In [2]:
import glob
import os
import numpy as np
import pandas as pd
import nltk

In [3]:
def get_data(split):
    if split.lower() == 'train':
        folder = 'train'
    elif split.lower() == 'test':
        folder = 'test'
    else:
        raise ValueError('Invalid data split specified.')
        
    file_names = []
    text = []
    is_positive = []
    
    # read all positive files
    files = glob.glob(os.path.join("data", folder, 'pos', '*'))
    for file in files:
        head, tail = os.path.split(file)
        file_names.append(tail)
        is_positive.append(1)
        with open(file, 'rb') as open_file:
            text.append(open_file.readlines()[0])
            
    # read all negative files
    files = glob.glob(os.path.join("data", folder, 'neg', '*'))
    for file in files:
        head, tail = os.path.split(file)
        file_names.append(tail)
        is_positive.append(0)
        with open(file, 'rb') as open_file:
            text.append(open_file.readlines()[0])
            
    return pd.DataFrame(data={'file': file_names, 'text': text, 'is_positive': is_positive})


train_df = get_data('train')
train_df

Unnamed: 0,file,text,is_positive
0,0_9.txt,b'Bromwell High is a cartoon comedy. It ran at...,1
1,10000_8.txt,b'Homelessness (or Houselessness as George Car...,1
2,10001_10.txt,b'Brilliant over-acting by Lesley Ann Warren. ...,1
3,10002_7.txt,b'This is easily the most underrated film inn ...,1
4,10003_8.txt,b'This is not the typical Mel Brooks film. It ...,1
...,...,...,...
24995,9998_4.txt,"b""Towards the end of the movie, I felt it was ...",0
24996,9999_3.txt,b'This is the kind of movie that my enemies co...,0
24997,999_3.txt,"b""I saw 'Descent' last night at the Stockholm ...",0
24998,99_1.txt,"b""Some films that you pick up for a pound turn...",0


### Get only 5000 records from the original 25000 highly polar movie reviews

In [4]:
positive_reviews = train_df[:2500]   #take first 2500 reviews which are positive
negative_reviews = train_df[22500:]  #last last 2500 reviews which are negative
new_train = positive_reviews.append(negative_reviews, ignore_index=True)


In [5]:
new_train

Unnamed: 0,file,text,is_positive
0,0_9.txt,b'Bromwell High is a cartoon comedy. It ran at...,1
1,10000_8.txt,b'Homelessness (or Houselessness as George Car...,1
2,10001_10.txt,b'Brilliant over-acting by Lesley Ann Warren. ...,1
3,10002_7.txt,b'This is easily the most underrated film inn ...,1
4,10003_8.txt,b'This is not the typical Mel Brooks film. It ...,1
...,...,...,...
4995,9998_4.txt,"b""Towards the end of the movie, I felt it was ...",0
4996,9999_3.txt,b'This is the kind of movie that my enemies co...,0
4997,999_3.txt,"b""I saw 'Descent' last night at the Stockholm ...",0
4998,99_1.txt,"b""Some films that you pick up for a pound turn...",0


In [86]:
from nltk.tokenize.casual import casual_tokenize  # we use casual tokenize because this is colloquial text
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS as sklearn_stop_words  # sklearn stop words is larger than nltk

import re

def decontracted(phrase):
    # Taken from https://stackoverflow.com/questions/19790188/expanding-english-language-contractions-in-python
    
    # specific
    phrase = re.sub(r"won\'t", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase


phrase = decontracted(str(new_train['text'][20]).replace('\\', '').replace("b'", "")) # remove backslashes and replace contractions
tokens = casual_tokenize(phrase, reduce_len=True, strip_handles=True)
normalized_tokens = [x.lower() for x in tokens] #  convert to all lowercase
filtered_tokens = [x for x in normalized_tokens if x not in sklearn_stop_words] #  filter stop words
filtered_tokens = [x for x in filtered_tokens if x and x not in '- \t\n."\':[...][\\]()/[br]<>*~,;!?'] #  filter punctuations
filtered_tokens

['night',
 'listener',
 'better',
 'people',
 'generally',
 'saying',
 'weaknesses',
 'having',
 'genre',
 'identity',
 'crisis',
 'doubt',
 'think',
 'creepy',
 'atmosphere',
 'intriguing',
 'performances',
 'make',
 'thing',
 'feels',
 'like',
 'fireside',
 'happened',
 'friend',
 'friend',
 'ghost',
 'stories',
 'big',
 'complaint',
 'movie',
 'pacing',
 'slow',
 'awkward',
 'pacing',
 'deliberate',
 'unfolds',
 'movie',
 'kept',
 'realm',
 'possibility',
 'real',
 'life',
 'just',
 'sort',
 'plods',
 'alongxc',
 '2x97no',
 'flashy',
 'endings',
 'earth-shattering',
 'revelations',
 'showdown',
 'scenes',
 'thank',
 'heaven',
 'zone',
 'watching',
 'movie',
 'forget',
 'reservations',
 'expectations',
 'makes',
 'conventionally',
 'good',
 'movie',
 'williams',
 'terrific',
 'easily',
 'meets',
 'needs',
 'story',
 'plus',
 'character',
 'supposed',
 'somewhat',
 'generic',
 'everyman',
 'avatar',
 'enter',
 'story',
 'toni',
 'collette',
 'performance',
 'nominated',
 'oscar',
 'ma