In [1]:
import pyprind
import pandas as pd
import os
import numpy as np


pbar = pyprind.ProgBar(50000)
labels = {'pos': 1, 'neg': 0}
df = pd.DataFrame()
for s in ('test', 'train'):
        for l in ('pos', 'neg'):
                path = './aclImdb/%s/%s' % (s, l)
                for file in os.listdir(path):
                        with open(os.path.join(path, file), 'r', encoding='utf-8') as infile: 
                                txt = infile.read()
                        df = pd.concat([df, pd.DataFrame([[txt, labels[l]]], columns=['review', 'sentiment'])], ignore_index=True)
                        pbar.update()

df.columns = ['review', 'sentiment']    

0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:11:47


In [2]:
np.random.seed(0)
df = df.reindex(np.random.permutation(df.index))
df.to_csv('./movie_data.csv', index = False)

In [3]:
df = pd.read_csv('./movie_data.csv')
df

Unnamed: 0,review,sentiment
0,"In 1974, the teenager Martha Moxley (Maggie Gr...",1
1,OK... so... I really like Kris Kristofferson a...,0
2,"***SPOILER*** Do not read this, if you think a...",0
3,hi for all the people who have seen this wonde...,1
4,"I recently bought the DVD, forgetting just how...",0
...,...,...
49995,"OK, lets start with the best. the building. al...",0
49996,The British 'heritage film' industry is out of...,0
49997,I don't even know where to begin on this one. ...,0
49998,Richard Tyler is a little boy who is scared of...,0


In [4]:
from sklearn.feature_extraction.text import CountVectorizer
count = CountVectorizer()
docs = np.array(['The sun is shining',
                 'The weather is sweet',
                 'The sun is shining and the weather is sweet'])
bag = count.fit_transform(docs)

In [5]:
print(count.vocabulary_)

{'the': 5, 'sun': 3, 'is': 1, 'shining': 2, 'weather': 6, 'sweet': 4, 'and': 0}


In [6]:
print(bag.toarray())

[[0 1 1 1 0 1 0]
 [0 1 0 0 1 1 1]
 [1 2 1 1 1 2 1]]


In [7]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf = TfidfTransformer()
np.set_printoptions(precision=2)
print(tfidf.fit_transform(count.fit_transform(docs)).toarray())

[[0.   0.43 0.56 0.56 0.   0.43 0.  ]
 [0.   0.43 0.   0.   0.56 0.43 0.56]
 [0.4  0.48 0.31 0.31 0.31 0.48 0.31]]


In [8]:
df.loc[0, 'review'][-50:]


'is seven.<br /><br />Title (Brazil): Not Available'

In [9]:
import re

def preprocessor(text):
    # Remove HTML tags
    text = re.sub('<[^>]*>', '', text)
    
    # Find all emoticons
    emoticons = re.findall(r'(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
    
    # Remove non-word characters and concatenate emoticons at the end
    text = re.sub(r'[\W]+', ' ', text.lower()) + ' ' + ''.join(emoticons).replace('-', '')

    return text


In [10]:
preprocessor(df.loc[0, 'review'][-50:])

'is seven title brazil not available '

In [11]:
preprocessor("</a>This :) is :( a test :-)!")


'this is a test  :):(:)'

In [12]:
df['review'] = df['review'].apply(preprocessor)

In [13]:
def tokenizer(text):
    return text.split()

tokenizer('runners like running and thus they run')

['runners', 'like', 'running', 'and', 'thus', 'they', 'run']

In [14]:
from nltk.stem.porter import PorterStemmer
porter = PorterStemmer()

def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]

tokenizer_porter('runners like running and thus they run')


['runner', 'like', 'run', 'and', 'thu', 'they', 'run']

In [15]:
import nltk
nltk.download('stopwords')


from nltk.corpus import stopwords
stop = stopwords.words('english')
[w for w in tokenizer_porter('a runner likes running and runs alot')[-10:] if w not in stop]


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\cleil\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


['runner', 'like', 'run', 'run', 'alot']

In [16]:
X_train = df.loc[:25000, 'review'].values
y_train = df.loc[:25000, 'sentiment'].values

X_test = df.loc[25000:, 'review'].values
y_tes = df.loc[25000:, 'sentiment'].values

In [17]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer


tfidf = TfidfVectorizer(strip_accents=None, lowercase=False, preprocessor=None) 

In [18]:
param_grid = [{'vect__ngram_range': [(1,1)],
 'vect__stop_words': [stop, None],
 'vect__tokenizer': [tokenizer,
 tokenizer_porter],
 'clf__penalty': ['l1', 'l2'],
 'clf__C': [1.0, 10.0, 100.0]},
 {'vect__ngram_range': [(1,1)],
 'vect__stop_words': [stop, None],
 'vect__tokenizer': [tokenizer,
 tokenizer_porter],
 'vect__use_idf':[False],
 'vect__norm':[None],
 'clf__penalty': ['l1', 'l2'],
 'clf__C': [1.0, 10.0, 100.0]}
 ]

In [19]:
lr_tfidf = Pipeline([('vect', tfidf),
 ('clf',
 LogisticRegression(random_state=0))])

In [20]:
gs_lr_tfidf = GridSearchCV(lr_tfidf, param_grid,
 scoring='accuracy',
 cv=5, verbose=1,
 n_jobs=-1)

In [22]:
gs_lr_tfidf.fit(X_train, y_train)

Fitting 5 folds for each of 48 candidates, totalling 240 fits


KeyboardInterrupt: 

In [24]:
print('Best parameter set: %s ' % gs_lr_tfidf.best_params_)

Best parameter set: {'clf__C': 10.0, 'clf__penalty': 'l2', 'vect__ngram_range': (1, 1), 'vect__stop_words': None, 'vect__tokenizer': <function tokenizer at 0x000001B4ECD4CB80>} 


In [26]:
print('CV Accuracy: %.3f'% gs_lr_tfidf.best_score_)

CV Accuracy: 0.896


In [27]:
clf = gs_lr_tfidf.best_estimator_

In [28]:
print('Test Accuracy: %.3f'
 % clf.score(X_test, y_tes))

Test Accuracy: 0.897
