In [None]:
# Cleaning and preparing text data
# Building feature vectors from text documents
# training a machine learning model to classify positive and negative movie reviews
# working with large text datasets using out-of-core learning

In [9]:
import pyprind
import pandas as pd
import os

pbar = pyprind.ProgBar(50000)
labels = {'pos':1, 'neg':0}
df = pd.DataFrame()
for s in ('test', 'train'):
    for l in ('pos','neg'):
        path = './aclImdb/%s/%s' % (s,l)
        for file in os.listdir(path):
            with open(os.path.join(path,file), 'r') as infile:txt = infile.read()
            df = df.append([[txt, labels[l]]], ignore_index=True)
            pbar.update()
df.columns = ['review','sentiment']

0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:11:22


In [10]:
#randomise and save
import numpy as np
np.random.seed(0)
df = df.reindex(np.random.permutation(df.index))
df.to_csv('./movie_data.csv', index=False)


In [11]:
df = pd.read_csv('./movie_data.csv')
df.head(5)

Unnamed: 0,review,sentiment
0,It is not surprising that this film was made b...,1
1,Big disappointment. CLASH BY NIGHT is much to ...,0
2,The movie seemed to appeal me because of the n...,0
3,I first saw this movie about 20 years ago and ...,1
4,Disappointing film. Performance of actors is w...,0


In [14]:
"""Bag of words - allows us to represent text as numerical feature vectors
- we create a vocabulary of unique tokens - for example, words - from the entire set of documents
- we construct a feature vector from each document that contains the counts of how often each word occurs in the particular document.
"""

""" 
CountVectorizer class takes an array of text data, which can be documents or just sentences,
and constructs the bad-of-words model
"""

from sklearn.feature_extraction.text import CountVectorizer
count = CountVectorizer()
docs = np.array([
    'The sun is shining',
    'The weather is sweet',
    'The sun is shining and the weather is sweet'
])
bag = count.fit_transform(docs)
print(count.vocabulary_)
print(bag.toarray())

{'the': 5, 'sun': 3, 'is': 1, 'shining': 2, 'weather': 6, 'sweet': 4, 'and': 0}
[[0 1 1 1 0 1 0]
 [0 1 0 0 1 1 1]
 [1 2 1 1 1 2 1]]


In [17]:
# term frequency-inverse document frequency is used to downweight frequently ocurring words in the feature vectors
from sklearn.feature_extraction.text import TfidfTransformer
tfidf = TfidfTransformer()
np.set_printoptions(precision=2)
print(tfidf.fit_transform(count.fit_transform(docs)).toarray())

[[ 0.    0.43  0.56  0.56  0.    0.43  0.  ]
 [ 0.    0.43  0.    0.    0.56  0.43  0.56]
 [ 0.4   0.48  0.31  0.31  0.31  0.48  0.31]]


In [35]:
"""CLEANING DATA"""
import re
def preprocessor(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)',text)
    text = re.sub('[\W]+', ' ', text.lower()) + \
    ''.join(emoticons).replace('-','')
    return text
preprocessor(df.loc[0, 'review'][-50:])   
preprocessor("</a>This :) is :(a test :-) !")

'this is a test :):(:)'

In [36]:
df['review'] = df['review'].apply(preprocessor)

In [38]:
"""Processing documents into tokens
split the tect corpora into individual elements

One way to tokenize documents is to split them into individual words by splitting the cleaned
document at its whitespace characters

"""

def tokenizer(text):
    return text.split()
tokenizer('running like running and thus they run')

"""Another technique is WORD STEMMING - which is the process of transforming a word into
its root form that allows us to map related words to the same stem. (Porter stemmer algorithm)
"""

from nltk.stem.porter import PorterStemmer
porter = PorterStemmer()
def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]

tokenizer_porter('runners like running and thus they run')

['runner', 'like', 'run', 'and', 'thu', 'they', 'run']

In [39]:
# Stop-word removal
"""
stop-words are simply those words that are extremely common in all sorts of texts and likely
bear no useful information that can be used to distinguish between classes of documents.
"""

import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/salaudeen/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [41]:
from nltk.corpus import stopwords
stop = stopwords.words('english')
[w for w in tokenizer_porter('a runner likes running and runs a lot')[-10:] if w not in stop]

['runner', 'like', 'run', 'run', 'lot']

In [49]:
# We will train a logistic regression model to classify the movie reviews into positive and negative reviews
# we will divide the dataframe of cleaned text documents into 25,000 documents for training and 25,000 documents for testing

X_train = df.loc[:25000, 'review'].values
y_train = df.loc[:25000, 'sentiment'].values
X_test = df.loc[25000:, 'review'].values
y_test = df.loc[25000:, 'sentiment'].values

from sklearn.grid_search import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(strip_accents=None,
                       lowercase=False,
                       preprocessor=None)
param_grid = [{'vect__ngram_range': [(1,1)],
              'vect__stop_words': [stop, None],
              'vect__tokenizer': [tokenizer, tokenizer_porter],
              'clf__penalty': ['l1','l2'],
              'clf__C': [1.0,10.0,100.0]},
             {'vect__ngram_range': [(1,1)],
             'vect__stop_words': [stop, None],
             'vect__tokenizer': [tokenizer, tokenizer_porter],
             'vect__use_idf': [False],
             'vect__norm': [None],
             'clf__penalty': ['l1','l2'],
             'clf__C': [1.0,10.0,100.0]}]
lr_tfidf = Pipeline([('vect',tfidf),
                    ('clf',
                    LogisticRegression(random_state=0))])
gs_lr_tfidf = GridSearchCV(lr_tfidf, param_grid,
                          scoring='accuracy',
                          cv=5, verbose=1,
                          n_jobs=-1)
gs_lr_tfidf.fit(X_train, y_train)

Fitting 5 folds for each of 48 candidates, totalling 240 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed: 30.8min
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed: 154.6min
[Parallel(n_jobs=-1)]: Done 240 out of 240 | elapsed: 200.5min finished


GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=False, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
 ...nalty='l2', random_state=0, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))]),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid=[{'vect__ngram_range': [(1, 1)], 'vect__stop_words': [['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's...se_idf': [False], 'vect__norm': [None], 'clf__penalty': ['l1', 'l2'], 'clf__C': [1.0, 10.0, 100.0]}],
       pre_dispatch='2*n_jobs', refit=True, scoring='ac

In [50]:
print('Best parameter set: %s ' %gs_lr_tfidf.best_params_)

Best parameter set: {'clf__C': 10.0, 'clf__penalty': 'l2', 'vect__ngram_range': (1, 1), 'vect__stop_words': None, 'vect__tokenizer': <function tokenizer at 0x7f7212705840>} 


In [52]:
print('CV Accuracy: %.3f' %gs_lr_tfidf.best_score_)
clf = gs_lr_tfidf.best_estimator_
print('Test Accuracy: %.3f' %clf.score(X_test, y_test))

CV Accuracy: 0.897
Test Accuracy: 0.901
