In [2]:
import os
import tarfile
from six.moves import urllib

In [4]:
DATA_DOWNLOAD_URL = "http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
DATA_PATH = os.path.join("data")

def fetch_data(data_download_url = DATA_DOWNLOAD_URL, data_path = DATA_PATH):
    if not os.path.isdir(data_path):
        os.makedirs(data_path)
    tgz_path = os.path.join(data_path, "aclImdb_v1.tar.gz")
    urllib.request.urlretrieve(data_download_url, tgz_path) 
    reviews_tgz = tarfile.open(tgz_path)
    reviews_tgz.extractall(path=data_path)
    reviews_tgz.close()
    

In [5]:
fetch_data()

In [7]:
tgz_path = os.path.join(os.path.join("data"), "aclImdb_v1.tar.gz")
print(tgz_path)

data/aclImdb_v1.tar.gz


In [19]:
import pandas as pd
import os
import pyprind

basepath = 'data/aclImdb'

labels = {'pos': 1, 'neg': 0}
pbar = pyprind.ProgBar(50000) # Create a processing bar.
df = pd.DataFrame()
for s in ('test', 'train'):
    for l in ('pos', 'neg'):
        path = os.path.join(basepath, s , l)
        for file in os.listdir(path):
            with open(os.path.join(path, file), 'r', encoding='utf-8') as infile:
                txt = infile.read()
            df = df.append([[txt, labels[l]]], ignore_index=True)
            pbar.update()
df.columns = ['review', 'sentiment']

0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:02:07


In [14]:
%%bash
pip install pyprind

Collecting pyprind
  Downloading https://files.pythonhosted.org/packages/1e/30/e76fb0c45da8aef49ea8d2a90d4e7a6877b45894c25f12fb961f009a891e/PyPrind-2.11.2-py3-none-any.whl
Installing collected packages: pyprind
Successfully installed pyprind-2.11.2


tensorboard 1.8.0 has requirement bleach==1.5.0, but you'll have bleach 2.1.3 which is incompatible.
tensorboard 1.8.0 has requirement html5lib==0.9999999, but you'll have html5lib 1.0.1 which is incompatible.


In [24]:
# Randomize data-frame and save as a CSV file
import numpy as np
np.random.seed(0)
# randomize data frame
df = df.reindex(np.random.permutation(df.index))
# sace data frame to a csv file called movie_data.csv
df.to_csv('movie_data.csv', index = False, encoding = 'utf-8')

## ===== Read CSV file and check frist 5 rows ====##
df = pd.read_csv('movie_data.csv', encoding = 'utf-8')
df.head()

Unnamed: 0,review,sentiment
0,This 1931 comedy gets better with every viewin...,1
1,Maybe it's because I'm no fan of the comics (b...,0
2,"This service comedy, for which Peter Marshall ...",0
3,"9/10- 30 minutes of pure holiday terror. Okay,...",1
4,This Drummond entry is lacking in continuity. ...,0


In [26]:
df.loc[0, 'review'][-999:]

'This 1931 comedy gets better with every viewing because of the comedic talents of Marion Davies and a terrific performance by C. Aubrey Smith. Smith plays a gruff old man who gathers his grown children (from his younger days as a rake) in his declining years. One is American (Davies), one English (Ray Milland who looks about 18), and one Italian (Nina Quartero). There are some surprises as the plot moves along with Ralph Forbes(was has no appeal at all) falling for Davies.<br /><br />Davies and Smith are just wonderful together and very touching. Davies also gets to do a few dances and make a few "big" entrances. And of course Davies is just gorgeous.<br /><br />Halliwell Hobbes, Doris Lloyd, Elizabeth Murray, Guinn Williams, Edgar Norton, and David Torrence co-star. Had they given out supporting Oscar awards in 1931, Smith might well have been nominated. He\'s just excellent in this this gem.'

In [27]:
def tokenizer1(text):
    return text.split()
tokenizer1('running like running and thus they run')

['running', 'like', 'running', 'and', 'thus', 'they', 'run']

In NLP, there’s a technique to generate words into their root form. This technique is called "[word stemming](https://nlp.stanford.edu/IR-book/html/htmledition/stemming-and-lemmatization-1.html)".  "[Porter Stemmer](http://snowball.tartarus.org/algorithms/porter/stemmer.html)" is quite popular among researchers in the NLP domain. In the below code segment you can see how we can use NLTK package’s PorterStemmer to obtain the root form of words

In [32]:
%%bash
pip install nltk

Collecting nltk
  Downloading https://files.pythonhosted.org/packages/50/09/3b1755d528ad9156ee7243d52aa5cd2b809ef053a0f31b53d92853dd653a/nltk-3.3.0.zip (1.4MB)
Building wheels for collected packages: nltk
  Running setup.py bdist_wheel for nltk: started
  Running setup.py bdist_wheel for nltk: finished with status 'done'
  Stored in directory: /home/aapfuser/.cache/pip/wheels/d1/ab/40/3bceea46922767e42986aef7606a600538ca80de6062dc266c
Successfully built nltk
Installing collected packages: nltk
Successfully installed nltk-3.3


tensorboard 1.8.0 has requirement bleach==1.5.0, but you'll have bleach 2.1.3 which is incompatible.
tensorboard 1.8.0 has requirement html5lib==0.9999999, but you'll have html5lib 1.0.1 which is incompatible.


In [35]:
from nltk.stem.porter import PorterStemmer
porter = PorterStemmer()

def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]

tokenizer_porter('running like running and thus they run')

['run', 'like', 'run', 'and', 'thu', 'they', 'run']

Another vital concept in the data cleaning and pre-processing step is the concept known as “**stop word removal**”. “stop words” are the words that are commonly occur in all forms of texts and probably bear no useful information. Few ‘stop words’ are, is, and, has, are, have, like… Stop word removal makes our text processing mechanism efficient as it reduces the number of words we need to analyze. 

In [39]:
tokenizer_porter('a running likes running and runs a lot')[-10:]

['a', 'run', 'like', 'run', 'and', 'run', 'a', 'lot']

In [40]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

stop = stopwords.words('english')
[w for w in tokenizer_porter('a running likes running and runs a lot') if w not in stop]

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/aapfuser/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


['run', 'like', 'run', 'run', 'lot']

In [41]:
import numpy as np
import re
from nltk.corpus import stopwords

from distutils.version import LooseVersion as Version
from sklearn import __version__ as sklearn_version

from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import SGDClassifier

In [42]:
if Version(sklearn_version) < '0.18':
    clf = SGDClassifier(loss='log', random_state=1, n_iter=1)
else:
    clf = SGDClassifier(loss='log', random_state=1, max_iter=1)

In [90]:
doc_stream = stream_docs(path='movie_data.csv')

stop = stopwords.words('english')
def stream_docs(path):
    with open(path, 'r', encoding='utf-8') as csv:
        next(csv)  # skip header
        for line in csv:
            text, label = line[:-3], int(line[-2])
            yield text, label

In [91]:
next(stream_docs('movie_data.csv'))

('"This 1931 comedy gets better with every viewing because of the comedic talents of Marion Davies and a terrific performance by C. Aubrey Smith. Smith plays a gruff old man who gathers his grown children (from his younger days as a rake) in his declining years. One is American (Davies), one English (Ray Milland who looks about 18), and one Italian (Nina Quartero). There are some surprises as the plot moves along with Ralph Forbes(was has no appeal at all) falling for Davies.<br /><br />Davies and Smith are just wonderful together and very touching. Davies also gets to do a few dances and make a few ""big"" entrances. And of course Davies is just gorgeous.<br /><br />Halliwell Hobbes, Doris Lloyd, Elizabeth Murray, Guinn Williams, Edgar Norton, and David Torrence co-star. Had they given out supporting Oscar awards in 1931, Smith might well have been nominated. He\'s just excellent in this this gem."',
 1)

In [92]:
def get_minibatch(doc_stream, size): 
    # size is the minibath put from doc_stream
    docs, y = [], []
    try:
        for _ in range(size):
            text, label = next(doc_stream)
            docs.append(text)
            y.append(label)
    except StopIteration:
        return None, None
    return docs, y

In [93]:
text_list, label_list = get_minibatch(stream_docs('movie_data.csv'), 2)

In [94]:
def tokenizer(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text.lower())
    text = re.sub('[\W]+', ' ', text.lower()) +\
        ' '.join(emoticons).replace('-', '')
    tokenized = [w for w in text.split() if w not in stop]
    return tokenized

In [95]:
tokenizer(text_list[0])

['1931',
 'comedy',
 'gets',
 'better',
 'every',
 'viewing',
 'comedic',
 'talents',
 'marion',
 'davies',
 'terrific',
 'performance',
 'c',
 'aubrey',
 'smith',
 'smith',
 'plays',
 'gruff',
 'old',
 'man',
 'gathers',
 'grown',
 'children',
 'younger',
 'days',
 'rake',
 'declining',
 'years',
 'one',
 'american',
 'davies',
 'one',
 'english',
 'ray',
 'milland',
 'looks',
 '18',
 'one',
 'italian',
 'nina',
 'quartero',
 'surprises',
 'plot',
 'moves',
 'along',
 'ralph',
 'forbes',
 'appeal',
 'falling',
 'davies',
 'davies',
 'smith',
 'wonderful',
 'together',
 'touching',
 'davies',
 'also',
 'gets',
 'dances',
 'make',
 'big',
 'entrances',
 'course',
 'davies',
 'gorgeous',
 'halliwell',
 'hobbes',
 'doris',
 'lloyd',
 'elizabeth',
 'murray',
 'guinn',
 'williams',
 'edgar',
 'norton',
 'david',
 'torrence',
 'co',
 'star',
 'given',
 'supporting',
 'oscar',
 'awards',
 '1931',
 'smith',
 'might',
 'well',
 'nominated',
 'excellent',
 'gem']

> We initialized HashingVectorizer with tokenizer funciton and set the number of features to math(2**21).  Furthermore, we reinitialized a logistic regression classifier by setting the loss parameter of the SGDClassifier to ‘log’. The reason to choose a large number of features in HashingVectorizer is to reduce the chance of causing hash collisions while increasing the number of coefficients in the logistic regression model.

In [96]:
vect = HashingVectorizer(decode_error='ignore', 
                         n_features=2**21,
                         preprocessor=None, 
                         tokenizer=tokenizer)

In [97]:
classes = np.array([0, 1])
for _ in range(45):
    X_train, y_train = get_minibatch(doc_stream, size=1000)
    if not X_train:
        break
    X_train = vect.transform(X_train)
    clf.partial_fit(X_train, y_train, classes=classes)

In [98]:
X_test, y_test = get_minibatch(doc_stream, size=5000)
X_test = vect.transform(X_test)
print('Accuracy: %.3f' % clf.score(X_test, y_test))

Accuracy: 0.870


In [119]:
import pickle
import os 
dest = os.path.join('movieclassifier', 'pkl_objects')
if not os.path.exists(dest):
    os.makedirs(dest)


What ‘dump’ method does is, it serialize the trained logistic regression model as well as ‘stop word’ set from NLTK library. 

In [110]:
pickle.dump(stop, 
            open(os.path.join(dest, 'stopwords.pkl'), 'wb'),
            protocol = 4)

In [118]:
pickle.dump(clf, open(os.path.join(dest, 'classifier.plk'), 'wb'), protocol=4)

constraints.txt
data
docs
download_data.ipynb
gen_requirements_txt
licence_research.ipynb
licenser
movieclassifier
movie_data.csv
requirements_all_py3_dev.txt
requirements_all_py3.txt
requirements_analytics_py3.in
requirements.txt
setup.cfg
text_file.txt
tox.ini
vendor
