In [1]:
import numpy as np
import re
from nltk.corpus import stopwords

In [2]:
stop = stopwords.words("english")

def tokenizer(text):
    
    # clean data
    text = re.sub('<[^>]*>', '', text)  # remove html tag
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text.lower())  # emotion icons, like :)
    text = re.sub('[\W]+', ' ', text.lower()) \  # to lowercase with emoicons
        + " ".join(emoticons).replace("-", "")
    
    tokenized = [w for w in  text.split() if w not in stop]  # split and remove stop-words
    return tokenized

In [3]:
def stream_docs(path):
    """get a streaming generator"""
    with open(path, "r") as f:
        next(f)  # skip header
        for line in f:
            text, label = line[:-3], int(line[-2])
            yield text, label

In [4]:
next(stream_docs(path="moive_data.csv"))

('"I recently rented Twister, a movie I\'d seen several years ago on TV, and it has aged well; I found myself laughing out loud several times at it and as weird as all these people are, by the end I profoundly cared about them. This is the sort of little movie that is made for a cult audience because, rather like Howdy\'s gazpacho (well, I think that\'s what it is), it\'s an acquired taste: you have to be attuned to its peculiar wavelength. The production values might be charitably called inexpensive and the pace and atmosphere take a while to get settled, but the film has a ""look"", especially in some wonderful shots contrasting the dry flatness of the land with the cluttery nouveau-riche opulence of the mansion interior: Michael Almereyda had a good eye even then. Life with sodapop magnate Eugene Cleveland (Harry Dean Stanton) and his household (two adult children, a grandchild, and a housekeeper) seems so detached from life outside we could be in Gormenghast. Everyone in this film 

In [5]:
def get_minibatch(doc_stream, size):
    docs, y = [], []
    try:
        for _ in range(size):
            text, label = next(doc_stream)
            docs.append(text)
            y.append(label)
    except StopIteration:
        return None, None
    return docs, y


In [6]:
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import SGDClassifier

# HashingVectorizer is data-independent
# largest number of features is set to 2^21
# By choosing a large number of features in the HashingVectorizer,
# we reduce the chance to cause hash collisions but we also increase the number of 
# coefficients in our logistic regression model
vect = HashingVectorizer(decode_error="ignore", n_features=2**21, preprocessor=None, tokenizer=tokenizer)
clf = SGDClassifier(loss="log", random_state=1, n_iter=1)
doc_stream = stream_docs("moive_data.csv")

In [7]:
import pyprind

pbar = pyprind.ProgBar(45)


classes = np.array([0, 1])
for _ in range(45):
    X_train, y_train = get_minibatch(doc_stream, size=1000)
    if not X_train:
        break
    X_train = vect.transform(X_train)  # vectorize
    clf.partial_fit(X_train, y_train, classes=classes)
    pbar.update()

0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:01:47


In [8]:
X_test, y_test = get_minibatch(doc_stream, size=5000)
X_test = vect.transform(X_test)
print("Accuracy: %.3f" % clf.score(X_test, y_test))

Accuracy: 0.874


In [9]:
clf = clf.partial_fit(X_test, y_test)  # update model

In [10]:
import pickle
import os

dest = os.path.join("moiveclassifier", "pkl_objects")
if not os.path.exists(dest):
    os.makedirs(dest)
pickle.dump(stop, open(os.path.join(dest, "stopwords.pkl"), "wb"), protocol=4)
pickle.dump(clf, open(os.path.join(dest, "classifier.pkl"), "wb"), protocol=4)