In [1]:
# stochastic gradient descent
import numpy as np
import re
from nltk.corpus import stopwords
stop = stopwords.words('english')

# function that cleans thr unprocessed text data
def tokenizer(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text.lower())
    text = re.sub('[\W]+', ' ', text.lower()) + ' '.join(emoticons).replace('-', '')
    tokenized = [w for w in text.split() if w not in stop]
    return tokenized


# create generator function that reads in and returns one document at a time
def stream_docs(path):
    with open(path, 'r', encoding='utf-8') as csv:
        next(csv) # skip header
        for line in csv:
            text, label = line[:-3], int(line[-2])
            yield text, label

next(stream_docs(path='./movie_data.csv'))


('"It is not surprising that this film was made by I\'m Kwon Taek at the time it was. He examined the early beauty and tragedy of Chosun Dynasty life in Seopyonje and delightfully explored a well-known Korean folk tale in Chunhyang, and these comprised his last two films. What is most surprising is that Chi Hwa Seon, his 2002 offering, is not presented in the pansori style of those previous two films.<br /><br />Nonetheless, the experienced hand of I\'m comes through. We explore together the life of a real person: a late nineteenth century Chosun Dynasty painter who rides on the edge of modernity but who is not a noble and who, because of that, causes a stir in contemporary Korean society with his fame and his public and artistic expressions of disdain for the old Korean noble class and his contempt for would-be Japanese ruling colonials alike. The painter, Chang Seung Up, known popularly as Oh Won (performed magnificently by Choi Min Sik, the famous star of Park Chan Wook\'s already l

In [2]:
# define function that will take a document stream from the stream_docs function and return
# a particular number of documents specified by the size paramter

def get_minibatch(doc_stream, size):
    docs, y = [],[]
    try:
        for _ in range(size):
            text, label = next(doc_stream)
            docs.append(text)
            y.append(label)
    except StopIteration:
        return None, None
    return docs, y

In [3]:
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import SGDClassifier

vect = HashingVectorizer(decode_error='ignore',
                        n_features=2**21,
                        preprocessor=None,
                        tokenizer=tokenizer)
clf = SGDClassifier(loss='log', random_state=1, n_iter=1)
doc_stream = stream_docs(path='./movie_data.csv')



In [4]:
import pyprind
pbar = pyprind.ProgBar(45)
classes = np.array([0,1])
for _ in range(45):
    X_train, y_train = get_minibatch(doc_stream, size=1000)
    if not X_train:
        break
    X_train = vect.transform(X_train)
    clf.partial_fit(X_train, y_train, classes=classes)
    pbar.update()

0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:00:50


In [5]:
X_test, y_test = get_minibatch(doc_stream, size=5000)
X_test = vect.transform(X_test)
print('Accuracy: %.3f' % clf.score(X_test, y_test))

Accuracy: 0.878


In [6]:
clf = clf.partial_fit(X_test, y_test)