In [3]:
import pyprind
import pandas as pd
import os
import numpy as np
readFromSource = False
if readFromSource:
    pbar = pyprind.ProgBar(50000)
    labels = {'pos':1, 'neg':0}
    df = pd.DataFrame()

    for s in ('test','train'):
        for l in ('pos','neg'):
            path = './aclImdb/%s/%s' % (s,l)
            for file in os.listdir(path):
                with open(os.path.join(path,file),'r') as infile:
                    txt = infile.read()
                df = df.append([[txt,labels[l]]], ignore_index=True)
                pbar.update()

    df.columns = ['review','sentiment']
    df.head()

    np.random.seed(0)
    df = df.reindex(np.random.permutation(df.index))
    df.head()

    df.to_csv('./movie_data.csv', index=False)

In [4]:
df = pd.read_csv('./movie_data.csv')

In [5]:
df.head()

Unnamed: 0,review,sentiment
0,"Man, I had my doubts. I love Kathy Bates, but ...",1
1,Mexican Werewolf in Texas is set in the small ...,0
2,I'm trying to decide if jumping into a wood ch...,0
3,"OK, I saw this in the theaters when it came ou...",1
4,I bought this movie and after I watched it I d...,0


# bag-of-words model

## Transforming words --> feature vectors

In [6]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

In [7]:
count = CountVectorizer()

In [8]:
docs = np.array(['The sun is shining.',
                 'The weather is sweet.',
                 'The sun is shining and the weather is sweet'])

In [9]:
## construct vocabulary and transform each
## document into a sparse feature vector
bag = count.fit_transform(docs)



In [10]:
print sorted(count.vocabulary_)

[u'and', u'is', u'shining', u'sun', u'sweet', u'the', u'weather']


In [11]:
print(bag.toarray())

[[0 1 1 1 0 1 0]
 [0 1 0 0 1 1 1]
 [1 2 1 1 1 2 1]]


In [12]:
count2 = CountVectorizer(ngram_range=(2,2))

In [13]:
bag2 = count2.fit_transform(docs)

In [14]:
print sorted(count2.vocabulary_)

[u'and the', u'is shining', u'is sweet', u'shining and', u'sun is', u'the sun', u'the weather', u'weather is']


In [15]:
print bag2.toarray()

[[0 1 0 0 1 1 0 0]
 [0 0 1 0 0 0 1 1]
 [1 1 1 1 1 1 1 1]]


## Assess word relavancy via tf-idf

In [16]:
from sklearn.feature_extraction.text import TfidfTransformer

In [17]:
tfidf = TfidfTransformer()

In [18]:
np.set_printoptions(precision=2)

In [19]:
print tfidf.fit_transform(count.fit_transform(docs)).toarray()

[[ 0.    0.43  0.56  0.56  0.    0.43  0.  ]
 [ 0.    0.43  0.    0.    0.56  0.43  0.56]
 [ 0.4   0.48  0.31  0.31  0.31  0.48  0.31]]


## Cleaning the text data

In [20]:
df.loc[50,'review'][-50:]

'cript, and even a Jack Nicholson<br /><br />cameo!'

In [21]:
import re

In [22]:
def preprocessor(text):
    ## remove html mark-up
    text = re.sub('<[^>]*>',' ',text)
    ## find emoticons
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)',text)
    ## remove all non-word characters, convert to lowercase,
    ## and add emoticons to the end.Additionally, remove
    ## nose character from emoticons.
    text = re.sub('[\W]+',' ',text.lower()) + \
        ' '.join(emoticons).replace('-','')
    return text

In [23]:
preprocessor(df.loc[50,'review'][-50:])

'cript and even a jack nicholson cameo '

In [24]:
preprocessor('</a>This :) is :( a test :-)!')

' this is a test :) :( :)'

In [25]:
df['review'] = df['review'].apply(preprocessor)

In [26]:
df.loc[50,'review'][-51:]

'intelligent script and even a jack nicholson cameo '

## Tokenizing 

In [27]:
def tokenizer(text):
    return text.split()

In [28]:
tokenizer('runners like running and thus they run')

['runners', 'like', 'running', 'and', 'thus', 'they', 'run']

In [29]:
from nltk.stem.porter import PorterStemmer

In [30]:
porter = PorterStemmer()

In [31]:
def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]

In [32]:
tokenizer_porter('runners like running and thus they run')

[u'runner', u'like', u'run', u'and', u'thu', u'they', u'run']

In [33]:
import nltk

In [34]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/thomas/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [35]:
from nltk.corpus import stopwords

In [36]:
stop = stopwords.words('english')

In [37]:
[w for w in tokenizer_porter('a runner likes running and runs a lot') if w not in stop] 

[u'runner', u'like', u'run', u'run', u'lot']

## Logistic Regression to classifier movie reviews

In [38]:
## 25k training, 25k testing
X_train = df.loc[:25000,'review'].values
y_train = df.loc[:25000,'sentiment'].values
X_test = df.loc[25000:,'review'].values
y_test = df.loc[25000:,'sentiment'].values

In [39]:
## Find optimal hyperparameters via gridsearch with 5-fold CV

In [40]:
from sklearn.grid_search import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer

In [41]:
tfidf = TfidfVectorizer(strip_accents=None, 
                        lowercase=False,
                        preprocessor=None)

In [42]:
param_grid = [{'vect__ngram_range': [(1,1)],
               'vect__stop_words': [stop, None],
               'vect__tokenizer': [tokenizer,tokenizer_porter],
               'clf__penalty': ['l1', 'l2'],
               'clf__C': [1.0, 10.0, 100.0]},
              {'vect__ngram_range': [(1,1)],
               'vect__stop_words': [stop, None],
               'vect__tokenizer': [tokenizer,tokenizer_porter],
               'vect__use_idf':[False],
               'vect__norm':[None],
               'clf__penalty': ['l1', 'l2'],
               'clf__C': [1.0, 10.0, 100.0]}
             ]

In [43]:
lr_tfidf = Pipeline([('vect',tfidf),
                     ('clf',
                      LogisticRegression(random_state=0))])

In [44]:
gs_lr_tfidf = GridSearchCV(lr_tfidf, param_grid,
                           scoring='accuracy',cv=5,
                           verbose=1,n_jobs=4)

In [45]:
gs_lr_tfidf.fit(X_train,y_train)

Fitting 5 folds for each of 48 candidates, totalling 240 fits


KeyboardInterrupt: 

## Online Algorithms - SGD

In [48]:
import numpy as np
import re
from nltk.corpus import stopwords

In [84]:
def tokenizer(text):
    ## remove html mark-up
    text = re.sub('<[^>]*>',' ',text)
    ## find emoticons
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)',text.lower())
    ## remove all non-word characters, convert to lowercase,
    ## and add emoticons to the end.Additionally, remove
    ## nose character from emoticons.
    text = re.sub('[\W]+',' ',text.lower()) + \
        ' '.join(emoticons).replace('-','')
    tokenized = [w for w in text.split() if w not in stop]
    return tokenized

In [85]:
## Generator function that reads in and returns one
## document at a time
def stream_docs(path):
    with open(path,'r') as csv:
        next(csv) #skip header
        for line in csv:
            text,label = line[:-3], int(line[-2])
            yield text,label

In [86]:
next(stream_docs(path='./movie_data.csv'))

('"Man, I had my doubts. I love Kathy Bates, but I thought, how good can this be, I had never even heard of this thing...! You know, it was one of those things, we gave it ""20 minutes and we\'ll turn it off if it sucks"" and we were locked in from the get-go. This is a very winsome, fun movie. It\'s quirky, you know? I mean, you\'ve got a lounge singer, a murderer (and a believable one), you have farce, then Kathy Bates in all her acting splendor, Rupert Everett finally acting to his real potential, Dan Ackroyd, and a dwarf that will make you laugh out loud. I tell ya, you\'ll laugh/you\'ll cry. <br /><br />Maybe I had a weird week, but I think this film is on the level of Fried Green Tomatoes. If you don\'t like that movie, maybe you won\'t like this, but I think it was a great movie. I went out and bought the DVD."',
 1)

In [87]:
## take document stream from stream_docs and
## return particular number of documents specified by size
def get_minibatch(doc_stream,size):
    docs,y=[],[]
    try: 
        for _ in range(size):
            text,label = next(doc_stream)
            docs.append(text)
            y.append(label)
    except StopIteration:
        return None, None
    return docs, y

In [88]:
## HashingVectorizer is data independent as uses 
## the hashing trick via the 32-bit MurmurHash3 algorithm
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import SGDClassifier

In [89]:
vect = HashingVectorizer(decode_error='ignore',
                         n_features=2**21,
                         preprocessor=None,
                         tokenizer=tokenizer)

In [90]:
clf = SGDClassifier(loss='log', random_state=1, n_iter=1)

In [91]:
doc_stream = stream_docs(path='./movie_data.csv')

In [92]:
## out-of-core learning
import pyprind
pbar = pyprind.ProgBar(45)
classes = np.array([0,1])
for _ in range(45):
    X_train, y_train = get_minibatch(doc_stream, size=1000)
    if not X_train:
        break
    X_train = vect.transform(X_train)
    clf.partial_fit(X_train,y_train,classes=classes)
    pbar.update()

0%                          100%
[##############################] | ETA: 00:00:00
Total time elapsed: 00:00:48


In [93]:
X_test, y_test = get_minibatch(doc_stream,size=5000)

In [94]:
X_test = vect.transform(X_test)

In [95]:
print 'Accuracy: %.3f' % clf.score(X_test,y_test)

Accuracy: 0.865


In [96]:
clf = clf.partial_fit(X_test, y_test)