In [1]:
# http://bogotobogo.com/Algorithms/Machine_Learning_NLP_Sentiment_Analysis_1.php

In [2]:
# IMDB

In [1]:
import pyprind
import pandas as pd
import os
import io

basepath = './aclImdb'

labels = {'pos': 1, 'neg': 0}
pbar = pyprind.ProgBar(50000)
df = pd.DataFrame()
for s in ('test', 'train'):
    for l in ('pos', 'neg'):
        path = os.path.join(basepath, s, l)
        for file in os.listdir(path):
            # For python2, use 'io.open', for Python3, just us 'open' 
            with io.open(os.path.join(path, file), 'r', encoding='utf-8') as infile:
                txt = infile.read()
            df = df.append([[txt, labels[l]]], ignore_index=True)
            pbar.update()
df.columns = ['review', 'sentiment']

0%                          100%
[##############################] | ETA: 00:00:00
Total time elapsed: 00:14:51


In [2]:
X = df.loc[:,:].values
X.shape

(50000, 2)

In [3]:
import numpy as np
np.random.seed(0)
df = df.reindex(np.random.permutation(df.index))
df.to_csv('./movie_data.csv', index=False, encoding='utf-8')

df = pd.read_csv('./movie_data.csv')
df.head(5)

Unnamed: 0,review,sentiment
0,"Master Kieslowsky came with an idea in 1993, t...",1
1,"4 Oscar winners, Karl Malden, Sally Field, Shi...",0
2,"Dull, flatly-directed ""comedy"" has zero laughs...",0
3,This movie was excellent. A sad truth to how c...,1
4,Words really can't describe how bad this film ...,0


In [4]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(min_df=1)
vectorizer

CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern=u'(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [5]:
corpus = [
    'This is the first document.',
    'This is the second second document.',
    'And the third one.',
    'Is this the first document?',
    ]
bag = vectorizer.fit_transform(corpus)
bag.shape, bag

((4, 9), <4x9 sparse matrix of type '<type 'numpy.int64'>'
 	with 19 stored elements in Compressed Sparse Row format>)

In [6]:
bag.toarray()

array([[0, 1, 1, 1, 0, 0, 1, 0, 1],
       [0, 1, 0, 1, 0, 2, 1, 0, 1],
       [1, 0, 0, 0, 1, 0, 1, 1, 0],
       [0, 1, 1, 1, 0, 0, 1, 0, 1]])

In [7]:
vectorizer.vocabulary_

{u'and': 0,
 u'document': 1,
 u'first': 2,
 u'is': 3,
 u'one': 4,
 u'second': 5,
 u'the': 6,
 u'third': 7,
 u'this': 8}

In [8]:
vectorizer.vocabulary_.get('third')

7

In [9]:
vectorizer.transform(['Something completely new.']).toarray()

array([[0, 0, 0, 0, 0, 0, 0, 0, 0]])

In [10]:
# tf-idf with a corpus document

In [11]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf = TfidfTransformer(smooth_idf=False)
np.set_printoptions(precision=2)
tfidf.fit_transform(vectorizer.fit_transform(corpus)).toarray()

array([[ 0.  ,  0.43,  0.57,  0.43,  0.  ,  0.  ,  0.34,  0.  ,  0.43],
       [ 0.  ,  0.24,  0.  ,  0.24,  0.  ,  0.89,  0.19,  0.  ,  0.24],
       [ 0.56,  0.  ,  0.  ,  0.  ,  0.56,  0.  ,  0.24,  0.56,  0.  ],
       [ 0.  ,  0.43,  0.57,  0.43,  0.  ,  0.  ,  0.34,  0.  ,  0.43]])

In [12]:
# The weights of each feature computed by the fit method call 
# are stored in a model attribute:
tfidf.idf_

array([ 2.39,  1.29,  1.69,  1.29,  2.39,  2.39,  1.  ,  2.39,  1.29])

In [13]:
# tf-idf with a counts example

In [14]:
from sklearn.feature_extraction.text import TfidfTransformer
transformer = TfidfTransformer(smooth_idf=False)
transformer   

TfidfTransformer(norm=u'l2', smooth_idf=False, sublinear_tf=False,
         use_idf=True)

In [15]:
counts = [[3, 0, 1],
          [2, 0, 0],
          [3, 0, 0],
          [4, 0, 0],
          [3, 2, 0],
          [3, 0, 2]]
tfidf = transformer.fit_transform(counts)
tfidf

<6x3 sparse matrix of type '<type 'numpy.float64'>'
	with 9 stored elements in Compressed Sparse Row format>

In [16]:
tfidf.toarray() 

array([[ 0.82,  0.  ,  0.57],
       [ 1.  ,  0.  ,  0.  ],
       [ 1.  ,  0.  ,  0.  ],
       [ 1.  ,  0.  ,  0.  ],
       [ 0.47,  0.88,  0.  ],
       [ 0.58,  0.  ,  0.81]])

In [17]:
transformer = TfidfTransformer()
transformer.fit_transform(counts).toarray()

array([[ 0.85,  0.  ,  0.52],
       [ 1.  ,  0.  ,  0.  ],
       [ 1.  ,  0.  ,  0.  ],
       [ 1.  ,  0.  ,  0.  ],
       [ 0.55,  0.83,  0.  ],
       [ 0.63,  0.  ,  0.78]])

In [18]:
transformer.idf_

array([ 1.  ,  2.25,  1.85])

In [19]:
# http://bogotobogo.com/Algorithms/Machine_Learning_NLP_Sentiment_Analysis_2.php

In [20]:
df.shape

(50000, 2)

In [21]:
df.head(5)

Unnamed: 0,review,sentiment
0,"Master Kieslowsky came with an idea in 1993, t...",1
1,"4 Oscar winners, Karl Malden, Sally Field, Shi...",0
2,"Dull, flatly-directed ""comedy"" has zero laughs...",0
3,This movie was excellent. A sad truth to how c...,1
4,Words really can't describe how bad this film ...,0


In [22]:
df[:5]

Unnamed: 0,review,sentiment
0,"Master Kieslowsky came with an idea in 1993, t...",1
1,"4 Oscar winners, Karl Malden, Sally Field, Shi...",0
2,"Dull, flatly-directed ""comedy"" has zero laughs...",0
3,This movie was excellent. A sad truth to how c...,1
4,Words really can't describe how bad this film ...,0


In [23]:
df

Unnamed: 0,review,sentiment
0,"Master Kieslowsky came with an idea in 1993, t...",1
1,"4 Oscar winners, Karl Malden, Sally Field, Shi...",0
2,"Dull, flatly-directed ""comedy"" has zero laughs...",0
3,This movie was excellent. A sad truth to how c...,1
4,Words really can't describe how bad this film ...,0
5,"I was the Production Accountant on this movie,...",1
6,"Well no, I tell a lie, this is in fact not the...",1
7,I've been a fan of Heaven's Gate since its fir...,1
8,"""Yes, Georgio"" is a light-hearted and enjoyabl...",1
9,This is quite a dull movie. Well-shot with rea...,1


In [24]:
# the last 300 characters from the first document
# in the reshuffled movie review
df.loc[0, 'review'][-300:]

"outstanding he got an Oscar nod for it (and deserved to win).<br /><br />Overall the movie is a perfect 10 and will be loved by people that love foreign cinema and people who don't. Don't Miss it.<br /><br />How did the awful Pulp Fiction beat ed this masterpiece at Cannes is beyond my comprehension"

In [85]:
# regex

import re
def preprocessor(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
    text = re.sub('[\W]+', ' ', text.lower()) +\
        ' '.join(emoticons).replace('-', '')
    return text

In [86]:
preprocessor(df.loc[0, 'review'][-300:])

'outstanding he got an oscar nod for it and deserved to win overall the movie is a perfect 10 and will be loved by people that love foreign cinema and people who don t don t miss it how did the awful pulp fiction beat ed this masterpiece at cannes is beyond my comprehension'

In [87]:
preprocessor("</a>This :) is :( a test :-)!")

'this is a test :) :( :)'

In [88]:
# apply the preprocess function to all reviews
df['review'] = df['review'].apply(preprocessor)

In [89]:
def tokenizer(text):
    return text.split()

tokenizer('The true sign of intelligence is not knowledge but imagination.')

['The',
 'true',
 'sign',
 'of',
 'intelligence',
 'is',
 'not',
 'knowledge',
 'but',
 'imagination.']

In [90]:
# Porter stemming
from nltk.stem.porter import PorterStemmer
porter = PorterStemmer()

def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]

tokenizer_porter('It has become appallingly obvious \
                   that our technology has exceeded our humanity.')

[u'It',
 u'ha',
 u'becom',
 u'appallingli',
 u'obviou',
 u'that',
 u'our',
 u'technolog',
 u'ha',
 u'exceed',
 u'our',
 u'humanity.']

In [91]:
# SnowballStemmer
from nltk.stem import SnowballStemmer
snowball = SnowballStemmer('english')

def tokenizer_snowball(text):
    return [snowball.stem(word) for word in text.split()]

tokenizer_snowball('It has become appallingly obvious \
                   that our technology has exceeded our humanity.')

['it',
 u'has',
 u'becom',
 u'appal',
 u'obvious',
 u'that',
 u'our',
 u'technolog',
 u'has',
 u'exceed',
 u'our',
 u'humanity.']

In [92]:
# LancasterStemmer
from nltk.stem.lancaster import LancasterStemmer
lancaster = LancasterStemmer()

def tokenizer_lancaster(text):
    return [lancaster.stem(word) for word in text.split()]

tokenizer_lancaster('It has become appallingly obvious \
                   that our technology has exceeded our humanity.')

['it',
 'has',
 'becom',
 'appal',
 u'obvy',
 'that',
 'our',
 'technolog',
 'has',
 u'excess',
 'our',
 'humanity.']

In [93]:
# Stop words

In [94]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/k/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [95]:
from nltk.corpus import stopwords
stop = stopwords.words('english')
[w for w in tokenizer_porter('It has become appallingly obvious \
        that our technology has exceeded our humanity.')[:] if w not in stop]

[u'It',
 u'ha',
 u'becom',
 u'appallingli',
 u'obviou',
 u'technolog',
 u'ha',
 u'exceed',
 u'humanity.']

In [36]:
# # http://bogotobogo.com/Algorithms/Machine_Learning_NLP_Sentiment_Analysis_3.php

In [37]:
# Training a model for classification

In [38]:
X_train = df.loc[:25000, 'review'].values
y_train = df.loc[:25000, 'sentiment'].values
X_test = df.loc[25000:, 'review'].values
y_test = df.loc[25000:, 'sentiment'].values

In [39]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV

tfidf = TfidfVectorizer(strip_accents=None, lowercase=False, preprocessor=None)

param_grid = [{'vect__ngram_range': [(1, 1)],
               'vect__stop_words': [stop, None],
               'vect__tokenizer': [tokenizer, tokenizer_porter],
               'clf__penalty': ['l1', 'l2'],
               'clf__C': [1.0, 10.0, 100.0]},
              {'vect__ngram_range': [(1, 1)],
               'vect__stop_words': [stop, None],
               'vect__tokenizer': [tokenizer, tokenizer_porter],
               'vect__use_idf':[False],
               'vect__norm':[None],
               'clf__penalty': ['l1', 'l2'],
               'clf__C': [1.0, 10.0, 100.0]},
             ]

lr_tfidf = Pipeline([('vect', tfidf),
                     ('clf', LogisticRegression(random_state=0))])

gs_lr_tfidf = GridSearchCV(lr_tfidf, param_grid, scoring='accuracy',
                           cv=5, verbose=1, n_jobs=-1)



In [40]:
gs_lr_tfidf.fit(X_train, y_train)

Fitting 5 folds for each of 48 candidates, totalling 240 fits


[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed: 59.9min
[Parallel(n_jobs=-1)]: Done 196 tasks      | elapsed: 281.6min
[Parallel(n_jobs=-1)]: Done 240 out of 240 | elapsed: 353.0min finished


GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(steps=[('vect', TfidfVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=False, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm=u'l2', preprocessor=None, smooth_idf=Tru...nalty='l2', random_state=0, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))]),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid=[{'vect__ngram_range': [(1, 1)], 'vect__tokenizer': [<function tokenizer at 0x7faad3963668>, <function tokenizer_porter at 0x7faac492f140>], 'clf__penalty': ['l1', 'l2'], 'clf__C': [1.0, 10.0, 100.0], 'vect__stop_words': [[u'i', u'me', u'my', u'myself', u'we', u'our', u'ours', u'ourselves...aac492f140>], 'vect__use_idf': [False], 'clf__C': [1.0, 10.0, 100.0], 'clf__penalty': ['l1', 'l2']}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
    

In [42]:
gs_lr_tfidf.best_params_

{'clf__C': 10.0,
 'clf__penalty': 'l2',
 'vect__ngram_range': (1, 1),
 'vect__stop_words': None,
 'vect__tokenizer': <function __main__.tokenizer>}

In [44]:
gs_lr_tfidf.best_score_

0.89540418383264675

In [45]:
clf = gs_lr_tfidf.best_estimator_
clf.score(X_test, y_test)

0.89824000000000004

In [46]:
# k-fold cross-validation score

In [52]:
from sklearn.linear_model import LogisticRegression
import numpy as np

from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score

np.random.seed(0)
np.set_printoptions(precision=6)
y = [np.random.randint(3) for i in range(20)]
X = (y + np.random.randn(20))

# The '-1' tells let's put as many rows as needed (same as X.T)
X = X.reshape(-1, 1)

X,y

(array([[ 1.220608],
        [-0.339496],
        [ 0.428373],
        [ 0.876537],
        [ 2.414377],
        [ 1.875949],
        [ 2.008157],
        [ 2.229887],
        [ 0.604894],
        [ 1.62716 ],
        [ 1.594561],
        [ 2.230434],
        [ 0.93509 ],
        [ 1.03102 ],
        [ 2.591243],
        [-0.782776],
        [ 0.555767],
        [ 0.654814],
        [ 0.118199],
        [ 0.557347]]),
 [0, 1, 0, 1, 1, 2, 0, 2, 0, 0, 0, 2, 1, 2, 2, 0, 1, 1, 1, 1])

In [54]:
cv5 = list(StratifiedKFold(n_splits=5, shuffle=False, random_state=0).split(X, y))
cv5

[(array([ 4,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]),
  array([0, 1, 2, 3, 5])),
 (array([ 0,  1,  2,  3,  5,  9, 10, 11, 13, 14, 15, 16, 17, 18, 19]),
  array([ 4,  6,  7,  8, 12])),
 (array([ 0,  1,  2,  3,  4,  5,  6,  7,  8, 10, 12, 13, 14, 15, 18, 19]),
  array([ 9, 11, 16, 17])),
 (array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 11, 12, 14, 15, 16, 17, 19]),
  array([10, 13, 18])),
 (array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 16, 17, 18]),
  array([14, 15, 19]))]

In [55]:
cross_val_score(LogisticRegression(random_state=123), X, y, cv=cv5)

array([ 0.6     ,  0.4     ,  0.75    ,  0.333333,  0.666667])

In [56]:
# GridSearchCV with LogisticRegression estimator

from sklearn.model_selection import GridSearchCV

gscv = GridSearchCV(LogisticRegression(), {}, cv=cv5, verbose=3).fit(X, y) 

Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV]  ................................................................
[CV] ....................................... , score=0.600000 -   0.0s
[CV]  ................................................................
[CV] ....................................... , score=0.400000 -   0.0s
[CV]  ................................................................
[CV] ....................................... , score=0.750000 -   0.0s
[CV]  ................................................................
[CV] ....................................... , score=0.333333 -   0.0s
[CV]  ................................................................
[CV] ....................................... , score=0.666667 -   0.0s


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.1s finished


In [57]:
# score from GridSearchCV
gscv.best_score_

0.55000000000000004

In [None]:
cross_val_score(LogisticRegression(), X, y, cv=cv5).mean()

0.55000000000000004

In [None]:
# http://www.bogotobogo.com/Algorithms/Machine_Learning_NLP_Sentiment_Analysis_4.php

In [75]:
import io
import numpy as np
import re
from nltk.corpus import stopwords

def tokenizer(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text.lower())
    text = re.sub('[\W]+', ' ', text.lower()) +\
        ' '.join(emoticons).replace('-', '')
    tokenized = [w for w in text.split() if w not in stop]
    return tokenized

def stream_docs(path):
    with io.open(path, 'r', encoding='utf-8') as csv:
        next(csv)  # skip header
        for line in csv:
            text, label = line[:-3], int(line[-2])
            yield text, label

In [76]:
next(stream_docs(path='./movie_data.csv'))

(u'"Master Kieslowsky came with an idea in 1993, the idea was to portrait how human relationship are in the world today, passing from Blue (a crafted visual masterpiece about a woman\'s life) from White (A visual comedy movie about marriage) and finally arriving to Red (A masterpiece dealing with human interaction).<br /><br />While I\'m not going to spoil the move I can easily say Red is the best movie from the 90\'s decade because it has one of the strongest messages in a script I have ever ever seen.<br /><br />The movie begins a little slow but finds it\'s rhythm early enough to keep you hooked through the whole movie.<br /><br />The performances are perfect, sublime. since the characters are completely realistic and they\'re not clich\xe9d in any way and one could expect no less from the actors and one doesn\'t get disappointed... seriously I believe Jean Louis Tringtignat deserved an Oscar nod at least.<br /><br />The music from Zbiegnew Preisner is amazing it\'s one of the best 

In [77]:
import pandas as pd
df = pd.read_csv('./movie_data.csv')
df.head(5)

Unnamed: 0,review,sentiment
0,"Master Kieslowsky came with an idea in 1993, t...",1
1,"4 Oscar winners, Karl Malden, Sally Field, Shi...",0
2,"Dull, flatly-directed ""comedy"" has zero laughs...",0
3,This movie was excellent. A sad truth to how c...,1
4,Words really can't describe how bad this film ...,0


In [104]:
def get_minibatch(doc_stream, size):
    docs, y = [], []
    try:
        for _ in range(size):
            text, label = next(doc_stream)
            docs.append(text)
            y.append(label)
    except StopIteration:
        return None, None
    return docs, y

In [105]:
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import SGDClassifier

vect = HashingVectorizer(decode_error='ignore', 
                         n_features=2**21,
                         preprocessor=None, 
                         tokenizer=tokenizer)

clf = SGDClassifier(loss='log', random_state=1, n_iter=1)
doc_stream = stream_docs(path='./movie_data.csv')

In [106]:
import pyprind
pbar = pyprind.ProgBar(45)

classes = np.array([0, 1])
for _ in range(45):
    X_train, y_train = get_minibatch(doc_stream, size=1000)
    if not X_train:
        break
    X_train = vect.transform(X_train)
    clf.partial_fit(X_train, y_train, classes=classes)
    pbar.update()

0%                          100%
[##############################] | ETA: 00:00:00
Total time elapsed: 00:01:01


In [107]:
X_test, y_test = get_minibatch(doc_stream, size=5000)
X_test = vect.transform(X_test)
clf.score(X_test, y_test)

0.79879999999999995

In [109]:
clf = clf.partial_fit(X_test, y_test)
clf.score(X_test, y_test)

0.82499999999999996