In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import sklearn
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
import spacy
import nltk
from time import time

In [2]:
phrases = ["The weather today is worse than yesterday",
           "education is what you have left over after forgetting everything you ever learnt"]




In [3]:
en_model = spacy.load('en')
stemmer = nltk.stem.PorterStemmer()

In [4]:
def stem_and_lemma(doc):
    doc_spacy = en_model(doc)
    print("Lemmatization:")
    print([token.lemma_ for token in doc_spacy])
    print("Stemming:")
    print([stemmer.stem(token.norm_.lower()) for token in doc_spacy])

In [8]:
stem_and_lemma(phrases[0])

Lemmatization:
['the', 'weather', 'today', 'be', 'bad', 'than', 'yesterday']
Stemming:
['the', 'weather', 'today', 'is', 'wors', 'than', 'yesterday']


In [5]:
stem_and_lemma(phrases[1])

Lemmatization:
['education', 'be', 'what', '-PRON-', 'have', 'leave', 'over', 'after', 'forget', 'everything', '-PRON-', 'ever', 'learn']
Stemming:
['educ', 'is', 'what', 'you', 'have', 'left', 'over', 'after', 'forget', 'everyth', 'you', 'ever', 'learnt']


In [9]:
data = pd.read_csv("data/labeledTrainData.tsv", delimiter="\t")

In [15]:
def simple_split(data,y,length,split_mark=0.7):
    if split_mark > 0. and split_mark < 1.0:
        n = int(split_mark*length)
    else:
        n = int(split_mark)
    X_train =  data[:n].copy()
    X_test =   data[n:].copy()
    y_train = y[:n].copy()
    y_test  = y[n:].copy()
    return X_train,X_test,y_train,y_test

In [16]:
def sentiment_analysis(data):
    d_train,d_test,y_train,y_test = simple_split(data,data.sentiment,len(data))
    print(d_train.shape,d_test.shape,y_train.shape,y_test.shape)
    pipe = make_pipeline(TfidfVectorizer(min_df=5, norm=None),LogisticRegression())
    start = time()
    param_dist = {"logisticregression__C": [0.001, 0.01, 0.1, 1, 10, 100],
                  "tfidfvectorizer__ngram_range": [(1, 1), (1, 2), (1, 3)]
                 }
    model = RandomizedSearchCV(pipe,param_dist, cv=5, n_iter=12)
    model.fit(d_train.review, y_train)
    print('RandomizedSearchCV Training took {} minutes'.format((time() - start)/60.))
    print("Best cross-validation score: {:.2f}".format(model.best_score_))
    print("Best parameters:\n{}".format(model.best_params_))
    tfidf = model.best_estimator_.named_steps["tfidfvectorizer"]
    logreg = model.best_estimator_.named_steps["logisticregression"]
    pipe = make_pipeline(tfidf,logreg)
    pipe.fit(d_train.review, y_train)
    print("Test set score: {:.3f}".format(pipe.score(d_test.review, y_test)))
    pred_logreg = pipe.predict(d_test.review)
    confusion = confusion_matrix(y_test, pred_logreg)
    print("Confusion matrix:\n{}".format(confusion))
    return pipe
    
    

In [17]:
def lemmatize(doc):
    d1 = en_model(doc)
    d1 = ' '.join([token.lemma_ for token in d1])
    return d1

In [18]:
def stemmatize(doc):
    d1 = en_model(doc)
    d1 = ' '.join([stemmer.stem(token.norm_.lower()) for token in d1])
    return d1

In [19]:
start = time()
data['review'] = data['review'].apply(lemmatize)
print('Lemmatization took {} minutes'.format((time() - start)/60.))


Lemmatization took 99.19152934948603 minutes


In [20]:
lemm = sentiment_analysis(data)

(17500, 3) (7500, 3) (17500,) (7500,)
RandomizedSearchCV Training took 42.26202932993571 minutes
Best cross-validation score: 0.90
Best parameters:
{'tfidfvectorizer__ngram_range': (1, 3), 'logisticregression__C': 0.001}
Test set score: 0.900
Confusion matrix:
[[3346  393]
 [ 355 3406]]


In [24]:
data = pd.read_csv("data/labeledTrainData.tsv", delimiter="\t")

In [25]:
start = time()
data['review'] = data['review'].apply(stemmatize)
print('Stemmatization took {} minutes'.format((time() - start)/60.))

Stemmatization took 105.80064667065939 minutes


In [26]:
stem = sentiment_analysis(data)

(17500, 3) (7500, 3) (17500,) (7500,)
RandomizedSearchCV Training took 40.914124596118924 minutes
Best cross-validation score: 0.90
Best parameters:
{'tfidfvectorizer__ngram_range': (1, 3), 'logisticregression__C': 0.01}
Test set score: 0.903
Confusion matrix:
[[3378  361]
 [ 365 3396]]


In [27]:
review = ["This movie is not that good"]
print(lemm.predict(review)[0])
print(stem.predict(review)[0])

0
0


In [28]:
review = ["This movie is not that bad"]
print(lemm.predict(review)[0])
print(stem.predict(review)[0])

0
0


In [29]:
review = ["I was going to say something awesome or great or good, but I can't because the movie is so bad."]
print(lemm.predict(review)[0])
print(stem.predict(review)[0])

0
0
