<a href="https://colab.research.google.com/github/Chinmay-47/NLP_Udemy/blob/master/Sentiment_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [81]:
#Data

from bs4 import BeautifulSoup
negative_reviews = BeautifulSoup(open('negative.review').read(),features= "html5lib")
negative_reviews = negative_reviews.findAll('review_text')
positive_reviews = BeautifulSoup(open('positive.review').read(),features= "html5lib")
positive_reviews = positive_reviews.findAll('review_text')
unlabelled_reviews = BeautifulSoup(open('unlabeled.review').read(),features= "html5lib")
unlabelled_reviews = unlabelled_reviews.findAll('review_text')

In [82]:
#Clean Up to prepare for training
import nltk
from nltk.stem import WordNetLemmatizer
nltk.download('punkt')
nltk.download('wordnet')

wordnet_lemmatizer = WordNetLemmatizer()

stopwords = set(w.rstrip() for w in open('stopwords.txt'))

def tokenizer(a):
    a = a.lower()
    tokens = nltk.tokenize.word_tokenize(a)
    tokens = [token for token in tokens if token not in stopwords]
    tokens = [token for token in tokens if len(token)>2]
    tokens = [wordnet_lemmatizer.lemmatize(token) for token in tokens]
    return tokens


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [83]:
unique_words = {}
index = 0
positive_tokenized = []
negative_tokenized = []
original_reviews = []

for r in positive_reviews:
    original_reviews.append(r.text)
    tokens = tokenizer(r.text)
    positive_tokenized.append(tokens)
    for token in tokens:
        if token not in unique_words:
            unique_words[token] = index
            index+=1

for n in negative_reviews:
    original_reviews.append(n.text)
    tokens = tokenizer(n.text)
    negative_tokenized.append(tokens)
    for token in tokens:
        if token not in unique_words:
            unique_words[token] = index
            index+=1

print("len(unique_words):", len(unique_words))
#key = unique tokens and value=index

len(unique_words): 11106


In [88]:
import numpy as np

def tokens_to_vector(tokens,label):
    x = np.zeros(len(unique_words)+1)
    for t in tokens:
        i = unique_words[t]
        x[i] +=1
    x = x/x.sum()
    x[-1] = label
    return x

N = len(positive_tokenized) + len(negative_tokenized)

data = np.zeros((N,len(unique_words)+1))
i= 0

for token in positive_tokenized:
    xy = tokens_to_vector(token,1)
    data[i,:] = xy
    i+=1

for token in negative_tokenized:
    xy = tokens_to_vector(token,0)
    data[i,:] = xy
    i+=1

from sklearn.utils import shuffle

data, original_reviews = shuffle(data,original_reviews)

from sklearn.model_selection import train_test_split

Xtrain, XTest, Ytrain, Ytest = train_test_split(data[:,:-1],data[:,-1],test_size = 0.33,shuffle= True)

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier

model = LogisticRegression()
model1 = AdaBoostClassifier()
model2 = RandomForestClassifier()

model.fit(Xtrain,Ytrain)
model1.fit(Xtrain,Ytrain)
model2.fit(Xtrain,Ytrain)

print("Training accuracy of model : ", model.score(Xtrain,Ytrain))
print("Test accuracy of model : ", model.score(XTest,Ytest))
print("Training accuracy of model1 : ", model1.score(Xtrain,Ytrain))
print("Test accuracy of model1 : ", model1.score(XTest,Ytest))
print("Training accuracy of model1 : ", model2.score(Xtrain,Ytrain))
print("Test accuracy of model1 : ", model2.score(XTest,Ytest))

Training accuracy of model :  0.7947761194029851
Test accuracy of model :  0.7303030303030303
Training accuracy of model1 :  0.841044776119403
Test accuracy of model1 :  0.7151515151515152
Training accuracy of model1 :  1.0
Test accuracy of model1 :  0.793939393939394


In [92]:
# let's look at the weights for each word
# try it with different threshold values!
from future.utils import iteritems
threshold = 0.5
for word, index in iteritems(unique_words):
    weight = model.coef_[0][index]
    if weight > threshold or weight < -threshold:
        print(word, weight)


bad -0.6603978121335308
day -0.5149190071734071
've 0.5365274108819938
sound 0.7261732691078957
lot 0.5874070608735378
n't -1.6758082610989142
easy 1.4480397975037347
quality 1.0396383800022515
card -0.5425538062698667
item -0.6649913255729275
perfect 0.6099835606159258
fast 0.6810597999933578
price 2.2046421736354995
money -0.6436933539717562
memory 0.5618716697414786
picture 0.5214866693483268
buy -0.6535091808293393
happy 0.5082706768739143
pretty 0.572200993659159
highly 0.7575087461447255
support -0.7163096321596831
little 0.7912586956264775
returned -0.5453165603473059
excellent 0.881130558524569
love 0.8198512416887568
week -0.618128239111465
size 0.5229419245550312
using 0.5576442928495882
ipod -0.592987027282843
poor -0.684190870068703
then -0.9504226025513846
tried -0.6098365162978384
try -0.6048710295322687
photo 0.5238503479523994
speaker 1.0178414393568573
broken -0.531591179209445
paper 0.599725154989537
return -1.0227898979035286
waste -0.6628778237089521


In [93]:
# check misclassified examples
preds = model.predict(data[:,:-1])
P = model.predict_proba(data[:,:-1])[:,1] # p(y = 1 | x)

In [97]:
# since there are many, just print the "most" wrong samples
minP_whenYis1 = 1
maxP_whenYis0 = 0
wrong_positive_review = None
wrong_negative_review = None
wrong_positive_prediction = None
wrong_negative_prediction = None
for i in range(N):
    p = P[i]
    y = data[:,-1][i]
    if y == 1 and p < 0.5:
        if p < minP_whenYis1:
            wrong_positive_review = original_reviews[i]
            wrong_positive_prediction = preds[i]
            minP_whenYis1 = p
    elif y == 0 and p > 0.5:
        if p > maxP_whenYis0:
            wrong_negative_review = original_reviews[i]
            wrong_negative_prediction = preds[i]
            maxP_whenYis0 = p

print("Most wrong positive review (prob = %s, pred = %s):" % (minP_whenYis1, wrong_positive_prediction))
print(wrong_positive_review)
print("Most wrong negative review (prob = %s, pred = %s):" % (maxP_whenYis0, wrong_negative_prediction))
print(wrong_negative_review)

Most wrong positive review (prob = 0.42810195908084964, pred = 0.0):

the product arrive to me in very good condition, will recommend this product to anyone who needs blank cd

Most wrong negative review (prob = 0.5890300620915938, pred = 1.0):

Like the forward/back buttons but they are located too far back on the body for comfort.  Like the scroll wheel and side-to-side scroll ability.  Pointer flickers a lot sometimes and often disappears over certain buttons.  Logitech website gives no help.
Comes with an installation CD for a Logitech MX320 !!!  Be prepared for a 50MB download to install the MX400.  Almost makes Microsot look good

