# Sentiment Analysis

In [1]:
import nltk
import numpy as np
from nltk.stem import WordNetLemmatizer
from sklearn.linear_model import LogisticRegression
from bs4 import BeautifulSoup

In [5]:
lemmatizer = WordNetLemmatizer()
#function to remove stop words
stopwords = nltk.corpus.stopwords.words('english')

positive_reviews = BeautifulSoup(open('positive.reviews').read())
positive = positive_reviews.findAll('review_text') # review_text is a tag name in the XML file

negative_reviews = BeautifulSoup(open('negative.reviews').read())
negative = negative_reviews.findAll('review_text') # review_text is a tag name in the XML file

# there might be more number of positve reviews
# so in order to get equal number of classes for both positive and negative
# we are going to first shuffle the positve reviews and cut off them equal to negative reviews count

np.random.shuffle(positive)

# cut off the positive counts equal to negatives
positive = positive[:len(negative)]

In [6]:
# the below function covers following steps:
# 1. removing stop words
# 2. tokenizing the words
# 3. lemmatizng
# 4. convert to lower case

def preprocessing(text):
    tokens = text.lower()
    tokens = nltk.tokenize.word_tokenize(tokens)
    tokens = [t for t in tokens if len(t) > 2]
    tokens = [lemmatizer.lemmatize(t) for t in tokens]
    tokens = [t for t in tokens if t not in stopwords]
    return tokens
 
word_index_map = {} # creating word indexex for each unique token
current_index = 0 # intially Zero and will increase with each token

positive_tokens = []# array of positive tokenized words
negative_tokens = []# array of negative tokenized words

# create a unique word dict for positive reviews ( word: index)
for message in positive:
    tokens = preprocessing(message.text)
    positive_tokens.append(tokens)# add the all the preprocessed tokens to above array
    for token in tokens:
        if token not in word_index_map:
            word_index_map[token] = current_index
            current_index +=1 # increment the index value for new token

# create a unique word dict for negative reviews ( word: index)
for message in negative:
    tokens = preprocessing(message.text)
    negative_tokens.append(tokens)# add the all the preprocessed tokens to above array
    for token in tokens:
        if token not in word_index_map:
            word_index_map[token] = current_index
            current_index +=1 # increment the index value for new token
            
            


In [11]:
#testing above code
print(positive_tokens[0])
print(negative_tokens[0])
#testing word_index_map dictionary
dictionary = {k:word_index_map[k] for k in list(word_index_map)[-10:]}
dictionary# token with their index 

['ha', 'helped', 'significantly', 'drawing', 'computer', 'program', 'adobe', 'illustrator', 'adobe', 'photoshop', 'macromedia', 'flash', 'drawing', 'mouse', 'like', 'drawing', 'brick', 'buy', 'piece', 'equipment', 'need', 'draw', 'computer', 'also', 'work', 'well', 'overall', 'mouse', 'computer', 'even', 'come', 'mouse', 'prefer', 'use', 'instead', 'pen', 'plug', 'right', 'usb', 'drive', 'work', 'almost', 'instantly']
['con', 'tip', 'extremely', 'easy', 'carpet', 'lot', 'cd', 'stacked', 'top', 'poorly', 'designed', 'vertical', 'rack', 'doesnt', 'individual', 'slot', 'cd', 'want', 'bottom', 'stack', 'basically', 'pull', 'whole', 'stack', 'get', 'putting', 'together', 'wa', 'pain', 'one', 'bought', 'break', 'piece', 'metal', 'fit', 'guide', 'hole', 'again..poorly', 'designed', '...', 'doesnt', 'even', 'fit', 'cd', 'well', 'gap', 'ca', 'loose', 'fitting', 'pro', '...', '...', '...', 'guess', 'hold', 'lot', 'cd', '...']


{'pasted': 11271,
 'recourse': 11272,
 'cruddy': 11273,
 'no-no': 11274,
 'volume/on/off': 11275,
 'streamlined': 11276,
 'slippery': 11277,
 'blister': 11278,
 'coushing': 11279,
 'menetioned': 11280}

In [45]:
# Convert tokens into a data arrays with numbers

def token_to_vector(tokens, label): # tokens with lables together to make it easy to shuffle before train them
    x = np.zeros(len(word_index_map) + 1) # +1 is for label, so total len(word_index_map+label)
    for t in tokens: # loop through each token
        i = word_index_map[t] # get the each token index from word_index_map
        x[i] += 1 # for each token
    #x = x/x.sum()
    x[-1] = label
    return x

N = len(positive_tokens) + len(negative_tokens)
# creating a N = 2000 arrays with length of each array equal to len(word_index_map)+1
data = np.zeros((N, len(word_index_map) + 1))
i = 0
for tokens in positive_tokens:
    xy = token_to_vector(tokens ,1)
    data[i,:] = xy
    i += 1

for tokens in negative_tokens:
    xy = token_to_vector(tokens ,0)
    data[i,:] = xy
    i += 1

np.random.shuffle(data)

x = data[:, :-1]
y = data[:,-1]

X_train = x[:-100, ]
y_train = y[:-100, ]
X_test = x[-100: , ]
y_test = y[-100: , ]

model = LogisticRegression()
model.fit(X_train,y_train)
print("Classification Rate: ",model.score(X_test,y_test))




Classification Rate:  0.84


In [47]:
#lets' see the weights of each token for weights more than the threshold
threshold = 0.5
for word, index in word_index_map.items():
    weight = model.coef_[0][index]# to get the weights
    if weight > threshold or weight < -threshold:
        print(word, weight)
    
# -ve weigths represents negativity
# +ve weights represents positivity

buy -0.585213145226766
piece -0.5227247977488222
even -0.506327464844218
almost -0.9459152156763782
may -0.5352083957475829
two -0.519129495532396
using 0.5145291944404375
result -0.739435475317519
great 1.2023169340047057
worth 0.7258537031507079
spare 0.6780456238938027
description -0.6241466449386905
easy 0.8877437050878285
radio -0.564120638429105
maybe -0.928969838840537
pleased 0.6560972647200137
bit 0.6706353337641017
easier 0.5890360419560926
value 0.723050626812834
unless -0.765848246579005
good 0.51152714406278
price 1.0384605515795275
fast 1.1001318647074947
item -0.7255955058800693
last 0.5729288513878242
month -0.5132222742837925
often -0.8339698070371449
big 0.508115982346513
lot 0.6074495605973464
allows 0.7240916074339978
without 0.5235699066772582
something -0.5766878784952915
minor 0.6206273208852686
jack -0.7522943128462749
want 0.7930058964474374
past 0.5083144291147803
quickly 0.580766968241641
used 0.6648221773472033
highly 1.368429872766604
point -0.8980155621041