<a href="https://colab.research.google.com/github/Bob-Gohardani/nlp-ml/blob/main/sentimentDetection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import nltk
import numpy as np

In [2]:
from nltk.stem import WordNetLemmatizer
from sklearn.linear_model import LogisticRegression
from bs4 import BeautifulSoup

In [4]:
wordnet_lemmatizer = WordNetLemmatizer()
stopwords = set(w.rstrip() for w in open('stopwords.txt'))

In [7]:
# get positive and negtive reviews from the source file:
positive_reviews = BeautifulSoup(open('positive.review.txt').read())
positive_reviews = positive_reviews.find_all('review_text')

negative_reviews = BeautifulSoup(open('negative.review.txt').read())
negative_reviews = negative_reviews.find_all('review_text')

In [27]:
positive_reviews[0]

<review_text>
We bought this unit from Sam's Club right before a trip to Disney World. (8 hours in the car with a toddler--eek!)  However, it was/is perfect!  We still use it for trips around town when we're in massive traffic or our son is being really fussy.  The case that keeps the dvd player on the back of the seat needs a little work, but we figured out a way to make it work for us.  Also, the remote doesn't work well, but we don't really need to use it as a DVD usually lasts long enough between stops
</review_text>

In [8]:
# shuffle positive reviews and select same number of them as negative reviews so our results are balanced
np.random.shuffle(positive_reviews)
positive_reviews = positive_reviews[:len(negative_reviews)]

In [9]:
# tokenzie a review (given string as input and returns list of tokens)

def my_tokenizer(s):
  s = s.lower()
  tokens = nltk.tokenize.word_tokenize(s)
  tokens = [t for t in tokens if len(t)>2]
  tokens = [wordnet_lemmatizer.lemmatize(t) for t in tokens]
  tokens = [t for t in tokens if t not in stopwords]

  return tokens

In [10]:
# this dict maps each token/word to an index. basically each word in here is a feature of our logistic regression model
word_index_map = {}
current_index = 0

positive_tokenized = []
negative_tokenized = []

In [16]:
# nltk.download('punkt')
# nltk.download('wordnet')

for review in positive_reviews:
  tokens = my_tokenizer(review.text)
  positive_tokenized.append(tokens)
  for token in tokens:
    # there wouldn't be any repeated word in the word_index_map
    if token not in word_index_map:
      word_index_map[token] = current_index
      current_index += 1

In [29]:
positive_tokenized[0][:5]

['bought', 'this', 'unit', 'sam', 'club']

In [35]:
print(word_index_map['gift'])
print(word_index_map['club'])

927
4


In [17]:
for review in negative_reviews:
    tokens = my_tokenizer(review.text)
    negative_tokenized.append(tokens)
    for token in tokens:
        if token not in word_index_map:
            word_index_map[token] = current_index
            current_index += 1

In [20]:
# bag of words tokens to vectors
def tokens_to_vectors(tokens, label):
  x = np.zeros(len(word_index_map)+1)
  for t in tokens:
    i = word_index_map[t] # get the index
    x[i] += 1 # update how many times a word has been repeated in the review

  x = x / x.sum() # normalize each vector based how many tokens are inside it
  x[-1] = label
  return x

In [36]:
N = len(positive_reviews) + len(negative_reviews)
N

2000

In [37]:
# create data matrix with N rows (number of examples) and column number of word_index_map plus the label (positive or negative)
data = np.zeros((N, len(word_index_map)+1))
i = 0

data

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [39]:
# each tokens inlcudes a review string
for tokens in positive_tokenized:
    xy = tokens_to_vectors(tokens, 1)
    data[i,:] = xy
    i += 1

for tokens in negative_tokenized:
  xy = tokens_to_vectors(tokens, 0)
  data[i,:] = xy
  i += 1

np.random.shuffle(data)

In [43]:
data

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        1.        ],
       [0.        , 0.07142857, 0.        , ..., 0.        , 0.        ,
        1.        ],
       [0.        , 0.02941176, 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.03448276, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.03125   , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.01190476, 0.01190476, 0.10714286, ..., 0.        , 0.        ,
        0.        ]])

In [23]:
X = data[:, :-1]  # features
Y = data[:, -1] # label

In [24]:
# last 100 rows are the test (validation) 
Xtrain = X[:-100,]
Ytrain = Y[:-100,]

Xtest = X[-100:,]
Ytest = Y[-100:,]

In [44]:
model = LogisticRegression()
model.fit(Xtrain, Ytrain)
print(model.score(Xtest, Ytest))

0.64


In [45]:
# check which words have the max impact
threshold = 0.5
for word, index in word_index_map.items():
  weight = model.coef_[0][index]
  if weight > threshold or weight < -1 * threshold:
    print(word, weight)

unit -0.722835078723575
hour -0.5344933871278992
perfect 1.0304273371442798
little 0.9646876272212312
doe -1.181034004049814
n't -2.0929078882810073
sound 1.1277127741615791
love 1.2198897304462806
lot 0.7137096135601921
speaker 0.919028070097317
wa -1.611451203033656
comfortable 0.6148809744293265
space 0.5524968806600744
feature 0.5107338897864697
buy -0.7551888209080694
pro 0.508430015504002
pretty 0.6576082798276508
you 1.2371267590594643
home 0.5591928633709934
quality 1.4627812684503678
cable 0.8838747190749062
price 2.65106186709478
excellent 1.377225510274656
highly 1.03792378565054
recommend 0.6070020583198741
picture 0.5804226470325667
using 0.6453629018734126
paper 0.521995792130177
tried -0.8910856012133294
easy 1.8404046989039844
time -0.6481961841570271
then -1.1123280783610467
try -0.6874011388939075
ha 0.6279606183648166
money -1.036971673205191
cheap -0.5048604506545629
've 0.7522093178934673
happy 0.5930365753675056
bit 0.6690181155751854
memory 0.9223747757629065
loo