In [1]:
import nltk
import numpy as np
from nltk.stem import WordNetLemmatizer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from bs4 import BeautifulSoup
import os
from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

In [2]:
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\dhruv\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\dhruv\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\dhruv\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [3]:
wordnet_lemmatizer = WordNetLemmatizer()

In [4]:
stopwords = set()
with open(os.path.join('data', 'stopwords.txt')) as f :  
    for line in f :
        stopwords.add(line.strip())
print('Length of stopwords',len(stopwords))

Length of stopwords 420


In [5]:
def tokenize(s) :
    s = s.lower()
    s = nltk.tokenize.word_tokenize(s)
    s = [token for token in s if len(token) > 2]
    s = [wordnet_lemmatizer.lemmatize(token) for token in s]
    s = [token for token in s if token not in stopwords]
    return s

In [6]:
positive_reviews = BeautifulSoup(open(os.path.join('data', 'positive.review')))
positive_reviews = positive_reviews.findAll('review_text')

negative_reviews = BeautifulSoup(open(os.path.join('data', 'negative.review')))
negative_reviews = negative_reviews.findAll('review_text')

print(f'Length of postive reviews : {len(positive_reviews)}\nLength of negative reviews : {len(negative_reviews)}')

Length of postive reviews : 1000
Length of negative reviews : 1000


In [7]:
word_index = {}
postive_tokenized = []
negative_tokenized = []
i = 0

for p in positive_reviews :
    tokenized = tokenize(p.text)
    postive_tokenized.append(tokenized)
    for token in tokenized :
        if token not in word_index :
            word_index[token] = i
            i += 1
            
for n in negative_reviews :
    tokenized = tokenize(n.text)
    negative_tokenized.append(tokenized)
    for token in tokenized :
        if token not in word_index :
            word_index[token] = i
            i += 1

In [32]:
positive_vectorized = np.zeros((len(postive_tokenized), len(word_index)))
negative_vectorized = np.zeros((len(negative_tokenized), len(word_index)))

for idx, p in enumerate(postive_tokenized) :
    for word in p :
        positive_vectorized[idx, word_index[word]] += 1
        break

for idx, n in enumerate(negative_tokenized) :
    for word in n :
        negative_vectorized[idx, word_index[word]] += 1

data = np.vstack([
    positive_vectorized,
    negative_vectorized
])


data = data / data.sum(axis = 1, keepdims = True)
data = np.concatenate([
    data,
    np.vstack((np.ones(shape=(positive_vectorized.shape[0], 1)), np.zeros(shape=(positive_vectorized.shape[0], 1))))
], axis = 1)
print(data.shape)

(2000, 10948)


In [33]:
np.random.shuffle(data)

In [34]:
X_train, X_test, y_train, y_test = data[:-100,:-1], data[-100:,:-1], data[:-100,-1], data[-100:,-1]

In [35]:
regression = LogisticRegression()
regression.fit(X_train, y_train)
regression.score(X_test, y_test)

0.72

In [36]:
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)
dt.score(X_test, y_test)

0.97

In [43]:
threshold = 0.9
for word, index in word_index.items():
    weight = regression.coef_[0][index]
    if weight > threshold or weight < -threshold:
        print(word, weight)

purchased 2.4095278956410526
unit 0.9621390836337539
day -1.2778596848106434
've 2.420227820043411
month -0.9757694465872293
pro 1.521876623237843
simple 1.5076418531325149
sound 1.1552669151413373
lot 1.1756754340759135
easy 1.4423939875077614
card 0.9274843814874687
recently 2.0430073331236946
product 1.5338585442401902
wa 1.4594995462163982
perfect 1.0112816726973073
money -0.9926794108197323
review 1.1847095387141227
bought 2.8184873424555072
canon 1.314237742380012
read 1.0686542463915658
happy 1.6088703078053095
device 0.9039359343374037
note 1.003626466377106
pleased 1.5201338266875166
sandisk 1.3665461244847865
earphone 1.2282702910643308
support -1.025166598570388
little 1.6203760366572217
excellent 1.9646653679539872
love 1.8995315727169282
found 1.4011457925806983
keyboard 1.1457816575693827
using 1.7457138918495538
logitech 0.9375487526831355
mini 0.993497078635716
nice 0.9626599208392497
then -2.0467166184226318
people 1.01015625757233
printer 1.1272800986050857
hour -0.91