<a href="https://colab.research.google.com/github/AlvinChiew/MachineLearning/blob/main/NLP_SentimentalAnalysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import Modules

In [1]:
import nltk
import numpy as np
from sklearn.utils import shuffle

from nltk.stem import WordNetLemmatizer
from sklearn.linear_model import LogisticRegression
from bs4 import BeautifulSoup

In [2]:
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

# Load Files

In [3]:
!wget https://raw.githubusercontent.com/lazyprogrammer/machine_learning_examples/master/nlp_class/stopwords.txt
!wget https://raw.githubusercontent.com/lazyprogrammer/machine_learning_examples/master/nlp_class/electronics/positive.review
!wget https://raw.githubusercontent.com/lazyprogrammer/machine_learning_examples/master/nlp_class/electronics/unlabeled.review
!wget https://raw.githubusercontent.com/lazyprogrammer/machine_learning_examples/master/nlp_class/electronics/negative.review

--2021-05-14 11:29:58--  https://raw.githubusercontent.com/lazyprogrammer/machine_learning_examples/master/nlp_class/stopwords.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2488 (2.4K) [text/plain]
Saving to: ‘stopwords.txt.1’


2021-05-14 11:29:58 (44.3 MB/s) - ‘stopwords.txt.1’ saved [2488/2488]

--2021-05-14 11:29:58--  https://raw.githubusercontent.com/lazyprogrammer/machine_learning_examples/master/nlp_class/electronics/positive.review
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1105010 (1.1M) [text/plain]
Saving to

In [4]:
stop_words = None

with open('stopwords.txt') as f:
    stop_words = set(w.rstrip() for w in f)     # use set to get unique list of vocab

with open('positive.review') as f:
    positive_reviews = BeautifulSoup(f.read()).findAll('review_text')

with open('negative.review') as f:
    negative_reviews = BeautifulSoup(f.read()).findAll('review_text')



In [5]:
np.random.seed(4)

if len(positive_reviews) > len(negative_reviews):
    np.random.shuffle(positive_reviews)
    positive_reviews = positive_reviews[:len(negative_reviews)]
else:
    np.random.shuffle(negative_reviews)
    negative_reviews = negative_reviews[:len(positive_reviews)]

N = len(positive_reviews) +  len(negative_reviews)  # Number of samples

In [6]:
lemmatizer = WordNetLemmatizer()    # group same words e.g. cats & cat to reduce vocab size

# convert sentence to tokens and filter out uninterested word
def tokenize(s):
    s = s.lower()
    tokens = nltk.tokenize.word_tokenize(s)
    tokens = [t for t in tokens if len(t) > 2]      # only accept word with more than 2 letters
    tokens = [lemmatizer.lemmatize(t) for t in tokens]
    tokens = [t for t in tokens if t not in stop_words]
    return tokens

In [7]:
word_idx_map = {}   # word-index mapping
last_idx = 0        # track index increment by word

positive_tokenized = []
negative_tokenized = []

# associate index to each vocab & tokenize sentences
def learn_vocab(reviews, last_idx):
    tokenized = []
    for review in reviews:
        tokens = tokenize(review.text)
        tokenized.append(tokens)
        for token in tokens:
            if token not in word_idx_map:
                word_idx_map[token] = last_idx
                last_idx += 1
    return tokenized, last_idx

In [8]:
positive_tokenized, last_idx = learn_vocab(positive_reviews, last_idx)
negative_tokenized, last_idx = learn_vocab(negative_reviews, last_idx)

In [9]:
def tokens_to_vector(tokens, label):
    X = np.zeros(len(word_idx_map) + 1)      # vector size is # vocabs + 1 label
    for t in tokens:
        i = word_idx_map[t]
        X[i] += 1
    X = X/X.sum()                   # normalize word count in a sentence, so that all counts (vectors) sum up to 1
    X[-1] = label
    return X

def fill_data(tokenized_reviews, last_idx, label):
    for tokens in tokenized_reviews:
        Xy = tokens_to_vector(tokens, label)
        data[last_idx,:] = Xy
        last_idx += 1
    return data, last_idx

In [10]:
data = np.zeros((N, len(word_idx_map) + 1))      # (# samples, # vocabs)
last_idx = 0

data, last_idx = fill_data(positive_tokenized, last_idx, 1)
data, last_idx = fill_data(negative_tokenized, last_idx, 0)

In [11]:
np.random.shuffle(data)
data

array([[0.01639344, 0.01639344, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        1.        ],
       [0.        , 0.08333333, 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        1.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        1.        ],
       [0.04347826, 0.04347826, 0.        , ..., 0.        , 0.        ,
        1.        ]])

In [12]:
X_train = data[:-100,:-1]   # split last 100 rows as test set; last column is label
X_test = data[-100:,:-1]
y_train = data[:-100, -1] 
y_test = data[-100:, -1]

In [13]:
model = LogisticRegression()
model.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [14]:
print(model.score(X_train, y_train))
print(model.score(X_test, y_test))

0.7878947368421053
0.77


In [15]:
# Peek into word with high weightage in Logistic Reg Model

THRESHOLD = 0.8
for word, idx in word_idx_map.items():
    weight = model.coef_[0][idx]
    if weight > THRESHOLD or weight < -THRESHOLD:
        print(f"{word:<10}{weight}")

# sign in weightage = positive / negative sentimental

# Further Effort:
    # optimize threshold point in Logistic Regression            
    # use more complex classification model or deep learning with RNN
    # expand categories from binary to a scale of 3 or 5 points, middle be neutral
    # increase sample size

unit      -0.8867396209383875
cable     0.8353727327798643
sound     1.0468014306205844
you       0.946293471324069
n't       -1.9560568679723216
easy      1.826783929118966
quality   1.506312866080753
item      -0.9013745300673538
wa        -1.487048488648309
perfect   1.033557004339358
fast      0.8956123965812748
price     2.635150400751305
money     -1.0460526667278023
memory    0.9896817080086341
buy       -0.8988341609032072
doe       -1.1774862573408704
highly    0.930849641469327
support   -0.8413442978671349
little    0.9255377587475565
excellent 1.3864532071684454
love      1.2649520177015412
poor      -0.8133698564336692
then      -1.1154776158565054
tried     -0.806217824199422
speaker   0.9323279547383146
return    -1.1439994196174428
waste     -0.9945234711043558
