<a href="https://colab.research.google.com/github/ArsenalHail/NLP-Sentiment-Analysis-with-SGD/blob/main/NLP_Sentiment_Analysis_with_SGD.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import nltk
import math
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import random
import sklearn
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score

nltk.download('punkt')
ex_df = pd.read_excel('CS173-published-sheet.xlsx')
sentence_df = pd.DataFrame(ex_df, columns=['Sadness Sentences','Joy Sentences','Sadness + Joy Sentences','Sadness + Joy + Fear Sentences'])
ps = PorterStemmer()
sentence_df.head()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Unnamed: 0,Sadness Sentences,Joy Sentences,Sadness + Joy Sentences,Sadness + Joy + Fear Sentences
0,The devastating news of the child's abduction ...,It was a sunny summer morning and the laughter...,"When I visited my old childhood home, I felt a...",The parents watched their son leave for colleg...
1,The wretched people have chosen wrongly.,The youth are filled with zeal.,"They have advanced to the next round, despite ...",After seeing the group’s admirable performance...
2,"She was not fond of graveyards, let alone the\...",Ms. Smith taught the lesson to fidelity. The o...,"Despite finally winning, I feel robbed of clos...",So begins an endless journey with no destinati...
3,I’m feeling very anxious about going back to w...,The women’s soccer team was proud that they ac...,Although my heart aches because I will never b...,She starts to weep because she knows that I mi...
4,Sometimes I feel like something is wrong with ...,It's crazy to see that Jane and Mike were able...,It must have been tough to find out all the de...,It really makes you wonder what people are cap...


In [None]:
## Setup the NRC dataset for features
!mkdir -p data
!wget -nc https://nyc3.digitaloceanspaces.com/ml-files-distro/v1/upshot-trump-emolex/data/NRC-emotion-lexicon-wordlevel-alphabetized-v0.92.txt -P data

filepath = "data/NRC-emotion-lexicon-wordlevel-alphabetized-v0.92.txt"
emolex_df = pd.read_csv(filepath,  names=["word", "emotion", "association"], skiprows=45, sep='\t', keep_default_na=False)

emolex_words = emolex_df.pivot(index='word', columns='emotion', values='association').reset_index()
emolex_words.head()

File ‘data/NRC-emotion-lexicon-wordlevel-alphabetized-v0.92.txt’ already there; not retrieving.



emotion,word,anger,anticipation,disgust,fear,joy,negative,positive,sadness,surprise,trust
0,aback,0,0,0,0,0,0,0,0,0,0
1,abacus,0,0,0,0,0,0,0,0,0,1
2,abandon,0,0,0,1,0,1,0,1,0,0
3,abandoned,1,0,0,1,0,1,0,1,0,0
4,abandonment,1,0,0,1,0,1,0,1,1,0


In [None]:
## Apply stemming to the nrc lexicons
def stemmer(text):
  if pd.notnull(text):
    stemmed_word = ps.stem(text)
    return stemmed_word
  else:
    return text

emolex_words['word'] = emolex_words['word'].apply(stemmer)
emolex_words.head()

emotion,word,anger,anticipation,disgust,fear,joy,negative,positive,sadness,surprise,trust
0,aback,0,0,0,0,0,0,0,0,0,0
1,abacu,0,0,0,0,0,0,0,0,0,1
2,abandon,0,0,0,1,0,1,0,1,0,0
3,abandon,1,0,0,1,0,1,0,1,0,0
4,abandon,1,0,0,1,0,1,0,1,1,0


In [None]:
## Tokenize and stem the sentence_df
def stemTokenizer(text):
    if pd.notnull(text):
      tokens = word_tokenize(text.lower())
      stemmed_tokens = [ps.stem(token) for token in tokens]
      return stemmed_tokens
    else:
      return text

for column in sentence_df:
  sentence_df[column] = sentence_df[column].apply(stemTokenizer)
sentence_df.head()

Unnamed: 0,Sadness Sentences,Joy Sentences,Sadness + Joy Sentences,Sadness + Joy + Fear Sentences
0,"[the, devast, news, of, the, child, 's, abduct...","[it, wa, a, sunni, summer, morn, and, the, lau...","[when, i, visit, my, old, childhood, home, ,, ...","[the, parent, watch, their, son, leav, for, co..."
1,"[the, wretch, peopl, have, chosen, wrongli, .]","[the, youth, are, fill, with, zeal, .]","[they, have, advanc, to, the, next, round, ,, ...","[after, see, the, group, ’, s, admir, perform,..."
2,"[she, wa, not, fond, of, graveyard, ,, let, al...","[ms., smith, taught, the, lesson, to, fidel, ....","[despit, final, win, ,, i, feel, rob, of, clos...","[so, begin, an, endless, journey, with, no, de..."
3,"[i, ’, m, feel, veri, anxiou, about, go, back,...","[the, women, ’, s, soccer, team, wa, proud, th...","[although, my, heart, ach, becaus, i, will, ne...","[she, start, to, weep, becaus, she, know, that..."
4,"[sometim, i, feel, like, someth, is, wrong, wi...","[it, 's, crazi, to, see, that, jane, and, mike...","[it, must, have, been, tough, to, find, out, a...","[it, realli, make, you, wonder, what, peopl, a..."


In [None]:
## 2.2 - Features

# new method, store features for individual sentences
sadness_features = []
joy_features = []
sadness_joy_features = []
sadness_joy_fear_features = []
emo_features = [sadness_features, joy_features, sadness_joy_features, sadness_joy_fear_features]

emo_counter = 0 # keeps track of if adding to sad, joy, or multi cols
for i in sentence_df:
  print(i)
  for j in sentence_df[i]:
    if j is not np.nan:
      print(j)
      x1 = 0
      x2 = 0
      x3 = 0
      for k in j:
        x3 += 1
        if k in emolex_words['word'].values:
          #print(k)
          x1 += emolex_words.loc[emolex_words['word'] == k, 'joy'].values[0]
          x2 += emolex_words.loc[emolex_words['word'] == k, 'sadness'].values[0]
          #print(emolex_words.loc[emolex_words['word'] == k, 'sadness'].values[0])
      emo_features[emo_counter].append([x1,x2,x3])
      print([x1,x2,x3])
  emo_counter += 1
  print("\n")
  print(len(emo_features[0]))

Sadness Sentences
['the', 'devast', 'news', 'of', 'the', 'child', "'s", 'abduct', 'left', 'a', 'solemn', 'shadow', 'over', 'the', 'famili', 'for', 'the', 'next', 'month', '.']
[1, 2, 20]
['the', 'wretch', 'peopl', 'have', 'chosen', 'wrongli', '.']
[0, 2, 7]
['she', 'wa', 'not', 'fond', 'of', 'graveyard', ',', 'let', 'alon', 'the', 'mausoleum', '.', 'a', 'sudden', 'nois', 'startl', 'her', ',', 'and', 'she', 'stifl', 'a', 'gasp', '.']
[1, 1, 24]
['i', '’', 'm', 'feel', 'veri', 'anxiou', 'about', 'go', 'back', 'to', 'work', 'next', 'week', '.', 'after', 'peopl', 'found', 'out', 'that', 'i', '’', 'm', 'a', 'recov', 'alcohol', ',', 'they', 'all', 'start', 'to', 'treat', 'me', 'differ', '.']
[2, 1, 34]
['sometim', 'i', 'feel', 'like', 'someth', 'is', 'wrong', 'with', 'me', 'becaus', 'i', 'feel', 'lone', 'even', 'though', 'i', 'am', 'surround', 'by', 'peopl', '.']
[0, 1, 21]
['the', 'abandon', 'child', 'look', 'up', 'with', 'sad', 'eye', ',', 'and', 'i', 'felt', 'helpless', '.']
[1, 3, 14]
['

In [None]:
## 3.1 Logistic Regression Classifier

def posLogRegression(value):
  # Utilizes sigmoid to get score for the positive class
  value = np.clip(value, -600, 600)
  sigmoid = 1 / (1 + np.exp(-(value)))
  return sigmoid

In [None]:
## 3.2 Cross-entropy Loss

def LCE(y, x, w = 0, b = 0):
  y = np.clip(y, 1e-10, 1 - 1e-10)
  dot_prod = (x[0]*w[0]) + (x[1]*w[1]) + (x[2]*w[2])
  sigmoid = posLogRegression((dot_prod)+b)
  #print("feature count =",x,",sigmoid =",sigmoid)
  loss = -( (y)*(np.log(sigmoid)) + (1-y)*(np.log(1-sigmoid)) )
  return loss

#print("Loss =",LCE(1, emo_features[1][0][0]+emo_features[2][0][0]+emo_features[3][0][0])) # loss for positive sentences

In [None]:
x_set_train = [] # set of all emo-column sentences for training and validation - 160 total
y_set_train = [] # emotion assignment for y for each sentence

x_set_valid = []
y_set_valid = []

for i in range(0,len(emo_features)):
  for j in range(0, len(emo_features[i])-20):
    x_set_train.append(emo_features[i][j])
    if i == 0:
      y_set_train.append([0])
    elif i == 1:
      y_set_train.append([1])
    else:
      x_set_train.append(emo_features[i][j]) # double count
      y_set_train.append([0])
      y_set_train.append([1])

print(x_set_train,"\n",y_set_train,"\n")

for i in range(0,len(emo_features)):
  for j in range(30, len(emo_features[i])-10):
    x_set_valid.append(emo_features[i][j])
    if i == 0:
      y_set_valid.append([0])
    elif i == 1:
      y_set_valid.append([1])
    else:
      x_set_valid.append(emo_features[i][j])
      y_set_valid.append([0])
      y_set_valid.append([1])

print(x_set_valid,"\n",y_set_valid,"\n")
print(len(x_set_train),len(y_set_train),len(x_set_train),len(y_set_train))

[[1, 2, 20], [0, 2, 7], [1, 1, 24], [2, 1, 34], [0, 1, 21], [1, 3, 14], [0, 2, 16], [0, 2, 16], [1, 1, 22], [0, 5, 28], [0, 5, 28], [1, 4, 24], [0, 1, 23], [0, 1, 14], [0, 3, 8], [0, 3, 18], [1, 3, 13], [0, 0, 32], [1, 5, 21], [1, 2, 45], [0, 1, 18], [1, 3, 26], [0, 1, 7], [1, 5, 21], [2, 2, 29], [3, 2, 36], [0, 4, 24], [1, 1, 32], [0, 5, 45], [2, 0, 25], [2, 0, 7], [2, 0, 18], [3, 0, 30], [2, 1, 22], [1, 0, 18], [4, 1, 22], [2, 0, 21], [3, 0, 21], [7, 0, 33], [14, 0, 66], [3, 0, 41], [5, 0, 38], [1, 0, 20], [2, 0, 11], [3, 0, 16], [3, 0, 31], [4, 1, 30], [12, 0, 48], [6, 0, 37], [1, 0, 26], [4, 0, 23], [2, 1, 11], [4, 0, 27], [2, 0, 31], [5, 0, 27], [5, 1, 27], [1, 1, 20], [11, 0, 63], [2, 1, 41], [2, 1, 41], [1, 1, 13], [1, 1, 13], [2, 3, 10], [2, 3, 10], [2, 2, 34], [2, 2, 34], [2, 3, 33], [2, 3, 33], [1, 1, 14], [1, 1, 14], [2, 2, 24], [2, 2, 24], [1, 1, 12], [1, 1, 12], [2, 3, 31], [2, 3, 31], [3, 2, 68], [3, 2, 68], [3, 2, 39], [3, 2, 39], [4, 2, 38], [4, 2, 38], [4, 2, 45], [4, 

In [None]:
## 4.1 SGD
def stochasticGradientDescent(x_set_train, y_set_train, x_set_valid, y_set_valid, learning_rate, epochs=300):

  np.random.seed(42) # DISABLE THIS LINE if you don't want to use a seed
  best_list = [] # stores the best loss, learning rate, and theta of each epoch
  loss_list = []
  theta_list = []

  # zip, shuffle the list, then unzip
  for epoch in range(epochs):
    shuffle_train = list(zip(x_set_train, y_set_train))
    np.random.shuffle(shuffle_train)
    x_set_train, y_set_train = list(zip(*shuffle_train))

    # Gradient Descent
    theta = [0, 0, 0, 0]

    for i in range(0,len(x_set_train)):
      dot_prod = (x_set_train[i][0] * theta[0]) + (x_set_train[i][1] * theta[1]) + (x_set_train[i][2] * theta[2]) # x1*theta1 + x2*theta2 + x3*theta3

      gradient = [0,0,0,0] # initialize gradient
      gradient[0] = (posLogRegression(dot_prod + theta[3]) - y_set_train[i][0]) * x_set_train[i][0] # (sigmoid(w*x)-y) * x1
      gradient[1] = (posLogRegression(dot_prod + theta[3]) - y_set_train[i][0]) * x_set_train[i][1] # (sigmoid(w*x)-y) * x2
      gradient[2] = (posLogRegression(dot_prod + theta[3]) - y_set_train[i][0]) * x_set_train[i][2] # (sigmoid(w*x)-y) * x3
      gradient[3] = (posLogRegression(dot_prod + theta[3]) - y_set_train[i][0])

      theta_new = [theta[0]-(learning_rate * gradient[0]),  theta[1]-(learning_rate * gradient[1]),  theta[2]-(learning_rate * gradient[2]), theta[3]-(learning_rate * gradient[3])] # using 0.00001
      theta = theta_new # theta1 = theta0 - (learning rate)(gradient)
    #print("Theta =",theta)

    # Loss (LCE)
    mean_sum = 0
    mean_count = 0

    for i in range(0,len(x_set_valid)):
      mean_sum += LCE(y_set_valid[i][0], x_set_valid[i], theta[slice(3)], theta[-1]) #y, x, weights, slice
      mean_count += 1
    loss_list.append(mean_sum/mean_count)
    theta_list.append(theta)
    #print("Loss =",mean_sum/mean_count)

  # After all the epochs are done
  return( [learning_rate, min(loss_list), theta_list[loss_list.index(min(loss_list))]] )



learning_rate_post = []
loss_post = []
theta_post = []

learning_rates = [0.00001, 0.00005, 0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5]
for lr in learning_rates:
  triple = stochasticGradientDescent(x_set_train, y_set_train, x_set_valid, y_set_valid, lr)
  learning_rate_post.append(triple[0])
  loss_post.append(triple[1])
  theta_post.append(triple[2])
print("Lowest loss =",min(loss_post),"of learning rate",learning_rate_post[np.argmin(loss_post)],"and theta",theta_post[np.argmin(loss_post)])

SGD_theta = theta_post[np.argmin(loss_post)]

  loss = -( (y)*(np.log(sigmoid)) + (1-y)*(np.log(1-sigmoid)) )


Lowest loss = 0.5943424469678538 of learning rate 0.01 and theta [0.2614151827045442, -0.38593558446464193, 0.003963059448084824, -0.06697975593672056]


In [None]:
# 5.1 Test Set with confusion matrix

predict_list = []
actual_list = []

x_set_test = []
y_set_test = []

# prep the test set
for i in range(0,len(emo_features)):
  for j in range(40, len(emo_features[i])):
    x_set_test.append(emo_features[i][j])
    if i == 0:
      y_set_test.append([0])
    elif i == 1:
      y_set_test.append([1])
    else:
      x_set_test.append(emo_features[i][j])
      y_set_test.append([0])
      y_set_test.append([1])
#print(x_set_test,"\n",y_set_test)



for i in range(0, len(x_set_test)):
  actual_list.append(y_set_test[i])
  mod_val_test = posLogRegression( (x_set_test[i][0]*SGD_theta[0])+(x_set_test[i][1]*SGD_theta[1])+(x_set_test[i][2]*SGD_theta[2]) + SGD_theta[3])
  #print(mod_val_test)
  if mod_val_test >= .5:
    predict_list.append(1)
  else:
    predict_list.append(0)


cm = confusion_matrix(actual_list, predict_list)

print("Confusion Matrix: Columns are the predicted values, Rows are the actual values\n Top Left is Sadness, bottom right is Joy")
print(cm)

Confusion Matrix: Columns are the predicted values, Rows are the actual values
 Top Left is Sadness, bottom right is Joy
[[17  8]
 [ 8 17]]


In [None]:
## 5.2 Calculations

TP = cm[1][1]
TN = cm[0][0]
FP = cm[0][1]
FN = cm[1][0]
print("TP =",TP,", TN =",TN,", FP =",FP,", FN =",FN)

# Accuracy = (TP + TN) / (TP + TN + FP + FN)
print("Accuracy =", (TP + TN) / (TP + TN + FP + FN) )
# Precision = TP ÷ (TP + FP)
print("Precision =", TP / (TP + FP) )
# Recall = TP ÷ (TP + FN)
print("Recall =", TP / (TP + FN) )
# F1 Score
category_index = 5
print("F1 =", f1_score(actual_list, predict_list, average='weighted', labels=np.unique(predict_list)))

TP = 17 , TN = 17 , FP = 8 , FN = 8
Accuracy = 0.68
Precision = 0.68
Recall = 0.68
F1 = 0.68


In [None]:
## 5.3

# A better implementation feature would probably be word2vec, but since the data has sentences with mixed classification.
# However, it can still help to get better meaning or context of the sentence, compared to the context of the original word. This gives a more accurate classification overall as a feature,
# and increases overall performance by using low density vectors from aggregate word vectors, and better precision shows a lower loss from validation.
# Other things to keep in mind is not just occurance of words, but their strength. Instead of simply the association of words represented by X1 and X2, having their association multiplied
# by some scale value can help to provide a more accurate classification while not increasing performance or increasing the loss