In [4]:
import torch
import numpy as np
import pandas as pd
import matplotlib as plt
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data as data
import torch.optim as opt
import matplotlib.pyplot as plt
from torch import Tensor
from tqdm.notebook import tqdm
from scipy.ndimage import gaussian_filter1d
from sklearn.model_selection import train_test_split
from nltk.corpus import wordnet
import nltk

In [5]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [6]:
# function that returns all synonyms to word using WordNet
def get_synonyms(word):
  synonyms = []

  for syn in wordnet.synsets(word):
      for i in syn.lemmas():
          synonyms.append(i.name())

  return list(set(synonyms))

In [7]:
get_synonyms("person")

['person', 'somebody', 'someone', 'mortal', 'individual', 'soul']

In [8]:
# Make data directory if it doesn't exist
!mkdir -p data
# Downloading NRC-emotion-lexicon
!wget -nc https://nyc3.digitaloceanspaces.com/ml-files-distro/v1/upshot-trump-emolex/data/NRC-emotion-lexicon-wordlevel-alphabetized-v0.92.txt -P data

--2023-03-26 17:37:25--  https://nyc3.digitaloceanspaces.com/ml-files-distro/v1/upshot-trump-emolex/data/NRC-emotion-lexicon-wordlevel-alphabetized-v0.92.txt
Resolving nyc3.digitaloceanspaces.com (nyc3.digitaloceanspaces.com)... 162.243.189.2
Connecting to nyc3.digitaloceanspaces.com (nyc3.digitaloceanspaces.com)|162.243.189.2|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2581050 (2.5M) [text/plain]
Saving to: ‘data/NRC-emotion-lexicon-wordlevel-alphabetized-v0.92.txt’


2023-03-26 17:37:26 (38.3 MB/s) - ‘data/NRC-emotion-lexicon-wordlevel-alphabetized-v0.92.txt’ saved [2581050/2581050]



In [9]:
filepath = "data/NRC-emotion-lexicon-wordlevel-alphabetized-v0.92.txt"
emolex_df = pd.read_csv(filepath,  names=["word", "emotion", "association"], skiprows=45, sep='\t', keep_default_na=False)
emolex_words = emolex_df.pivot(index='word', columns='emotion', values='association').reset_index()
emolex_words.head()

emotion,word,anger,anticipation,disgust,fear,joy,negative,positive,sadness,surprise,trust
0,aback,0,0,0,0,0,0,0,0,0,0
1,abacus,0,0,0,0,0,0,0,0,0,1
2,abandon,0,0,0,1,0,1,0,1,0,0
3,abandoned,1,0,0,1,0,1,0,1,0,0
4,abandonment,1,0,0,1,0,1,0,1,1,0


In [10]:
emotions = emolex_words.columns[1:].to_numpy()
emotions

array(['anger', 'anticipation', 'disgust', 'fear', 'joy', 'negative',
       'positive', 'sadness', 'surprise', 'trust'], dtype=object)

In [11]:
word_to_emotions = {}

# for each word in NRC-emotion-lexicon dataset 
# and for its synonyms save emotion vector
for _, row in emolex_words.iterrows():
  word = row['word']
  ems = row[emotions].to_numpy()
  syns = get_synonyms(word)

  for syn in syns:
    word_to_emotions[syn] = ems

len(word_to_emotions)

35318

## Preprocessing

In [12]:
from sklearn import preprocessing

df = pd.read_csv('fb_sentiment.csv')
del df[df.columns[0]]
df = df.rename(columns={"FBPost": "text", "Label": "target"})
oneHot = preprocessing.OneHotEncoder()
oneHot.fit(df["target"].to_numpy().reshape(-1, 1))
cols = oneHot.transform(df["target"].to_numpy().reshape(-1, 1)).toarray().T

for i in range(len(oneHot.categories_[0])):
  df[oneHot.categories_[0][i]] = cols[i]

del df["target"]
df.head()

Unnamed: 0,text,N,O,P
0,Drug Runners and a U.S. Senator have somethin...,0.0,1.0,0.0
1,"Heres a single, to add, to Kindle. Just read t...",0.0,1.0,0.0
2,If you tire of Non-Fiction.. Check out http://...,0.0,1.0,0.0
3,Ghost of Round Island is supposedly nonfiction.,0.0,1.0,0.0
4,Why is Barnes and Nobles version of the Kindle...,1.0,0.0,0.0


In [13]:
import re

# preprocessing each fb post
def preprocess(s: str) -> str:
  s = s.lower()

  # removing urls
  url_regex = r'(https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|www\.[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9]+\.[^\s]{2,}|www\.[a-zA-Z0-9]+\.[^\s]{2,})'
  s = re.sub(url_regex, "__URL__", s)

  # removing links to users
  s = re.sub(r'@[A-Za-z0-9]+', "__AT_USER__", s)

  # replacing emojis with equivalent words
  s = re.sub(r':\)', " cheer ", s)
  s = re.sub(r':-\)', " cheer ", s)
  s = re.sub("<3", " love ", s)
  s = re.sub(":\(", " bad ", s)

  # removing numbers
  s = re.sub(r'\d', '', s)

  return s

df["text"] = df["text"].apply(preprocess)
df.head()

Unnamed: 0,text,N,O,P
0,drug runners and a u.s. senator have somethin...,0.0,1.0,0.0
1,"heres a single, to add, to kindle. just read t...",0.0,1.0,0.0
2,if you tire of non-fiction.. check out __URL__,0.0,1.0,0.0
3,ghost of round island is supposedly nonfiction.,0.0,1.0,0.0
4,why is barnes and nobles version of the kindle...,1.0,0.0,0.0


In [14]:
X_text, y3 = df["text"].to_numpy(), df[["N", "O", "P"]].to_numpy()
X_text.shape, y3.shape

((1000,), (1000, 3))

In [15]:
y1 = []
for i in range(y3.shape[0]):
  y1.append(np.where(y3[i] == 1))
y1 = np.array(y1).reshape((-1))
y1.shape

(1000,)

In [16]:
X_text_train, X_text_test, y_train, y_test = train_test_split(X_text, y1, test_size=0.1)

In [17]:
(y_test == 0).sum(), (y_test == 1).sum(), (y_test == 2).sum()

(13, 26, 61)

## Emotion mining

In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer

# extracting tf-idf features
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(X_text_train)
X = X.toarray()

X.shape

(900, 2597)

In [20]:
X_test = vectorizer.transform(X_text_test).toarray()
X_test.shape

(100, 2597)

In [21]:
cnt = 0
for word in vectorizer.get_feature_names_out():
  if not (word in word_to_emotions):
      cnt += 1
print("Number of words without emotion:", cnt)

Number of words without emotion: 1155


In [29]:
def get_emotions(X):
  X_emotions = []
  all_words = vectorizer.get_feature_names_out()

  # iterating over posts
  for i in tqdm(range(X.shape[0])):
    
    # total emotion on fb post
    sum_emotions = np.zeros(emotions.shape[0])
    
    for j in range(X.shape[1]):
      word = all_words[j]
      if X[i, j] > 0 and word in word_to_emotions:
          sum_emotions += np.array(word_to_emotions[word].astype(int))
    
    # normilizing
    if sum_emotions.sum() > 0:
      normalized_sum = sum_emotions / np.linalg.norm(sum_emotions)
    else:
      normalized_sum = np.zeros_like(sum_emotions)
    
    X_emotions.append(normalized_sum)

  return np.array(X_emotions)

In [30]:
X_ems_train = get_emotions(X)
X_ems_test = get_emotions(X_test)
X_ems_train.shape, X_ems_test.shape

  0%|          | 0/900 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

((900, 10), (100, 10))

In [31]:
# unite tf-idf features and emotion vectors
X_conc_train = np.concatenate((X, X_ems_train), axis=1)
X_conc_test = np.concatenate((X_test, X_ems_test), axis=1)

X_conc_train.shape, X_conc_test.shape

((900, 2607), (100, 2607))

In [59]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn.pipeline import make_pipeline
from sklearn.svm import SVC
from sklearn.metrics import fbeta_score, make_scorer, accuracy_score

clf = SVC(C=10, class_weight='balanced')
clf.fit(X_conc_train, y_train)

print("f1 score:", f1_score(clf.predict(X_conc_test), y_test, average='macro'))
print("accuracy_score", accuracy_score(clf.predict(X_conc_test), y_test))

f1 score: 0.6626756066411239
accuracy_score 0.8


In [48]:
# Analysing errors
cnt = 0
error_types = np.zeros((3, 3))
for i in range(X_conc_test.shape[0]):
  pred = clf.predict(X_conc_test[i].reshape(1, -1))
  error_types[int(y_test[i]), int(pred[0])] += 1
  cnt += y_test[i] != pred[0]
print("Errors: ", cnt)
for i in range(3):
  for j in range(3):
    print("True value:", i, "Predicted:", j, "Number of samples:", error_types[i, j])

Errors:  20
True value: 0 Predicted: 0 Number of samples: 3.0
True value: 0 Predicted: 1 Number of samples: 6.0
True value: 0 Predicted: 2 Number of samples: 4.0
True value: 1 Predicted: 0 Number of samples: 0.0
True value: 1 Predicted: 1 Number of samples: 21.0
True value: 1 Predicted: 2 Number of samples: 5.0
True value: 2 Predicted: 0 Number of samples: 0.0
True value: 2 Predicted: 1 Number of samples: 5.0
True value: 2 Predicted: 2 Number of samples: 56.0
