In [1]:
import numpy as np
import nltk
nltk.download('punkt')
import warnings
warnings.filterwarnings("ignore")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\a_lolooh\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Setup 

#### this class is purely for organisation and ease of access, instead of messy nested array access , we just create a posts object with attributes ( the post itself, its emotion, its sentiment )
                

In [2]:
from nltk.tokenize import word_tokenize as tokenize
class Posts : 
    def __init__(self, post, emotion, sentiment):
        self.post = post
        self.emotion = emotion
        self.sentiment = sentiment
        self.tokens = list(tokenize(post))
        
    def print(self):
        print(f"post: {self.post} emotion: {self.emotion} sentiment: {self.sentiment}")
        
        

## Loading file 

In [3]:
import json
import gzip

posts = []

# the file given to us is a gzip which has a json inside it, so we need to unzip first and then load the json file 

with gzip.open('goemotions.json.gz', 'r') as f: # unzipping
    data = json.loads(f.read(), encoding="utf-8") # loading json
    for line in data:
        posts.append(Posts(line[0],line[1],line[2])) #creating the object and appending to the list 
        
# basically posts is a list of objects where each object has its info as attributes ( see above )

# PART 3 Word2Vec

In [4]:
import gensim
import gensim.downloader as api
import nltk 
import joblib
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\a_lolooh\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## 3.1 Downloading Model

In [5]:
model = api.load('word2vec-google-news-300')

## 3.2 - 3.3 Tokenization and Embeddings
#### Tokens were implicitly generated during object instantiation

In [6]:
embeddings = []
hit = 0
miss = 0 

for i, post in enumerate(posts):
    A = [] 
    ll = post.tokens
    for token in ll:
        hit += 1
        try:
            A.append(model[token])
        except KeyError:
            miss+=1
            continue
    if not A: # if no word2vec fill in zeroes 
        post.embedding = np.zeros(300, dtype='int')
        continue
    np_arr = np.vstack(A)
    avg = np.average(np_arr, axis=0)
    post.embedding = np.copy(avg)


In [7]:
print (f"Number of tokens is : {hit}")

Number of tokens is : 2642139


### Train/Test Split

In [9]:
from sklearn.model_selection import train_test_split
x = np.array([x.embedding for x in posts])
y_emotion = np.array([x.emotion for x in posts])
y_sentiment = np.array([x.sentiment for x in posts])

trainX, testX, trainY_emotion,  testY_emotion = train_test_split(x, y_emotion, test_size=0.2, random_state = 42) 
trainX, testX, trainY_sent,  testY_sent = train_test_split(x, y_sentiment, test_size=0.2, random_state = 42) 


## 3.4 Computing Hit Rate

In [16]:
trainxHit, testxHit, trainYHit,  testYHit = train_test_split(np.array([x.post for x in posts]), y_emotion, test_size=0.2, random_state = 42) 


In [17]:
hit_train = 0
miss_train = 0 

for post in trainXHit:
    for token in tokenize(post):
        dummyA = []
        hit_train += 1
        try:
            dummyA.append(model[token])
        except KeyError:
            miss_train+=1
            continue

In [18]:
hit_test = 0
miss_test = 0 

for post in testxHit:
    for token in tokenize(post):
        dummyA = []
        hit_test += 1
        try:
            dummyA.append(model[token])
        except KeyError:
            miss_test+=1
            continue

In [19]:
print(f"Hit rate for Training Set: {((hit_train-miss_train)/hit_train)} ")
print(f"Hit rate for Test Set: {((hit_test-miss_test)/hit_test)} ")
print(f"Overall Hit rate: {((hit-miss)/hit)} ")



Hit rate for Training Set: 1.0 
Hit rate for Test Set: 0.7741150983877424 
Overall Hit rate: 0.7745073215300179 


## 3.5 BASE MLP

### Emotion

In [20]:
from sklearn.neural_network import MLPClassifier
from sklearn import metrics

mlp_emotion = MLPClassifier(random_state=2, max_iter=20).fit(trainX, trainY_emotion)
y_pred_base_emotion = mlp_emotion.predict(testX)

In [21]:
mlp_emotion_score = mlp_emotion.score(testX,testY_emotion)
mlp_emotion_score 

0.41851938074729367

### Sentiment

In [22]:
mlp_sent = MLPClassifier(random_state=2, max_iter=20).fit(trainX, trainY_sent)
y_pred_base_sent = mlp_sent.predict(testX)

In [24]:
mlp_sent_score = mlp_sent.score(testX,testY_sent)
mlp_sent_score 

0.5504306832731929

## 3.6 TOP MLP

### Emotion

In [25]:
mlp_gs = MLPClassifier(max_iter=2)
parameter_space = {
    'solver': ["adam", "sgd"],
    'hidden_layer_sizes' : [(10,5),(15,10)],
    'activation' : ["relu", "tanh", "identity"]
}
from sklearn.model_selection import GridSearchCV
clf_emotion = GridSearchCV(mlp_gs, parameter_space, n_jobs=-1, cv=5, verbose = 2)
clf_emotion.fit(trainX, trainY_emotion) # X is train samples and y is the corresponding labels

Fitting 5 folds for each of 12 candidates, totalling 60 fits


GridSearchCV(cv=5, estimator=MLPClassifier(max_iter=2), n_jobs=-1,
             param_grid={'activation': ['relu', 'tanh', 'identity'],
                         'hidden_layer_sizes': [(10, 5), (15, 10)],
                         'solver': ['adam', 'sgd']},
             verbose=2)

In [26]:
print('Best parameters found:\n', clf_emotion.best_params_)
y_pred_top_emotion = clf_emotion.predict(testX)

Best parameters found:
 {'activation': 'tanh', 'hidden_layer_sizes': (15, 10), 'solver': 'adam'}


### Sentiment

In [27]:
clf_sent = GridSearchCV(mlp_gs, parameter_space, n_jobs=-1, cv=5, verbose = 2)
top_sent = clf_sent.fit(trainX, trainY_sent) # X is train samples and y is the corresponding labels

Fitting 5 folds for each of 12 candidates, totalling 60 fits


In [28]:
print('Best parameters found:\n', clf_sent.best_params_)
y_pred_top_sent = clf_sent.predict(testX)

Best parameters found:
 {'activation': 'relu', 'hidden_layer_sizes': (15, 10), 'solver': 'adam'}


## 3.6 Performance

In [29]:
from sklearn.metrics import confusion_matrix , ConfusionMatrixDisplay
from sklearn.metrics import classification_report

In [30]:
def write_to_file(conf,clas, file, title): # expects conf. matrix, classification report, file path and title as 
    with open(file, 'a') as f: # "a" appends text to file
        f.write(f"{title}\n\nConfusion Matrix: \n{conf}\n\nCLassification Report: \n{clas}\n\n\n")
    return None

#### BASE MLP

In [31]:


base_matrix_emotion = confusion_matrix(testY_emotion,y_pred_base_emotion)
base_clas_emotion = classification_report(testY_emotion, y_pred_base_emotion)

write_to_file(base_matrix_emotion,base_clas_emotion, 'performance.txt','WORD2VEC BASE MLP - EMOTION')

base_matrix_sent = confusion_matrix(testY_sent,y_pred_base_sent)
base_clas_sent = classification_report(testY_sent, y_pred_base_sent)

write_to_file(base_matrix_sent,base_clas_sent, 'performance.txt','WORD2VEC BASE MLP - SENTIMENT')

#### TOP MLP

In [32]:

top_matrix_emotion = confusion_matrix(testY_emotion,y_pred_top_emotion)
top_clas_emotion = classification_report(testY_emotion, y_pred_top_emotion)

write_to_file(top_matrix_emotion,top_clas_emotion, 'performance.txt','WORD2VEC TOP MLP - EMOTION')

top_matrix_sent = confusion_matrix(testY_sent,y_pred_top_sent)
top_clas_sent = classification_report(testY_sent, y_pred_top_sent)

write_to_file(base_matrix_sent,base_clas_sent, 'performance.txt','WORD2VEC TOP MLP - SENTIMENT')

## Model 2

In [33]:
model2 = api.load("glove-wiki-gigaword-50")

In [34]:
hit2 = 0
miss2 = 0 

for i, post in enumerate(posts):
    A = []
    ll = post.tokens
    for token in ll:
        hit2 += 1
        try:
            A.append(model2[token])
        except KeyError:
            miss2+=1
            continue
    if not A: # if no word2vec fill in zeroes 
        post.embedding2 = np.zeros(50, dtype='int')
        continue
    np_arr = np.vstack(A)
    avg = np.average(np_arr, axis=0)
    post.embedding2 = np.copy(avg)


In [35]:
x2 = np.array([x.embedding2 for x in posts])


trainX2, testX2, trainY2_emotion,  testY2_emotion = train_test_split(x2, y_emotion, test_size=0.2, random_state = 42) 
trainX2, testX2, trainY2_sent,  testY2_sent = train_test_split(x2, y_sentiment, test_size=0.2, random_state = 42) 

#### Emotion

In [36]:


mlp_emotion2 = MLPClassifier(random_state=2, max_iter=20)
mlp_emotion2.fit(trainX2, trainY2_emotion)
y_pred_base_emotion2 = mlp_emotion2.predict(testX2)

In [38]:
y_pred_base_emotion2 = mlp_emotion2.predict(testX2)
mlp_emotion_score2 = mlp_emotion2.score(testX2,testY2_emotion)
mlp_emotion_score2 

0.35167617273891283

#### Sentiment

In [39]:
mlp_sent2 = MLPClassifier(random_state=2, max_iter=20).fit(trainX2, trainY2_sent)


y_pred_base_sent2 = mlp_sent2.predict(testX2)

In [40]:
mlp_sent_score2 = mlp_sent2.score(testX2,testY2_sent)
mlp_sent_score2 

0.46592364101967176

#### Performance

In [41]:
base_matrix_emotion2 = confusion_matrix(testY2_emotion,y_pred_base_emotion2)
base_clas_emotion2 = classification_report(testY_emotion, y_pred_base_emotion2)

write_to_file(base_matrix_emotion2,base_clas_emotion2, 'performance.txt','WORD2VEC MODEL 2 BASE MLP - EMOTION')

base_matrix_sent2 = confusion_matrix(testY2_sent,y_pred_base_sent2)
base_clas_sent2 = classification_report(testY2_sent, y_pred_base_sent2)

write_to_file(base_matrix_sent2,base_clas_sent2, 'performance.txt','WORD2VEC MODEL 2 BASE MLP - SENTIMENT')

## Model 3

In [None]:
model3 = api.load("fasttext-wiki-news-subwords-300")

In [None]:
hit3 = 0
miss3 = 0 

for i, post in enumerate(posts):
    A = []
    ll = post.tokens
    for token in ll:
        hit3 += 1
        try:
            A.append(model3[token])
        except KeyError:
            miss3+=1
            continue
    if not A: # if no word2vec fill in zeroes 
        post.embedding3 = np.zeros(300, dtype='int')
        continue
    np_arr = np.vstack(A)
    avg = np.average(np_arr, axis=0)
    post.embedding3 = np.copy(avg)


In [None]:
x3 = np.array([x.embedding3 for x in posts])


trainX3, testX3, trainY3_emotion,  testY3_emotion = train_test_split(x3, y_emotion, test_size=0.2, random_state = 42) 
trainX3, testX3, trainY3_sent,  testY3_sent = train_test_split(x3, y_sentiment, test_size=0.2, random_state = 42) 



#### Emotion

In [None]:


mlp_emotion3 = MLPClassifier(random_state=2, max_iter=20)
mlp_emotion3.fit(trainX3, trainY3_emotion)

In [None]:
y_pred_base_emotion3 = mlp_emotion3.predict(testX3)
mlp_emotion_score3 = mlp_emotion3.score(testX3,testY3_emotion)
mlp_emotion_score3 

#### Sentiment 

In [None]:


mlp_sent3 = MLPClassifier(random_state=2, max_iter=20)
mlp_sent3.fit(trainX3, trainY3_sent)


In [None]:
y_pred_base_sent3 = mlp_sent3.predict(testX3)
mlp_sent_score3 = mlp_sent3.score(testX3,testY3_sent)
mlp_sent_score3 

#### Performance

In [None]:
base_matrix_emotion3 = confusion_matrix(testY3_emotion,y_pred_base_emotion3)
base_clas_emotion3 = classification_report(testY_emotion, y_pred_base_emotion3)

write_to_file(base_matrix_emotion3,base_clas_emotion3, 'performance.txt','WORD2VEC MODEL 3 BASE MLP - EMOTION')

base_matrix_sent3 = confusion_matrix(testY3_sent,y_pred_base_sent3)
base_clas_sent3 = classification_report(testY3_sent, y_pred_base_sent3)

write_to_file(base_matrix_sent3,base_clas_sent3, 'performance.txt','WORD2VEC MODEL 3 BASE MLP - SENTIMENT')