Packages

In [82]:
import numpy as np
import pandas as pd
# import matplotlib.pyplot as plt # didn't use, can be deleted
import tensorflow as tf
from sklearn.model_selection import train_test_split
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

In [83]:
import nltk
#seperating words
nltk.download('punkt_tab')
#removing insignificant words
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\brian\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\brian\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Data Cleaning

In [84]:
#load data from csv
emotions_data = pd.read_csv("emotions.csv", encoding = "UTF-8")
emotions_data.head(5)

Unnamed: 0,text,label
0,i just feel really helpless and heavy hearted,4
1,ive enjoyed being able to slouch about relax a...,0
2,i gave up my internship with the dmrg and am f...,4
3,i dont know i feel so lost,0
4,i am a kindergarten teacher and i am thoroughl...,4


In [85]:
#shape of emotion data
emotions_data.shape

(416809, 2)

In [86]:
#check for missing values in dataset
emotions_data.isnull().sum()

text     0
label    0
dtype: int64

In [87]:
#check distribution of 'label' column
emotions_data['label'].value_counts()

label
1    141067
0    121187
3     57317
4     47712
2     34554
5     14972
Name: count, dtype: int64

In [88]:
#downsample
min_count = emotions_data['label'].value_counts().min()
selected_cols = ['label'] + [col for col in emotions_data.columns if col != 'label']
emotions_data = emotions_data.groupby('label')[selected_cols].apply(lambda x: x.sample(min_count, random_state=18)).reset_index(drop=True)
emotions_data['label'].value_counts()

label
0    14972
1    14972
2    14972
3    14972
4    14972
5    14972
Name: count, dtype: int64

In [89]:
emotions_data.head(10)

Unnamed: 0,label,text
0,0,i always feel like abby is being punished for ...
1,0,i used to feel devastated when someone critici...
2,0,i feel like i ve lost someone utterly irreplac...
3,0,i go and feel morose about this
4,0,i had been feeling out my more submissive side...
5,0,i lacked just reinforced my feelings of guilt ...
6,0,i was for the first time since i came over her...
7,0,i had so many peo ple to be grate ful for so w...
8,0,i love the sunset because after a tiring dayti...
9,0,im ashamed to admit it but i do have that comp...


Word2Vec Word Embedding

In [9]:
#loads pretrained model for vectorizing sentences, Word2Vec
path = "GoogleNews-vectors-negative300.bin.gz"
model = KeyedVectors.load_word2vec_format(path, binary=True)
vocab_size = len(model.index_to_key)

In [90]:
meaningless_bank = [ # stop word bank
    "i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours", "yourself", 
    "yourselves", "he", "him", "his", "himself", "she", "her", "hers", "herself", "it", "its", "itself", 
    "they", "them", "their", "theirs", "themselves", "what", "which", "who", "whom", "this", "that", 
    "these", "those", "am", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", 
    "having", "do", "does", "did", "doing", "a", "an", "the", "and", "but", "if", "or", "because", 
    "as", "until", "while", "of", "at", "by", "for", "with", "about", "against", "between", "into", 
    "through", "during", "before", "after", "above", "below", "to", "from", "up", "down", "in", "out", 
    "on", "off", "over", "under", "again", "further", "then", "once", "here", "there", "when", "where", 
    "why", "how", "all", "any", "both", "each", "few", "more", "most", "other", "some", "such", "no", 
    "nor", "not", "only", "own", "same", "so", "than", "too", "very", "s", "t", "can", "will", "just", 
    "don", "should", "now", "uh", "um", "well", "like", "okay", "yeah", "oh", "ah", "er", "hm", "hmm"
]

In [91]:
#method for mean pooling
tokenizer = nltk.RegexpTokenizer(r'\w+')
def embedSentence_meanPooling(row):
    #uses nltk to seperate sentence into words omitting punctuation
    tokens = tokenizer.tokenize(row)
    #averages all token vectors
    vector = np.array([0]*300)
    count = 0
    for i in tokens:
        #skips stopwords
        if i in stop_words:
            continue
        #only adds tokens present in the model
        if i in model:
            vector = vector + model.get_vector(i)
            count += 1
    vector = vector/max(1, count)
    return vector

In [92]:
#method for max pooling
tokenizer = nltk.RegexpTokenizer(r'\w+')
def embedSentence_maxPooling(row):
    #uses nltk to seperate sentence into words omitting punctuation
    tokens = tokenizer.tokenize(row)
    #
    vector = np.array([None]*300)
    #
    for i in tokens:
        #skip stopwords
        if i in stop_words:
            continue
        #only adds tokens present in the model
        if i in model:
            if vector[0] == None:
                vector = model.get_vector(i).copy()
            else:
                for feature in range(len(vector)):
                    if model.get_vector(i)[feature] > vector[feature]:
                        vector[feature] = model.get_vector(i)[feature]
    return vector

In [93]:
#embeds all x from word to values
emotions_data["text"] = emotions_data["text"].apply(embedSentence_meanPooling)
emotions_data.head(5)

Unnamed: 0,label,text
0,0,"[0.03204345703125, 0.037322998046875, 0.078208..."
1,0,"[0.059901646205357144, 0.04827880859375, 0.070..."
2,0,"[0.08094889322916667, 0.021000162760416666, 0...."
3,0,"[0.083984375, 0.06870524088541667, -0.04882812..."
4,0,"[-0.015956333705357144, 0.076171875, -0.036080..."


In [94]:
#set input and output
X = emotions_data["text"].values
y = emotions_data["label"].values

In [101]:
#compress X into batch dimension vector to feed into tensorflow
X = np.stack(X).astype(np.float32)
X.shape

(89832, 300)

In [121]:
#split x, y into training
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=0)
X_valid, X_test, y_valid, y_test = train_test_split(X_temp, y_temp, test_size=0.4, random_state=0)

In [122]:
#builds tensorflow model
tf_model = tf.keras.Sequential([
                                tf.keras.layers.Dense(12, activation='relu'),
                                # tf.keras.layers.Dropout(0.15), # let each layer randomly forget some data, prevent overfitting
                                # tf.keras.layers.BatchNormalization(), #  normalizes the activations (outputs) of the previous layer, make the training more stable
                                tf.keras.layers.Dense(6, activation='softmax')
                                ])

In [123]:
tf_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
                loss='sparse_categorical_crossentropy',
                metrics=["accuracy"])

In [124]:
tf_model.evaluate(X_train, y_train)
# tf_model.evaluate(X, y) # using the original data is fine

[1m1685/1685[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 684us/step - accuracy: 0.1730 - loss: 1.7953


[1.79551100730896, 0.17373235523700714]

In [125]:
tf_model.fit(X_train, y_train, batch_size=16, epochs = 10, validation_data=(X_valid, y_valid))

Epoch 1/10
[1m3369/3369[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 1ms/step - accuracy: 0.5143 - loss: 1.3509 - val_accuracy: 0.6770 - val_loss: 0.9062
Epoch 2/10
[1m3369/3369[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 1ms/step - accuracy: 0.6846 - loss: 0.8783 - val_accuracy: 0.6934 - val_loss: 0.8437
Epoch 3/10
[1m3369/3369[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 1ms/step - accuracy: 0.7057 - loss: 0.8229 - val_accuracy: 0.7048 - val_loss: 0.8155
Epoch 4/10
[1m3369/3369[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 1ms/step - accuracy: 0.7100 - loss: 0.8038 - val_accuracy: 0.7062 - val_loss: 0.8002
Epoch 5/10
[1m3369/3369[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 1ms/step - accuracy: 0.7116 - loss: 0.7904 - val_accuracy: 0.7106 - val_loss: 0.7956
Epoch 6/10
[1m3369/3369[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 1ms/step - accuracy: 0.7192 - loss: 0.7707 - val_accuracy: 0.7097 - val_loss: 0.7834
Epoch 7/10
[1m3

<keras.src.callbacks.history.History at 0x19c967e2ab0>

In [131]:
#test on one sample sentence: sadness (0), joy (1), love (2), anger (3), fear (4), and surprise (5)
sample_sentence = np.stack(embedSentence_meanPooling("easy lemons")).astype(np.float32)
sample_sentence = np.expand_dims(sample_sentence, axis=0)  
predicted = np.argmax(tf_model.predict(sample_sentence), axis=1)
predicted

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step


array([0], dtype=int64)

In [127]:
tf_model.evaluate(X_test, y_test)

[1m450/450[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 687us/step - accuracy: 0.7197 - loss: 0.7575


[0.7539156675338745, 0.7197022438049316]