Packages

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
from sklearn.model_selection import train_test_split
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

In [2]:
import nltk
#seperating words
nltk.download('punkt_tab')
#removing insignificant words
nltk.download('stopwords')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\brian\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\brian\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

Data Cleaning

In [3]:
#load data from csv
emotions_data = pd.read_csv("emotions.csv", encoding = "UTF-8")
emotions_data.head(5)

Unnamed: 0,text,label
0,i just feel really helpless and heavy hearted,4
1,ive enjoyed being able to slouch about relax a...,0
2,i gave up my internship with the dmrg and am f...,4
3,i dont know i feel so lost,0
4,i am a kindergarten teacher and i am thoroughl...,4


In [4]:
#shape of emotion data
emotions_data.shape

(416809, 2)

In [5]:
#check for missing values in dataset
emotions_data.isnull().sum()

text     0
label    0
dtype: int64

In [6]:
#check distribution of 'label' column
emotions_data['label'].value_counts()

label
1    141067
0    121187
3     57317
4     47712
2     34554
5     14972
Name: count, dtype: int64

In [7]:
#downsample
min_count = emotions_data['label'].value_counts().min()
selected_cols = ['label'] + [col for col in emotions_data.columns if col != 'label']
emotions_data = emotions_data.groupby('label')[selected_cols].apply(lambda x: x.sample(min_count, random_state=18)).reset_index(drop=True)
emotions_data['label'].value_counts()

label
0    14972
1    14972
2    14972
3    14972
4    14972
5    14972
Name: count, dtype: int64

In [8]:
emotions_data.head(10)

Unnamed: 0,label,text
0,0,i always feel like abby is being punished for ...
1,0,i used to feel devastated when someone critici...
2,0,i feel like i ve lost someone utterly irreplac...
3,0,i go and feel morose about this
4,0,i had been feeling out my more submissive side...
5,0,i lacked just reinforced my feelings of guilt ...
6,0,i was for the first time since i came over her...
7,0,i had so many peo ple to be grate ful for so w...
8,0,i love the sunset because after a tiring dayti...
9,0,im ashamed to admit it but i do have that comp...


Word2Vec Word Embedding

In [9]:
#loads pretrained model for vectorizing sentences, Word2Vec
path = "GoogleNews-vectors-negative300.bin.gz"
model = KeyedVectors.load_word2vec_format(path, binary=True)
vocab_size = len(model.index_to_key)

In [10]:
#method for mean pooling
tokenizer = nltk.RegexpTokenizer(r'\w+')
def embedSentence_meanPooling(row):
    #uses nltk to seperate sentence into words omitting punctuation
    tokens = tokenizer.tokenize(row)
    #averages all token vectors
    vector = np.array([0]*300)
    count = 0
    for i in tokens:
        if i in model: #only adds tokens present in the model
            vector = vector + model.get_vector(i)
            count += 1
    vector = vector/max(1, count)
    return vector

In [11]:
#embeds all x from word to values
emotions_data["text"] = emotions_data["text"].apply(embedSentence_meanPooling)

In [12]:
#set input and output
X = emotions_data["text"].values
y = emotions_data["label"].values

In [13]:
#compress X into batch dimension vector to feed into tensorflow
X = np.stack(X).astype(np.float32)
X.shape

(89832, 300)

In [14]:
#split x, y into training
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=0)
X_valid, X_test, y_valid, y_test = train_test_split(X_temp, y_temp, test_size=0.4, random_state=0)

In [15]:
#builds tensorflow model
tf_model = tf.keras.Sequential([
                                tf.keras.layers.Dense(12, activation='relu'),
                                tf.keras.layers.Dense(6, activation='softmax')
                                ])

In [16]:
tf_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
                loss='sparse_categorical_crossentropy',
                metrics=["accuracy"])

In [20]:
tf_model.evaluate(X_train, y_train)

[1m1685/1685[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 682us/step - accuracy: 0.7075 - loss: 0.8036


[0.8056967854499817, 0.707805335521698]

In [18]:
tf_model.fit(X_train, y_train, batch_size=16, epochs = 10, validation_data=(X_valid, y_valid))

Epoch 1/10
[1m3369/3369[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 1ms/step - accuracy: 0.4218 - loss: 1.5228 - val_accuracy: 0.6003 - val_loss: 1.0666
Epoch 2/10
[1m3369/3369[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 1ms/step - accuracy: 0.6402 - loss: 1.0075 - val_accuracy: 0.6616 - val_loss: 0.9418
Epoch 3/10
[1m3369/3369[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 1ms/step - accuracy: 0.6671 - loss: 0.9222 - val_accuracy: 0.6668 - val_loss: 0.9059
Epoch 4/10
[1m3369/3369[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 1ms/step - accuracy: 0.6787 - loss: 0.8827 - val_accuracy: 0.6776 - val_loss: 0.8793
Epoch 5/10
[1m3369/3369[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 1ms/step - accuracy: 0.6843 - loss: 0.8686 - val_accuracy: 0.6847 - val_loss: 0.8646
Epoch 6/10
[1m3369/3369[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 1ms/step - accuracy: 0.6952 - loss: 0.8434 - val_accuracy: 0.6799 - val_loss: 0.8673
Epoch 7/10
[1m3

<keras.src.callbacks.history.History at 0x1856debbec0>

In [21]:
#test on one sample sentence: sadness (0), joy (1), love (2), anger (3), fear (4), and surprise (5)
sample_sentence = np.stack(embedSentence_meanPooling("god i would happy dying right now")).astype(np.float32)
sample_sentence = np.expand_dims(sample_sentence, axis=0)   
predicted = np.argmax(tf_model.predict(sample_sentence), axis=1)
predicted

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step


array([1], dtype=int64)