Packages

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
from sklearn.model_selection import train_test_split
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

In [2]:
import nltk
#seperating words
nltk.download('punkt_tab')
#removing insignificant words
nltk.download('stopwords')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\brian\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\brian\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

Data Cleaning

In [45]:
#load data from csv
emotions_data = pd.read_csv("emotions.csv", encoding = "UTF-8")
emotions_data.head(5)

Unnamed: 0,text,label
0,i just feel really helpless and heavy hearted,4
1,ive enjoyed being able to slouch about relax a...,0
2,i gave up my internship with the dmrg and am f...,4
3,i dont know i feel so lost,0
4,i am a kindergarten teacher and i am thoroughl...,4


In [14]:
#shape of emotion data
emotions_data.shape

(416809, 2)

In [15]:
#check for missing values in dataset
emotions_data.isnull().sum()

text     0
label    0
dtype: int64

In [36]:
#check distribution of 'label' column
emotions_data['label'].value_counts()

label
1    141067
0    121187
3     57317
4     47712
2     34554
5     14972
Name: count, dtype: int64

In [48]:
#downsample
min_count = emotions_data['label'].value_counts().min()
selected_cols = ['label'] + [col for col in emotions_data.columns if col != 'label']
emotions_data = emotions_data.groupby('label')[selected_cols].apply(lambda x: x.sample(min_count, random_state=18)).reset_index(drop=True)
emotions_data['label'].value_counts()

label
0    14972
1    14972
2    14972
3    14972
4    14972
5    14972
Name: count, dtype: int64

In [49]:
emotions_data.head(10)

Unnamed: 0,label,text
0,0,i feel like the saddle got in the way or inhib...
1,0,i feel failure and discontent when they are no...
2,0,i am sticky to you sending me cutesy futsy mes...
3,0,i only feel its aching and sore
4,0,i feel that sometimes my lessons are too borin...
5,0,i feel kind of dull these days
6,0,i loved him with all my heart and i feel empty...
7,0,i feel like we broke up but skipping the actua...
8,0,i already feel ugly here on the mission and i ...
9,0,i am feeling rather stressed out fed up and ov...


Word2Vec Word Embedding

In [50]:
#loads pretrained model for vectorizing sentences, Word2Vec
path = "GoogleNews-vectors-negative300.bin.gz"
model = KeyedVectors.load_word2vec_format(path, binary=True)
vocab_size = len(model.index_to_key)

In [51]:
#method for mean pooling
tokenizer = nltk.RegexpTokenizer(r'\w+')
def embedSentence_meanPooling(row):
    #uses nltk to seperate sentence into words omitting punctuation
    tokens = tokenizer.tokenize(row)
    #averages all token vectors
    vector = np.array([0]*300)
    count = 0
    for i in tokens:
        if i in model: #only adds tokens present in the model
            vector = vector + model.get_vector(i)
            count += 1
    vector = vector/max(1, count)
    return vector

In [52]:
#embeds all x from word to values
emotions_data["text"] = emotions_data["text"].apply(embedSentence_meanPooling)

In [53]:
#set input and output
X = emotions_data["text"].values
y = emotions_data["label"].values

In [54]:
#compress X into vector to convert to tensor
X = np.stack(X).astype(np.float32)

In [55]:
#split x, y into training
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=0)
X_valid, X_test, y_valid, y_test = train_test_split(X_temp, y_temp, test_size=0.4, random_state=0)

In [56]:
#builds tensorflow model
tf_model = tf.keras.Sequential([
                                tf.keras.layers.Dense(12, activation='relu'),
                                tf.keras.layers.Dense(6, activation='softmax')
                                ])

In [57]:
tf_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
                loss='sparse_categorical_crossentropy',
                metrics=["accuracy"])

In [58]:
tf_model.evaluate(X_train, y_train)

[1m1685/1685[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 747us/step - accuracy: 0.1686 - loss: 1.7987


[1.7988133430480957, 0.16805507242679596]

In [59]:
tf_model.fit(X_train, y_train, batch_size=16, epochs = 5, validation_data=(X_valid, y_valid))

Epoch 1/5
[1m3369/3369[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 1ms/step - accuracy: 0.4011 - loss: 1.5107 - val_accuracy: 0.5992 - val_loss: 1.0860
Epoch 2/5
[1m3369/3369[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 1ms/step - accuracy: 0.6155 - loss: 1.0460 - val_accuracy: 0.6548 - val_loss: 0.9498
Epoch 3/5
[1m3369/3369[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 1ms/step - accuracy: 0.6545 - loss: 0.9377 - val_accuracy: 0.6669 - val_loss: 0.9083
Epoch 4/5
[1m3369/3369[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 1ms/step - accuracy: 0.6771 - loss: 0.8864 - val_accuracy: 0.6846 - val_loss: 0.8730
Epoch 5/5
[1m3369/3369[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 1ms/step - accuracy: 0.6834 - loss: 0.8651 - val_accuracy: 0.6862 - val_loss: 0.8571


<keras.src.callbacks.history.History at 0x17acac29550>