In [129]:
import string
import numpy as np
import pandas as pd
import tensorflow as tf
from nltk.corpus import stopwords
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [130]:
dataset = pd.read_csv("../lab2/emotion.csv").query("Emotion == 'happy' or Emotion == 'sadness'")[:2000]
le = LabelEncoder()
dataset["Emotion"] = le.fit_transform(dataset["Emotion"])

In [131]:
def process_text(text):
  result = [char for char in text if char not in string.punctuation]
  result = "".join(result)
  result = [word.lower() for word in result.split() if word.lower() not in stopwords.words("english")]
  result = " ".join(result)
  return result

dataset["Text"] = dataset['Text'].apply(process_text)

In [132]:
dataset

Unnamed: 0,Text,Emotion
0,didnt feel humiliated,1
1,go feeling hopeless damned hopeful around some...,1
5,ive feeling little burdened lately wasnt sure,1
8,petronas years feel petronas performed well ma...,0
10,feel like make suffering seeing mean something,1
...,...,...
3193,ive feeling far perfect area motherhood,0
3195,try let anxiety show make feel unwelcome,1
3197,feel peaceful secure independent,0
3199,feeling lil groggy cough medicine,1


In [133]:
X = dataset["Text"]
Y = dataset["Emotion"]
print(X.shape, Y.shape)

(2000,) (2000,)


In [134]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [135]:
tokenizer = tf.keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(X_train.tolist())

X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

max_sequence_length = 100
X_train = tf.keras.preprocessing.sequence.pad_sequences(X_train, maxlen=max_sequence_length)
X_test = tf.keras.preprocessing.sequence.pad_sequences(X_test, maxlen=max_sequence_length)

print(len(X_train), len(X_test))

1600 400


In [152]:
model = tf.keras.models.Sequential()
model.add(tf.keras.layers.Embedding(input_dim=10000, output_dim=64))
model.add(tf.keras.layers.LSTM(128))
model.add(tf.keras.layers.Dense(1, activation='sigmoid'))

In [153]:
model.compile(
  optimizer='adam',
  loss='binary_crossentropy',
  metrics=['accuracy'],
)

In [154]:
print(X_train, Y_train)
model.fit(X_train, Y_train, epochs=1)

[[   0    0    0 ... 1598  205 1599]
 [   0    0    0 ...   42   63   43]
 [   0    0    0 ...    7 1602  650]
 ...
 [   0    0    0 ...   44 4026 4027]
 [   0    0    0 ...    1    3   60]
 [   0    0    0 ...  150   49  280]] 1569    0
387     0
1319    0
1131    0
694     0
       ..
1814    0
2079    1
1398    0
2355    1
1808    1
Name: Emotion, Length: 1600, dtype: int32


<keras.src.callbacks.History at 0x205198ff610>

In [165]:
Y_pred = model.predict(X_test)
print(Y_pred)

[[0.3965581 ]
 [0.4302944 ]
 [0.4226237 ]
 [0.37898216]
 [0.39453062]
 [0.40522707]
 [0.4799567 ]
 [0.41123104]
 [0.3878377 ]
 [0.47646946]
 [0.45063698]
 [0.408675  ]
 [0.4095886 ]
 [0.4783295 ]
 [0.44520617]
 [0.40026647]
 [0.4150681 ]
 [0.3963551 ]
 [0.41112778]
 [0.3065006 ]
 [0.399223  ]
 [0.3319161 ]
 [0.4164219 ]
 [0.41803825]
 [0.47933966]
 [0.40872833]
 [0.47852328]
 [0.3393287 ]
 [0.41251767]
 [0.447223  ]
 [0.38623315]
 [0.43859246]
 [0.34046125]
 [0.42735866]
 [0.38359764]
 [0.38275346]
 [0.42187616]
 [0.3482417 ]
 [0.41920337]
 [0.38613674]
 [0.33125386]
 [0.40271327]
 [0.3729375 ]
 [0.39513293]
 [0.4311806 ]
 [0.47714975]
 [0.40238526]
 [0.4101881 ]
 [0.4349767 ]
 [0.36076468]
 [0.4223596 ]
 [0.4614678 ]
 [0.40392098]
 [0.42276874]
 [0.41313565]
 [0.45927402]
 [0.41054904]
 [0.3889573 ]
 [0.4208138 ]
 [0.48325807]
 [0.41502064]
 [0.41793674]
 [0.4416853 ]
 [0.48050368]
 [0.48125494]
 [0.38278577]
 [0.43578303]
 [0.3990578 ]
 [0.41073835]
 [0.38401297]
 [0.3681938 ]
 [0.41

In [174]:
Y_pred = np.round(Y_pred)
print(accuracy_score(Y_test, Y_pred))

0.5825
