In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

import random
import re                                  
import string  

import nltk
from nltk.corpus import stopwords          
from nltk.stem import PorterStemmer        
from nltk.tokenize import word_tokenize
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import OneHotEncoder
from keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical

from sklearn.model_selection import train_test_split

In [2]:
simplified_emotions = pd.read_csv(r"C:\Users\domin\Desktop\Year 2 Block C\2023-24c-fai2-adsai-DominikSzewczyk224180\Datasets\simplified_emotions.csv")

In [3]:
X_train, X_test, y_train, y_test = train_test_split(simplified_emotions['sentence'], simplified_emotions['emotion'], test_size=0.2, random_state=42)

In [4]:
labelencoder = LabelEncoder()
y_train = labelencoder.fit_transform(y_train)
y_test = labelencoder.transform(y_test)

y_train = to_categorical(y_train)
y_test = to_categorical(y_test)

In [5]:
def sentence_processor(sentence):
    sentence = re.sub('https?\S+|#+', '', sentence)

    sentence = re.sub('#', '', sentence)

    tokenizer = word_tokenize

    processed_sentence = tokenizer(sentence)

    stopwords_english = set(stopwords.words('english'))
    processed_sentence = [word for word in processed_sentence if word.lower() not in stopwords_english]

    processed_sentence = [word for word in processed_sentence if word.lower() not in string.punctuation]

    stemmer = PorterStemmer()
    processed_sentence = [stemmer.stem(word) for word in processed_sentence]

    return processed_sentence

def sentence_processor_df(df):
    processed_sentences_list = []
    for sentence in df:
        processed_sentence = sentence_processor(sentence)
        processed_sentences_list.append(processed_sentence)
    return processed_sentences_list

In [6]:
X_train = sentence_processor_df(X_train)

train_sentences_str = []
for token in X_train:
    train_sentences_str.append(' '.join(token)) 

X_test = sentence_processor_df(X_test)

test_sentences_str = []
for token in X_test:
    test_sentences_str.append(' '.join(token))

In [7]:
tokenizer = Tokenizer(filters = '')
tokenizer.fit_on_texts(train_sentences_str)

In [8]:
vocab_size = len(train_sentences_str)
print(vocab_size)

77384


In [9]:
training_sequences = tokenizer.texts_to_sequences(train_sentences_str)
training_padded = pad_sequences(training_sequences, padding='post',maxlen = 30) 

test_sequences = tokenizer.texts_to_sequences(test_sentences_str)
test_padded = pad_sequences(test_sequences, padding='post',maxlen = 30)

In [10]:
X_train = training_padded
X_test = test_padded

---


In [11]:
from keras.models import Sequential
from keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout, SimpleRNN, LSTM

model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=100, input_length=30))
model.add(LSTM(units=128, activation='relu'))
model.add(Dense(256, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(128, activation='relu'))
model.add(Dense(units=2, activation='sigmoid'))

model.summary()

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])



Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 30, 100)           7738400   
                                                                 
 lstm (LSTM)                 (None, 128)               117248    
                                                                 
 dense (Dense)               (None, 256)               33024     
                                                                 
 dropout (Dropout)           (None, 256)               0         
                                                                 
 dense_1 (Dense)             (None, 128)               32896     
                                                                 
 dense_2 (Dense)             (None, 2)                 258       
                                                                 
Total params: 7,921,826
Trainable params: 7,921,826
Non-

In [12]:
history = model.fit(X_train, y_train, epochs=3, batch_size=16, validation_data = (X_test,y_test))

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [13]:
from sklearn.metrics import accuracy_score, f1_score

y_pred = model.predict(X_test)

y_pred_labels = np.argmax(y_pred, axis=1)
y_test_labels = np.argmax(y_test, axis=1)

accuracy = accuracy_score(y_test_labels, y_pred_labels)
f1 = f1_score(y_test_labels, y_pred_labels, average='macro')

print("Accuracy:", accuracy)
print("F1 Score:", f1)

Accuracy: 0.7396877907577794
F1 Score: 0.42518421678155455


kaggle

In [14]:
kaggle_test = pd.read_csv(r"C:\Users\domin\Desktop\Year 2 Block C\2023-24c-fai2-adsai-DominikSzewczyk224180\Datasets\test.csv", sep = "\t")

In [15]:
tokenized_test_sentences = tokenizer.texts_to_sequences(kaggle_test['sentence'])
padded_test_sentences = pad_sequences(tokenized_test_sentences, maxlen=30)

predictions = model.predict(padded_test_sentences)

predicted_labels = ['happiness' if pred.argmax() == 1 else 'other' for pred in predictions]

results_df = pd.DataFrame({'id': range(len(kaggle_test)), 'emotion': predicted_labels})

results_df.to_csv('predictions_LSTM_model.csv', index=False)

