In [95]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

import random
import re                                  
import string  

import nltk
from nltk.corpus import stopwords          
from nltk.stem import PorterStemmer        
from nltk.tokenize import word_tokenize
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import OneHotEncoder
from keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical

from sklearn.model_selection import train_test_split

In [96]:
df_emotions = pd.read_csv(r"C:\Users\domin\Desktop\Year 2 Block C\2023-24c-fai2-adsai-DominikSzewczyk224180\Datasets\emotion_data_merged.csv")

In [97]:
df_emotions = df_emotions [df_emotions ['emotion'] != 'neutral']
df_emotions.head()

Unnamed: 0,sentence,emotion
0,What?,surprise
3,Hey!,happiness
5,Where?!,surprise
8,"No, I know!",surprise
13,Well! Well! Well! Joey Tribbiani! So you came ...,surprise


In [98]:
X_train, X_test, y_train, y_test = train_test_split(df_emotions['sentence'], df_emotions['emotion'], test_size=0.2, random_state=42)

In [99]:
X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.2, random_state=42)

In [100]:
labelencoder = LabelEncoder()
y_train = labelencoder.fit_transform(y_train)
y_test = labelencoder.transform(y_test)
y_val = labelencoder.transform(y_val)

y_train = to_categorical(y_train)
y_test = to_categorical(y_test)
y_val = to_categorical(y_val)

In [101]:
def sentence_processor(sentence):
    sentence = re.sub('https?\S+|#+', '', sentence)

    sentence = re.sub('#', '', sentence)

    tokenizer = word_tokenize

    processed_sentence = tokenizer(sentence)

    stopwords_english = set(stopwords.words('english'))
    processed_sentence = [word for word in processed_sentence if word.lower() not in stopwords_english]

    processed_sentence = [word for word in processed_sentence if word.lower() not in string.punctuation]

    stemmer = PorterStemmer()
    processed_sentence = [stemmer.stem(word) for word in processed_sentence]

    return processed_sentence

def sentence_processor_df(df):
    processed_sentences_list = []
    for sentence in df:
        processed_sentence = sentence_processor(sentence)
        processed_sentences_list.append(processed_sentence)
    return processed_sentences_list



In [102]:
X_train = sentence_processor_df(X_train)

train_sentences_str = []
for token in X_train:
    train_sentences_str.append(' '.join(token)) 

X_test = sentence_processor_df(X_test)

test_sentences_str = []
for token in X_test:
    test_sentences_str.append(' '.join(token))

X_val = sentence_processor_df(X_val)

val_sentences_str = []
for token in X_val:
    val_sentences_str.append(' '.join(token))

In [103]:
tokenizer = Tokenizer(filters = '')
tokenizer.fit_on_texts(train_sentences_str)

In [104]:
vocab_size = len(train_sentences_str)
print(vocab_size)

38692


In [105]:
training_sequences = tokenizer.texts_to_sequences(train_sentences_str)
training_padded = pad_sequences(training_sequences, padding='post',maxlen = 30) 

test_sequences = tokenizer.texts_to_sequences(test_sentences_str)
test_padded = pad_sequences(test_sequences, padding='post',maxlen = 30)

val_sequences = tokenizer.texts_to_sequences(val_sentences_str)
val_padded = pad_sequences(val_sequences, padding='post',maxlen = 30)

In [106]:
X_train = training_padded
X_test = test_padded
X_val = val_padded

In [107]:
num_classes = 6
max_length = 30

In [108]:
from keras.models import Sequential
from keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout

model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=2, input_length=max_length))
model.add(Conv1D(filters=128, kernel_size=5, activation='relu'))
model.add(Conv1D(filters=128, kernel_size=5, activation='relu'))
model.add(GlobalMaxPooling1D())
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(64, activation='relu'))
model.add(Dense(num_classes, activation='softmax'))


model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [109]:
history = model.fit(X_train, y_train, epochs=3, batch_size=16, validation_data=(X_val, y_val))

Epoch 1/3


Epoch 2/3
Epoch 3/3


In [110]:
from sklearn.metrics import accuracy_score, f1_score

y_pred = model.predict(X_test)

y_pred_labels = np.argmax(y_pred, axis=1)
y_test_labels = np.argmax(y_test, axis=1)

accuracy = accuracy_score(y_test_labels, y_pred_labels)
f1 = f1_score(y_test_labels, y_pred_labels, average='macro')

print("Accuracy:", accuracy)
print("F1 Score:", f1)

Accuracy: 0.5896872576893254
F1 Score: 0.3607902788859847


kaggle

In [111]:
kaggle_test_df = pd.read_csv(r"C:\Users\domin\Desktop\Year 2 Block C\2023-24c-fai2-adsai-DominikSzewczyk224180\Datasets\test.csv", sep = "\t")

In [112]:
kaggle_test = sentence_processor_df(kaggle_test_df['sentence'])

In [113]:
kaggle_test1 = []
for token in kaggle_test:
    kaggle_test1.append(' '.join(token))

In [114]:
kaggle_test_sequences = tokenizer.texts_to_sequences(kaggle_test1)
kaggle_test_padded = pad_sequences(kaggle_test_sequences, maxlen=30, padding='post')

In [115]:
y_pred_kaggle = model.predict(kaggle_test_padded)



In [116]:
y_pred_kaggle = np.argmax(y_pred_kaggle, axis=1)
y_pred_kaggle = labelencoder.inverse_transform(y_pred_kaggle)

In [117]:
predictions = pd.DataFrame({'id': kaggle_test_df['id'], 'emotion': y_pred_kaggle})
predictions.to_csv('predictions_CNN.csv', index=False, sep=',')