## Empathy Emotion and Personality Detection using Deep Learning
### 7120CEM CW2
WASSA 2023 Shared Task on Empathy Emotion and Personality Detection in Interactions (* 
Including regression problems and classification problems):  
- Website: https://codalab.lisn.upsaclay.fr/competitions/11167 
- Summary paper: https://aclanthology.org/2023.wassa-1.44/ 

1. Import Libraries

In [1]:
import string
import pandas as pd
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

# nltk.download('wordnet')

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


2. Load data, preprocess and split

In [29]:
data = 'data/WASSA23_conv_level_with_labels_train.tsv'
df = pd.read_table(data, header=0)
new_col = []
for names in df.columns:
    new_col.append(names.strip())
df.columns = new_col
dataset = df.drop(["conversation_id", "turn_id", "speaker_number", "article_id", "speaker_id", "essay_id"], axis=1, inplace=True)

X_data, y_data = df.loc[:, 'text'], df.drop('text', axis=1)
X_train, X_test, y_train , y_test = train_test_split(X_data, y_data, train_size=0.8)
#reset index of training examples
X_train, X_test = X_train.reset_index(drop=True), X_test.reset_index(drop=True)
y_train, y_test = y_train.reset_index(drop=True), y_test.reset_index(drop=True)

df.head()

Unnamed: 0,text,EmotionalPolarity,Emotion,Empathy
0,I feel very sad for the people. ...,2.0,3.0,3.3333
1,It's terrible. Not only the people but the ani...,2.0,4.0,3.3333
2,I felt really sorry for the sister that now ha...,2.0,3.6667,2.6667
3,"Yeah, it's going to be tough but i am sure she...",0.6667,3.0,2.0
4,"Yeah, we never know what we can do unless we a...",0.3333,2.3333,1.3333


3. process word data into numbers
- tokenization
- remove stop word and punctuatuons, numbers
- lematization
- vectorization

In [30]:
"""Preprocesses a sentence for natural language processing tasks.

This function performs the following steps:
    1. Tokenizes the sentence into individual words.
    2. Removes stop words (common words with little meaning) from the tokens.
    3. Removes punctuation marks from the tokens.
    4. Lemmatizes the tokens (reduces words to their base form).
    5. Joins the preprocessed tokens back into a sentence string.

Args:
	sentence: The input sentence to be preprocessed (string).

Returns:
	The preprocessed sentence string.
"""
def word_preprocessor(sentence):
    stop_words = set(stopwords.words('english'))
    punctuations = set(string.punctuation)
    lem = WordNetLemmatizer().lemmatize
    sentence = word_tokenize(sentence)
    sentence = [word for word in sentence if word not in stop_words]
    sentence = [word for word in sentence if word not in punctuations]
    sentence_str = ' '.join(sentence)
    sentence = lem(sentence_str)
    return sentence 

In [31]:
X_train = X_train.apply(word_preprocessor)
X_test = X_test.apply(word_preprocessor)

#convert labels to array
X_train, X_test = np.array(X_train), np.array(X_test)
y_train, y_test = np.array(y_train[['EmotionalPolarity', 'Emotion', 'Empathy']]), np.array(y_test[['EmotionalPolarity', 'Emotion', 'Empathy']])

vectorizer = CountVectorizer(max_features=8000, stop_words='english', lowercase=True)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)
X_train_vec = X_train_vec.toarray()
X_test_vec = X_test_vec.toarray()


4. Create and Train model

In [33]:
# Create a sequential model
EEPD_Model = Sequential()
EEPD_Model.add(Dense(800, activation='relu', input_dim=X_train_vec.shape[1]))
EEPD_Model.add(Dense(400, activation='relu'))
EEPD_Model.add(Dense(200, activation='relu'))
EEPD_Model.add(Dense(3, activation='softmax'))

# Compile the model
EEPD_Model.compile(loss='mean_squared_error', optimizer='adam', metrics=['accuracy'])

# Train the model
EEPD_Model.fit(X_train_vec, y_train, epochs=20, batch_size=128, validation_data=(X_test_vec, y_test))

# Evaluate the model
loss, accuracy = EEPD_Model.evaluate(X_test_vec, y_test)
print(f'|Test loss: \t{loss:.4f}')
print(f'|Test accuracy: \t{accuracy:.4f}')

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/20
[1m234/234[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 80ms/step - accuracy: 0.5324 - loss: 2.9654 - val_accuracy: 0.5467 - val_loss: 2.8716
Epoch 2/20
[1m234/234[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 86ms/step - accuracy: 0.6669 - loss: 2.9259 - val_accuracy: 0.5404 - val_loss: 2.8755
Epoch 3/20
[1m234/234[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 90ms/step - accuracy: 0.7351 - loss: 2.8742 - val_accuracy: 0.5467 - val_loss: 2.8742
Epoch 4/20
[1m234/234[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 86ms/step - accuracy: 0.8029 - loss: 2.8630 - val_accuracy: 0.5450 - val_loss: 2.8766
Epoch 5/20
[1m234/234[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 88ms/step - accuracy: 0.8179 - loss: 2.8993 - val_accuracy: 0.5444 - val_loss: 2.8760
Epoch 6/20
[1m234/234[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 75ms/step - accuracy: 0.8397 - loss: 2.7785 - val_accuracy: 0.5359 - val_loss: 2.8782
Epoch 7/20
[1m2