This is the deep learning implementation of the model

In [4]:
import pandas as pd

# Load the CSV file into a DataFrame
df = pd.read_csv("../data-preparation/chat_emotions.csv")
display(df.head())


Unnamed: 0,text,emotion
0,i cant do my kyc because nigerian banks are no...,sadness
1,you don t need to add bank account to complete...,neutral
2,verification will be automatic if you ve done ...,neutral
3,iam wait hour not complete kyc,sadness
4,it can take longer after working hour please s...,neutral


In [None]:

print("DataFrame shape:", df.shape)


print("\nData types:\n", df.dtypes)


emotion_counts = df['emotion'].value_counts()
print("\nEmotion label distribution:\n", emotion_counts)


emotion_percentage = df['emotion'].value_counts(normalize=True)
print("\nEmotion label percentage distribution:\n", emotion_percentage)


missing_values = df.isnull().sum()
print("\nMissing values:\n", missing_values)


df['text_length'] = df['text'].apply(lambda x: len(x) if isinstance(x, str) else 0)


print("\nText length statistics:\n", df['text_length'].describe())


In [None]:

num_duplicates = df.duplicated().sum()
print("Number of duplicate rows:", num_duplicates)
df = df.drop_duplicates()


df = df.dropna(subset=['emotion'])


condition = (df['text_length'] >= 15) & (df['text_length'] <= 2000)
num_outliers = df[~condition].shape[0]
print("Number of outliers:", num_outliers)
df = df[condition]

print("Shape of dataframe after cleaning:", df.shape)


In [None]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.preprocessing import LabelEncoder


nltk.download('stopwords')


def preprocess_text(text):

    text = text.lower()

    text = re.sub(r'[^a-z0-9\s]', '', text)

    tokens = text.split()
    tokens = [word for word in tokens if word not in stopwords.words('english')]

    stemmer = PorterStemmer()
    tokens = [stemmer.stem(word) for word in tokens]
    return " ".join(tokens)


df['text_processed'] = df['text'].apply(preprocess_text)


label_encoder = LabelEncoder()
df['emotion_encoded'] = label_encoder.fit_transform(df['emotion'])


display(df[['text', 'text_processed', 'emotion', 'emotion_encoded']].head())


In [None]:
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import seaborn as sns

plt.figure(figsize=(10, 6))
sns.countplot(x='emotion', data=df)
plt.title('Distribution of Emotion Labels')
plt.xlabel('Emotion')
plt.ylabel('Count')
plt.show()


all_text = " ".join(df['text_processed'])
wordcloud = WordCloud(background_color='white', width=800, height=400).generate(all_text)
plt.figure(figsize=(10, 6))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.title('Most Frequent Words')
plt.show()


df['text_processed_length'] = df['text_processed'].apply(lambda x: len(x.split()))
plt.figure(figsize=(10, 6))
sns.histplot(df['text_processed_length'], bins=50, kde=True)
plt.title('Distribution of Preprocessed Text Length')
plt.xlabel('Number of Words')
plt.ylabel('Frequency')
plt.show()


In [None]:
from imblearn.over_sampling import RandomOverSampler


df_balanced = df[['text_processed', 'emotion_encoded']]


ros = RandomOverSampler(random_state=42)


X_resampled, y_resampled = ros.fit_resample(df_balanced[['text_processed']], df_balanced['emotion_encoded'])


df_resampled = pd.concat(
    [pd.DataFrame(X_resampled, columns=['text_processed']),
     pd.Series(y_resampled, name='emotion_encoded')],
    axis=1
)


print("Resampled emotion distribution:")
print(df_resampled['emotion_encoded'].value_counts())


In [None]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
y_dl = label_encoder.fit_transform(df['emotion'])

In [None]:
import tensorflow as tf
from tensorflow.keras import layers, models, optimizers
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.preprocessing.text import Tokenizer  # Added import
from tensorflow.keras.preprocessing.sequence import pad_sequences  # Added import
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.class_weight import compute_class_weight
import numpy as np


print("GPU Available:", tf.config.list_physical_devices('GPU'))

# Prepare data
texts = df['text_processed'].tolist()
max_features = 20000  # Increased vocabulary size
max_len = 150         # Increased sequence length


label_encoder = LabelEncoder()
y_dl = label_encoder.fit_transform(df['emotion'])


tokenizer = Tokenizer(num_words=max_features, oov_token='<OOV>')
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
X_seq = pad_sequences(sequences, maxlen=max_len, padding='post', truncating='post')


X_train_dl, X_test_dl, y_train_dl, y_test_dl = train_test_split(
    X_seq, y_dl, test_size=0.2, random_state=42, stratify=y_dl  # Reduced test size
)
X_train_dl, X_val_dl, y_train_dl, y_val_dl = train_test_split(
    X_train_dl, y_train_dl, test_size=0.15, random_state=42, stratify=y_train_dl
)

# Calculate class weights
class_weights = compute_class_weight('balanced', classes=np.unique(y_train_dl), y=y_train_dl)
class_weights = dict(enumerate(class_weights))


model = models.Sequential([
    layers.Embedding(input_dim=max_features, output_dim=256, input_length=max_len),
    layers.Bidirectional(layers.LSTM(128, return_sequences=True)),
    layers.BatchNormalization(),
    layers.Conv1D(64, 3, activation='relu'),
    layers.GlobalMaxPooling1D(),
    layers.Dense(128, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.01)),
    layers.Dropout(0.5),
    layers.Dense(64, activation='relu'),
    layers.Dropout(0.3),
    layers.Dense(len(label_encoder.classes_), activation='softmax')
])


optimizer = optimizers.Adam(learning_rate=0.001)
model.compile(optimizer=optimizer,
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

model.summary()


early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
lr_scheduler = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=2)


history = model.fit(X_train_dl, y_train_dl,
                    epochs=30,
                    batch_size=64,  # Reduced batch size
                    validation_data=(X_val_dl, y_val_dl),
                    callbacks=[early_stop, lr_scheduler],
                    class_weight=class_weights)


loss, accuracy = model.evaluate(X_test_dl, y_test_dl)
print(f"Test accuracy: {accuracy:.4f}")


from sklearn.metrics import classification_report
y_pred = model.predict(X_test_dl).argmax(axis=1)
print("\nClassification Report:")
print(classification_report(y_test_dl, y_pred, target_names=label_encoder.classes_))
