In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, TFBertForSequenceClassification
import tensorflow as tf
import numpy as np

# Load the data from "merged_training.pkl"
df = pd.read_pickle("merged_training.pkl")

# Encode the emotion labels using LabelEncoder
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
df['emotions_encoded'] = label_encoder.fit_transform(df['emotions'])

# Split the data into train and test sets
X = df['text']
y = df['emotions_encoded']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the list of emotion labels for later decoding
emotions = list(label_encoder.classes_)

In [2]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense

# Tokenize and preprocess text
tokenizer_cnn = Tokenizer()
tokenizer_cnn.fit_on_texts(X_train)
X_train_sequences_cnn = tokenizer_cnn.texts_to_sequences(X_train)
X_test_sequences_cnn = tokenizer_cnn.texts_to_sequences(X_test)
X_train_padded_cnn = pad_sequences(X_train_sequences_cnn, padding='post')
X_test_padded_cnn = pad_sequences(X_test_sequences_cnn, padding='post')

# Create and train the CNN model
cnn_model = Sequential()
cnn_model.add(Embedding(input_dim=len(tokenizer_cnn.word_index) + 1, output_dim=100, input_length=X_train_padded_cnn.shape[1]))
cnn_model.add(Conv1D(128, 5, activation='relu'))
cnn_model.add(GlobalMaxPooling1D())
cnn_model.add(Dense(len(emotions), activation='softmax'))
cnn_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
cnn_model.fit(X_train_padded_cnn, y_train, epochs=3)


Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.src.callbacks.History at 0x1880f417c90>

In [5]:
# Tokenize and preprocess test data to match the input shape used during training
tokenizer_cnn = Tokenizer()
tokenizer_cnn.fit_on_texts(X_train)
X_test_sequences_cnn = tokenizer_cnn.texts_to_sequences(X_test)
X_test_padded_cnn = pad_sequences(X_test_sequences_cnn, maxlen=X_train_padded_cnn.shape[1], padding='post')

# Evaluate the CNN model on the test dataset
cnn_scores = cnn_model.evaluate(X_test_padded_cnn, y_test)

# Extract metrics
cnn_loss = cnn_scores[0]
cnn_accuracy = cnn_scores[1]

# Make predictions on the test dataset
y_pred_cnn = cnn_model.predict(X_test_padded_cnn)
y_pred_cnn_classes = np.argmax(y_pred_cnn, axis=1)

# Calculate precision, recall, and F1-score using scikit-learn
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

precision_cnn = precision_score(y_test, y_pred_cnn_classes, average='weighted')
recall_cnn = recall_score(y_test, y_pred_cnn_classes, average='weighted')
f1_cnn = f1_score(y_test, y_pred_cnn_classes, average='weighted')

# Print evaluation metrics
print("CNN Model Evaluation:")
print(f"Loss: {cnn_loss:.4f}")
print(f"Accuracy: {cnn_accuracy:.4f}")
print(f"Precision: {precision_cnn:.4f}")
print(f"Recall: {recall_cnn:.4f}")
print(f"F1-Score: {f1_cnn:.4f}")

# Detailed classification report
report = classification_report(y_test, y_pred_cnn_classes, target_names=emotions)
print("\nClassification Report:\n", report)


CNN Model Evaluation:
Loss: 0.1383
Accuracy: 0.9315
Precision: 0.9335
Recall: 0.9315
F1-Score: 0.9296

Classification Report:
               precision    recall  f1-score   support

       anger       0.97      0.90      0.93     11339
        fear       0.85      0.93      0.89      9376
         joy       0.92      0.99      0.95     28247
        love       0.98      0.70      0.82      6853
     sadness       0.96      0.98      0.97     24504
    surprise       0.85      0.71      0.77      3043

    accuracy                           0.93     83362
   macro avg       0.92      0.87      0.89     83362
weighted avg       0.93      0.93      0.93     83362

