**Emotions**<br>
`0` : Sadness<br>
`1` : Joy<br>
`2` : Love<br>
`3` : Anger<br>
`4` : Fear<br>
`5` : Surprise

### 1. **Loading the Datasets**

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import layers, models
from tensorflow.keras.utils import to_categorical
from tensorflow import keras

import nltk
from nltk.corpus import stopwords

nltk.download('stopwords', quiet=True)
nltk.download('punkt', quiet=True)
nltk.download('punkt_tab', quiet=True)

import warnings
warnings.filterwarnings('ignore')



In [2]:
emotion_df = pd.read_csv(r"E:\PROJECTS\NLP Project\Data\emotions.csv")
emotion_df.head()

Unnamed: 0.1,Unnamed: 0,text,label
0,0,i just feel really helpless and heavy hearted,4
1,1,ive enjoyed being able to slouch about relax a...,0
2,2,i gave up my internship with the dmrg and am f...,4
3,3,i dont know i feel so lost,0
4,4,i am a kindergarten teacher and i am thoroughl...,4


### 2. **Data Preprocessing**

In [3]:
emotion_df.drop(columns=['Unnamed: 0'], inplace=True)
emotion_df.head()

Unnamed: 0,text,label
0,i just feel really helpless and heavy hearted,4
1,ive enjoyed being able to slouch about relax a...,0
2,i gave up my internship with the dmrg and am f...,4
3,i dont know i feel so lost,0
4,i am a kindergarten teacher and i am thoroughl...,4


In [5]:
# Check for null values
emotion_df.isna().sum()

text     0
label    0
dtype: int64

---

In [6]:
emotion_df.shape

(416809, 2)

In [7]:
# Check glass distribution
emotion_df['label'].value_counts()

label
1    141067
0    121187
3     57317
4     47712
2     34554
5     14972
Name: count, dtype: int64

In [None]:
# Balancing all classes in the emotions dataset
e_df = pd.DataFrame()
for i in range(6):
    temp_df = emotion_df[emotion_df['label'] == i].sample(14972, replace=True)
    e_df = pd.concat([e_df, temp_df])

emotion_df = e_df.copy()

In [9]:
emotion_df['label'].value_counts()

label
0    14972
1    14972
2    14972
3    14972
4    14972
5    14972
Name: count, dtype: int64

In [10]:
emotion_df.shape

(89832, 2)

In [11]:
emotion_df.head()

Unnamed: 0,text,label
109391,i do not feel a shamed i m your hell i m your ...,0
311212,i feel terrible and helpless about it,0
404591,i really enjoy bendis snappy dialogue in a lot...,0
92662,i cant risk hurting their feelings because i l...,0
56468,i want to help her by i feel so helpless,0


---

In [None]:
# randomly shuffle the dataset
emotion_df = emotion_df.sample(frac=1, random_state=42)

In [None]:
# resetting the indexes after shuffling
emotion_df.reset_index(drop=True, inplace=True)

In [14]:
emotion_df.head()

Unnamed: 0,text,label
0,i have seen thus far but i m not sure why but ...,5
1,i could really have guessed this after unpleas...,1
2,i feel irritated because no one else washes th...,3
3,i always feel a little funny writing posts on ...,5
4,i feel slightly dazed and tired and angry but ...,5


In [15]:
emotion_df['label'].unique()

array([5, 1, 3, 4, 0, 2], dtype=int64)

### 3. **Stopwords Removal**

In [16]:
stopwords = set(stopwords.words('english'))

In [17]:
# Stopwords Removal function
def remove_stopwords(text, stopwords_set=stopwords):
    tokens = nltk.word_tokenize(text)
    filtered_tokens = [word for word in tokens if word.lower() not in stopwords_set]
    return ' '.join(filtered_tokens)

In [18]:
emotion_df['text'] = emotion_df['text'].apply(remove_stopwords)

In [19]:
emotion_df.head()

Unnamed: 0,text,label
0,seen thus far sure feeling impressed,5
1,could really guessed unpleasant childhood expe...,1
2,feel irritated one else washes dishes,3
3,always feel little funny writing posts blog goes,5
4,feel slightly dazed tired angry normal emotion...,5


### 4. **Tokenizer**

In [20]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(emotion_df['text'])

In [21]:
emotion_sequences = tokenizer.texts_to_sequences(emotion_df['text'])

In [40]:
import pickle

save_path = r"E:\PROJECTS\NLP Project\tokenizer.pkl"

with open(save_path, "wb") as f:
    pickle.dump(tokenizer, f)

In [22]:
emotion_df['text'].iloc[1]

'could really guessed unpleasant childhood experiments toothpaste yep curiousity chilled cat id assumed shampoo supposed feel pleasant silly'

In [23]:
emotion_sequences[1:2]

[[28,
  5,
  4260,
  1048,
  1580,
  10101,
  11921,
  8865,
  14741,
  7283,
  1498,
  144,
  4102,
  3945,
  594,
  1,
  839,
  1197]]

In [24]:
max_length = 50
emotion_padded = pad_sequences(emotion_sequences, maxlen=max_length, padding='post')

In [25]:
emotion_padded[1:2]

array([[   28,     5,  4260,  1048,  1580, 10101, 11921,  8865, 14741,
         7283,  1498,   144,  4102,  3945,   594,     1,   839,  1197,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0]])

In [26]:
# Generating labels in numpy array format
emotion_labels = np.array(emotion_df['label'])

### 5. **Train-Test Split**

In [27]:
# Split data into 70% train, 30% temp (for valid+test)
emotion_train, emotion_temp, emotion_labels_train, emotion_labels_temp = train_test_split(
    emotion_padded, emotion_labels, test_size=0.3, random_state=42)


# Split remaining into 50% valid, 50% test (of the 30%)
emotion_valid, emotion_test, emotion_labels_valid, emotion_labels_test = train_test_split(
    emotion_temp, emotion_labels_temp, test_size=0.5, random_state=42)


print(f"Emotion - Train: {emotion_train.shape[0]}, Valid: {emotion_valid.shape[0]}, Test: {emotion_test.shape[0]}")

Emotion - Train: 62882, Valid: 13475, Test: 13475


### 6. **One-hot Encoding** 

In [28]:
# One-hot encode all labels (train, valid, test)
emotion_labels_train_encoded = to_categorical(emotion_labels_train, num_classes=6)
emotion_labels_valid_encoded = to_categorical(emotion_labels_valid, num_classes=6)
emotion_labels_test_encoded = to_categorical(emotion_labels_test, num_classes=6)

In [29]:
emotion_labels_train_encoded.shape

(62882, 6)

### 7. **Standard LSTM**

In [30]:
model_lstm = models.Sequential([
    layers.Input(shape=(max_length,), name='Input_Layer'),
    layers.Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=128, name='Embedding_Layer'),

    # Regularization layer
    layers.SpatialDropout1D(0.3, name='Spatial_Dropout_Layer'),

    layers.Bidirectional(
        layers.LSTM(64, return_sequences=False, name='LSTM_Layer1'),
        name='Bidirectional_Layer1'
    ),

    layers.Dense(64, activation='relu', name='Dense_Layer1'),
    layers.Dropout(0.3, name='Dropout_Layer1'),

    layers.Dense(6, activation='softmax', name='Output_Layer')
])

model_lstm.compile(
    loss='categorical_crossentropy',
    optimizer=keras.optimizers.Adam(1e-3),
    metrics=['accuracy']
)

model_lstm.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 Embedding_Layer (Embedding)  (None, 50, 128)          4270720   
                                                                 
 Spatial_Dropout_Layer (Spat  (None, 50, 128)          0         
 ialDropout1D)                                                   
                                                                 
 Bidirectional_Layer1 (Bidir  (None, 128)              98816     
 ectional)                                                       
                                                                 
 Dense_Layer1 (Dense)        (None, 64)                8256      
                                                                 
 Dropout_Layer1 (Dropout)    (None, 64)                0         
                                                                 
 Output_Layer (Dense)        (None, 6)                 3

#### 7.1 **Data Training(LSTM)**

In [31]:
history_lstm = model_lstm.fit(
    emotion_train,
    emotion_labels_train_encoded,
    validation_data=(emotion_valid, emotion_labels_valid_encoded),
    epochs=2,
    batch_size=4,
    verbose=1
)

Epoch 1/2
Epoch 2/2
Epoch 2/2


In [32]:
model_lstm.save(r'E:\PROJECTS\NLP Project\models\emotion_lstm_model.h5')

#### 7.2 **Data Evaluation(LSTM)**

In [33]:
# Evaluate on test data
test_results = model_lstm.evaluate(emotion_test, emotion_labels_test_encoded, verbose=1)

print("\n" + "="*50)
print("Test Results:")
print("="*50)
print(f"Test Loss: {test_results[0]:.4f}")
print(f"Test Accuracy: {test_results[1]:.4f}")
print("="*50)


Test Results:
Test Loss: 0.1216
Test Accuracy: 0.9501

Test Results:
Test Loss: 0.1216
Test Accuracy: 0.9501


In [34]:
# Get predictions on test data
test_predictions = model_lstm.predict(emotion_test)

# Extract predicted labels
emotion_pred_labels = np.argmax(test_predictions, axis=1)

# Calculate accuracy
from sklearn.metrics import accuracy_score, classification_report

emotion_accuracy = accuracy_score(emotion_labels_test, emotion_pred_labels)

print("\n" + "="*50)
print("Per-Task Accuracy on Test Set:")
print("="*50)
print(f"Emotion Accuracy: {emotion_accuracy:.4f}")
print("="*50)

print("\n\nDetailed Classification Report - Emotion:")
print(classification_report(emotion_labels_test, emotion_pred_labels))


Per-Task Accuracy on Test Set:
Emotion Accuracy: 0.9501


Detailed Classification Report - Emotion:
              precision    recall  f1-score   support

           0       0.99      0.95      0.97      2185
           1       0.99      0.91      0.95      2255
           2       0.93      0.99      0.96      2288
           3       0.95      0.95      0.95      2206
           4       0.93      0.90      0.91      2290
           5       0.91      1.00      0.95      2251

    accuracy                           0.95     13475
   macro avg       0.95      0.95      0.95     13475
weighted avg       0.95      0.95      0.95     13475


Per-Task Accuracy on Test Set:
Emotion Accuracy: 0.9501


Detailed Classification Report - Emotion:
              precision    recall  f1-score   support

           0       0.99      0.95      0.97      2185
           1       0.99      0.91      0.95      2255
           2       0.93      0.99      0.96      2288
           3       0.95      0.95     

### 8. **TextCNN**

In [35]:
inputs = layers.Input(shape=(max_length,), name='Input_Layer')

# Embedding (same dims as your LSTM)
x = layers.Embedding(input_dim=len(tokenizer.word_index) + 1,
                     output_dim=128,
                     input_length=max_length,
                     name='Embedding_Layer')(inputs)

# Spatial dropout like in your LSTM
x = layers.SpatialDropout1D(0.3, name='Spatial_Dropout_Layer')(x)

# Parallel Conv1D layers with different kernel sizes
conv_pools = []
for ks in (3, 4, 5):
    conv = layers.Conv1D(filters=128, kernel_size=ks, activation='relu', padding='valid')(x)
    pool = layers.GlobalMaxPooling1D()(conv)
    conv_pools.append(pool)

# Concatenate pooled features
if len(conv_pools) > 1:
    x = layers.concatenate(conv_pools, name='Concat_Conv_GlobalMaxPool')
else:
    x = conv_pools[0]

# Dense head similar capacity to your LSTM's dense
x = layers.Dense(64, activation='relu', name='Dense_Layer1')(x)
x = layers.Dropout(0.3, name='Dropout_Layer1')(x)

# Output
outputs = layers.Dense(6, activation='softmax', name='Output_Layer')(x)

model_cnn = models.Model(inputs=inputs, outputs=outputs, name='TextCNN_Functional')

model_cnn.compile(
    loss='categorical_crossentropy',
    optimizer=keras.optimizers.Adam(1e-3),
    metrics=['accuracy']
)

model_cnn.summary()

Model: "TextCNN_Functional"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 Input_Layer (InputLayer)       [(None, 50)]         0           []                               
                                                                                                  
 Embedding_Layer (Embedding)    (None, 50, 128)      4270720     ['Input_Layer[0][0]']            
                                                                                                  
 Spatial_Dropout_Layer (Spatial  (None, 50, 128)     0           ['Embedding_Layer[0][0]']        
 Dropout1D)                                                                                       
                                                                                                  
 conv1d (Conv1D)                (None, 48, 128)      49280       ['Spatial_Dropou

#### 8.1 **Data Training(TextCNN)**

In [36]:
history_cnn = model_cnn.fit(
    emotion_train,
    emotion_labels_train_encoded,
    validation_data=(emotion_valid, emotion_labels_valid_encoded),
    epochs=2,
    batch_size=4,
    verbose=1
)


Epoch 1/2
Epoch 2/2
Epoch 2/2


In [37]:
model_cnn.save(r'E:\PROJECTS\NLP Project\models\emotion_textcnn_model.h5')

#### 8.2 **Data Evaluation(TextCNN)**

In [38]:
# Evaluate on test data
test_results = model_cnn.evaluate(emotion_test, emotion_labels_test_encoded, verbose=1)

print("\n" + "="*50)
print("Test Results:")
print("="*50)
print(f"Test Loss: {test_results[0]:.4f}")
print(f"Test Accuracy: {test_results[1]:.4f}")
print("="*50)


Test Results:
Test Loss: 0.1585
Test Accuracy: 0.9422

Test Results:
Test Loss: 0.1585
Test Accuracy: 0.9422


In [39]:
# Get predictions on test data
test_predictions = model_cnn.predict(emotion_test)

# Extract predicted labels
emotion_pred_labels = np.argmax(test_predictions, axis=1)

# Calculate accuracy
from sklearn.metrics import accuracy_score, classification_report

emotion_accuracy = accuracy_score(emotion_labels_test, emotion_pred_labels)

print("\n" + "="*50)
print("Per-Task Accuracy on Test Set:")
print("="*50)
print(f"Emotion Accuracy: {emotion_accuracy:.4f}")
print("="*50)

print("\n\nDetailed Classification Report - Emotion:")
print(classification_report(emotion_labels_test, emotion_pred_labels))


Per-Task Accuracy on Test Set:
Emotion Accuracy: 0.9422


Detailed Classification Report - Emotion:
              precision    recall  f1-score   support

           0       0.96      0.95      0.96      2185
           1       0.99      0.89      0.94      2255
           2       0.93      0.99      0.96      2288
           3       0.95      0.95      0.95      2206
           4       0.92      0.89      0.91      2290
           5       0.91      0.99      0.94      2251

    accuracy                           0.94     13475
   macro avg       0.94      0.94      0.94     13475
weighted avg       0.94      0.94      0.94     13475


Per-Task Accuracy on Test Set:
Emotion Accuracy: 0.9422


Detailed Classification Report - Emotion:
              precision    recall  f1-score   support

           0       0.96      0.95      0.96      2185
           1       0.99      0.89      0.94      2255
           2       0.93      0.99      0.96      2288
           3       0.95      0.95     