In [8]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [10]:
import os

# Change the current working directory to your Google Drive
os.chdir('/content/gdrive/MyDrive')

# Verify that the directory has been changed
print(os.getcwd())

/content/gdrive/MyDrive


In [12]:
emo = pd.read_csv('emotions.csv')

In [13]:
emo.head()

Unnamed: 0,text,label
0,i just feel really helpless and heavy hearted,4
1,ive enjoyed being able to slouch about relax a...,0
2,i gave up my internship with the dmrg and am f...,4
3,i dont know i feel so lost,0
4,i am a kindergarten teacher and i am thoroughl...,4


In [14]:
#sadness (0), joy (1), love (2), anger (3), fear (4), and surprise (5)
emo['label'].unique()

array([4, 0, 2, 1, 5, 3])

In [17]:
emo['label'].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
1,141067
0,121187
3,57317
4,47712
2,34554
5,14972


In [18]:
emo.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 416809 entries, 0 to 416808
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   text    416809 non-null  object
 1   label   416809 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 6.4+ MB


In [19]:
emo['label'].value_counts(dropna=False).sort_index()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
0,121187
1,141067
2,34554
3,57317
4,47712
5,14972


In [20]:
print(emo['label'].value_counts(dropna=False).sort_index())

label
0    121187
1    141067
2     34554
3     57317
4     47712
5     14972
Name: count, dtype: int64


In [21]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

In [22]:
X = emo['text']
y = emo['label']

# --- 2. Train/Test Split (Stratified) ---
# Stratify ensures the severe class imbalance is reflected equally in both sets
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.25, # 80% train / 20% test
    random_state=42,
    stratify=y
)

In [23]:
# --- 3. Initialize and Fit the TF-IDF Vectorizer ---
# We use standard parameters for a robust baseline
tfidf = TfidfVectorizer(
    lowercase=True,
    stop_words='english',
    ngram_range=(1, 2), # Consider both single words and two-word phrases
    max_features=10000 # Limit vocabulary size for speed and focus
)

# Fit the vectorizer ONLY on the training data
X_train_vec = tfidf.fit_transform(X_train)

# Transform the test data using the *fitted* training vocabulary
X_test_vec = tfidf.transform(X_test)

In [24]:
# --- Output Summary ---
print(f"✅ Data split completed.")
print(f"✅ TF-IDF Vectorization complete.")
print(f"Training Data Shape: {X_train_vec.shape}")
print(f"Testing Data Shape: {X_test_vec.shape}")
print(f"Total Features (vocabulary size): {X_train_vec.shape[1]} (limited to 10000)")

✅ Data split completed.
✅ TF-IDF Vectorization complete.
Training Data Shape: (312606, 10000)
Testing Data Shape: (104203, 10000)
Total Features (vocabulary size): 10000 (limited to 10000)


In [25]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, f1_score

In [26]:
# --- 1. Initialize and Train the MNB Model ---
# We use default settings for a baseline model.
mnb_model = MultinomialNB()

# Train the model using the TF-IDF feature matrices
print("Starting Multinomial Naive Bayes training...")
mnb_model.fit(X_train_vec, y_train)
print("MNB training complete.")

Starting Multinomial Naive Bayes training...
MNB training complete.


In [27]:
# --- 2. Make Predictions ---
# Predict the emotion labels for the held-out test set
y_pred_mnb = mnb_model.predict(X_test_vec)

In [28]:
# --- 3. Evaluate Performance ---
# Use the classification report for a detailed view of performance across all 6 classes
# The 'weighted' F1-score is crucial for your imbalanced data.
weighted_f1 = f1_score(y_test, y_pred_mnb, average='weighted')

In [29]:
print("\n--- MNB Model Performance on Test Set ---")
print(f"Weighted F1-Score (CRITICAL METRIC): {weighted_f1:.4f}")
print("\nClassification Report (Detail by Class):")
print(classification_report(y_test, y_pred_mnb))


--- MNB Model Performance on Test Set ---
Weighted F1-Score (CRITICAL METRIC): 0.8750

Classification Report (Detail by Class):
              precision    recall  f1-score   support

           0       0.90      0.95      0.93     30297
           1       0.84      0.97      0.90     35267
           2       0.91      0.59      0.72      8639
           3       0.94      0.85      0.89     14329
           4       0.88      0.81      0.84     11928
           5       0.89      0.47      0.62      3743

    accuracy                           0.88    104203
   macro avg       0.89      0.77      0.82    104203
weighted avg       0.88      0.88      0.87    104203



In [30]:
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, f1_score
from sklearn.multiclass import OneVsRestClassifier

In [31]:
# --- 1. Initialize and Train the Linear SVC Model ---

# LinearSVC is ideal for this large, sparse, high-dimensional data (10,000 features).
# key parameters:
# 1. class_weight='balanced': Crucial for assigning higher penalty to errors on rare classes (Surprise/Love).
# 2. dual=False: Recommended when n_samples > n_features (although here 312k > 10k, it's often faster for large datasets).
# 3. max_iter: Increased max_iter for convergence on large data.
# 4. random_state: For reproducibility.

# LinearSVC naturally handles multiclass problems using a One-vs-Rest strategy.
lsvc_model = LinearSVC(
    class_weight='balanced',
    dual=False,
    max_iter=1500,
    random_state=42
)

In [32]:
print("Starting Linear SVC training with class weighting...")
lsvc_model.fit(X_train_vec, y_train)
print("Linear SVC training complete.")

# --- 2. Make Predictions ---
# Predict the emotion labels for the held-out test set
y_pred_lsvc = lsvc_model.predict(X_test_vec)

Starting Linear SVC training with class weighting...
Linear SVC training complete.


In [33]:
# --- 3. Evaluate Performance ---
# Calculate the F1-score and generate the full classification report.
weighted_f1 = f1_score(y_test, y_pred_lsvc, average='weighted')

print("\n--- Linear SVC Model Performance on Test Set ---")
print(f"Weighted F1-Score (CRITICAL METRIC): {weighted_f1:.4f}")
print("\nClassification Report (Detail by Class):")
print(classification_report(y_test, y_pred_lsvc))


--- Linear SVC Model Performance on Test Set ---
Weighted F1-Score (CRITICAL METRIC): 0.9097

Classification Report (Detail by Class):
              precision    recall  f1-score   support

           0       0.96      0.93      0.95     30297
           1       0.96      0.90      0.93     35267
           2       0.75      0.91      0.82      8639
           3       0.90      0.93      0.91     14329
           4       0.86      0.85      0.86     11928
           5       0.70      0.85      0.77      3743

    accuracy                           0.91    104203
   macro avg       0.86      0.90      0.87    104203
weighted avg       0.91      0.91      0.91    104203



In [34]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

# --- Configuration ---
MAX_WORDS = 20000        # The maximum number of unique words to keep in the vocabulary
MAX_LEN = 50             # Maximum length of a tweet (truncate longer ones)
EMBEDDING_DIM = 100      # Size of the dense vector for each word (embedding)
NUM_CLASSES = len(np.unique(y_train)) # Should be 6

In [35]:
# --- 1. Tokenization ---
# Create a dictionary mapping every word to an integer
tokenizer = Tokenizer(num_words=MAX_WORDS, oov_token="<unk>")
tokenizer.fit_on_texts(X_train)

# Convert text sequences to sequences of integers
X_train_sequences = tokenizer.texts_to_sequences(X_train)
X_test_sequences = tokenizer.texts_to_sequences(X_test)

In [36]:
# --- 2. Padding and Truncation ---
# Make all sequences the same length (MAX_LEN) by padding short ones and truncating long ones
X_train_padded = pad_sequences(X_train_sequences, maxlen=MAX_LEN, padding='post', truncating='post')
X_test_padded = pad_sequences(X_test_sequences, maxlen=MAX_LEN, padding='post', truncating='post')

In [37]:
# --- 3. One-Hot Encode the Target ---
# Keras needs OHE targets for categorical_crossentropy loss
y_train_ohe = to_categorical(y_train, num_classes=NUM_CLASSES)
y_test_ohe = to_categorical(y_test, num_classes=NUM_CLASSES)

In [38]:
print("✅ Deep Learning Preprocessing Complete.")
print(f"X_train_padded shape: {X_train_padded.shape}")
print(f"y_train_ohe shape: {y_train_ohe.shape}")

✅ Deep Learning Preprocessing Complete.
X_train_padded shape: (312606, 50)
y_train_ohe shape: (312606, 6)


In [39]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import classification_report, f1_score
import numpy as np

# --- Configuration (from previous step) ---
# MAX_WORDS = 20000
# MAX_LEN = 50
# EMBEDDING_DIM = 100
# NUM_CLASSES = 6 (derived from y_train)

In [40]:
def create_lstm_model(max_words, max_len, embedding_dim, num_classes):
    model = Sequential()

    # 1. Embedding Layer: Converts integers into dense vectors (Word Embeddings)
    model.add(Embedding(
        input_dim=max_words,        # Vocabulary size (20k)
        output_dim=embedding_dim,   # Size of the dense word vector (100)
        input_length=max_len        # Max sequence length (50)
    ))

    # 2. LSTM Layer: Learns temporal dependencies and context (word order)
    model.add(LSTM(units=128, dropout=0.2, recurrent_dropout=0.2))

    # 3. Dense Classification Layers
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.3))

    # 4. Output Layer: 6 units with Softmax for probability distribution
    # We use Softmax because the target is OHE
    model.add(Dense(num_classes, activation='softmax'))

    # Compile the model
    model.compile(
        optimizer='adam',
        # Categorical Crossentropy is the correct loss for OHE multiclass targets
        loss='categorical_crossentropy',
        metrics=['accuracy']
    )
    return model

lstm_model = create_lstm_model(MAX_WORDS, MAX_LEN, EMBEDDING_DIM, NUM_CLASSES)
print(lstm_model.summary())



None


In [41]:
# Implement Early Stopping to prevent overfitting and save the best weights
early_stopping = EarlyStopping(
    monitor='val_loss',
    patience=3, # Stop after 3 epochs with no improvement
    restore_best_weights=True
)

In [42]:
print("\nStarting LSTM Model Training...")
history = lstm_model.fit(
    X_train_padded,
    y_train_ohe,
    epochs=10,                 # Start with 10 epochs (training is faster)
    batch_size=128,
    validation_split=0.1,      # Use 10% of training data for validation
    callbacks=[early_stopping],
    verbose=1
)
print("LSTM training complete.")


Starting LSTM Model Training...
Epoch 1/10
[1m2199/2199[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m659s[0m 295ms/step - accuracy: 0.3334 - loss: 1.5844 - val_accuracy: 0.3415 - val_loss: 1.5675
Epoch 2/10
[1m2199/2199[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m644s[0m 293ms/step - accuracy: 0.3381 - loss: 1.5681 - val_accuracy: 0.5053 - val_loss: 0.9611
Epoch 3/10
[1m2199/2199[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m637s[0m 290ms/step - accuracy: 0.7839 - loss: 0.5227 - val_accuracy: 0.9378 - val_loss: 0.1008
Epoch 4/10
[1m2199/2199[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m637s[0m 290ms/step - accuracy: 0.9371 - loss: 0.1100 - val_accuracy: 0.9395 - val_loss: 0.0947
Epoch 5/10
[1m2199/2199[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m641s[0m 292ms/step - accuracy: 0.9396 - loss: 0.0977 - val_accuracy: 0.9392 - val_loss: 0.0930
Epoch 6/10
[1m2199/2199[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m688s[0m 294ms/step - accuracy: 0.9403 - loss: 0.0939 -

In [43]:
# --- 3. Final Evaluation ---

# Predict probabilities on the test set
y_pred_probas = lstm_model.predict(X_test_padded)

# Convert OHE predictions back to single label integers
y_pred_lstm = np.argmax(y_pred_probas, axis=1)

# Convert the OHE test targets back to single label integers for sklearn metrics
y_test_labels = np.argmax(y_test_ohe, axis=1)

[1m3257/3257[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 25ms/step


In [44]:
# Calculate the final performance metrics
weighted_f1_lstm = f1_score(y_test_labels, y_pred_lstm, average='weighted')

print("\n--- LSTM Model Performance on Test Set ---")
print(f"Weighted F1-Score (CRITICAL METRIC): {weighted_f1_lstm:.4f}")
print("\nClassification Report (Detail by Class):")
print(classification_report(y_test_labels, y_pred_lstm))


--- LSTM Model Performance on Test Set ---
Weighted F1-Score (CRITICAL METRIC): 0.9389

Classification Report (Detail by Class):
              precision    recall  f1-score   support

           0       0.99      0.95      0.97     30297
           1       0.97      0.93      0.95     35267
           2       0.80      0.91      0.85      8639
           3       0.93      0.95      0.94     14329
           4       0.85      0.98      0.91     11928
           5       0.89      0.72      0.80      3743

    accuracy                           0.94    104203
   macro avg       0.91      0.91      0.91    104203
weighted avg       0.94      0.94      0.94    104203



In [45]:
import joblib
import os
import json
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer

# --- Configuration (Ensure tokenizer and lstm_model are available) ---
# Assuming tokenizer and lstm_model are defined and trained in your environment.

MODEL_DIR = 'lstm_deployment_artifacts'
KERAS_MODEL_FILENAME = 'emotion_lstm_model.keras'
TOKENIZER_FILENAME = 'emotion_tokenizer.json'
FEATURES_FILENAME = 'emotion_features.joblib' # Save feature list for completeness

# Create the deployment directory
os.makedirs(MODEL_DIR, exist_ok=True)

In [46]:
# 1. Save the Keras Model (Structure and Weights)
tf.keras.models.save_model(
    lstm_model,
    os.path.join(MODEL_DIR, KERAS_MODEL_FILENAME),
    save_format='keras'
)



In [47]:
# 2. Save the Tokenizer Vocabulary
tokenizer_json = tokenizer.to_json()
with open(os.path.join(MODEL_DIR, TOKENIZER_FILENAME), 'w', encoding='utf-8') as f:
    f.write(json.dumps(tokenizer_json, ensure_ascii=False))

In [48]:
# 3. Save the Class Mapping (For converting 0-5 back to emotion names)
# NOTE: This assumes you have a dictionary mapping 0:'sadness', 1:'joy', etc.
emotion_labels = {
    0: 'sadness', 1: 'joy', 2: 'love', 3: 'anger', 4: 'fear', 5: 'surprise'
}
joblib.dump(emotion_labels, os.path.join(MODEL_DIR, FEATURES_FILENAME))


print("✅ LSTM Deployment Artifacts successfully saved.")
print(f"Model saved to: {os.path.join(MODEL_DIR, KERAS_MODEL_FILENAME)}")
print(f"Tokenizer saved to: {os.path.join(MODEL_DIR, TOKENIZER_FILENAME)}")
print("Remember to download the 'lstm_deployment_artifacts' folder.")

✅ LSTM Deployment Artifacts successfully saved.
Model saved to: lstm_deployment_artifacts/emotion_lstm_model.keras
Tokenizer saved to: lstm_deployment_artifacts/emotion_tokenizer.json
Remember to download the 'lstm_deployment_artifacts' folder.
