In [22]:
#loading data set
from datasets import load_dataset
dataset = load_dataset("dair-ai/emotion")

In [4]:
# Save DataSet in each split CSV file
dataset["train"].to_csv("train_emotion.csv", index=False)
dataset["validation"].to_csv("val_emotion.csv", index=False)
dataset["test"].to_csv("test_emotion.csv", index=False)

Creating CSV from Arrow format:   0%|          | 0/16 [00:00<?, ?ba/s]

Creating CSV from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Creating CSV from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

201185

In [2]:
#Analyzing the dataset 
import pandas as pd

test_df = pd.read_csv("test_emotion.csv")
train_df = pd.read_csv("train_emotion.csv")
val_df = pd.read_csv("val_emotion.csv")
# Show class distribution for the 'label' column
print(test_df['label'].value_counts()+train_df['label'].value_counts()+val_df['label'].value_counts())


label
1    6761
0    5797
3    2709
4    2373
2    1641
5     719
Name: count, dtype: int64


In [None]:
# Check label names
label_names = dataset['train'].features['label'].names
for i, label in enumerate(label_names):
    print(f"{i} → {label}")


0 → sadness
1 → joy
2 → love
3 → anger
4 → fear
5 → surprise


In [17]:
#importing libraries
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from imblearn.combine import SMOTEENN
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.metrics import classification_report, accuracy_score

In [5]:
#Text Cleaning
def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^a-z\s]", "", text)  # remove punctuation and numbers
    text = re.sub(r"\s+", " ", text).strip()  # remove extra spaces
    return text

train_df['text'] = train_df['text'].apply(clean_text)
val_df['text'] = val_df['text'].apply(clean_text)
test_df['text'] = test_df['text'].apply(clean_text)

In [6]:
#Label Encoding
le = LabelEncoder()
y_train = le.fit_transform(train_df['label'])  # Fit on train
y_val = le.transform(val_df['label'])         # Transform only
y_test = le.transform(test_df['label'])

In [None]:
# TF-IDF Vectorization
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(train_df['text'])  # Fit + transform on train
X_val = vectorizer.transform(val_df['text'])          # Transform only
X_test = vectorizer.transform(test_df['text'])

In [8]:
#Class Imbalance - SMOTE + UO
smote_enn = SMOTEENN(random_state=42)
X_train_balanced, y_train_balanced = smote_enn.fit_resample(X_train, y_train)


In [None]:
# samples you had before and after balancing
print("Original train shape:", X_train.shape)
print("Balanced train shape:", X_train_balanced.shape)

Original train shape: (16000, 15186)
Balanced train shape: (22554, 15186)


In [11]:
# 1. Train Logistic Regression on balanced training data
lr_model = LogisticRegression(max_iter=1000, class_weight='balanced', random_state=42)
lr_model.fit(X_train_balanced, y_train_balanced)

In [12]:
# 2. Predict on validation and test sets
y_val_pred = lr_model.predict(X_val)
y_test_pred = lr_model.predict(X_test)

In [18]:
# 3. Evaluate performance
target_names = [str(label) for label in le.classes_]

print("📊 Validation Set Evaluation:")
print(classification_report(y_val, y_val_pred, target_names=target_names))
print("Accuracy:", accuracy_score(y_val, y_val_pred))

print("\n📊 Test Set Evaluation:")
print(classification_report(y_test, y_test_pred, target_names=target_names))
print("Accuracy:", accuracy_score(y_test, y_test_pred))

📊 Validation Set Evaluation:
              precision    recall  f1-score   support

           0       0.85      0.77      0.81       550
           1       0.99      0.39      0.56       704
           2       0.47      0.98      0.64       178
           3       0.63      0.93      0.75       275
           4       0.55      0.85      0.67       212
           5       0.67      0.93      0.78        81

    accuracy                           0.69      2000
   macro avg       0.69      0.81      0.70      2000
weighted avg       0.79      0.69      0.68      2000

Accuracy: 0.694

📊 Test Set Evaluation:
              precision    recall  f1-score   support

           0       0.87      0.76      0.81       581
           1       0.98      0.36      0.53       695
           2       0.40      0.99      0.57       159
           3       0.60      0.92      0.72       275
           4       0.61      0.88      0.72       224
           5       0.56      0.91      0.69        66

    accu

In [28]:
import joblib

# Save Logistic Regression model
joblib.dump(lr_model, 'emotion_model.pkl')

# Save TF-IDF Vectorizer
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')

# Save Label Encoder
joblib.dump(le, 'label_encoder.pkl')


['label_encoder.pkl']

In [27]:
# Load model
lr_model = joblib.load('emotion_model.pkl')

# Load vectorizer
vectorizer = joblib.load('tfidf_vectorizer.pkl')

# Load label encoder
le = joblib.load('label_encoder.pkl')

# Example prediction
sample_text = ["I'm feeling really great today!"]
cleaned_text = [clean_text(sample_text[0])]
X_sample = vectorizer.transform(cleaned_text)
pred = lr_model.predict(X_sample)
emotion = le.inverse_transform(pred)

print("Predicted emotion:", emotion[0])


Predicted emotion: 1
