In [None]:
from datasets import load_dataset
dataset = load_dataset("go_emotions")


In [None]:
import pandas as pd

# Convert to DataFrame
df = pd.DataFrame(dataset['train'])
df.head()


In [None]:
# Map label indices to label names
label_names = dataset['train'].features['labels'].feature.names

# Add decoded labels to DataFrame
df['decoded_labels'] = df['labels'].apply(lambda x: [label_names[i] for i in x])
df[['text', 'decoded_labels']].head()


In [None]:
from collections import Counter
import matplotlib.pyplot as plt

# Flatten all labels
all_labels = [label for sublist in df['labels'] for label in sublist]
label_counts = Counter(all_labels)

# Convert to name-based counts
label_counts_named = {label_names[k]: v for k, v in label_counts.items()}

# Plot
plt.figure(figsize=(12, 6))
plt.bar(label_counts_named.keys(), label_counts_named.values())
plt.xticks(rotation=45)
plt.title("Emotion Label Frequencies")
plt.xlabel("Emotion")
plt.ylabel("Count")
plt.grid(axis='y')
plt.tight_layout()
plt.show()


## 🧠 Step 1: Vectorize the Text Using TF-IDF
We'll convert the journal text entries into numeric vectors using `TfidfVectorizer`, which transforms text into a matrix of term importance.

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize vectorizer
vectorizer = TfidfVectorizer(max_features=5000)

# Transform text data into TF-IDF features
X = vectorizer.fit_transform(df['text'])

# Check shape
print("TF-IDF matrix shape:", X.shape)


## 🎯 Step 2: Prepare Multi-Label Targets (Y)
We’ll transform the `labels` list into a binary matrix for multi-label classification.

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()
Y = mlb.fit_transform(df['decoded_labels'])

# Check shape and preview
print("Target shape:", Y.shape)
pd.DataFrame(Y, columns=mlb.classes_).head()


## 🔍 Step 3: Train a Logistic Regression Model
We'll train a simple multi-label logistic regression using `OneVsRestClassifier`.

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Train-test split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# Train model
model = OneVsRestClassifier(LogisticRegression(max_iter=1000))
model.fit(X_train, Y_train)

# Predict
Y_pred = model.predict(X_test)

# Report
print(classification_report(Y_test, Y_pred, target_names=mlb.classes_))


## 💾 Step 4: Save the Trained Model and Label Encoder
We'll save the trained model and the MultiLabelBinarizer using `joblib` so we can reuse them in our API.

In [1]:
import joblib
import os

# Create models directory if it doesn't exist
os.makedirs("../models", exist_ok=True)

# Save model
joblib.dump(model, "../models/logistic_model.pkl")

# Save TF-IDF vectorizer
joblib.dump(vectorizer, "../models/tfidf_vectorizer.pkl")

# Save label binarizer
joblib.dump(mlb, "../models/label_binarizer.pkl")

print("✅ Model, vectorizer, and label encoder saved successfully.")


NameError: name 'model' is not defined