# NLP Emotion Classification with DistilBERT
This notebook contains the full workflow for loading the dataset, preprocessing text, tokenizing using DistilBERT, fine-tuning a classifier, evaluating it, and testing predictions.

Each code block is explained in detail.

## 1. Install Required Libraries
We install HuggingFace Transformers, Datasets, Accelerate, and visualization tools.

In [None]:
!pip install -U transformers accelerate datasets bertviz 

## 2. Load the Emotion Dataset
We load the `dair-ai/emotion` dataset from HuggingFace containing 6 emotion labels.

In [None]:
from datasets import load_dataset
emotion = load_dataset('emotion')
print(emotion['train'][0])

## 3. Convert Dataset to Pandas for Exploration
Exploratory Data Analysis helps us understand distributions and text length.

In [None]:
emotion.set_format(type='pandas')
import pandas as pd
df = emotion['train'][:]
df.head()

## 4. Visualize Label Distribution

In [None]:
import matplotlib.pyplot as plt
label_counts = df['label'].value_counts()
label_counts.plot.bar()
plt.title('Label Frequency')
plt.show()

## 5. Load Tokenizer and Tokenize Text
DistilBERT requires numerical input (token IDs + attention masks).

In [None]:
import os
os.environ['HF_HUB_DISABLE_SYMLINKS_WARNING'] = '1'

from transformers import AutoTokenizer
model_ckpt = 'distilbert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

def tokenize(batch):
    return tokenizer(batch['text'], padding=True, truncation=True)

## 6. Apply Tokenization to Entire Dataset

In [None]:
emotion.reset_format()
emotions_encoded = emotion.map(tokenize, batched=True)
emotions_encoded

## 7. Load DistilBERT for Sequence Classification
We use a pretrained model with a classification head.

In [None]:
import torch
from transformers import AutoModelForSequenceClassification
import warnings
warnings.filterwarnings('ignore')

num_labels = len(emotion['train'].features['label'].names)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = AutoModelForSequenceClassification.from_pretrained(model_ckpt, num_labels=num_labels).to(device)

## 8. Define Training Arguments and Metrics

In [None]:
from transformers import TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, f1_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    return {
        'accuracy': accuracy_score(labels, preds),
        'f1': f1_score(labels, preds, average='weighted')
    }

training_args = TrainingArguments(
    output_dir='distilbert-emotion',
    num_train_epochs=2,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    eval_strategy='epoch',
    learning_rate=2e-5,
    weight_decay=0.01
)

## 9. Train the Model

In [8]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=emotions_encoded['train'],
    eval_dataset=emotions_encoded['validation'],
    compute_metrics=compute_metrics,
    
    tokenizer=tokenizer
)

trainer.train()

KeyboardInterrupt: 

## 10. Evaluate on Test Set

In [None]:
test_results = trainer.predict(emotions_encoded['test'])
test_results.metrics

## 11. Test Model on Custom Sentence

In [None]:
text = 'im so happy today!'
inputs = tokenizer(text, return_tensors='pt').to(device)

with torch.no_grad():
    outputs = model(**inputs)

pred = torch.argmax(outputs.logits, dim=1).item()
emotion['train'].features['label'].names[pred]

## 12. Evaluate The Model

In [None]:
# Evaluate on test set: classification report + confusion matrix (counts and normalized)
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score

# Run predictions on the test set (uses the Trainer)
pred_out = trainer.predict(emotions_encoded['test'])
y_true = pred_out.label_ids
y_pred = np.argmax(pred_out.predictions, axis=1)
label_names = emotion['train'].features['label'].names

# Basic metrics
print("Accuracy:", accuracy_score(y_true, y_pred))
print("Weighted F1:", f1_score(y_true, y_pred, average='weighted'))
print("\nClassification Report:\n")
print(classification_report(y_true, y_pred, target_names=label_names, digits=4))


# Confusion matrix Normalized
cmn = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
fig, ax = plt.subplots(figsize=(8,6))
sns.heatmap(cmn, annot=True, fmt='.2f', cmap='Blues', xticklabels=label_names, yticklabels=label_names, ax=ax)
ax.set_xlabel('Predicted')
ax.set_ylabel('True')
ax.set_title('Confusion Matrix (normalized)')
plt.tight_layout()
plt.show()


## 13. Test Model on Custom Sentences


In [None]:
import torch
import numpy as np

label_names = emotion['train'].features['label'].names

def predict_text(text):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True).to(device)
    model.eval()
    with torch.no_grad():
        outputs = model(**inputs)
    probs = torch.softmax(outputs.logits, dim=1).cpu().numpy()[0]
    pred_idx = int(np.argmax(probs))
    return pred_idx, probs

examples = [
    "I am extremely happy today!",
    "I feel so sad and alone.",
    "That was embarrassing and humiliating."
]

for t in examples:
    idx, probs = predict_text(t)
    print("Text:", t)
    print(" Predicted:", label_names[idx], f"(label {idx})")
    print(" Probabilities:")
    for name, p in zip(label_names, probs):
        print(f"  - {name}: {p:.4f}")
    print()


## 11. Test Model on Clients Reviews

In [None]:
# Analyze customer reviews (CSV) with the trained model
# Place a CSV named 'customer_reviews.csv' in the notebook folder with a column named 'review'.
import os
import pandas as pd
import numpy as np
import torch
import matplotlib.pyplot as plt
from tqdm import tqdm

csv_path = 'customer_reviews.csv'  # change this path if needed
if not os.path.exists(csv_path):
    print(f"File not found: {csv_path}\nPlease upload a CSV with a column named 'review' and re-run this cell.")
else:
    df_reviews = pd.read_csv(csv_path)
    if 'review' not in df_reviews.columns:
        print("CSV must contain a column named 'review'. Columns found:\n", df_reviews.columns.tolist())
    else:
        texts = df_reviews['review'].astype(str).tolist()
        model.eval()
        preds = []
        probs_list = []
        batch_size = 32
        for i in tqdm(range(0, len(texts), batch_size)):
            batch_texts = texts[i:i+batch_size]
            enc = tokenizer(batch_texts, return_tensors='pt', padding=True, truncation=True).to(device)
            with torch.no_grad():
                out = model(**enc)
                probs = torch.softmax(out.logits, dim=1).cpu().numpy()
                preds_batch = probs.argmax(axis=1)
            preds.extend(preds_batch.tolist())
            probs_list.extend(probs.tolist())

        df_reviews['pred_label'] = preds
        label_names = emotion['train'].features['label'].names
        df_reviews['pred_label_name'] = df_reviews['pred_label'].apply(lambda x: label_names[int(x)])

        # Add probability columns for each class
        for idx, name in enumerate(label_names):
            df_reviews[f'prob_{name}'] = [p[idx] for p in probs_list]

        # Summary distribution
        dist = df_reviews['pred_label_name'].value_counts()
        print('\nPredicted label distribution:\n')
        print(dist)

        # Plot distribution
        plt.figure(figsize=(8,5))
        dist.plot.bar()
        plt.title('Predicted Label Distribution (customer reviews)')
        plt.xlabel('Label')
        plt.ylabel('Count')
        plt.tight_layout()
        plt.show()

        # Save results
        out_csv = 'customer_reviews_with_predictions.csv'
        df_reviews.to_csv(out_csv, index=False)
        print(f"Saved predictions to {out_csv}")

        # Show a few examples per label
        for name in label_names:
            subset = df_reviews[df_reviews['pred_label_name'] == name]
            if len(subset) > 0:
                print(f"\nExamples predicted as '{name}': (showing up to 3)")
                display(subset[['review', 'prob_{}' .format(name)]].head(3))
            else:
                print(f"\nNo examples predicted as '{name}'")