# Sentiment Model Training (Complete)

A compact, reproducible pipeline for binary sentiment classification using **TF-IDF + Logistic Regression**.

**What this does:**
1) Load dataset from `datasets/sentiment_training.json` (or use a tiny fallback).
2) Stratified train/validation/test split.
3) Small hyperparameter search over C and n-grams.
4) Evaluate with accuracy, F1, ROC-AUC, confusion matrix.
5) Save model + summary JSON to `outputs/`.
6) Provide `infer(texts)` helper and quick error analysis.

In [None]:
# Imports & setup
import os, json, uuid
import numpy as np
import pandas as pd
from pathlib import Path
from datetime import datetime

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve
import joblib

import matplotlib.pyplot as plt

np.random.seed(42)
RUN_ID = str(uuid.uuid4())
OUT_DIR = Path('outputs'); OUT_DIR.mkdir(exist_ok=True)
DATA_PATH = Path('datasets/sentiment_training.json')
print('RUN_ID:', RUN_ID)

## 1) Load data
Expected JSON schema (as in repo):
```json
{ "text": ["great", ...], "label": [1, 0, ...] }
```
The expanded version also includes metadata fields; we handle both.

In [None]:
if DATA_PATH.exists():
    with open(DATA_PATH, 'r', encoding='utf-8') as f:
        payload = json.load(f)
    # handle either flat schema or metadata-wrapped
    if isinstance(payload, dict) and 'text' in payload and 'label' in payload:
        texts, labels = payload['text'], payload['label']
    else:
        raise ValueError('Unsupported JSON format for sentiment_training.json')
else:
    # Fallback tiny dataset
    texts = ['great product','bad experience','okay','terrible quality','excellent support','meh']
    labels = [1,0,1,0,1,0]

df = pd.DataFrame({'text': texts, 'label': labels})
print(df.head(), '\nRows:', len(df))

## 2) Train/Val/Test split (stratified)

In [None]:
X = df['text'].astype(str).values
y = np.asarray(df['label']).astype(int)

# 80% train, 10% val, 10% test
X_train, X_tmp, y_train, y_tmp = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
X_val, X_test, y_val, y_test = train_test_split(
    X_tmp, y_tmp, test_size=0.5, random_state=42, stratify=y_tmp
)
print('Split sizes:', len(X_train), len(X_val), len(X_test))

## 3) Pipeline + hyperparameter search
We keep the search small for speed. Tune **C** and **n-grams**; use class balancing for robustness on small sets.

In [None]:
pipe = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('lr', LogisticRegression(max_iter=2000, class_weight='balanced', solver='liblinear'))
])

param_grid = {
    'tfidf__ngram_range': [(1,1),(1,2)],
    'tfidf__min_df': [1, 2],
    'lr__C': [0.5, 1.0, 2.0]
}

search = GridSearchCV(pipe, param_grid, scoring='f1', cv=5, n_jobs=-1, verbose=0)
search.fit(X_train, y_train)
print('Best params:', search.best_params_)
best = search.best_estimator_

# Evaluate on validation to simulate model selection feedback
val_pred = best.predict(X_val)
val_proba = best.predict_proba(X_val)[:,1] if hasattr(best, 'predict_proba') else None
val_report = classification_report(y_val, val_pred, digits=3)
print('Validation report:\n', val_report)
val_auc = roc_auc_score(y_val, val_proba) if val_proba is not None and len(np.unique(y_val))==2 else float('nan')
print('Validation ROC-AUC:', round(val_auc, 3))

## 4) Final test evaluation
We report accuracy, precision/recall/F1, ROC-AUC, and confusion matrix. Plots are optional for quick inspection.

In [None]:
test_pred = best.predict(X_test)
test_proba = best.predict_proba(X_test)[:,1] if hasattr(best, 'predict_proba') else None
print('Test report:\n', classification_report(y_test, test_pred, digits=3))
cm = confusion_matrix(y_test, test_pred)
print('Confusion matrix:\n', cm)

if test_proba is not None and len(np.unique(y_test))==2:
    auc = roc_auc_score(y_test, test_proba)
    fpr, tpr, _ = roc_curve(y_test, test_proba)
    print('ROC-AUC:', round(auc, 3))
    # Optional quick plot
    plt.figure()
    plt.plot(fpr, tpr, label=f'AUC={auc:.3f}')
    plt.plot([0,1],[0,1],'--')
    plt.xlabel('FPR'); plt.ylabel('TPR'); plt.title('ROC Curve')
    plt.legend(loc='lower right')
    plt.tight_layout()
    plt.show()

## 5) Save model + summary
Artifacts go to `outputs/` so you can track different runs by `RUN_ID`.

In [None]:
model_path = OUT_DIR / f'sentiment_model_{RUN_ID}.joblib'
joblib.dump(best, model_path)
print('Saved model →', model_path)

summary = {
    'run_id': RUN_ID,
    'timestamp_utc': datetime.utcnow().isoformat() + 'Z',
    'train_size': len(X_train), 'val_size': len(X_val), 'test_size': len(X_test),
    'best_params': search.best_params_,
    'val_f1_macro': float(np.mean(list(json.loads(json.dumps(classification_report(y_val, val_pred, output_dict=True)))['macro avg'].values())) if len(np.unique(y_val))>1 else 0.0),
    'test_report': classification_report(y_test, test_pred, output_dict=True),
}
if test_proba is not None and len(np.unique(y_test))==2:
    summary['test_auc'] = float(roc_auc_score(y_test, test_proba))

summary_path = OUT_DIR / f'sentiment_model_summary_{RUN_ID}.json'
with open(summary_path, 'w', encoding='utf-8') as f:
    json.dump(summary, f, indent=2)
print('Saved summary →', summary_path)

## 6) Inference helper + error analysis
A small `infer()` wrapper and a peek at misclassified examples ranked by confidence.

In [None]:
def infer(texts):
    model = joblib.load(model_path)
    probs = model.predict_proba(texts)[:,1]
    preds = (probs >= 0.5).astype(int)
    return list(zip(texts, preds.tolist(), probs.tolist()))

# Demo
print(infer(['excellent support and fast response', 'buggy app with terrible UX']))

# Error analysis on test set
if test_proba is not None:
    margins = np.abs(test_proba - 0.5)
    mis_idx = np.where(test_pred != y_test)[0]
    hardest = sorted(mis_idx, key=lambda i: margins[i])[:10]
    examples = [{'text': X_test[i], 'true': int(y_test[i]), 'pred': int(test_pred[i]), 'prob': float(test_proba[i])} for i in hardest]
    pd.DataFrame(examples)