# üîß Notebook 3: Nominal Models
## Final Project - Ordinal vs Nominal Sentiment Analysis
### Atharv Chaudhary

---

**Purpose:** Train and evaluate NOMINAL classification models.

**Models:**
1. Multinomial Naive Bayes
2. Logistic Regression (Multinomial)

**Input:** `amazon_electronics_cleaned.csv`

**Output:** `nominal_results.csv`, confusion matrices

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score, 
    mean_absolute_error, 
    f1_score, 
    classification_report,
    confusion_matrix
)

# Settings
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)
plt.style.use('seaborn-v0_8-whitegrid')

print("‚úÖ Libraries imported")

## Step 1: Load Data

In [None]:
# Load cleaned data
df = pd.read_csv('amazon_electronics_cleaned.csv')
print(f"‚úÖ Loaded {len(df):,} reviews")

# Show distribution
print("\nüìä Rating Distribution:")
print(df['rating'].value_counts().sort_index())

## Step 2: Feature Extraction (TF-IDF)

In [None]:
# ============================================================================
# TF-IDF FEATURE EXTRACTION
# ============================================================================

print("=" * 70)
print("TF-IDF FEATURE EXTRACTION")
print("=" * 70)

# Configuration
MAX_FEATURES = 5000
NGRAM_RANGE = (1, 2)  # Unigrams and bigrams

vectorizer = TfidfVectorizer(
    max_features=MAX_FEATURES,
    stop_words='english',
    ngram_range=NGRAM_RANGE,
    min_df=5,
    max_df=0.95
)

print(f"\nSettings:")
print(f"   Max features: {MAX_FEATURES}")
print(f"   N-gram range: {NGRAM_RANGE}")

# Transform
X = vectorizer.fit_transform(df['text'])
y = df['rating'].values

print(f"\n‚úÖ TF-IDF matrix: {X.shape}")

## Step 3: Train/Test Split

In [None]:
# ============================================================================
# TRAIN/TEST SPLIT
# ============================================================================

X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2, 
    random_state=RANDOM_STATE, 
    stratify=y
)

print(f"‚úÖ Training set: {X_train.shape[0]:,} samples")
print(f"‚úÖ Test set: {X_test.shape[0]:,} samples")

## Step 4: Helper Functions

In [None]:
# ============================================================================
# HELPER FUNCTIONS
# ============================================================================

def evaluate_model(y_true, y_pred, model_name):
    """Evaluate model and return metrics."""
    accuracy = accuracy_score(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)
    f1_macro = f1_score(y_true, y_pred, average='macro')
    f1_weighted = f1_score(y_true, y_pred, average='weighted')
    
    print(f"\n{'='*55}")
    print(f"üìä {model_name}")
    print(f"{'='*55}")
    print(f"Accuracy:      {accuracy:.4f} ({accuracy*100:.2f}%)")
    print(f"MAE:           {mae:.4f}")
    print(f"F1 (macro):    {f1_macro:.4f}")
    print(f"F1 (weighted): {f1_weighted:.4f}")
    
    return {
        'model': model_name,
        'encoding': 'Nominal',
        'accuracy': accuracy,
        'mae': mae,
        'f1_macro': f1_macro,
        'f1_weighted': f1_weighted
    }


def calculate_error_rates(y_true, y_pred):
    """Calculate adjacent and severe error rates."""
    errors = y_true != y_pred
    if errors.sum() == 0:
        return 0.0, 0.0
    
    error_distances = np.abs(y_true[errors] - y_pred[errors])
    adjacent = (error_distances == 1).sum() / errors.sum()
    severe = (error_distances >= 2).sum() / errors.sum()
    
    return adjacent, severe


print("‚úÖ Helper functions defined")

## Step 5: Model 1 - Multinomial Naive Bayes

In [None]:
# ============================================================================
# MODEL 1: MULTINOMIAL NAIVE BAYES
# ============================================================================

print("\n" + "="*70)
print("üîß MODEL 1: Multinomial Naive Bayes (Nominal)")
print("="*70)
print("\nTreats classes as UNORDERED categories.")
print("Formula: P(Y=k|x) ‚àù P(Y=k) √ó Œ† P(x‚±º|Y=k)")

# Train
nb_model = MultinomialNB(alpha=1.0)  # Laplace smoothing
nb_model.fit(X_train, y_train)

# Predict
nb_pred = nb_model.predict(X_test)

# Evaluate
nb_results = evaluate_model(y_test, nb_pred, "Naive Bayes")

# Error analysis
nb_adjacent, nb_severe = calculate_error_rates(y_test, nb_pred)
nb_results['adjacent_error'] = nb_adjacent
nb_results['severe_error'] = nb_severe

print(f"\nError Analysis:")
print(f"   Adjacent Error Rate (¬±1): {nb_adjacent:.2%}")
print(f"   Severe Error Rate (¬±2+):  {nb_severe:.2%}")

In [None]:
# Classification report
print("\nüìã Classification Report - Naive Bayes:")
print(classification_report(y_test, nb_pred, digits=4))

## Step 6: Model 2 - Logistic Regression (Multinomial)

In [None]:
# ============================================================================
# MODEL 2: LOGISTIC REGRESSION (MULTINOMIAL)
# ============================================================================

print("\n" + "="*70)
print("üîß MODEL 2: Logistic Regression (Nominal - Multinomial)")
print("="*70)
print("\nUses softmax, treats classes as UNORDERED.")
print("Formula: P(Y=k|x) = exp(w‚Çñ·µÄx + b‚Çñ) / Œ£ exp(w‚±º·µÄx + b‚±º)")

# Train
lr_model = LogisticRegression(
    multi_class='multinomial',
    solver='lbfgs',
    max_iter=1000,
    random_state=RANDOM_STATE,
    n_jobs=-1
)
lr_model.fit(X_train, y_train)

# Predict
lr_pred = lr_model.predict(X_test)

# Evaluate
lr_results = evaluate_model(y_test, lr_pred, "Logistic Regression")

# Error analysis
lr_adjacent, lr_severe = calculate_error_rates(y_test, lr_pred)
lr_results['adjacent_error'] = lr_adjacent
lr_results['severe_error'] = lr_severe

print(f"\nError Analysis:")
print(f"   Adjacent Error Rate (¬±1): {lr_adjacent:.2%}")
print(f"   Severe Error Rate (¬±2+):  {lr_severe:.2%}")

In [None]:
# Classification report
print("\nüìã Classification Report - Logistic Regression:")
print(classification_report(y_test, lr_pred, digits=4))

## Step 7: Confusion Matrices

In [None]:
# ============================================================================
# CONFUSION MATRICES
# ============================================================================

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Naive Bayes
cm_nb = confusion_matrix(y_test, nb_pred)
sns.heatmap(cm_nb, annot=True, fmt='d', cmap='Blues', ax=axes[0],
            xticklabels=[1, 2, 3, 4, 5], yticklabels=[1, 2, 3, 4, 5])
axes[0].set_xlabel('Predicted Rating', fontsize=11)
axes[0].set_ylabel('Actual Rating', fontsize=11)
axes[0].set_title(f'Naive Bayes (Nominal)\nAccuracy: {nb_results["accuracy"]:.2%}', 
                  fontsize=12, fontweight='bold')

# Logistic Regression
cm_lr = confusion_matrix(y_test, lr_pred)
sns.heatmap(cm_lr, annot=True, fmt='d', cmap='Greens', ax=axes[1],
            xticklabels=[1, 2, 3, 4, 5], yticklabels=[1, 2, 3, 4, 5])
axes[1].set_xlabel('Predicted Rating', fontsize=11)
axes[1].set_ylabel('Actual Rating', fontsize=11)
axes[1].set_title(f'Logistic Regression (Nominal)\nAccuracy: {lr_results["accuracy"]:.2%}', 
                  fontsize=12, fontweight='bold')

plt.tight_layout()
plt.savefig('confusion_matrices_nominal.png', dpi=150, bbox_inches='tight')
plt.show()

print("\n‚úÖ Saved: confusion_matrices_nominal.png")

## Step 8: Save Results

In [None]:
# ============================================================================
# SAVE RESULTS
# ============================================================================

# Combine results
nominal_results = pd.DataFrame([nb_results, lr_results])

print("\n" + "="*70)
print("üìä NOMINAL MODELS SUMMARY")
print("="*70)
print(nominal_results.to_string(index=False))

# Save to CSV
nominal_results.to_csv('nominal_results.csv', index=False)
print("\n‚úÖ Saved: nominal_results.csv")

In [None]:
# Save predictions for later analysis
predictions_df = pd.DataFrame({
    'actual': y_test,
    'nb_pred': nb_pred,
    'lr_pred': lr_pred
})
predictions_df.to_csv('nominal_predictions.csv', index=False)
print("‚úÖ Saved: nominal_predictions.csv")

In [None]:
# Download files
try:
    from google.colab import files
    files.download('nominal_results.csv')
    files.download('confusion_matrices_nominal.png')
except:
    print("Files saved locally")

---
## ‚úÖ Summary

**Nominal Models Trained:**

| Model | Accuracy | MAE | Adjacent Error | Severe Error |
|-------|----------|-----|----------------|---------------|
| Naive Bayes | See above | See above | See above | See above |
| Logistic Regression | See above | See above | See above | See above |

**Key Observation:** Both models treat ratings as unordered categories, potentially missing ordinal structure.

**Next:** Run `4_Models_Ordinal.ipynb` to compare with ordinal methods.