In [None]:
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from scipy.sparse import hstack, csr_matrix
from sklearn.model_selection import train_test_split, RandomizedSearchCV, StratifiedKFold
from sklearn.metrics import (confusion_matrix, f1_score, accuracy_score, 
                             classification_report, roc_curve, roc_auc_score, silhouette_score, adjusted_rand_score)
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, StackingClassifier
from sklearn.decomposition import TruncatedSVD
from sklearn.cluster import KMeans
from sklearn.preprocessing import Normalizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import label_binarize

# =============================================================================
# 1. Data Loading and Preprocessing
# =============================================================================
try:
    train_df = pd.read_csv('Training.csv')
    test_df = pd.read_csv('Test.csv')
except FileNotFoundError:
    print("Error: One or both CSV files not found.")
    exit(1)

# Fill missing review text with empty strings
train_df['reviewText'] = train_df['reviewText'].fillna('')
test_df['reviewText'] = test_df['reviewText'].fillna('')

# -------------------------------
# Aggressive Text Preprocessing
# -------------------------------
def preprocess_text(text):
    # Lowercase the text
    text = text.lower()
    # Remove punctuation and non-alphanumeric characters
    text = re.sub(r'[^a-z0-9\s]', ' ', text)
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    return text

train_df['clean_text'] = train_df['reviewText'].apply(preprocess_text)
test_df['clean_text'] = test_df['reviewText'].apply(preprocess_text)

# Additional numeric feature: review length (word count)
train_df['review_length'] = train_df['clean_text'].apply(lambda x: len(x.split()))
test_df['review_length'] = test_df['clean_text'].apply(lambda x: len(x.split()))

# =============================================================================
# 2. Feature Extraction and Combination
# =============================================================================
# TF-IDF with bigrams
tfidf = TfidfVectorizer(stop_words='english',
                        max_features=30000,
                        min_df=5,
                        max_df=0.95,
                        ngram_range=(1, 2))
X_train_tfidf = tfidf.fit_transform(train_df['clean_text'])
X_test_tfidf = tfidf.transform(test_df['clean_text'])

# Convert review_length to sparse column vector
X_train_length = csr_matrix(train_df['review_length'].values).T
X_test_length = csr_matrix(test_df['review_length'].values).T

# Combine TF-IDF features with numeric feature
X_train_combined = hstack([X_train_tfidf, X_train_length])
X_test_combined = hstack([X_test_tfidf, X_test_length])

# =============================================================================
# 3. Evaluation Strategy
# =============================================================================


if 'overall' in test_df.columns:
    X_train_used = X_train_combined
    y_train_used = train_df['overall']
    X_test_used = X_test_combined
    y_test_used = test_df['overall']
    print("Test dataset contains labels. Using provided test set for evaluation.")
else:
    X_train_used, X_test_used, y_train_used, y_test_used = train_test_split(
        X_train_combined, train_df['overall'], test_size=0.2, random_state=42, stratify=train_df['overall'])
    print("Test dataset does not contain labels. Using a hold-out split from training data for evaluation.")

# =============================================================================
# 4. Utility Functions for Plotting
# =============================================================================
def plot_confusion_matrix(cm, title='Confusion Matrix'):
    plt.figure()
    plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(cm.shape[0])
    plt.xticks(tick_marks, tick_marks)
    plt.yticks(tick_marks, tick_marks)
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.show()

def plot_roc_curve(fpr, tpr, auc_score, title='ROC Curve'):
    plt.figure()
    plt.plot(fpr, tpr, label=f'AUC = {auc_score:.2f}')
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(title)
    plt.legend(loc='lower right')
    plt.show()

# =============================================================================
# 5. Binary Classification Function
# =============================================================================
def run_binary_classification(cutoff, X_train, y_train, X_test, y_test):
    """
    Converts ratings to binary labels (1 if overall > cutoff, else 0) and trains
    Logistic Regression, Linear SVM, and Random Forest using 5-fold cross-validation.
    Reports confusion matrix, accuracy, macro F1, and ROC AUC.
    """
    y_train_bin = (y_train > cutoff).astype(int)
    y_test_bin = (y_test > cutoff).astype(int)
    
    models = {
        'Logistic Regression': {
            'model': LogisticRegression(max_iter=3000, class_weight='balanced'),
            'params': {'C': [0.01, 0.1, 1, 10, 100]}
        },
        'Linear SVM': {
            'model': LinearSVC(max_iter=1000, class_weight='balanced'),
            'params': {'C': [0.01, 0.1, 1, 10]}
        },
        'Random Forest': {
            'model': RandomForestClassifier(random_state=42),
            'params': {'n_estimators': [50, 100, 150],
                       'max_depth': [None, 10, 20]}
        }
    }
    
    results = {}
    print(f"\n=== Binary Classification (Cutoff = {cutoff}) ===")
    for name, mp in models.items():
        print(f"\n--- {name} ---")
        grid = RandomizedSearchCV(mp['model'],
                                  mp['params'],
                                  n_iter=10,
                                  cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
                                  scoring='f1_macro',
                                  n_jobs=-1,
                                  random_state=42)
        grid.fit(X_train, y_train_bin)
        best_model = grid.best_estimator_
        y_pred = best_model.predict(X_test)
        
        cm_ = confusion_matrix(y_test_bin, y_pred)
        acc = accuracy_score(y_test_bin, y_pred)
        f1_macro = f1_score(y_test_bin, y_pred, average='macro')
        # Compute ROC and AUC
        try:
            scores = best_model.decision_function(X_test)
        except Exception:
            scores = best_model.predict_proba(X_test)[:, 1]
        fpr, tpr, _ = roc_curve(y_test_bin, scores)
        auc_val = roc_auc_score(y_test_bin, scores)
        
        print("Best hyperparameters:", grid.best_params_)
        print("Confusion Matrix:\n", cm_)
        print("Accuracy:", acc)
        print("Macro F1:", f1_macro)
        print("ROC AUC:", auc_val)
        plot_confusion_matrix(cm_, title=f'{name} (Cutoff {cutoff}) Confusion Matrix')
        plot_roc_curve(fpr, tpr, auc_val, title=f'{name} (Cutoff {cutoff}) ROC Curve')
        
        results[name] = {
            'best_params': grid.best_params_,
            'confusion_matrix': cm_,
            'accuracy': acc,
            'macro_f1': f1_macro,
            'roc_auc': auc_val
        }
    return results

# Run binary classification for cutoffs 1, 2, 3, and 4
binary_results = {}
for cutoff in [1, 2, 3, 4]:
    binary_results[cutoff] = run_binary_classification(cutoff,
                                                       X_train_used,
                                                       y_train_used,
                                                       X_test_used,
                                                       y_test_used)

# =============================================================================
# 6. Multiclass Classification Function with Ensembles
# =============================================================================
def run_multiclass_classification(X_train, y_train, X_test, y_test):
    """
    Evaluates multiclass classification (ratings 1-5) using:
      - Logistic Regression
      - Random Forest
      - Voting Ensemble (soft voting)
      - Stacking Ensemble (with RidgeClassifier as meta-model)
    Reports confusion matrix, accuracy, macro F1, and plots ROC curves for each class.
    """
    classes = sorted(y_train.unique())
    y_test_bin = label_binarize(y_test, classes=classes)
    
    models = {
        'Logistic Regression': {
            'model': LogisticRegression(max_iter=3000, solver='lbfgs'),
            'params': {'C': [0.01, 0.1, 1, 10, 100]}
        },
        'Random Forest': {
            'model': RandomForestClassifier(random_state=42),
            'params': {'n_estimators': [50, 100, 150],
                       'max_depth': [None, 10, 20]}
        }
    }
    
    results = {}
    print("\n=== Multiclass Classification (Ratings 1-5) ===")
    for name, mp in models.items():
        print(f"\n--- {name} ---")
        grid = RandomizedSearchCV(mp['model'],
                                  mp['params'],
                                  n_iter=10,
                                  cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
                                  scoring='f1_macro',
                                  n_jobs=-1,
                                  random_state=42)
        grid.fit(X_train, y_train)
        best_model = grid.best_estimator_
        y_pred = best_model.predict(X_test)
        
        cm_ = confusion_matrix(y_test, y_pred)
        acc = accuracy_score(y_test, y_pred)
        f1_macro = f1_score(y_test, y_pred, average='macro')
        
        print("Best hyperparameters:", grid.best_params_)
        print("Confusion Matrix:\n", cm_)
        print("Accuracy:", acc)
        print("Macro F1:", f1_macro)
        print("Classification Report:\n", classification_report(y_test, y_pred))
        plot_confusion_matrix(cm_, title=f'{name} (Multiclass) Confusion Matrix')
        
        # Compute and plot ROC curves for each class
        try:
            scores = best_model.decision_function(X_test)
        except Exception:
            scores = best_model.predict_proba(X_test)
        for i, cls in enumerate(classes):
            fpr, tpr, _ = roc_curve(y_test_bin[:, i], scores[:, i])
            auc_val = roc_auc_score(y_test_bin[:, i], scores[:, i])
            plt.plot(fpr, tpr, label=f'Class {cls} (AUC = {auc_val:.2f})')
        plt.plot([0, 1], [0, 1], 'k--')
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title(f'{name} (Multiclass) ROC Curves')
        plt.legend(loc='lower right')
        plt.show()
        
        results[name] = {
            'best_params': grid.best_params_,
            'accuracy': acc,
            'macro_f1': f1_macro
        }
    
    # --- Voting Ensemble ---
    print("\n--- Voting Ensemble ---")
    lr = LogisticRegression(max_iter=3000, solver='lbfgs', C=1)
    rf = RandomForestClassifier(random_state=42, n_estimators=100, max_depth=None)
    voting_ensemble = VotingClassifier(estimators=[('lr', lr), ('rf', rf)], voting='soft')
    voting_ensemble.fit(X_train, y_train)
    y_pred_voting = voting_ensemble.predict(X_test)
    
    cm_voting = confusion_matrix(y_test, y_pred_voting)
    acc_voting = accuracy_score(y_test, y_pred_voting)
    f1_voting = f1_score(y_test, y_pred_voting, average='macro')
    print("Voting Ensemble Accuracy:", acc_voting)
    print("Voting Ensemble Macro F1:", f1_voting)
    print("Voting Ensemble Classification Report:\n", classification_report(y_test, y_pred_voting))
    plot_confusion_matrix(cm_voting, title='Voting Ensemble (Multiclass) Confusion Matrix')
    
    results['Voting Ensemble'] = {
        'accuracy': acc_voting,
        'macro_f1': f1_voting
    }
    
    # --- Stacking Ensemble ---
    print("\n--- Stacking Ensemble ---")
    estimators = [('lr', lr), ('rf', rf)]
    stacking_ensemble = StackingClassifier(estimators=estimators, final_estimator=RidgeClassifier(), cv=5)
    stacking_ensemble.fit(X_train, y_train)
    y_pred_stack = stacking_ensemble.predict(X_test)
    
    cm_stack = confusion_matrix(y_test, y_pred_stack)
    acc_stack = accuracy_score(y_test, y_pred_stack)
    f1_stack = f1_score(y_test, y_pred_stack, average='macro')
    print("Stacking Ensemble Accuracy:", acc_stack)
    print("Stacking Ensemble Macro F1:", f1_stack)
    print("Stacking Ensemble Classification Report:\n", classification_report(y_test, y_pred_stack))
    plot_confusion_matrix(cm_stack, title='Stacking Ensemble (Multiclass) Confusion Matrix')
    
    results['Stacking Ensemble'] = {
        'accuracy': acc_stack,
        'macro_f1': f1_stack
    }
    
    return results

multiclass_results = run_multiclass_classification(X_train_used,
                                                   y_train_used,
                                                   X_test_used,
                                                   y_test_used)

# =============================================================================
# 7. Clustering
# =============================================================================
def run_clustering_tuned_cosine(X):
    """
    Applies Truncated SVD to reduce dimensionality, normalizes the result, then runs KMeans
    over a grid of n_components and cluster numbers to select the best silhouette score.
    """
    """
    Applies Truncated SVD to reduce dimensionality, normalizes the result, and then runs KMeans
    over a grid of n_components and cluster numbers (k) to select the best silhouette score using
    the cosine metric.
    """
    best_score = -1
    best_params = {}
    best_clusters = None
    for n_components in [50, 100, 200, 300]:
        svd = TruncatedSVD(n_components=n_components, random_state=42)
        X_reduced = svd.fit_transform(X)
        normalizer = Normalizer(copy=False)
        X_normalized = normalizer.fit_transform(X_reduced)
        for k in [4, 5, 6, 7, 8]:
            kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
            clusters = kmeans.fit_predict(X_normalized)
            # Use cosine distance for silhouette_score
            sil = silhouette_score(X_normalized, clusters, metric="cosine")
            print(f"n_components={n_components}, k={k}, silhouette (cosine)={sil:.4f}")
            if sil > best_score:
                best_score = sil
                best_params = {'n_components': n_components, 'k': k}
                best_clusters = clusters
    print("\nBest clustering parameters (cosine):", best_params)
    print("Best silhouette score (cosine):", best_score)
    return best_clusters, best_score, best_params

clusters_cosine, sil_score_cosine, best_params_cosine = run_clustering_tuned_cosine(X_test_tfidf)

# =============================================================================
# 8. Create Submission CSV Files for Each Task
# =============================================================================


# We'll loop over cutoffs [1,2,3,4] and create a submission file for each.
for cutoff in [1, 2, 3, 4]:
    print(f"\nCreating submission file for binary classification (cutoff = {cutoff})")
    # Create binary labels for full training data
    y_train_bin_full = (train_df['overall'] > cutoff).astype(int)
    
    # Use the best hyperparameter for Logistic Regression from our previous experiments.
    best_params = binary_results[cutoff]['Logistic Regression']['best_params']
    
    # Refit the final model using Logistic Regression 
    final_model_bin = LogisticRegression(max_iter=3000, class_weight='balanced',
                                           C=best_params['C'], solver='lbfgs')
    final_model_bin.fit(X_train_combined, y_train_bin_full)
    
    # Predict probabilities on the test set.
    test_probs_bin = final_model_bin.predict_proba(X_test_combined)[:, 1]
    
    # Set threshold for binary decision.
    threshold = 0.5  
    test_preds_bin = (test_probs_bin >= threshold).astype(int)
    
    # Create submission DataFrame with the required column names.
    submission_bin = pd.DataFrame({
        'id': test_df.index,  
        f'binary_split_{cutoff}': test_preds_bin
    })
    
    # Save to CSV
    filename_bin = f'submission_binary_cutoff{cutoff}.csv'
    submission_bin.to_csv(filename_bin, index=False)
    print(f"Submission file created: {filename_bin}")

# ----- For Multiclass Classification -----
print("\nCreating submission file for multiclass classification")
# Use the best hyperparameters 
best_params_multi = multiclass_results['Logistic Regression']['best_params']
final_model_multi = LogisticRegression(max_iter=3000, solver='lbfgs', C=best_params_multi['C'])
final_model_multi.fit(X_train_combined, train_df['overall'])

# Predict overall ratings on the test set
test_preds_multi = final_model_multi.predict(X_test_combined)

submission_multi = pd.DataFrame({
    'id': test_df.index,  
    'overall': test_preds_multi
})
submission_multi.to_csv('submission_multiclass.csv', index=False)
print("Submission file created: submission_multiclass.csv")
