# Atul Anant (MDS202314) @ AML ASSIGNMENT 1

### Importing required packages

In [1]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

### Loading data

In [2]:
def load_csv_data(file_path: str) -> pd.DataFrame:    
    df = pd.read_csv(file_path)
    return df

In [20]:
def prepare_features_labels(df: pd.DataFrame, feature_col: str = "message", label_col: str = "label"):    
    X = df[feature_col]
    y = df[label_col]
    return X, y

### Fitting model

In [21]:
def fit_model(model, X_train, y_train): 
    model.fit(X_train, y_train)
    return model

In [22]:
def score_model(model, X, y) -> float:
    return model.score(X, y)

In [23]:
def evaluate_model_predictions(y_true, y_pred, title: str = ""):
    print(f"\n=== Evaluation: {title} ===")
    print("Accuracy:", accuracy_score(y_true, y_pred))
    print("Classification Report:\n", classification_report(y_true, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_true, y_pred))

In [12]:
def validate_model(model, X_val, y_val) -> float:
    """
    Validate the model on validation data and return the accuracy. 
    Returns  (float) Accuracy on the validation set.
    """
    return model.score(X_val, y_val)

### Train and evaluate

In [15]:
def train_and_evaluate(model, X_train, y_train, X_val, y_val):
    # Fit on train
    model = fit_model(model, X_train, y_train)
    
    # Score on train
    train_acc = score_model(model, X_train, y_train)
    y_train_pred = model.predict(X_train)
    
    # Score on validation
    val_acc = score_model(model, X_val, y_val)
    y_val_pred = model.predict(X_val)

    # Evaluate
    print(f"\n=== Model: {type(model.steps[-1][1]).__name__} ===")  
    print("--- Train Evaluation ---")
    evaluate_model_predictions(y_train, y_train_pred, title="Train")
    
    print("--- Validation Evaluation ---")
    evaluate_model_predictions(y_val, y_val_pred, title="Validation")
    
    print(f"Train Accuracy: {train_acc:.4f}, Validation Accuracy: {val_acc:.4f}")
    
    return model, (train_acc, val_acc)

### Hyper-parameter tuning

In [16]:
def fine_tune_hyperparams(model, param_grid, X_train, y_train, X_val, y_val):
    grid_search = GridSearchCV(model, param_grid, cv=3, scoring='accuracy', n_jobs=-1)
    grid_search.fit(X_train, y_train)
    
    best_model = grid_search.best_estimator_
    best_score = grid_search.best_score_
    
    print("\n=== Hyperparameter Tuning ===")
    print("Best parameters:", grid_search.best_params_)
    print(f"Best CV score: {best_score:.4f}")

    # Evaluate best_model on the validation set
    val_acc = best_model.score(X_val, y_val)
    print(f"Validation Accuracy of best model: {val_acc:.4f}")
    
    return best_model, best_score

### Scoring benchmark

In [17]:
def score_benchmark_models_on_test(models, X_test, y_test):
    """
    Scores three  benchmark models on the test set, prints their accuracies,
    and selects the best one based on test accuracy.
    
    Parameters
    ----------
    models : dict
        A dictionary of {model_name: fitted_model, ...} 
        e.g. {
           'naive_bayes': nb_pipeline,
           'log_reg': lr_pipeline,
           'svm': svm_pipeline
        }
    X_test, y_test : test data
    
    Returns
    -------
    best_model_name : str
        Name of the best-performing model.
    best_accuracy : float
        Accuracy of the best-performing model.
    """
    best_model_name = None
    best_accuracy = 0.0
    
    for model_name, model in models.items():
        y_pred = model.predict(X_test)
        test_acc = accuracy_score(y_test, y_pred)
        
        print(f"\n=== {model_name.upper()} on Test Set ===")
        print("Test Accuracy:", test_acc)
        print("Classification Report:\n", classification_report(y_test, y_pred))
        print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
        
        if test_acc > best_accuracy:
            best_accuracy = test_acc
            best_model_name = model_name
    
    print(f"\nBest model on Test Set: {best_model_name} with accuracy={best_accuracy:.4f}")
    return best_model_name, best_accuracy

### Main workflow

In [24]:
if __name__ == "__main__":
    # Load train, validation, test data
    train_df = load_csv_data(r"D:\sem4\Aml\Assignment1\data_splits\train.csv")
    val_df   = load_csv_data(r"D:\sem4\Aml\Assignment1\data_splits\val.csv")
    test_df  = load_csv_data(r"D:\sem4\Aml\Assignment1\data_splits\test.csv")

    train_df.dropna(subset=["message"], inplace=True)
    val_df.dropna(subset=["message"], inplace=True)
    test_df.dropna(subset=["message"], inplace=True)

    # Prepare X, y
    X_train, y_train = prepare_features_labels(train_df, label_col="label", feature_col="message")
    X_val,   y_val   = prepare_features_labels(val_df,   label_col="label", feature_col="message")
    X_test,  y_test  = prepare_features_labels(test_df,  label_col="label", feature_col="message")

    # Define three benchmark pipelines
    nb_pipeline = Pipeline([
        ("tfidf", TfidfVectorizer()),
        ("clf",   MultinomialNB())
    ])

    lr_pipeline = Pipeline([
        ("tfidf", TfidfVectorizer()),
        ("clf",   LogisticRegression(max_iter=1000))
    ])

    svm_pipeline = Pipeline([
        ("tfidf", TfidfVectorizer()),
        ("clf",   SVC())
    ])

    # Train + Score + Evaluate for each
    print("\n========== Train & Evaluate: Naive Bayes ==========")
    nb_model, (nb_train_acc, nb_val_acc) = train_and_evaluate(nb_pipeline, X_train, y_train, X_val, y_val)

    print("\n========== Train & Evaluate: Logistic Regression ==========")
    lr_model, (lr_train_acc, lr_val_acc) = train_and_evaluate(lr_pipeline, X_train, y_train, X_val, y_val)

    print("\n========== Train & Evaluate: SVM ==========")
    svm_model, (svm_train_acc, svm_val_acc) = train_and_evaluate(svm_pipeline, X_train, y_train, X_val, y_val)


    # Evaluate all three on the test set
    all_models = {
        'naive_bayes': nb_model,
        'log_reg': lr_model,
        'svm': svm_model
    }

    best_model_name, best_test_acc = score_benchmark_models_on_test(all_models, X_test, y_test)

    print(f"\n=== Best model on Test is '{best_model_name}' with test accuracy={best_test_acc:.4f} ===")



=== Model: MultinomialNB ===
--- Train Evaluation ---

=== Evaluation: Train ===
Accuracy: 0.9773135669362084
Classification Report:
               precision    recall  f1-score   support

           0       0.97      1.00      0.99      3854
           1       1.00      0.83      0.91       598

    accuracy                           0.98      4452
   macro avg       0.99      0.92      0.95      4452
weighted avg       0.98      0.98      0.98      4452

Confusion Matrix:
 [[3854    0]
 [ 101  497]]
--- Validation Evaluation ---

=== Evaluation: Validation ===
Accuracy: 0.9640933572710951
Classification Report:
               precision    recall  f1-score   support

           0       0.96      1.00      0.98       483
           1       1.00      0.73      0.84        74

    accuracy                           0.96       557
   macro avg       0.98      0.86      0.91       557
weighted avg       0.97      0.96      0.96       557

Confusion Matrix:
 [[483   0]
 [ 20  54]]
Train A