###Assignment 1 - Training and Evaluation (MDS202414)

In [22]:
#function to load the data, drop na and convert spam label to 1 and ham label to 0

import pandas as pd

def load_splits(train_path, val_path, test_path, label_col="label", pos_label="spam"):
    # Load CSVs
    train = pd.read_csv(train_path)
    val   = pd.read_csv(val_path)
    test  = pd.read_csv(test_path)

    # Drop rows with NA anywhere (features or label)
    train = train.dropna()
    val   = val.dropna()
    test  = test.dropna()

    # Convert labels to binary
    def convert_labels(df):
        y = (df[label_col] == pos_label).astype(int)
        X = df.drop(columns=[label_col])
        return X, y

    X_train, y_train = convert_labels(train)
    X_val, y_val     = convert_labels(val)
    X_test, y_test   = convert_labels(test)

    return X_train, y_train, X_val, y_val, X_test, y_test


In [23]:
X_train, y_train, X_val, y_val, X_test, y_test = load_splits("train_set.csv","validation_set.csv","test_set.csv")

In [31]:
#function to train and get score for model

from sklearn.metrics import accuracy_score, roc_auc_score, average_precision_score

def train_and_eval(model, X_train, y_train, X_val, y_val):
    model.fit(X_train, y_train)

    y_pred = model.predict(X_val)

    # Get continuous scores
    if hasattr(model, "predict_proba"):
        y_score = model.predict_proba(X_val)[:, 1]
    elif hasattr(model, "decision_function"):
        y_score = model.decision_function(X_val)
    else:
        y_score = None

    metrics = {
        "accuracy": accuracy_score(y_val, y_pred)
    }

    if y_score is not None:
        metrics["roc_auc"] = roc_auc_score(y_val, y_score)
        metrics["pr_auc"]  = average_precision_score(y_val, y_score)

    return metrics


In [27]:
#function for hyperparameter tuning using the score function used in above function

from itertools import product
import numpy as np

def tune_model(model_class, param_grid, X_train, y_train, X_val, y_val, metric="pr_auc"):
    best_score = -np.inf
    best_params = None
    best_model = None

    keys, values = zip(*param_grid.items())

    for v in product(*values):
        params = dict(zip(keys, v))
        model = model_class(**params)

        metrics = train_and_eval(model, X_train, y_train, X_val, y_val)
        score = metrics[metric]

        print(f"{model_class.__name__} params={params} metrics={metrics}")

        if score > best_score:
            best_score = score
            best_params = params
            best_model = model

    return best_model, best_params, best_score


In [40]:
#get score on test data, for final comparison between models

import numpy as np
from sklearn.metrics import accuracy_score, roc_auc_score, average_precision_score

def evaluate_on_test(model_class, best_params, X_train, y_train, X_val, y_val, X_test, y_test):
    # Stack train + val
    X_train_full = np.vstack([X_train, X_val])
    y_train_full = np.concatenate([y_train, y_val])

    model = model_class(**best_params)
    model.fit(X_train_full, y_train_full)

    y_pred = model.predict(X_test)

    # Get scores for AUC
    if hasattr(model, "predict_proba"):
        y_score = model.predict_proba(X_test)[:, 1]
    elif hasattr(model, "decision_function"):
        y_score = model.decision_function(X_test)
    else:
        y_score = None

    results = {"accuracy": accuracy_score(y_test, y_pred)}

    if y_score is not None:
        results["roc_auc"] = roc_auc_score(y_test, y_score)
        results["pr_auc"]  = average_precision_score(y_test, y_score)

    return results


In [37]:
#setting up the hyperparameter space for the 3 models

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC

models = {
    "LogReg": (LogisticRegression, {
        "C": [0.01, 0.1, 1, 10],
        "solver": ["liblinear"],
        "max_iter": [1000]
    }),

    "NaiveBayes": (MultinomialNB, {
        "alpha": [0.1, 0.5, 1, 2]
    }),

    "LinearSVM": (LinearSVC, {
        "C": [0.01, 0.1, 1, 10],
        "max_iter": [5000]
    })
}


In [38]:
#scaling data -> converting raw counts to proportions
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler(with_mean=False)
X_train = scaler.fit_transform(X_train)
X_val   = scaler.transform(X_val)
X_test  = scaler.transform(X_test)


In [41]:
#finding best hyper-parameters
results = {}

for name, (model_class, param_grid) in models.items():
    print("\n==============================")
    print("TUNING:", name)

    best_model, best_params, best_score = tune_model(
        model_class, param_grid,
        X_train, y_train, X_val, y_val,
        metric="pr_auc"   # spam â†’ PR-AUC best
    )

    print(f"BEST {name}: params={best_params}, val_score={best_score}")

    test_metrics = evaluate_on_test(
        model_class, best_params,
        X_train, y_train, X_val, y_val, X_test, y_test
    )

    results[name] = {"best_params": best_params, "test_metrics": test_metrics}



TUNING: LogReg
LogisticRegression params={'C': 0.01, 'solver': 'liblinear', 'max_iter': 1000} metrics={'accuracy': 0.972488038277512, 'roc_auc': np.float64(0.9609720798408008), 'pr_auc': np.float64(0.9337120656590089)}
LogisticRegression params={'C': 0.1, 'solver': 'liblinear', 'max_iter': 1000} metrics={'accuracy': 0.9712918660287081, 'roc_auc': np.float64(0.9605861424350238), 'pr_auc': np.float64(0.933600426169063)}
LogisticRegression params={'C': 1, 'solver': 'liblinear', 'max_iter': 1000} metrics={'accuracy': 0.972488038277512, 'roc_auc': np.float64(0.9606826267864682), 'pr_auc': np.float64(0.9351292102410739)}
LogisticRegression params={'C': 10, 'solver': 'liblinear', 'max_iter': 1000} metrics={'accuracy': 0.9712918660287081, 'roc_auc': np.float64(0.9622504974974371), 'pr_auc': np.float64(0.9360148391560262)}
BEST LogReg: params={'C': 10, 'solver': 'liblinear', 'max_iter': 1000}, val_score=0.9360148391560262

TUNING: NaiveBayes
MultinomialNB params={'alpha': 0.1} metrics={'accura



LinearSVC params={'C': 0.1, 'max_iter': 5000} metrics={'accuracy': 0.9712918660287081, 'roc_auc': np.float64(0.9626967376228668), 'pr_auc': np.float64(0.9355868138749771)}




LinearSVC params={'C': 1, 'max_iter': 5000} metrics={'accuracy': 0.965311004784689, 'roc_auc': np.float64(0.9840439003799071), 'pr_auc': np.float64(0.9664385381908788)}
LinearSVC params={'C': 10, 'max_iter': 5000} metrics={'accuracy': 0.9665071770334929, 'roc_auc': np.float64(0.9888801784960501), 'pr_auc': np.float64(0.9743866213819193)}
BEST LinearSVM: params={'C': 10, 'max_iter': 5000}, val_score=0.9743866213819193


In [42]:
#evaluating on test data to compare the 3 models
import pandas as pd

df_results = pd.DataFrame({
    name: res["test_metrics"] for name, res in results.items()
}).T

print(df_results)


            accuracy   roc_auc    pr_auc
LogReg      0.976105  0.978362  0.959598
NaiveBayes  0.958184  0.973498  0.875465
LinearSVM   0.972521  0.989101  0.947018


The results suggest:
1. Logistic Regression gives best results overall.
2. LinearSVM has low PR-AUC but high ROC-AUC, which suggests it has many false positives.
3. NaiveBayes seems to eb overfitting on train / validation dataset, leading to poor accuracy and PR-AUC.