### Arka Roy- MDS202311

### Train.ipynb

## Importing necessary libraries

In [1]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC


### Function to load csv data

In [2]:
# Function to load CSV data 
def load_csv_data(file_path: str) -> pd.DataFrame:
    if not os.path.exists(file_path):
        print(f"Error: File not found at {file_path}")
        return None
    try:
        df = pd.read_csv(file_path)
        print(f"Loaded dataset: {file_path} with {df.shape[0]} rows and {df.shape[1]} columns.")
        return df
    except Exception as e:
        print(f"Error loading dataset {file_path}: {e}")
        return None

### Function to extract features and labels

In [3]:
# Function to extract features and labels
def prepare_features_labels(df: pd.DataFrame, feature_col: str = "message", label_col: str = "label"):
    return df[feature_col], df[label_col]

### Function to train a model

In [4]:
# Function to train a model
def fit_model(model, X_train, y_train):
    model.fit(X_train, y_train)
    return model

### Function to evaluate predictions

In [5]:
# Function to evaluate predictions
def evaluate_model_predictions(y_true, y_pred, title: str = ""):
    print(f"\n=== Evaluation: {title} ===")
    print("Accuracy:", accuracy_score(y_true, y_pred))
    print("Classification Report:\n", classification_report(y_true, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_true, y_pred))

### Function to train and evaluate models

In [6]:
# Function to train and evaluate models
def train_and_evaluate(model, X_train, y_train, X_val, y_val):
    model = fit_model(model, X_train, y_train)

    # Predictions
    y_train_pred = model.predict(X_train)
    y_val_pred = model.predict(X_val)

    print(f"\n=== Model: {type(model.steps[-1][1]).__name__} ===")
    
    print("--- Train Evaluation ---")
    evaluate_model_predictions(y_train, y_train_pred, title="Train")

    print("--- Validation Evaluation ---")
    evaluate_model_predictions(y_val, y_val_pred, title="Validation")

    return model


### Function to compare models on test set

In [7]:
# Function to compare models on test set
def score_benchmark_models_on_test(models, X_test, y_test):
    best_model_name = None
    best_accuracy = 0.0

    for model_name, model in models.items():
        y_pred = model.predict(X_test)
        test_acc = accuracy_score(y_test, y_pred)

        print(f"\n=== {model_name.upper()} on Test Set ===")
        print("Test Accuracy:", test_acc)
        print("Classification Report:\n", classification_report(y_test, y_pred))
        print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

        if test_acc > best_accuracy:
            best_accuracy = test_acc
            best_model_name = model_name

    print(f"\nBest model on Test Set: {best_model_name} with accuracy={best_accuracy:.4f}")
    return best_model_name, best_accuracy

### Main Workflow

In [10]:
import os
if __name__ == "__main__":
    # Correct file paths
    data_directory = "/Users/arkaroy/Downloads/sms+spam+collection/data_splits"
    
    train_file = os.path.join(data_directory, "train.csv")
    val_file = os.path.join(data_directory, "validation.csv")  # Corrected filename
    test_file = os.path.join(data_directory, "test.csv")

    # Load datasets
    train_df = load_csv_data(train_file)
    val_df = load_csv_data(val_file)
    test_df = load_csv_data(test_file)


    # Drop missing messages safely
    for df, name in zip([train_df, val_df, test_df], ["Train", "Validation", "Test"]):
        if "message" in df.columns:
            df.dropna(subset=["message"], inplace=True)
            print(f"{name} dataset after removing NaNs: {df.shape[0]} rows")
        else:
            print(f"Error: Column 'message' not found in {name} dataset.")
            exit()

    # Prepare features and labels
    def prepare_features_labels(df, feature_col="message", label_col="label"):
        return df[feature_col], df[label_col]

    X_train, y_train = prepare_features_labels(train_df)
    X_val, y_val = prepare_features_labels(val_df)
    X_test, y_test = prepare_features_labels(test_df)

    # Define models
    models = {
        "naive_bayes": Pipeline([("tfidf", TfidfVectorizer()), ("clf", MultinomialNB())]),
        "log_reg": Pipeline([("tfidf", TfidfVectorizer()), ("clf", LogisticRegression(max_iter=1000))]),
        "svm": Pipeline([("tfidf", TfidfVectorizer()), ("clf", SVC())])
    }

    # Train and evaluate models
    trained_models = {}
    for model_name, pipeline in models.items():
        print(f"\n========== Train & Evaluate: {model_name.upper()} ==========")
        pipeline.fit(X_train, y_train)
        trained_models[model_name] = pipeline

    # Test evaluation function
    def score_benchmark_models_on_test(models, X_test, y_test):
        best_model_name = None
        best_accuracy = 0.0

        for model_name, model in models.items():
            y_pred = model.predict(X_test)
            test_acc = accuracy_score(y_test, y_pred)

            print(f"\n=== {model_name.upper()} on Test Set ===")
            print("Test Accuracy:", test_acc)
            print("Classification Report:\n", classification_report(y_test, y_pred))
            print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

            if test_acc > best_accuracy:
                best_accuracy = test_acc
                best_model_name = model_name

        print(f"\nBest model on Test Set: {best_model_name} with accuracy={best_accuracy:.4f}")
        return best_model_name, best_accuracy

    # Run evaluation
    best_model_name, best_test_acc = score_benchmark_models_on_test(trained_models, X_test, y_test)

    print(f"\n=== Best model on Test is '{best_model_name}' with accuracy={best_test_acc:.4f} ===")


Loaded dataset: /Users/arkaroy/Downloads/sms+spam+collection/data_splits/train.csv with 4457 rows and 2 columns.
Loaded dataset: /Users/arkaroy/Downloads/sms+spam+collection/data_splits/validation.csv with 557 rows and 2 columns.
Loaded dataset: /Users/arkaroy/Downloads/sms+spam+collection/data_splits/test.csv with 558 rows and 2 columns.
Train dataset after removing NaNs: 4452 rows
Validation dataset after removing NaNs: 557 rows
Test dataset after removing NaNs: 557 rows




=== NAIVE_BAYES on Test Set ===
Test Accuracy: 0.9658886894075404
Classification Report:
               precision    recall  f1-score   support

           0       0.96      1.00      0.98       482
           1       1.00      0.75      0.85        75

    accuracy                           0.97       557
   macro avg       0.98      0.87      0.92       557
weighted avg       0.97      0.97      0.96       557

Confusion Matrix:
 [[482   0]
 [ 19  56]]

=== LOG_REG on Test Set ===
Test Accuracy: 0.9694793536804