<a href="https://colab.research.google.com/github/22Himanshu/AppliedMachineLearning/blob/main/train.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Himanshu (MDS202328)

## Applied_ML_Spam_Classifier_1

#### Importing Libraries

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
import os
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [28]:
def load_splits(data_dir):
    """Loads train, validation, and test datasets from CSV files."""
    train = pd.read_csv(os.path.join(data_dir, 'train.csv'))
    val = pd.read_csv(os.path.join(data_dir, 'validation.csv'))
    test = pd.read_csv(os.path.join(data_dir, 'test.csv'))
    return train, val, test

def vectorize_data(train, val, test):
    """Converts text data into TF-IDF vectors."""
    vectorizer = TfidfVectorizer()
    X_train = vectorizer.fit_transform(train['message'])
    X_val = vectorizer.transform(val['message'])
    X_test = vectorizer.transform(test['message'])
    return X_train, X_val, X_test, train['label'], val['label'], test['label'], vectorizer

def fit_model(X_train, y_train, model):
    """Fits a model on training data."""
    model.fit(X_train, y_train)
    return model

def score_model(model, X, y):
    """Scores a model on given data."""
    y_pred = model.predict(X)
    return accuracy_score(y, y_pred), classification_report(y, y_pred)

def evaluate_model(model, X_train, y_train, X_val, y_val):
    """Evaluates the model on both training and validation data."""
    train_acc, train_report = score_model(model, X_train, y_train)
    val_acc, val_report = score_model(model, X_val, y_val)
    print(f"{model} Training Accuracy:", train_acc)
    print(f"{model} Validation Accuracy:", val_acc)
    print(f"{model} Validation Report:\n", val_report)
    return train_acc, val_acc

def fine_tune_model(X_train, y_train, X_val, y_val):
    """Fine-tunes hyperparameters using GridSearchCV."""
    param_grid = {'C': [0.1, 1, 10, 100]}
    model = GridSearchCV(LogisticRegression(), param_grid, cv=3)
    model.fit(X_train, y_train)
    print("Best parameters:", model.best_params_)
    return model.best_estimator_

def benchmark_models(X_train, y_train, X_val, y_val, X_test, y_test):
    """Trains and evaluates multiple models, selecting the best one."""
    models = {
        'Naive Bayes': MultinomialNB(),
        'Logistic Regression': LogisticRegression(),
        'SVM': SVC()
    }
    best_model, best_acc = None, 0
    for name, model in models.items():
        model = fit_model(X_train, y_train, model)
        _, val_acc = evaluate_model(model, X_train, y_train, X_val, y_val)
        print("\n-----------------------------------------------------------------")
        print("\n")
        if val_acc > best_acc:
            best_acc = val_acc
            best_model = model
    test_acc, test_report = score_model(best_model, X_test, y_test)
    print(f"The best model is {best_model}")
    print("Best Model Test Accuracy:", test_acc)
    print("Test Report:\n", test_report)
    return best_model

In [29]:
if __name__ == "__main__":
    data_dir = "/content/drive/MyDrive/Applied_ML/data"
    train, val, test = load_splits(data_dir)
    X_train, X_val, X_test, y_train, y_val, y_test, vectorizer = vectorize_data(train, val, test)
    best_model = benchmark_models(X_train, y_train, X_val, y_val, X_test, y_test)

MultinomialNB() Training Accuracy: 0.9738461538461538
MultinomialNB() Validation Accuracy: 0.9557416267942583
MultinomialNB() Validation Report:
               precision    recall  f1-score   support

           0       0.95      1.00      0.98       724
           1       1.00      0.67      0.80       112

    accuracy                           0.96       836
   macro avg       0.98      0.83      0.89       836
weighted avg       0.96      0.96      0.95       836


-----------------------------------------------------------------


LogisticRegression() Training Accuracy: 0.9733333333333334
LogisticRegression() Validation Accuracy: 0.9677033492822966
LogisticRegression() Validation Report:
               precision    recall  f1-score   support

           0       0.96      1.00      0.98       724
           1       1.00      0.76      0.86       112

    accuracy                           0.97       836
   macro avg       0.98      0.88      0.92       836
weighted avg       0.97  