In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.svm import LinearSVC


from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix
from sklearn.metrics import precision_recall_curve, auc
from sklearn.metrics import average_precision_score

import warnings
warnings.filterwarnings("ignore")

In [2]:
train_df = pd.read_csv("train.csv")
val_df = pd.read_csv("validate.csv")
test_df = pd.read_csv("test.csv")

In [3]:
train_df = train_df.dropna(subset=["text", "label"])
val_df   = val_df.dropna(subset=["text", "label"])
test_df  = test_df.dropna(subset=["text", "label"])

In [4]:
X_train = train_df["text"]
y_train = train_df["label"]

X_val = val_df["text"]
y_val = val_df["label"]

X_test = test_df["text"]
y_test = test_df["label"]

In [5]:
import mlflow
import mlflow.sklearn


# Model 1- Multionomial Naive Bayes

In [6]:
mnb_model = Pipeline([
    ("tfidf", TfidfVectorizer(
        max_features=5000,
        ngram_range=(1,2),   # captures phrases like "free offer"
        stop_words="english"
    )),
    ("model", MultinomialNB())
])

In [7]:
mlflow.set_experiment("Spam_Text_Models")

with mlflow.start_run(run_name="MultinomialNB"):

    mnb_model.fit(X_train, y_train)

    # probabilities needed for AUCPR
    probs = mnb_model.predict_proba(X_val)[:, 1]

    aucpr = average_precision_score(y_val, probs)

    # log metric
    mlflow.log_metric("AUCPR", aucpr)

    # log parameters
    mlflow.log_param("max_features", 5000)
    mlflow.log_param("ngram_range", "1,2")
    mlflow.log_param("model_type", "MultinomialNB")

    # save model
    mlflow.sklearn.log_model(mnb_model, "MNB_model")

    print("MultinomialNB AUCPR:", aucpr)


2026/02/16 00:50:38 INFO alembic.runtime.plugins: setup plugin alembic.autogenerate.schemas
2026/02/16 00:50:38 INFO alembic.runtime.plugins: setup plugin alembic.autogenerate.tables
2026/02/16 00:50:38 INFO alembic.runtime.plugins: setup plugin alembic.autogenerate.types
2026/02/16 00:50:38 INFO alembic.runtime.plugins: setup plugin alembic.autogenerate.constraints
2026/02/16 00:50:38 INFO alembic.runtime.plugins: setup plugin alembic.autogenerate.defaults
2026/02/16 00:50:38 INFO alembic.runtime.plugins: setup plugin alembic.autogenerate.comments
2026/02/16 00:50:39 INFO alembic.runtime.migration: Context impl SQLiteImpl.
2026/02/16 00:50:39 INFO alembic.runtime.migration: Will assume non-transactional DDL.


MultinomialNB AUCPR: 0.9559890275793826


In [8]:
for size in [3000, 5000]:

    model = Pipeline([
        ("tfidf", TfidfVectorizer(max_features=size, ngram_range=(1,2), stop_words="english")),
        ("model", MultinomialNB())
    ])

    with mlflow.start_run(run_name=f"MNB_{size}"):

        model.fit(X_train, y_train)

        probs = model.predict_proba(X_val)[:,1]
        aucpr = average_precision_score(y_val, probs)

        mlflow.log_param("max_features", size)
        mlflow.log_metric("AUCPR", aucpr)

        mlflow.sklearn.log_model(model, name="MNB")

        print(f"MNB {size} ‚Üí AUCPR: {aucpr:.4f}")


MNB 3000 ‚Üí AUCPR: 0.9644
MNB 5000 ‚Üí AUCPR: 0.9560


In [9]:
test_probs = mnb_model.predict_proba(X_test)[:,1]
test_mnb_aucpr = average_precision_score(y_test, test_probs)

print("Test AUCPR:", test_mnb_aucpr)


Test AUCPR: 0.9660944794920839


# Model 2 - Logistic Regression

In [10]:
log_model = Pipeline([
    ("tfidf", TfidfVectorizer(
        max_features=5000,
        ngram_range=(1,2),
        stop_words="english"
    )),
    ("model", LogisticRegression(max_iter=1000))
])

In [11]:
mlflow.set_experiment("Spam_Text_Models")

with mlflow.start_run(run_name="LogisticRegression"):

    log_model.fit(X_train, y_train)

    # probabilities needed for AUCPR
    probs = log_model.predict_proba(X_val)[:, 1]

    aucpr = average_precision_score(y_val, probs)

    # log metric
    mlflow.log_metric("AUCPR", aucpr)

    # log parameters
    mlflow.log_param("max_features", 5000)
    mlflow.log_param("ngram_range", "1,2")
    mlflow.log_param("model_type", "LogisticRegression")

    # save model
    mlflow.sklearn.log_model(log_model, "LR_model")

    print("LogisticRegression AUCPR:", aucpr)




LogisticRegression AUCPR: 0.9741479641191884


In [12]:
for c in [1, 5]:

    model = Pipeline([
        ("tfidf", TfidfVectorizer(max_features=5000, ngram_range=(1,2), stop_words="english")),
        ("model", LogisticRegression(C=c, max_iter=200))
    ])

    with mlflow.start_run(run_name=f"LR_C{c}"):

        model.fit(X_train, y_train)

        probs = model.predict_proba(X_val)[:,1]
        aucpr = average_precision_score(y_val, probs)

        mlflow.log_param("C", c)
        mlflow.log_metric("AUCPR", aucpr)

        mlflow.sklearn.log_model(model, name="LogReg")

        print(f"LR C={c} ‚Üí AUCPR: {aucpr:.4f}")


LR C=1 ‚Üí AUCPR: 0.9741
LR C=5 ‚Üí AUCPR: 0.9790


In [13]:
test_probs = log_model.predict_proba(X_test)[:,1]
test_log_aucpr = average_precision_score(y_test, test_probs)

print("Test AUCPR:", test_log_aucpr)


Test AUCPR: 0.973632545256588


# Model 3 - Linear SVM (LinearSVC)

In [14]:
svm_model = Pipeline([
    ("tfidf", TfidfVectorizer(
        max_features=5000,
        ngram_range=(1,2),
        stop_words="english"
    )),
    ("model", LinearSVC())
])

In [15]:
mlflow.set_experiment("Spam_Text_Models")

with mlflow.start_run(run_name="LinearSVC"):

    svm_model.fit(X_train, y_train)

    # probabilities needed for AUCPR
    scores = svm_model.decision_function(X_val)

    aucpr = average_precision_score(y_val, scores)

    # log metric
    mlflow.log_metric("AUCPR", aucpr)

    # log parameters
    mlflow.log_param("max_features", 5000)
    mlflow.log_param("ngram_range", "1,2")
    mlflow.log_param("model_type", "LinearSVC")

    # save model
    mlflow.sklearn.log_model(svm_model, "SVM_model")

    print("LinearSVC AUCPR:", aucpr)




LinearSVC AUCPR: 0.9795351567999901


In [16]:
for weight in [None, "balanced"]:

    model = Pipeline([
        ("tfidf", TfidfVectorizer(max_features=5000, ngram_range=(1,2), stop_words="english")),
        ("model", LinearSVC(class_weight=weight))
    ])

    with mlflow.start_run(run_name=f"SVM_weight_{weight}"):

        model.fit(X_train, y_train)

        scores = model.decision_function(X_val)
        aucpr = average_precision_score(y_val, scores)

        mlflow.log_param("class_weight", weight)
        mlflow.log_metric("AUCPR", aucpr)

        mlflow.sklearn.log_model(model, name="LinearSVC")

        print(f"SVM weight={weight} ‚Üí AUCPR: {aucpr:.4f}")


SVM weight=None ‚Üí AUCPR: 0.9795
SVM weight=balanced ‚Üí AUCPR: 0.9761


In [17]:
test_probs = svm_model.decision_function(X_test)
test_svm_aucpr = average_precision_score(y_test, test_probs)

print("Test AUCPR:", test_svm_aucpr)


Test AUCPR: 0.9864359173601204


In [18]:
print("\nAUCPR Comparison")
print("---------------------")
print("MultinomialNB:", test_mnb_aucpr)
print("LogisticRegression:", test_log_aucpr)
print("LinearSVC:", test_svm_aucpr)


AUCPR Comparison
---------------------
MultinomialNB: 0.9660944794920839
LogisticRegression: 0.973632545256588
LinearSVC: 0.9864359173601204


In [19]:
experiment_name = "Spam_Text_Models"

# get experiment
experiment = mlflow.get_experiment_by_name(experiment_name)

if experiment is None:
    print(f"Experiment '{experiment_name}' not found")
else:
    runs = mlflow.search_runs(experiment_ids=[experiment.experiment_id])

    # keep useful columns
    results = runs[[
        "tags.mlflow.runName",
        "metrics.AUCPR",
        "start_time"
    ]]

    # rename columns
    results.columns = ["Model", "AUCPR", "Run Time"]

    # sort best ‚Üí worst
    results = results.sort_values(by="AUCPR", ascending=False)

    print("\nüèÜ Model Leaderboard (Validation AUCPR)")
    print("="*50)
    print(results.to_string(index=False))



üèÜ Model Leaderboard (Validation AUCPR)
              Model    AUCPR                         Run Time
    SVM_weight_None 0.979535 2026-02-15 19:21:51.253000+00:00
          LinearSVC 0.979535 2026-02-15 19:21:41.926000+00:00
              LR_C5 0.978991 2026-02-15 19:21:36.289000+00:00
SVM_weight_balanced 0.976067 2026-02-15 19:22:00.448000+00:00
              LR_C1 0.974148 2026-02-15 19:21:26.803000+00:00
 LogisticRegression 0.974148 2026-02-15 19:21:15.961000+00:00
           MNB_3000 0.964423 2026-02-15 19:20:57.952000+00:00
           MNB_5000 0.955989 2026-02-15 19:21:08.995000+00:00
      MultinomialNB 0.955989 2026-02-15 19:20:39.988000+00:00


In [20]:
runs = mlflow.search_runs()

print(runs[["tags.mlflow.runName","metrics.AUCPR"]]
      .sort_values(by="metrics.AUCPR", ascending=False))


   tags.mlflow.runName  metrics.AUCPR
1      SVM_weight_None       0.979535
2            LinearSVC       0.979535
3                LR_C5       0.978991
0  SVM_weight_balanced       0.976067
4                LR_C1       0.974148
5   LogisticRegression       0.974148
7             MNB_3000       0.964423
6             MNB_5000       0.955989
8        MultinomialNB       0.955989
