In [1]:
import pandas as pd
import numpy as np
import optuna
import pickle
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import f1_score, accuracy_score
from sklearn.utils import shuffle
import warnings
warnings.filterwarnings("ignore")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# =========================================
# 1Ô∏è‚É£ Load Cleaned Dataset
# =========================================
df = pd.read_csv("cleaned_spam_dataset.csv")

# Ensure no missing text
df["clean_text"] = df["clean_text"].astype(str).fillna("")
df = shuffle(df, random_state=42)

X = df["clean_text"]
y = df["spam"]

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# TF-IDF vectorization
vectorizer = TfidfVectorizer(max_features=3000, stop_words="english")
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# =========================================
# 2Ô∏è‚É£ Define Objective Function for Optuna
# =========================================
def objective(trial):
    model_name = trial.suggest_categorical(
        "model", ["NaiveBayes", "LogisticRegression", "RandomForest", "XGBoost"]
    )
    
    if model_name == "NaiveBayes":
        alpha = trial.suggest_float("alpha", 0.1, 2.0, step=0.1)
        model = MultinomialNB(alpha=alpha)
    
    elif model_name == "LogisticRegression":
        c = trial.suggest_float("C", 0.01, 10.0, log=True)
        penalty = trial.suggest_categorical("penalty", ["l2"])
        model = LogisticRegression(C=c, penalty=penalty, solver="lbfgs", max_iter=2000)
    
    elif model_name == "RandomForest":
        n_estimators = trial.suggest_int("n_estimators", 100, 500)
        max_depth = trial.suggest_int("max_depth", 5, 30)
        min_samples_split = trial.suggest_int("min_samples_split", 2, 10)
        model = RandomForestClassifier(
            n_estimators=n_estimators,
            max_depth=max_depth,
            min_samples_split=min_samples_split,
            random_state=42,
            n_jobs=-1
        )
    
    elif model_name == "XGBoost":
        n_estimators = trial.suggest_int("n_estimators", 100, 500)
        max_depth = trial.suggest_int("max_depth", 3, 10)
        learning_rate = trial.suggest_float("learning_rate", 0.01, 0.3, log=True)
        subsample = trial.suggest_float("subsample", 0.5, 1.0)
        model = XGBClassifier(
            n_estimators=n_estimators,
            max_depth=max_depth,
            learning_rate=learning_rate,
            subsample=subsample,
            eval_metric="logloss",
            use_label_encoder=False,
            random_state=42
        )

    # Train and evaluate
    model.fit(X_train_tfidf, y_train)
    y_pred = model.predict(X_test_tfidf)

    # We‚Äôll maximize F1-score (you can change to accuracy if you want)
    f1 = f1_score(y_test, y_pred)
    return f1


# =========================================
# 3Ô∏è‚É£ Run Optuna Optimization
# =========================================
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=25, show_progress_bar=True)

# =========================================
# 4Ô∏è‚É£ Results
# =========================================
print("\nüèÅ Optimization finished!")
print("Best Model:", study.best_params["model"])
print("Best Parameters:", study.best_params)
print("Best F1 Score:", round(study.best_value, 4))

# =========================================
# 5Ô∏è‚É£ Retrain Best Model on Full Dataset
# =========================================
best_params = study.best_params
best_model_name = best_params["model"]

if best_model_name == "NaiveBayes":
    model = MultinomialNB(alpha=best_params["alpha"])

elif best_model_name == "LogisticRegression":
    model = LogisticRegression(C=best_params["C"], penalty="l2", solver="lbfgs", max_iter=2000)

elif best_model_name == "RandomForest":
    model = RandomForestClassifier(
        n_estimators=best_params["n_estimators"],
        max_depth=best_params["max_depth"],
        min_samples_split=best_params["min_samples_split"],
        random_state=42,
        n_jobs=-1
    )

elif best_model_name == "XGBoost":
    model = XGBClassifier(
        n_estimators=best_params["n_estimators"],
        max_depth=best_params["max_depth"],
        learning_rate=best_params["learning_rate"],
        subsample=best_params["subsample"],
        eval_metric="logloss",
        use_label_encoder=False,
        random_state=42
    )

model.fit(X_train_tfidf, y_train)
print("‚úÖ Best model retrained successfully!")

# =========================================
# 6Ô∏è‚É£ Save Best Model and Vectorizer
# =========================================
with open("best_spam_model.pkl", "wb") as f:
    pickle.dump(model, f)

with open("best_vectorizer.pkl", "wb") as f:
    pickle.dump(vectorizer, f)

print("üíæ Best model and vectorizer saved successfully!")


[I 2025-11-09 18:40:47,529] A new study created in memory with name: no-name-78cfc9c2-b379-4f76-bc17-29ea269b66a0
Best trial: 1. Best value: 0.90216:   4%|‚ñç         | 1/25 [00:00<00:11,  2.04it/s]

[I 2025-11-09 18:40:48,022] Trial 0 finished with value: 0.6978998384491115 and parameters: {'model': 'RandomForest', 'n_estimators': 212, 'max_depth': 16, 'min_samples_split': 5}. Best is trial 0 with value: 0.6978998384491115.
[I 2025-11-09 18:40:48,029] Trial 1 finished with value: 0.9021601016518425 and parameters: {'model': 'NaiveBayes', 'alpha': 0.1}. Best is trial 1 with value: 0.9021601016518425.


Best trial: 2. Best value: 0.902724:  12%|‚ñà‚ñè        | 3/25 [00:08<01:09,  3.17s/it]

[I 2025-11-09 18:40:56,234] Trial 2 finished with value: 0.9027237354085603 and parameters: {'model': 'XGBoost', 'n_estimators': 408, 'max_depth': 5, 'learning_rate': 0.04632537647154444, 'subsample': 0.6581426803566224}. Best is trial 2 with value: 0.9027237354085603.


Best trial: 4. Best value: 0.904824:  16%|‚ñà‚ñå        | 4/25 [00:09<00:45,  2.18s/it]

[I 2025-11-09 18:40:56,549] Trial 3 finished with value: 0.28205128205128205 and parameters: {'model': 'RandomForest', 'n_estimators': 138, 'max_depth': 7, 'min_samples_split': 9}. Best is trial 2 with value: 0.9027237354085603.
[I 2025-11-09 18:40:56,558] Trial 4 finished with value: 0.9048239895697523 and parameters: {'model': 'NaiveBayes', 'alpha': 1.0}. Best is trial 4 with value: 0.9048239895697523.


Best trial: 6. Best value: 0.911652:  24%|‚ñà‚ñà‚ñç       | 6/25 [00:09<00:23,  1.24s/it]

[I 2025-11-09 18:40:57,121] Trial 5 finished with value: 0.7083333333333334 and parameters: {'model': 'RandomForest', 'n_estimators': 249, 'max_depth': 17, 'min_samples_split': 8}. Best is trial 4 with value: 0.9048239895697523.
[I 2025-11-09 18:40:57,128] Trial 6 finished with value: 0.911651728553137 and parameters: {'model': 'NaiveBayes', 'alpha': 0.5}. Best is trial 6 with value: 0.911651728553137.


Best trial: 7. Best value: 0.915:  36%|‚ñà‚ñà‚ñà‚ñå      | 9/25 [00:21<00:51,  3.24s/it]   

[I 2025-11-09 18:41:09,221] Trial 7 finished with value: 0.915 and parameters: {'model': 'XGBoost', 'n_estimators': 403, 'max_depth': 10, 'learning_rate': 0.12924366441412957, 'subsample': 0.6988389061009636}. Best is trial 7 with value: 0.915.
[I 2025-11-09 18:41:09,235] Trial 8 finished with value: 0.8959156785243741 and parameters: {'model': 'NaiveBayes', 'alpha': 1.5000000000000002}. Best is trial 7 with value: 0.915.
[I 2025-11-09 18:41:09,251] Trial 9 finished with value: 0.8941798941798942 and parameters: {'model': 'NaiveBayes', 'alpha': 1.7000000000000002}. Best is trial 7 with value: 0.915.


Best trial: 7. Best value: 0.915:  44%|‚ñà‚ñà‚ñà‚ñà‚ñç     | 11/25 [00:33<00:50,  3.61s/it]

[I 2025-11-09 18:41:21,303] Trial 10 finished with value: 0.9072681704260651 and parameters: {'model': 'XGBoost', 'n_estimators': 489, 'max_depth': 9, 'learning_rate': 0.23009828749516437, 'subsample': 0.8825255796823629}. Best is trial 7 with value: 0.915.
[I 2025-11-09 18:41:21,351] Trial 11 finished with value: 0.0 and parameters: {'model': 'LogisticRegression', 'C': 0.025174969210228212, 'penalty': 'l2'}. Best is trial 7 with value: 0.915.


Best trial: 13. Best value: 0.92132:  64%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç   | 16/25 [00:43<00:21,  2.39s/it]

[I 2025-11-09 18:41:30,687] Trial 12 finished with value: 0.9086357947434293 and parameters: {'model': 'XGBoost', 'n_estimators': 360, 'max_depth': 9, 'learning_rate': 0.18906229807767225, 'subsample': 0.5039001902905458}. Best is trial 7 with value: 0.915.
[I 2025-11-09 18:41:30,739] Trial 13 finished with value: 0.9213197969543148 and parameters: {'model': 'LogisticRegression', 'C': 8.973975056762294, 'penalty': 'l2'}. Best is trial 13 with value: 0.9213197969543148.
[I 2025-11-09 18:41:30,787] Trial 14 finished with value: 0.9185750636132316 and parameters: {'model': 'LogisticRegression', 'C': 8.26667590009963, 'penalty': 'l2'}. Best is trial 13 with value: 0.9213197969543148.
[I 2025-11-09 18:41:30,831] Trial 15 finished with value: 0.9185750636132316 and parameters: {'model': 'LogisticRegression', 'C': 8.503586720080055, 'penalty': 'l2'}. Best is trial 13 with value: 0.9213197969543148.


Best trial: 13. Best value: 0.92132:  76%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå  | 19/25 [00:43<00:09,  1.54s/it]

[I 2025-11-09 18:41:30,882] Trial 16 finished with value: 0.9185750636132316 and parameters: {'model': 'LogisticRegression', 'C': 9.885681105961336, 'penalty': 'l2'}. Best is trial 13 with value: 0.9213197969543148.
[I 2025-11-09 18:41:30,925] Trial 17 finished with value: 0.8906455862977603 and parameters: {'model': 'LogisticRegression', 'C': 1.0545173620612953, 'penalty': 'l2'}. Best is trial 13 with value: 0.9213197969543148.
[I 2025-11-09 18:41:30,963] Trial 18 finished with value: 0.8894736842105263 and parameters: {'model': 'LogisticRegression', 'C': 1.0632486495935003, 'penalty': 'l2'}. Best is trial 13 with value: 0.9213197969543148.
[I 2025-11-09 18:41:31,023] Trial 19 finished with value: 0.9097938144329897 and parameters: {'model': 'LogisticRegression', 'C': 3.0158185893473703, 'penalty': 'l2'}. Best is trial 13 with value: 0.9213197969543148.


Best trial: 13. Best value: 0.92132:  96%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå| 24/25 [00:43<00:00,  1.31it/s]

[I 2025-11-09 18:41:31,078] Trial 20 finished with value: 0.73125 and parameters: {'model': 'LogisticRegression', 'C': 0.17840388716277414, 'penalty': 'l2'}. Best is trial 13 with value: 0.9213197969543148.
[I 2025-11-09 18:41:31,133] Trial 21 finished with value: 0.9199491740787802 and parameters: {'model': 'LogisticRegression', 'C': 8.824976519480057, 'penalty': 'l2'}. Best is trial 13 with value: 0.9213197969543148.
[I 2025-11-09 18:41:31,182] Trial 22 finished with value: 0.9213197969543148 and parameters: {'model': 'LogisticRegression', 'C': 9.170153637892644, 'penalty': 'l2'}. Best is trial 13 with value: 0.9213197969543148.
[I 2025-11-09 18:41:31,226] Trial 23 finished with value: 0.9097938144329897 and parameters: {'model': 'LogisticRegression', 'C': 2.7255924668800877, 'penalty': 'l2'}. Best is trial 13 with value: 0.9213197969543148.


Best trial: 13. Best value: 0.92132: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 25/25 [00:43<00:00,  1.75s/it]


[I 2025-11-09 18:41:31,283] Trial 24 finished with value: 0.9109677419354839 and parameters: {'model': 'LogisticRegression', 'C': 2.8914710650568756, 'penalty': 'l2'}. Best is trial 13 with value: 0.9213197969543148.

üèÅ Optimization finished!
Best Model: LogisticRegression
Best Parameters: {'model': 'LogisticRegression', 'C': 8.973975056762294, 'penalty': 'l2'}
Best F1 Score: 0.9213
‚úÖ Best model retrained successfully!
üíæ Best model and vectorizer saved successfully!
