In [3]:
# ===============================
# ‚öôÔ∏è SPAM DETECTION ‚Äî OPTUNA F1-FOCUSED OPTIMIZATION
# ===============================

# üì¶ Imports
import pandas as pd
import numpy as np
import optuna
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import make_scorer, f1_score
import pickle

# ===============================
# üìÇ 1. Load Cleaned Data
# ===============================
df = pd.read_csv("cleaned_spam_dataset.csv")

# üí° Ensure all text data are valid strings (prevents .lower() float errors)
df["clean_text"] = df["clean_text"].astype(str).fillna("")
X = df["clean_text"]
y = df["spam"]

print("‚úÖ Data Loaded | Shape:", df.shape)
print("üßº Null values:", df.isnull().sum().sum())

# ===============================
# ‚öñÔ∏è 2. Define Custom F1 Scorer
# ===============================
f1_macro_scorer = make_scorer(f1_score, average="macro")

# ===============================
# üéØ 3. Define Objective Function
# ===============================
def objective(trial):
    # Suggest hyperparameters
    model_type = trial.suggest_categorical("model", ["logistic", "svm"])
    max_features = trial.suggest_int("max_features", 5000, 25000)
    ngram = trial.suggest_categorical("ngram_range", [(1,1), (1,2)])
    
    if model_type == "logistic":
        C = trial.suggest_float("C", 0.01, 10.0, log=True)
        model = LogisticRegression(C=C, max_iter=1000, n_jobs=-1)
    else:
        C = trial.suggest_float("C", 0.01, 5.0, log=True)
        model = LinearSVC(C=C, max_iter=2000)
    
    # Define pipeline
    pipeline = Pipeline([
        ("tfidf", TfidfVectorizer(max_features=max_features, ngram_range=ngram)),
        ("clf", model)
    ])
    
    # Stratified CV for balanced evaluation
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    f1_scores = cross_val_score(pipeline, X, y, cv=cv, scoring=f1_macro_scorer, n_jobs=-1)
    
    return f1_scores.mean()

# ===============================
# üöÄ 4. Run Study (Optimize F1)
# ===============================
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=30)

print("\n‚úÖ Best parameters:", study.best_params)
print("üèÜ Best F1-macro score:", round(study.best_value, 4))

# ===============================
# üß† 5. Train Final Model
# ===============================
best_params = study.best_params

if best_params["model"] == "logistic":
    final_model = LogisticRegression(C=best_params["C"], max_iter=1000, n_jobs=-1)
else:
    final_model = LinearSVC(C=best_params["C"], max_iter=2000)

final_pipeline = Pipeline([
    ("tfidf", TfidfVectorizer(
        max_features=best_params["max_features"],
        ngram_range=best_params["ngram_range"]
    )),
    ("clf", final_model)
])

# Fit the final model
final_pipeline.fit(X, y)

# ===============================
# üíæ 6. Save Model for Deployment
# ===============================
with open("best_f1_spam_model.pkl", "wb") as f:
    pickle.dump(final_pipeline, f)

print("\nüíæ Model saved as best_f1_spam_model.pkl")
print("‚úÖ Final model trained successfully with best F1 parameters!")


[I 2025-11-11 22:50:10,744] A new study created in memory with name: no-name-6b14aedd-ca12-4d1b-a05e-f0f08596e5aa


‚úÖ Data Loaded | Shape: (10852, 3)
üßº Null values: 0


[I 2025-11-11 22:50:11,566] Trial 0 finished with value: 0.7662838258130804 and parameters: {'model': 'logistic', 'max_features': 18506, 'ngram_range': (1, 1), 'C': 0.16705318037247427}. Best is trial 0 with value: 0.7662838258130804.
[I 2025-11-11 22:50:14,099] Trial 1 finished with value: 0.892398663063541 and parameters: {'model': 'logistic', 'max_features': 13734, 'ngram_range': (1, 2), 'C': 0.3989635475305462}. Best is trial 1 with value: 0.892398663063541.
[I 2025-11-11 22:50:16,807] Trial 2 finished with value: 0.9314791158863537 and parameters: {'model': 'svm', 'max_features': 24447, 'ngram_range': (1, 2), 'C': 0.07206714833165659}. Best is trial 2 with value: 0.9314791158863537.
[I 2025-11-11 22:50:17,844] Trial 3 finished with value: 0.8918138295406397 and parameters: {'model': 'svm', 'max_features': 5861, 'ngram_range': (1, 1), 'C': 0.03317822933820296}. Best is trial 2 with value: 0.9314791158863537.
[I 2025-11-11 22:50:21,154] Trial 4 finished with value: 0.961919319600644


‚úÖ Best parameters: {'model': 'svm', 'max_features': 22256, 'ngram_range': (1, 2), 'C': 2.3451784883543128}
üèÜ Best F1-macro score: 0.9647

üíæ Model saved as best_f1_spam_model.pkl
‚úÖ Final model trained successfully with best F1 parameters!
