#### Load libraries

In [None]:
import pandas as pd
import numpy as np
import optuna

optuna.logging.set_verbosity(optuna.logging.WARNING)
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
import warnings

warnings.filterwarnings("ignore", category=UserWarning, module="sklearn")
from sklearn.metrics import (
    precision_recall_curve,
    classification_report,
)

#### Load dataset and name columns

In [80]:
df = pd.read_csv(
    "dataset/SMSSpamCollection", sep="\t", header=None, names=["target", "text"]
)

#### Dataset

In [81]:
df

Unnamed: 0,target,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ã¼ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [82]:
df.info()

<class 'pandas.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   target  5572 non-null   str  
 1   text    5572 non-null   str  
dtypes: str(2)
memory usage: 87.2 KB


In [83]:
df.target.value_counts()

target
ham     4825
spam     747
Name: count, dtype: int64

#### Convert string targets to integers

In [51]:
df.target = df.target.map({"ham": 0, "spam": 1})

#### Separate text and target

In [None]:
X = df["text"]  # SMS
y = df["target"]  # Labels: 1 = Spam, 0 = No Spam

#### split train/test

In [53]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

#### Method to perform manual cross-validation with optional calibration

In [54]:
def manual_CV(model, X_train, y_train, calibrate):
    # Initialize StratifiedKFold for cross-validation with 5 splits
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    
    # Lists to store F1 scores and best thresholds for each fold
    f1_scores = []
    best_thresholds = []

    # Iterate over each train-validation split
    for train_idx, val_idx in skf.split(X_train, y_train):
        # Split the data into training and validation sets
        X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
        y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]

        # Check if calibration is required
        if calibrate:
            # Create a calibrated classifier using sigmoid method
            model_fold = CalibratedClassifierCV(
                model,
                method="sigmoid",
                cv=5
            )
        else:
            # Use the original model if no calibration is needed
            model_fold = model

        # Fit the model on the training data
        model_fold.fit(X_tr, y_tr)

        # Check if the model has a predict_proba method
        if hasattr(model_fold, "predict_proba"):
            # Get predicted probabilities for the positive class
            probs = model_fold.predict_proba(X_val)[:, 1]
        else:
            # Use decision function if predict_proba is not available
            probs = model_fold.decision_function(X_val)

        # Calculate precision, recall, and thresholds for the validation set
        precision, recall, thresholds = precision_recall_curve(y_val, probs)

        # Exclude the last precision and recall values for threshold alignment
        precision = precision[:-1]
        recall = recall[:-1]

        # Calculate the F1 score for the precision-recall curve
        f1_curve = 2 * (precision * recall) / (precision + recall + 1e-10)

        # Find the index of the best F1 score
        best_idx = np.argmax(f1_curve)

        # Append the best F1 score and corresponding threshold to the lists
        f1_scores.append(f1_curve[best_idx])
        best_thresholds.append(thresholds[best_idx])

    # Return the mean F1 score and mean of the best thresholds across all folds
    return np.mean(f1_scores), np.mean(best_thresholds)


#### Method to optimize LogisticRegression classifier using Optuna

In [55]:
def objective_logreg(trial):
    # Suggest a categorical value for n-gram range from predefined options
    ngram_range_str = trial.suggest_categorical("ngram_range", ["1_1", "1_2"])
    # Convert the string representation of n-gram range to a tuple of integers
    ngram_range = tuple(map(int, ngram_range_str.split("_")))

    # Create a pipeline consisting of TF-IDF vectorization and logistic regression
    pipeline = Pipeline(
        [
            (
                "tfidf",  # Name of the first step in the pipeline
                TfidfVectorizer(
                    ngram_range=ngram_range,  # Set the n-gram range for vectorization
                    max_features=trial.suggest_int("max_features", 5000, 30000),  # Suggest max features
                    min_df=trial.suggest_int("min_df", 1, 5),  # Suggest minimum document frequency
                ),
            ),
            (
                "clf",  # Name of the second step in the pipeline
                LogisticRegression(
                    C=trial.suggest_float("C", 0.01, 10.0, log=True),  # Suggest regularization strength
                    class_weight="balanced",  # Use balanced class weights
                    max_iter=300,  # Set maximum iterations for convergence
                    solver="liblinear",  # Specify the solver for optimization
                ),
            ),
        ]
    )
    # Perform manual cross-validation and retrieve mean F1 score and threshold
    mean_f1, mean_threshold = manual_CV(pipeline, X_train, y_train, calibrate=False)
    # Store the best threshold found during cross-validation in trial attributes
    trial.set_user_attr("best_threshold", mean_threshold)

    # Return the mean F1 score for the current trial
    return mean_f1


# Create an Optuna study to optimize the logistic regression model
study_logreg = optuna.create_study(
    direction="maximize",  # Aim to maximize the objective function
    study_name="study logreg",  # Name of the study for identification
    sampler=optuna.samplers.TPESampler(seed=42),  # Use TPE sampler with a fixed seed for reproducibility
)
# Optimize the objective function over a specified number of trials
study_logreg.optimize(objective_logreg, n_trials=40, show_progress_bar=True)

# Print the best F1 score achieved during the optimization
print(f"Best LogisticRegression F1: {study_logreg.best_value:.4f}")
best_trial = study_logreg.best_trial  # Retrieve the best trial from the study
best_threshold_logreg = best_trial.user_attrs["best_threshold"]  # Get the best threshold from trial attributes
# Print the best threshold found
print(f"Best threshold : {best_threshold_logreg:.4f}")
print("Best LogisticRegression Params:")
# Iterate through the best parameters and print them
for k, v in study_logreg.best_params.items():
    print(f"   {k}: {v}")

Best trial: 10. Best value: 0.948606: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 40/40 [00:25<00:00,  1.55it/s]

Best LogisticRegression F1: 0.9486
Best threshold : 0.6520
Best LogisticRegression Params:
   ngram_range: 1_1
   max_features: 12612
   min_df: 5
   C: 7.333758903560418





#### Method to optimize LinearSVC classifier using Optuna

In [56]:
def objective_svm(trial):

    ngram_range_str = trial.suggest_categorical("ngram_range", ["1_1", "1_2"])
    
    ngram_range = tuple(map(int, ngram_range_str.split("_")))

    pipeline = Pipeline(
        [
            (
                "tfidf",
                TfidfVectorizer(
                    ngram_range=ngram_range,
                    max_features=trial.suggest_int("max_features", 5000, 30000),
                    min_df=trial.suggest_int("min_df", 1, 5),
                ),
            ),
            (
                "clf",
                LinearSVC(
                    C=trial.suggest_float("C", 0.01, 10.0, log=True),
                    class_weight="balanced",
                ),
            ),
        ]
    )
    
    mean_f1, mean_threshold = manual_CV(pipeline, X_train, y_train, calibrate=True)
    trial.set_user_attr("best_threshold", mean_threshold)

    return mean_f1


study_svm = optuna.create_study(
    direction="maximize",
    study_name="study svm",
    sampler=optuna.samplers.TPESampler(seed=42),
)
study_svm.optimize(objective_svm, n_trials=40, show_progress_bar=True)

print(f"Best SVM F1:  {study_svm.best_value:.4f}")
best_trial = study_svm.best_trial
best_threshold_svm = best_trial.user_attrs["best_threshold"]
print(f"Best threshold : {best_threshold_svm:.4f}")
print("Best SVM Params:")
for k, v in study_svm.best_params.items():
    print(f"   {k}: {v}")

Best trial: 26. Best value: 0.954241: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 40/40 [03:08<00:00,  4.71s/it]

Best SVM F1:  0.9542
Best threshold : 0.4113
Best SVM Params:
   ngram_range: 1_2
   max_features: 12281
   min_df: 2
   C: 1.5106535540253345





#### Method to optimize LightGBM classifier using Optuna

In [57]:
def objective_lgb(trial):

    pipeline = Pipeline(
        [
            ("tfidf", TfidfVectorizer(ngram_range=(1, 2), max_features=20000)),
            (
                "clf",
                lgb.LGBMClassifier(
                    n_estimators=trial.suggest_int("n_estimators", 100, 500),
                    num_leaves=trial.suggest_int("num_leaves", 20, 150),
                    learning_rate=trial.suggest_float("learning_rate", 0.01, 0.2),
                    class_weight="balanced",
                    verbose=-1,
                    n_jobs=-1,
                ),
            ),
        ]
    )
    mean_f1, mean_threshold = manual_CV(pipeline, X_train, y_train, calibrate=False)
    trial.set_user_attr("best_threshold", mean_threshold)

    return mean_f1


study_lgb = optuna.create_study(
    direction="maximize",
    study_name="study lgb",
    sampler=optuna.samplers.TPESampler(seed=42),
)
study_lgb.optimize(objective_lgb, n_trials=40, show_progress_bar=True)

print(f"Best LGB F1: {study_lgb.best_value:.4f}")
best_trial = study_lgb.best_trial
best_threshold_lgb = best_trial.user_attrs["best_threshold"]
print(f"Best threshold : {best_threshold_lgb:.4f}")
print("Best LGB Params:")
for k, v in study_lgb.best_params.items():
    print(f"   {k}: {v}")

Best trial: 34. Best value: 0.918805: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 40/40 [06:30<00:00,  9.76s/it]

Best LGB F1: 0.9188
Best threshold : 0.2492
Best LGB Params:
   n_estimators: 178
   num_leaves: 28
   learning_rate: 0.08822192250884218





#### Show threshold of all models

In [58]:
print(f"Best threshold LogisticRegression: {best_threshold_logreg:.4f}")
print(f"Best threshold LinearSVC:          {best_threshold_svm:.4f}")
print(f"Best threshold LGBMClassifier:     {best_threshold_lgb:.4f}")

Best threshold LogisticRegression: 0.6520
Best threshold LinearSVC:          0.4113
Best threshold LGBMClassifier:     0.2492


#### Show F1 of all models

In [59]:
print(f"Best LogisticRegression F1: {study_logreg.best_value:.4f}")
print(f"Best LinearSVC F1:          {study_svm.best_value:.4f}")
print(f"Best LGBMClassifier F1:     {study_lgb.best_value:.4f}")

Best LogisticRegression F1: 0.9486
Best LinearSVC F1:          0.9542
Best LGBMClassifier F1:     0.9188


#### ðŸ“Š Comparison

| Aspect                   | LogReg    | SVM       | LightGBM  |
| ------------------------ | --------- | --------- | --------- |
| F1                       | ðŸ¥ˆ 0.9486 | ðŸ¥‡ 0.9542 | ðŸ¥‰ 0.9188 |
| Optimal Threshold        | 0.6520    | 0.4113    | 0.2492    |
| Interpretability         | High      | Medium    | Low       |
| Calibrated probabilities | Yes       | Yes       | Yes       |
| Hardiness                | High      | High      | Medium    |
| Risk of overfitting      | Low       | Low       | Medium    |
| Speed                    | Very high | High      | Medium    |


#### ðŸ¥ˆ Logistic Regression

- F1: **0.9486**, very strong and close to the best.
- High optimal threshold (0.6520) â†’ more conservative, favors precision.
- Naturally calibrated probabilities and high interpretability.
- Stable, simple, and production-friendly.

#### ðŸ¥‡ LinearSVC 

- F1: **0.9542**, best overall performance.
- Moderate threshold (0.4113) â†’ more balanced recall/precision trade-off.
- Margin-based optimization slightly improves classification boundary.
- Slightly more complex than LogReg but still robust.

#### ðŸ¥‰ LGBMClassifier

- F1: **0.9188**, clearly below linear models.
- Very low threshold (0.2492) â†’ needs aggressive recall to compensate.
- Higher model complexity without performance gain.
- Likely unnecessary capacity for this problem.

#### ðŸŽ¯ Conclusion

- The problem appears largely linearly separable.<br>
- **LinearSVC is the winner** in pure performance (highest F1).<br>
- However, Logistic Regression remains a strong alternative if interpretability and simplicity are priorities.

#### LinearSVC optimization history
![study_xgb](dataset/svm.png "LinearSVC Optimization History ")

#### Instantiate the LinearSVC Pipeline and obtain predictions

In [None]:
# Retrieve the best parameters from the SVM study results
best_svm = study_svm.best_params

# Convert the ngram_range string into a tuple of integers
ngram_range = tuple(map(int, best_svm["ngram_range"].split("_")))

# Create a pipeline for the SVM model with TF-IDF vectorization and a linear SVC classifier
pipeline_svm = Pipeline(
    [
        (
            "tfidf",  # Name of the step in the pipeline
            TfidfVectorizer(
                ngram_range=ngram_range,  # Set the n-gram range for the vectorizer
                max_features=best_svm["max_features"],  # Limit the number of features
                min_df=best_svm["min_df"],  # Set the minimum document frequency
            ),
        ),
        ("clf", LinearSVC(C=best_svm["C"], class_weight="balanced")),  # Classifier step with balanced class weights
    ]
)

# Fit the pipeline to the training data
pipeline_svm.fit(X_train, y_train)

# Obtain the decision function scores for the test data
scores = pipeline_svm.decision_function(X_test)

# Generate predictions based on the decision function scores and the best threshold
y_pred_svm = (scores >= best_threshold_svm).astype(int)

# Print the classification report for the Linear SVM model
print("Classification report:")
print(classification_report(y_test, y_pred_svm))


Classification report:
              precision    recall  f1-score   support

           0       0.99      1.00      0.99       966
           1       1.00      0.91      0.95       149

    accuracy                           0.99      1115
   macro avg       0.99      0.96      0.97      1115
weighted avg       0.99      0.99      0.99      1115

[[966   0]
 [ 13 136]]


#### Show predictions

In [63]:
predictions_df = pd.DataFrame()
predictions_df['Text'] = X_test
predictions_df['Target'] = y_test
predictions_df['Predicted'] = y_pred_svm
predictions_df

Unnamed: 0,Text,Target,Predicted
3245,Squeeeeeze!! This is christmas hug.. If u lik ...,0,0
944,And also I've sorta blown him off a couple tim...,0,0
1044,Mmm thats better now i got a roast down me! iÂ’...,0,0
2484,Mm have some kanji dont eat anything heavy ok,0,0
812,So there's a ring that comes with the guys cos...,0,0
...,...,...,...
4264,Den only weekdays got special price... Haiz......,0,0
2439,I not busy juz dun wan 2 go so early.. Hee..,0,0
5556,Yes i have. So that's why u texted. Pshew...mi...,0,0
4205,How are you enjoying this semester? Take care ...,0,0


0 = Non-spam<br>
1 = spam