In [57]:
#11. Используйте набор данных SpamAssassin для обучения модели классификации электронной почты как спам или не спам. Сравните результаты с использованием различных алгоритмов.

import pandas as pd
import os
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report


# Классификаторы
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier, ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.dummy import DummyClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV

#Проектные модули
from HW_2.data_loader import DataLoader
from HW_2.data_processing import DataProcessing

In [149]:
# Загрузка данных
cwd = os.getcwd()
file_path = os.path.join(cwd, r'data\spam.csv')  # Замените на путь к вашему CSV файлу

df = DataLoader.load_from_csv(file_path)

df.head()

Unnamed: 0,word_freq_make,word_freq_address,word_freq_all,word_freq_3d,word_freq_our,word_freq_over,word_freq_remove,word_freq_internet,word_freq_order,word_freq_mail,...,char_freq_%3B,char_freq_%28,char_freq_%5B,char_freq_%21,char_freq_%24,char_freq_%23,capital_run_length_average,capital_run_length_longest,capital_run_length_total,class
0,0.0,0.64,0.64,0.0,0.32,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.778,0.0,0.0,3.756,61,278,1
1,0.21,0.28,0.5,0.0,0.14,0.28,0.21,0.07,0.0,0.94,...,0.0,0.132,0.0,0.372,0.18,0.048,5.114,101,1028,1
2,0.06,0.0,0.71,0.0,1.23,0.19,0.19,0.12,0.64,0.25,...,0.01,0.143,0.0,0.276,0.184,0.01,9.821,485,2259,1
3,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.137,0.0,0.137,0.0,0.0,3.537,40,191,1
4,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.135,0.0,0.135,0.0,0.0,3.537,40,191,1


In [60]:
#Проверка датасета на пустые значения
missing_values = DataProcessing.check_missing_values(df)
print(missing_values)


Общее кол-во пустых значений:  0 

Series([], dtype: int64)


In [61]:
# X и y
X = df.drop("class", axis=1)
y = df["class"]

In [62]:
# Разделение train/test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [63]:
#Модели и параметры

models = {
    "Gradient Boosting": (
        GradientBoostingClassifier(random_state=42),
        {
            "n_estimators": [100, 200],
            "learning_rate": [0.05, 0.1],
            "max_depth": [3, 5]
        }
    ),

    "CatBoost": (
        CatBoostClassifier(verbose=0, random_seed=42),
        {
            "iterations": [100, 200],
            "learning_rate": [0.05, 0.1],
            "depth": [4, 6]
        }
    ),

    "AdaBoost": (
        AdaBoostClassifier(random_state=42),
        {
            "n_estimators": [50, 100],
            "learning_rate": [0.5, 1.0]
        }
    ),

    "Extra Trees": (
        ExtraTreesClassifier(random_state=42),
        {
            "n_estimators": [100, 200],
            "max_depth": [None, 10],
            "min_samples_split": [2, 5]
        }
    ),

    "QDA": (
        QuadraticDiscriminantAnalysis(),
        {
            "reg_param": [0.0, 0.1, 0.2]
        }
    ),

    "LightGBM": (
        LGBMClassifier(random_state=42),
        {
            "learning_rate": [0.05, 0.1],
            "n_estimators": [100, 200],
            "max_depth": [-1, 10],
            "min_split_gain": [0.0, 0.01],
            "min_child_samples": [20, 40]
        }
    ),

    "KNN": (
        KNeighborsClassifier(),
        {
            "n_neighbors": [3, 5, 7],
            "weights": ["uniform", "distance"]
        }
    ),

    "Decision Tree": (
        DecisionTreeClassifier(random_state=42),
        {
            "max_depth": [None, 10, 20],
            "min_samples_split": [2, 5]
        }
    ),

    "XGBoost": (
        XGBClassifier(eval_metric="logloss", random_state=42),
        {
            "n_estimators": [100, 200],
            "learning_rate": [0.05, 0.1],
            "max_depth": [3, 5]
        }
    ),

    "Dummy": (
        DummyClassifier(strategy="most_frequent"),
        {}  # фиктивный — без параметров
    ),

    "SVM (linear)": (
        SVC(kernel="linear", probability=True, random_state=42),
        {
            "C": [0.01, 0.1, 1.0, 10.0]
        }
    )
}

In [64]:
 #Обучение, подбор параметров и метрики
results = []
#optuna and hyperopt
#balanced data from dataset
for name, (model, params) in models_and_params.items():
    print(f"Обработка модели: {name}\n")
    if params:
        grid = GridSearchCV(model, params, scoring="f1", cv=3, n_jobs=-1)
        grid.fit(X_train, y_train)
        best_model = grid.best_estimator_
        best_params = grid.best_params_
    else:
        model.fit(X_train, y_train)
        best_model = model
        best_params = "default or custom"

    y_pred = best_model.predict(X_test)
    report = classification_report(y_test, y_pred, target_names=["Not Spam", "Spam"], output_dict=True, zero_division=0)
    pretty_report = classification_report(y_test, y_pred, target_names=["Not Spam", "Spam"], zero_division=0)
    print(pretty_report)
    print("\n---------------------------------------------------------\n")
    precision_spam = report["Spam"]["precision"]
    precision_not_spam = report["Not Spam"]["precision"]

    f1_spam = report["Spam"]["f1-score"]
    f1_not_spam = report["Not Spam"]["f1-score"]

    recall_spam = report["Spam"]["recall"]
    recall_not_spam = report["Not Spam"]["recall"]

    model_accuracy = report["accuracy"]

    results.append({
        "Model": name,
        "Precision (Spam)": round(precision_spam, 2),
        "Precision (Not Spam)": round(precision_not_spam, 2),
        "F1 Score (Spam)": round(f1_spam, 2),
        "F1 Score (Not Spam)": round(f1_not_spam, 2),
        "Recall (Spam)": round(recall_spam, 2),
        "Recall (Not Spam)": round(recall_not_spam, 2),
        "Accuracy": round(model_accuracy, 2)
    })

Обработка модели: GradientBoosting

              precision    recall  f1-score   support

    Not Spam       0.95      0.96      0.96       558
        Spam       0.94      0.93      0.93       363

    accuracy                           0.95       921
   macro avg       0.95      0.94      0.95       921
weighted avg       0.95      0.95      0.95       921


---------------------------------------------------------

Обработка модели: CatBoost

              precision    recall  f1-score   support

    Not Spam       0.96      0.96      0.96       558
        Spam       0.94      0.93      0.93       363

    accuracy                           0.95       921
   macro avg       0.95      0.94      0.95       921
weighted avg       0.95      0.95      0.95       921


---------------------------------------------------------

Обработка модели: AdaBoost

              precision    recall  f1-score   support

    Not Spam       0.93      0.95      0.94       558
        Spam       0.92  

In [148]:
#Вывод финальной таблицы
results_df = pd.DataFrame(results).sort_values(by="Accuracy", ascending=False).reset_index(drop=True)
display(results_df)

Unnamed: 0,Model,Precision (Spam),Precision (Not Spam),F1 Score (Spam),F1 Score (Not Spam),Recall (Spam),Recall (Not Spam),Accuracy
0,GradientBoosting,0.94,0.95,0.93,0.96,0.93,0.96,0.95
1,CatBoost,0.94,0.96,0.93,0.96,0.93,0.96,0.95
2,ExtraTrees,0.96,0.95,0.94,0.96,0.92,0.97,0.95
3,XGBoost,0.94,0.95,0.93,0.96,0.93,0.96,0.95
4,LightGBM,0.94,0.96,0.94,0.96,0.93,0.96,0.95
5,AdaBoost,0.92,0.93,0.91,0.94,0.9,0.95,0.93
6,LinearSVC,0.92,0.93,0.91,0.94,0.9,0.95,0.93
7,DecisionTree,0.9,0.92,0.88,0.93,0.87,0.94,0.91
8,QDA,0.84,0.91,0.86,0.9,0.87,0.9,0.89
9,KNN,0.74,0.82,0.73,0.83,0.72,0.84,0.79
