In [None]:
import sys
sys.path.append('../src')

import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score, train_test_split
from data_preprocessing import TextPreprocessor
import joblib
from config import SEED, TEST_SIZE
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer

df = pd.read_csv('../data/raw/Spam_SMS.csv')

X = df['Message']
y = df['Class'].map({'ham': 0, 'spam': 1})

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=TEST_SIZE, random_state=SEED)

pipelines = {
    'logreg': Pipeline([
        ('preprocessor', TextPreprocessor()),
        ('vectorizer', TfidfVectorizer(max_features=5000)),
        ('classifier', LogisticRegression(max_iter=1000))
    ]),
    'randomforest': Pipeline([
        ('preprocessor', TextPreprocessor()),
        ('vectorizer', TfidfVectorizer(max_features=5000)),
        ('classifier', RandomForestClassifier(n_estimators=100, random_state=SEED))
    ]),
    'xgboost': Pipeline([
        ('preprocessor', TextPreprocessor()),
        ('vectorizer', TfidfVectorizer(max_features=5000)),
        ('classifier', XGBClassifier(eval_metric='logloss', random_state=SEED))
    ]),
    'svm': Pipeline([
        ('preprocessor', TextPreprocessor()),
        ('vectorizer', TfidfVectorizer(max_features=5000)),
        ('classifier', SVC(kernel='linear', probability=True, random_state=SEED))
    ]),
    'naive_bayes': Pipeline([
        ('preprocessor', TextPreprocessor()),
        ('vectorizer', TfidfVectorizer(max_features=5000)),
        ('classifier', MultinomialNB())
    ])
}

results = {}
for name, pipeline in pipelines.items():
    scores = cross_val_score(pipeline, X, y, cv=5, scoring='f1')
    results[name] = scores.mean()
    print(f"{name}: F1 = {scores.mean():.4f}")

best_model_name = max(results, key=results.get)
print(f"\nЛучшая модель: {best_model_name} с F1 = {results[best_model_name]:.4f}")

best_model = pipelines[best_model_name]
best_model.fit(X_train, y_train)

joblib.dump(best_model, '../models/best_model.pkl')
print("Модель сохранена в '../models/best_model.pkl'")

from evaluate import evaluate_model
test_metrics = evaluate_model('../models/best_model.pkl', X_test, y_test)
print("\nМетрики на тестовых данных:")
print(test_metrics)

logreg: F1 = 0.8597
randomforest: F1 = 0.9070
xgboost: F1 = 0.8985
svm: F1 = 0.9331
naive_bayes: F1 = 0.8741

Лучшая модель: svm с F1 = 0.9331
Модель сохранена в '../models/best_model.pkl'

Метрики на тестовых данных:
{'accuracy': 0.9847533632286996, 'f1': 0.9449838187702265, 'roc_auc': np.float64(0.993902756618097)}
