In [14]:
import sys
from pathlib import Path
from data_preprocessing import TextPreprocessor
from config import SEED, TEST_SIZE

df = pd.read_csv('../data/raw/Spam_SMS.csv')

X = df['Message']
y = df['Class'].map({'ham': 0, 'spam': 1})

# 3.разделение на train/test (хотя cross_val_score сделает это автоматически)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=TEST_SIZE, random_state=SEED)

# Пайплайны для разных моделей
pipelines = {
    'logreg': Pipeline([
        ('preprocessor', TextPreprocessor()),
        ('vectorizer', TfidfVectorizer(max_features=5000)),
        ('classifier', LogisticRegression(max_iter=1000))
    ]),
    'randomforest': Pipeline([
        ('preprocessor', TextPreprocessor()),
        ('vectorizer', TfidfVectorizer(max_features=5000)),
        ('classifier', RandomForestClassifier(n_estimators=100, random_state=SEED))
    ]),
    'xgboost': Pipeline([
        ('preprocessor', TextPreprocessor()),
        ('vectorizer', TfidfVectorizer(max_features=5000)),
        ('classifier', XGBClassifier(eval_metric='logloss', random_state=SEED))
    ])
}

# кросс-валидация и оценка моделей
results = {}
for name, pipeline in pipelines.items():
    scores = cross_val_score(pipeline, X, y, cv=5, scoring='f1')
    results[name] = scores.mean()
    print(f"{name}: F1 = {scores.mean():.4f}")

best_model_name = max(results, key=results.get)
print(f"\nЛучшая модель: {best_model_name} с F1 = {results[best_model_name]:.4f}")

best_model = pipelines[best_model_name]
best_model.fit(X_train, y_train)

joblib.dump(best_model, '../models/best_model.pkl')
print("Модель сохранена в '../models/best_model.pkl'")

logreg: F1 = 0.8597
randomforest: F1 = 0.9070
xgboost: F1 = 0.8985

Лучшая модель: randomforest с F1 = 0.9070
Модель сохранена в '../models/best_model.pkl'
