In [1]:
import re
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

In [3]:
train_data = pd.read_csv('.../train_spam.csv')
test_data = pd.read_csv('.../test_spam.csv')

In [4]:
def preprocess_text(text):
    """Производит предобработку текста."""

    # замена эмоджи токеном 'emoji'
    emoji_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        "]+", flags=re.UNICODE)
    text = re.sub(emoji_pattern, r' emoji ', text)

    # замена ссылки на аккаунт токеном 'accountid'
    text = re.sub(r'([@]\w+)', ' accountid ', text)
    
    return text

In [5]:
# предобработка текстов
train_data['text'] = train_data['text'].apply(preprocess_text)
test_data['text'] = test_data['text'].apply(preprocess_text)

In [6]:
# реализация кросс-валидации
skf = StratifiedKFold(shuffle=True, random_state=42)

fold = 0
for train_index, val_index in skf.split(train_data, train_data['text_type']):

    # номер фолда
    fold +=1

    # инициализация под-датасетов
    X_train = train_data.loc[train_index, 'text']
    X_val = train_data.loc[val_index, 'text']
    y_train = train_data.loc[train_index, 'text_type']
    y_val = train_data.loc[val_index, 'text_type']

    # инициализация пайплайна с конструктором признаков и моделью
    classifier = Pipeline([
        ('vectorizer', CountVectorizer(ngram_range=(1, 2), stop_words='english')),
        ('nb', MultinomialNB(alpha=0.5))
        ])

    # обучение модели
    classifier.fit(X_train, y_train)

    # оценка метрики ROC-AUC модели на каждом фолде
    y_pred = classifier.predict_proba(X_val)[:,1]
    score = roc_auc_score(y_val, y_pred)
    print(f'Fold {fold}. ROC-AUC score: {score:.5f}')

Fold 1. ROC-AUC score: 0.97703
Fold 2. ROC-AUC score: 0.97548
Fold 3. ROC-AUC score: 0.97224
Fold 4. ROC-AUC score: 0.97706
Fold 5. ROC-AUC score: 0.98044


In [7]:
# классификация тестовых данных
test_data['score'] = classifier.predict_proba(test_data['text'])[:,1]

# сохранение предсказаний
test_data.to_csv('.../submission.csv', index=False)