In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
import itertools

from matplotlib import pyplot as plt

%load_ext autoreload
%autoreload 2

Сначала нам надо прочитать данные в какой-то адекватный формат:

In [2]:
def read_msg(file: Path):
    with file.open('r') as f:
        head = f.readline().split()[1:]
        f.readline() # skip empty
        body = f.readline().split()
        
        return np.array(head + body), ('legit' in str(file))

In [3]:
def read_dir(dir_path: Path):
    X = []
    y = []
    for file in dir_path.iterdir():
        if not file.is_file():
            continue
        text, legit = read_msg(file)
        X.append(text)
        y.append(legit)
    
    return np.array(X, dtype=object), np.array(y, dtype=int)

In [4]:
data = [read_dir(Path(f'data/part{i}')) for i in range(1, 11)]

X = np.concatenate([entry[0] for entry in data])
y = np.concatenate([entry[1] for entry in data])

Хочется сепарировать логику выделения n-грамм в отдельный класс-трансформер, так мы и поступим.

In [5]:
import bayes
import ngrammer

Опробуем ngrammer:

In [6]:
ngr = ngrammer.NGrammer(n=2)

test_X = np.array([
    ['a', 'b', 'c'],
    ['ab', 'bc', 'cd', 'de']
], dtype=object)

test_y = np.array([True, False])

ngr.fit_transform(test_X, test_y)

array([list(['ab', 'bc']), list(['abbc', 'bccd', 'cdde'])], dtype=object)

Отлично, взятие n-грамм работает. Теперь давайте соберём пайплайн:

In [7]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

pipe = Pipeline([
    ('ngr', ngrammer.NGrammer()),
    ('bayes', bayes.BayesClassifier()),
])

param_grid = {
    'ngr__n'        : [1, 2, 3],
    'bayes__alpha'  : [800, 200, 50, 15, 1, .5],
    'bayes__lambdas': [(1, 1), (10, 1), (100, 1)],
}

In [8]:
best_model = GridSearchCV(pipe, param_grid, cv=10, n_jobs=4, scoring='accuracy', verbose=1, refit=True)

best_model.fit(X, y)

Fitting 10 folds for each of 54 candidates, totalling 540 fits


GridSearchCV(cv=10,
             estimator=Pipeline(steps=[('ngr', NGrammer()),
                                       ('bayes', BayesClassifier())]),
             n_jobs=4,
             param_grid={'bayes__alpha': [800, 200, 50, 15, 1, 0.5],
                         'bayes__lambdas': [(1, 1), (10, 1), (100, 1)],
                         'ngr__n': [1, 2, 3]},
             scoring='accuracy', verbose=1)

In [9]:
print(f'\n\nBest parameter (CV score={best_model.best_score_:.4})')
print(best_model.best_params_)



Best parameter (CV score=0.5596)
{'bayes__alpha': 800, 'bayes__lambdas': (1, 1), 'ngr__n': 1}


In [10]:
from sklearn.metrics import accuracy_score as acc_score

acc_score(best_model.predict(X), y)

0.5596330275229358

Теперь давайте напишем скорер, который будет крайне бесконечно штрафовать все ложноположительные классификации (aka классификации legit как spam).

In [11]:
from sklearn.metrics import make_scorer

def legit_check_else_acc(y_true, y_pred):
    if -1 in (y_true - y_pred):
        return .0
    return acc_score(y_true, y_pred)

legit_scorer = make_scorer(legit_check_else_acc)

In [12]:
legit_check_param_grid = {
    'ngr__n' : [1], # as best result
    'bayes__alpha': [1, 8, 64, 512],
    'bayes__lambdas': [(10 ** pw, 1) for pw in range(0, 20, 2)]
}

In [13]:
legit_model = GridSearchCV(pipe, legit_check_param_grid, cv=10, n_jobs=4, scoring=legit_scorer, verbose=1, refit=True)

legit_model.fit(X, y)

Fitting 10 folds for each of 40 candidates, totalling 400 fits


GridSearchCV(cv=10,
             estimator=Pipeline(steps=[('ngr', NGrammer()),
                                       ('bayes', BayesClassifier())]),
             n_jobs=4,
             param_grid={'bayes__alpha': [1, 8, 64, 512],
                         'bayes__lambdas': [(1, 1), (100, 1), (10000, 1),
                                            (1000000, 1), (100000000, 1),
                                            (10000000000, 1),
                                            (1000000000000, 1),
                                            (100000000000000, 1),
                                            (10000000000000000, 1),
                                            (1000000000000000000, 1)],
                         'ngr__n': [1]},
             scoring=make_scorer(legit_check_else_acc), verbose=1)

In [14]:
print(f'\n\nBest parameter (CV score={legit_model.best_score_:.4})')
print(legit_model.best_params_)



Best parameter (CV score=0.5596)
{'bayes__alpha': 64, 'bayes__lambdas': (1, 1), 'ngr__n': 1}
