## Курсовая работа

Бинарная классификация комментариев на токсичность

### Приступая к работе

In [4]:
import numpy as np
import pandas as pd

In [5]:
train_df = pd.read_csv('data/train.csv')
test_df = pd.read_csv('data/test.csv')
test_labels = pd.read_csv('data/test_labels.csv')
train_df.head(3)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0


In [6]:
train_df['severe_toxic'].unique()

array([0, 1])

Один признак текстовый, остальные бинарные. Еще есть __id__, но он мне не нужен.

In [7]:
df = train_df.drop('id', axis=1)
df.head(3)

Unnamed: 0,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0


В качестве целевой переменной будет факт токсичности (признак __toxic__).

In [8]:
X = df.drop('toxic', axis=1)
y = df['toxic']
X.head(3)

Unnamed: 0,comment_text,severe_toxic,obscene,threat,insult,identity_hate
0,Explanation\nWhy the edits made under my usern...,0,0,0,0,0
1,D'aww! He matches this background colour I'm s...,0,0,0,0,0
2,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0


### Векторизация текста

Текст надо представить в виде чисел, понятных компьютеру (векторов). Для этого я буду использовать TF-IDF векторизатор. В него я добавил регулярное выражение, которое отсеивает все числа, чтобы они не попали в словарь слов.

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
import re

In [102]:
def clean_text(text):
    delimiters = ' ', '\n', '_'
    regex_pattern = '|'.join(map(re.escape, delimiters))
    return ' '.join(re.split(regex_pattern, re.sub(r'[0-9]+', '', text.lower())))


def clean_series(s):
    for i in range(s.shape[0]):
        s.iloc[i] = clean_text(s.iloc[i].lower())


def clean_df(X):
    for i in range(X.shape[0]):
        X.loc[i, 'comment_text'] = clean_text(X.loc[i, 'comment_text'].lower())

In [10]:
%%time
delimiters = ' ', '\n', '_'
regex_pattern = '|'.join(map(re.escape, delimiters))
for i in range(X.shape[0]):
    X.loc[i, 'comment_text'] = ' '.join(re.split(regex_pattern, re.sub(r'[0-9]+', '', X.loc[i, 'comment_text'].lower())))
X.head(3)

CPU times: user 14min 6s, sys: 8.45 s, total: 14min 15s
Wall time: 30min 33s


Unnamed: 0,comment_text,severe_toxic,obscene,threat,insult,identity_hate
0,explanation why the edits made under my userna...,0,0,0,0,0
1,d'aww! he matches this background colour i'm s...,0,0,0,0,0
2,"hey man, i'm really not trying to edit war. it...",0,0,0,0,0


In [34]:
from sklearn.model_selection import train_test_split

In [36]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.3, random_state=21)

In [37]:
vectorizer = TfidfVectorizer()
corpus_train = X_train['comment_text']
corpus_valid = X_valid['comment_text']
tfidf_features_train = vectorizer.fit_transform(corpus_train)
tfidf_features_valid = vectorizer.transform(corpus_valid)
vectorizer.get_feature_names()[0:10]

['aa',
 'aaa',
 'aaaa',
 'aaaaa',
 'aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaany',
 'aaaaaaaaaah',
 'aaaaaaaaaahhhhhhhhhhhhhh',
 'aaaaaaaaadm',
 'aaaaaaaaaq',
 'aaaaaaaacfo']

In [38]:
len(vectorizer.get_feature_names())

140231

In [39]:
tfidf_features.shape

(159571, 172817)

In [40]:
X.shape

(159571, 6)

### Обучение модели

In [15]:
# Xv = X.drop('comment_text', axis=1).join(tfidf_features

In [16]:
from sklearn.ensemble import GradientBoostingClassifier

In [41]:
X_train_new = tfidf_features_train
X_valid_new = tfidf_features_valid

In [46]:
model = GradientBoostingClassifier(random_state=21, n_estimators=1000)
model

GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=1000,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=21, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [48]:
%%time
model = model.fit(X_train_new, y_train)

CPU times: user 17min 51s, sys: 4.42 s, total: 17min 55s
Wall time: 38min 9s


In [49]:
from sklearn.metrics import f1_score

In [50]:
pred_train = model.predict(X_train_new)
f1_score(pred_train, y_train)

0.8062133391266564

In [51]:
pred_valid = model.predict(X_valid_new)
f1_score(pred_valid, y_valid)

0.7329350451252066

In [52]:
from sklearn.metrics import precision_score, recall_score

In [53]:
precision_score(pred_valid, y_valid), recall_score(pred_valid, y_valid)

(0.6202667814113597, 0.8956197576887233)

### Подбор порога вероятности

In [59]:
from sklearn.metrics import precision_recall_curve

In [60]:
pred_valid_proba = model.predict_proba(X_valid_new)[:, 1]

In [89]:
precision, recall, thresholds = precision_recall_curve(y_valid, pred_valid_proba)
beta = 0.75 # Отдаю предпочтение precision
fscore = ((1 + beta**2) * precision * recall) / (beta**2 * precision + recall)
f1score = (2 * precision * recall) / (precision + recall)
ind = np.argmax(fscore)
print('Best threshold = %f,\nF-Score = %.3f,\nF1-Score = %.3f,\nPrecision = %.3f,\nRecall = %.3f' %
       (thresholds[ind], 
        fscore[ind],
        f1score[ind],
        precision[ind],
        recall[ind]))

Best threshold = 0.382022,
F-Score = 0.787,
F1-Score = 0.759,
Precision = 0.866,
Recall = 0.676


### Пайплайн

In [96]:
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.base import BaseEstimator, TransformerMixin

In [97]:
class TextCleaner(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X = X.copy()
        clean_series(X)
        return X

In [132]:
pipeline = Pipeline([('text_cleaner', TextCleaner()), 
                     ('vectorizer', TfidfVectorizer()), 
                     ('classifier', GradientBoostingClassifier
                      (random_state=21, n_estimators=1000))])

Сейчас будет очень долго

In [133]:
%%time
pipeline = pipeline.fit(X_train['comment_text'], y_train)

CPU times: user 17min 59s, sys: 5.11 s, total: 18min 4s
Wall time: 1h 19min 55s


In [134]:
pipeline.predict(pd.Series(['Hello bitch fuck you shit!', 'I love you!']))

array([1, 0])

Сохраню его для дальнейшего использования

In [135]:
from joblib import dump, load

In [136]:
dump(pipeline, 'saved/pipeline.joblib')

['saved/pipeline.joblib']

Конец!