# 1. Подготовка

In [1]:
import pandas as pd
import numpy as np
import nltk
import re
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer 
from nltk.corpus import stopwords as nltk_stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from catboost import CatBoostClassifier
import xgboost as xgb
from sklearn.metrics import f1_score
from sklearn.utils import shuffle

In [2]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Анатолий\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [3]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Анатолий\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
def lemmatize(text):
    lem = nltk.word_tokenize(text)
    lemmtext = [lemmatizer.lemmatize(i, get_wordnet_pos(i)) for i in nltk.word_tokenize(text)]
    return " ".join(lemmtext)

In [5]:
lemmatizer = WordNetLemmatizer()

In [6]:
def get_wordnet_pos(word):
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)

In [7]:
def clear_text(text):
    txt = re.sub(r'[^a-zA-Z0-9]',' ', text)
    txt = str(txt).lower()
    return ' '.join(txt.split())

In [8]:
def upsample(features, target, repeat):
    features_zeros = features[target == 0]
    features_ones = features[target == 1]
    target_zeros = target[target == 0]
    target_ones = target[target == 1]

    features_upsampled = pd.concat([features_zeros] + [features_ones] * repeat)
    target_upsampled = pd.concat([target_zeros] + [target_ones] * repeat)
    
    features_upsampled, target_upsampled = shuffle(
        features_upsampled, target_upsampled, random_state=12345)
    
    return features_upsampled, target_upsampled

In [9]:
data = pd.read_csv('toxic_comments.csv')

In [10]:
data.shape

(159571, 2)

In [11]:
data.head(3)

Unnamed: 0,text,toxic
0,Explanation\nWhy the edits made under my usern...,0
1,D'aww! He matches this background colour I'm s...,0
2,"Hey man, I'm really not trying to edit war. It...",0


In [12]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159571 entries, 0 to 159570
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   text    159571 non-null  object
 1   toxic   159571 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 2.4+ MB


Тип данных нормально

In [13]:
data['toxic'].value_counts()

0    143346
1     16225
Name: toxic, dtype: int64

In [14]:
data = data.sample(25000)

In [15]:
data['lemm_text'] = data['text'].apply(lambda x: lemmatize(x))

In [16]:
data['lemm_text'] = data['lemm_text'].apply(lambda x: clear_text(x))

In [17]:
data = data.drop(['text'], axis = 1)

In [18]:
features = data.drop(['toxic'], axis = 1)

In [19]:
target = data['toxic']

In [20]:
features_train, features_valid, target_train, target_valid = train_test_split(
    features, target, test_size=0.4, random_state=12345)

In [21]:
features_valid, features_test, target_valid, target_test = train_test_split(
    features_valid, target_valid, test_size=0.5, random_state=12345)

In [22]:
features_train.shape

(15000, 1)

In [23]:
features_valid.shape

(5000, 1)

In [24]:
features_test.shape

(5000, 1)

In [25]:
target_train.shape

(15000,)

In [26]:
target_valid.shape

(5000,)

In [27]:
target_test.shape

(5000,)

In [28]:
features_train, target_train = upsample(features_train, target_train, 9)

Разбил / Проверил

In [29]:
corpus_train = features_train['lemm_text'].values.astype('U')

In [30]:
corpus_valid = features_valid['lemm_text'].values.astype('U')

In [31]:
corpus_test = features_test['lemm_text'].values.astype('U')

Корпус в юникод

In [32]:
stopwords = set(nltk_stopwords.words('english'))

In [33]:
count_tf_idf = TfidfVectorizer(stop_words=stopwords)

In [34]:
count_tf_idf.fit(corpus_train)

TfidfVectorizer(stop_words={'a', 'about', 'above', 'after', 'again', 'against',
                            'ain', 'all', 'am', 'an', 'and', 'any', 'are',
                            'aren', "aren't", 'as', 'at', 'be', 'because',
                            'been', 'before', 'being', 'below', 'between',
                            'both', 'but', 'by', 'can', 'couldn', "couldn't", ...})

Обучил только на трэйн

In [35]:
tf_idf_train  = count_tf_idf.transform(corpus_train)

In [36]:
tf_idf_valid = count_tf_idf.transform(corpus_valid)

In [37]:
tf_idf_test = count_tf_idf.transform(corpus_test)

Преобразовал.

# Обучение / Модели

#### LogisticRegression

In [38]:
model_LR = LogisticRegression(solver = 'liblinear', random_state = 12345)

In [39]:
model_LR.fit(tf_idf_train, target_train)

LogisticRegression(random_state=12345, solver='liblinear')

In [40]:
predict_LR = model_LR.predict(tf_idf_valid)

In [41]:
f1_LR = f1_score(target_valid, predict_LR)

In [42]:
f1_LR

0.7233661593554161

#### RandomForestClassifier

In [94]:
model_RFC =  RandomForestClassifier(n_estimators = 179,
                                    max_depth = 320,
                                    max_features = 480,
                                    n_jobs = 4, 
                                    random_state = 12345)

In [95]:
model_RFC.fit(tf_idf_train, target_train)

RandomForestClassifier(max_depth=320, max_features=480, n_estimators=179,
                       n_jobs=4, random_state=12345)

In [96]:
predict_RFC = model_RFC.predict(tf_idf_valid)

In [97]:
f1_RFC = f1_score(target_valid, predict_RFC)

In [98]:
f1_RFC

0.7108571428571429

#### DecisionTreeClassifier

In [48]:
model_DTC = DecisionTreeClassifier(max_depth = 96, random_state = 12345)

In [49]:
model_DTC.fit(tf_idf_train, target_train)

DecisionTreeClassifier(max_depth=96, random_state=12345)

In [50]:
predict_DTC = model_DTC.predict(tf_idf_valid)

In [51]:
f1_DTC = f1_score(target_valid, predict_DTC)

In [52]:
f1_DTC

0.62580054894785

#### CatBoostClassifier

In [53]:
model_CBC = CatBoostClassifier(iterations = 91, depth = 6, random_state = 12345)

In [54]:
model_CBC.fit(tf_idf_train, target_train)

Learning rate set to 0.379998
0:	learn: 0.5714271	total: 805ms	remaining: 1m 12s
1:	learn: 0.5351618	total: 1.25s	remaining: 55.7s
2:	learn: 0.5139765	total: 1.69s	remaining: 49.7s
3:	learn: 0.4946701	total: 2.12s	remaining: 46s
4:	learn: 0.4755768	total: 2.53s	remaining: 43.6s
5:	learn: 0.4567330	total: 2.96s	remaining: 41.9s
6:	learn: 0.4438054	total: 3.38s	remaining: 40.5s
7:	learn: 0.4327108	total: 3.8s	remaining: 39.4s
8:	learn: 0.4223608	total: 4.22s	remaining: 38.5s
9:	learn: 0.4148244	total: 4.64s	remaining: 37.6s
10:	learn: 0.4057207	total: 5.06s	remaining: 36.8s
11:	learn: 0.3979270	total: 5.48s	remaining: 36.1s
12:	learn: 0.3881940	total: 5.91s	remaining: 35.5s
13:	learn: 0.3827037	total: 6.32s	remaining: 34.8s
14:	learn: 0.3766799	total: 6.75s	remaining: 34.2s
15:	learn: 0.3699046	total: 7.18s	remaining: 33.7s
16:	learn: 0.3641343	total: 7.61s	remaining: 33.1s
17:	learn: 0.3599505	total: 8.03s	remaining: 32.6s
18:	learn: 0.3547545	total: 8.44s	remaining: 32s
19:	learn: 0.35

<catboost.core.CatBoostClassifier at 0x307c786040>

In [55]:
predict_CBC = model_CBC.predict(tf_idf_valid)

In [56]:
f1_CBC = f1_score(target_valid, predict_CBC)

In [57]:
f1_CBC

0.7211625794732062

#### XGBClassifier

In [58]:
model_XGBC = xgb.XGBClassifier(max_depth = 71, n_estimators = 33, random_state = 12345)

In [59]:
model_XGBC.fit(tf_idf_train, target_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=71,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=33, n_jobs=0, num_parallel_tree=1,
              random_state=12345, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
              subsample=1, tree_method='exact', validate_parameters=1,
              verbosity=None)

In [60]:
predict_XGBC = model_XGBC.predict(tf_idf_valid)

In [61]:
f1_XGBC = f1_score(target_valid, predict_XGBC)

In [62]:
f1_XGBC

0.7208333333333332

# Выводы / Тест

Предсказания на тестовой выборке

In [63]:
predict_LR_test = model_LR.predict(tf_idf_test)

In [64]:
f1_LR_test = f1_score(target_test, predict_LR_test)

In [65]:
f1_LR_test

0.6814671814671814

In [66]:
predict_RFC_test = model_RFC.predict(tf_idf_test)

In [67]:
f1_RFC_test = f1_score(target_test, predict_RFC_test)

In [68]:
f1_RFC_test

0.5505226480836238

In [69]:
predict_DTC_test = model_DTC.predict(tf_idf_test)

In [70]:
f1_DTC_test = f1_score(target_test, predict_DTC_test)

In [71]:
f1_DTC_test

0.5978987583572112

In [72]:
predict_CBC_test = model_CBC.predict(tf_idf_test)

In [73]:
f1_CBC_test = f1_score(target_test, predict_CBC_test)

In [74]:
f1_CBC_test

0.6851674641148325

In [75]:
predict_XGBC_test = model_XGBC.predict(tf_idf_test)

In [76]:
f1_XGBC_test = f1_score(target_test, predict_XGBC_test)

In [77]:
f1_XGBC_test

0.6771300448430494

In [78]:
table = pd.DataFrame(
    {'name': ['LogisticRegression', 'RandomForestClassifier', 'DecisionTreeClassifier', 'CatBoostClassifier', 'XGBClassifier']
                      , 'f1_score': [f1_LR, f1_RFC, f1_DTC, f1_CBC, f1_XGBC]
                      , 'f1_score_test': [f1_LR_test, f1_RFC_test, f1_DTC_test, f1_CBC_test, f1_XGBC_test]})

In [79]:
table

Unnamed: 0,name,f1_score,f1_score_test
0,LogisticRegression,0.723366,0.681467
1,RandomForestClassifier,0.61676,0.550523
2,DecisionTreeClassifier,0.625801,0.597899
3,CatBoostClassifier,0.721163,0.685167
4,XGBClassifier,0.720833,0.67713


name |f1_test_1000|f1_test_2500|f1_test_5000|f1_test_7500|f1_test_10000|f1_test_25000|f1_test_50000|f1_test_75000|f1_test_100000|
--|--|--|--|--|--|--|--|--|--|
LogisticRegression| 0.400000| 0.621622| 0.684783| 0.642336| 0.652742| 0.679630| 0.715939| 0.732978|--|
RandomForestClassifier| 0.125000| 0.344828| 0.545455| 0.532020| 0.597786| 0.662469| 0.678218| 0.674679|--|
DecisionTreeClassifier| 0.315789| 0.511628| 0.594872| 0.652015| 0.629526| 0.659292| 0.693176| 0.697234|--|
CatBoostClassifier| 0.380952|0.492754| 0.565517| 0.571429| 0.587814| 0.647428| 0.702703| 0.700435|--|
XGBClassifier| 0.300000| 0.478873| 0.569444| 0.634361| 0.600000| 0.652553| 0.697979| 0.697526|--|

+ В результате на сформированной таблице видно, что метрика F1 постепенно увеличивается в зависимости от количества данных, подаваемых на модели.