##### Часть 1. Загрузка и подготовка данных:
* [1. Подключение необходимых библиотек и импортирование функций.](#11.1-bullet)
* [2. Загрузка данных.](#11.2-bullet)
* [3. Подготовка данных.](#11.3-bullet)

##### Часть 2. Обучение моделей:
* [4. Обучение модели "Логистическая регрессия".](#11.4-bullet)
* [5. Обучение модели "Случайный лес".](#11.5-bullet)

##### Часть 3. Выводы:
* [6. Выводы.](#11.6-bullet)

## Часть 1. Загрузка и подготовка данных. 

<a id='11.1-bullet'></a> 
### 1. Подключение необходимых библиотек и импортирование функций. 

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import datetime
import time
import re
import nltk

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from nltk.corpus import stopwords as nltk_stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from pymystem3 import Mystem
from sklearn.utils import shuffle
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier

pd.options.mode.chained_assignment = None

RANDOM_STATE = 123456
LOW_SEARCH_DEPTH = True

<a id='11.2-bullet'></a> 
### 2. Загрузка данных. 

In [2]:
try:
    data = pd.read_csv('/datasets/toxic_comments.csv')
except:
    data = pd.read_csv('../datasets/toxic_comments.csv')

In [3]:
display(data.head())

Unnamed: 0,text,toxic
0,Explanation\nWhy the edits made under my usern...,0
1,D'aww! He matches this background colour I'm s...,0
2,"Hey man, I'm really not trying to edit war. It...",0
3,"""\nMore\nI can't make any real suggestions on ...",0
4,"You, sir, are my hero. Any chance you remember...",0


Выведем общую информацию о данных в таблице *data*

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159571 entries, 0 to 159570
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   text    159571 non-null  object
 1   toxic   159571 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 2.4+ MB


Имеем в базе 2 колонки. Текст и маркер токсичности.

<a id='11.3-bullet'></a> 
### 3. Подготовка данных. 

Пара вспомогательных классов

In [5]:
class ab_ProgressBar():
    
    def __init__(self, max=0):
        self.max_count = max
        self.spinner_state = 0
        self.progress = 0
        self.start_time = time.perf_counter()
        self.progress_step = 100 / self.max_count
        self.draw()
    
    def get_diff_time(self, t):
        return int(t // 3600), int((t // 60) % 60), int(t % 60)
    
    def get_time_string(self):
        return '{0[0]:02}:{0[1]:02}:{0[2]:02}'.format(self.get_diff_time(time.perf_counter() - self.start_time))
    
    def get_left_time_string(self):
        cur_time = time.perf_counter() - self.start_time
        try:
            return '{0[0]:02}:{0[1]:02}:{0[2]:02}'.format(self.get_diff_time(max(0, cur_time * (100 - self.progress) / self.progress)))
        except:
            return ' - '
            
    def get_fill_progress(self):
        count = int(self.progress / 2)
        return '.' * count + ' ' * int(50 - count)
    
    def draw(self):
        print(f'[{self.progress:.1f}%][time: {self.get_time_string()}][{self.get_fill_progress()}][time left: {self.get_left_time_string()}]', end='\r')
 
    def increment(self, count=1):
        self.draw()
        self.progress += self.progress_step * count
    
    def stop(self):
        print()

In [6]:
class DataSplitter():
    
    def __init__(self, random_state=RANDOM_STATE):
        self.random_state = random_state
    
    def split_target(self, df, target_column, need_drop_target):
        result = {'X': df, 'y': df[target_column]}
        if need_drop_target:
            result['X'] = result['X'].drop([target_column], axis=1)
        return result
    
    def split(self, df, target_column='toxic', 
              part=None, test_size=0.2, valid_size=0.2,
              shuffle=True, need_drop_target=True, train_with_valid=False):
        train_test = train_test_split(df, test_size=test_size, 
                                      shuffle=shuffle, random_state=self.random_state)
        train_valid = train_test_split(train_test[0], test_size=valid_size, 
                                       shuffle=shuffle, random_state=self.random_state)
        split_data = {}
        if train_with_valid:
            split_data['train'] = self.split_target(train_test[0], target_column, need_drop_target)
        else:
            split_data['train'] = self.split_target(train_valid[0], target_column, need_drop_target)
        
        split_data['test'] = self.split_target(train_test[1], target_column, need_drop_target)
        split_data['valid'] = self.split_target(train_valid[1], target_column, need_drop_target)
        try:
            self.X = split_data[part]['X']
            self.y = split_data[part]['y']
        except:
            self.X = {'train': split_data['train']['X'], 'test': split_data['test']['X'], 'valid': split_data['valid']['X']}
            self.y = {'train': split_data['train']['y'], 'test': split_data['test']['y'], 'valid': split_data['valid']['y']}

    def balance(self, factor=None, technique='Upsampling'):
        X_zeros, X_ones = self.X[self.y == 0], self.X[self.y == 1]
        y_zeros, y_ones = self.y[self.y == 0], self.y[self.y == 1]
        if factor == None:
            if X_zeros.shape[0] > X_ones.shape[0]:
                factor = X_zeros.shape[0] // X_ones.shape[0]
            else:    
                factor = X_ones.shape[0] // X_zeros.shape[0]
        if technique == 'Upsampling':
            new_X = pd.concat([X_zeros] + [X_ones] * factor)
            new_y = pd.concat([y_zeros] + [y_ones] * factor)
        elif technique == 'Downsampling':
            new_X = pd.concat([X_zeros.sample(frac=1/factor, random_state=self.random_state)] + [X_ones])
            new_y = pd.concat([target_zeros.sample(frac=1/factor, random_state=self.random_state)] + [y_ones])
        
        self.X, self.y = shuffle(new_X, new_y, random_state=self.random_state)    

Функции для подготовки тескста.

Лемматизируем текст и очистим, оставив только кириллические символы и пробелы. Создадим новый столбец *lemm_text*

In [7]:
def lemmatize(text, progress_bar, m):
    lemm_list = m.lemmatize(text)
    lemm_text = ' '.join(re.sub(r"[^a-zA-Z' ]", ' ', ''.join(lemm_list)).split())
    progress_bar.increment()
    return lemm_text        

In [8]:
try:
    data = pd.read_csv('../datasets/toxic_comments_lemm.csv')
except:
    m = Mystem()
    progress_bar = ab_ProgressBar(data.shape[0])
    data['lemm_text'] = data.text.apply(lemmatize, progress_bar=progress_bar, m=m)
    progress_bar.stop()

In [9]:
data.head()

Unnamed: 0,text,toxic,lemm_text
0,Explanation\nWhy the edits made under my usern...,0,Explanation Why the edits made under my userna...
1,D'aww! He matches this background colour I'm s...,0,D'aww He matches this background colour I'm se...
2,"Hey man, I'm really not trying to edit war. It...",0,Hey man I'm really not trying to edit war It's...
3,"""\nMore\nI can't make any real suggestions on ...",0,More I can't make any real suggestions on impr...
4,"You, sir, are my hero. Any chance you remember...",0,You sir are my hero Any chance you remember wh...


## Часть 2. Обучение моделей. 

Подготовим функцию для создания мешка слов без стоп-слов.

In [10]:
def get_tf_idf(df, count_tf_idf, stopwords, need_fit=False):
    corpus = df.lemm_text.values.astype('U')
    if need_fit:
        tf_idf = count_tf_idf.fit_transform(corpus)
    else:
        tf_idf = count_tf_idf.transform(corpus)
    return tf_idf

<a id='11.4-bullet'></a> 
### 4. Обучение модели "Логистическая регрессия". 

In [11]:
split_data = DataSplitter()
nltk.download('stopwords')
stopwords = set(nltk_stopwords.words('english'))
count_tf_idf = TfidfVectorizer(stop_words=stopwords)

model = LogisticRegression(max_iter=1000)
split_data.split(data, part='train', train_with_valid=True)
split_data.balance()
model.fit(get_tf_idf(split_data.X, count_tf_idf, stopwords, need_fit=True), split_data.y)
split_data.split(data, part='test')
predicted = model.predict(get_tf_idf(split_data.X, count_tf_idf, stopwords))
print('f1 score на тестовой "Логистическая регрессия"', f1_score(split_data.y, predicted))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Andrey\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


f1 score на тестовой "Логистическая регрессия" 0.7579151112954515


<a id='11.5-bullet'></a> 
### 5. Обучение модели "Случайный лес". 

In [12]:
best_params = {
    'f1_score': 0,
    'n_estimators': 0,
    'max_depth': 0,
}
split_data.split(data, part='train')
split_data.balance()
td_idf_train = get_tf_idf(split_data.X, count_tf_idf, stopwords, need_fit=True)
y_train = split_data.y
split_data.split(data, part='valid')
td_idf_valid = get_tf_idf(split_data.X, count_tf_idf, stopwords)
y_valid = split_data.y
progress_bar = ab_ProgressBar((sum(range(20, 101, 20))) * (sum(range(10, 31, 10))))
for n_estimators in range(20, 101, 20):
    for max_depth in range(10, 31, 10):
        model = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, random_state=RANDOM_STATE)
        model.fit(td_idf_train, y_train)
        split_data.split(data, part='test')
        predicted = model.predict(td_idf_valid)
        score = f1_score(y_valid, predicted)
        if score > best_params['f1_score']:
            best_params['f1_score'] = score
            best_params['n_estimators'] = n_estimators
            best_params['max_depth'] = max_depth
        progress_bar.increment(n_estimators * max_depth)
progress_bar.stop()
print('Лучшие параметры:', best_params)

model = RandomForestClassifier(n_estimators=best_params['n_estimators'], max_depth=best_params['max_depth'], random_state=RANDOM_STATE)
split_data.split(data, part='train', train_with_valid=True)
split_data.balance()
model.fit(get_tf_idf(split_data.X, count_tf_idf, stopwords, need_fit=True), split_data.y)
split_data.split(data, part='test')
predicted = model.predict(get_tf_idf(split_data.X, count_tf_idf, stopwords))
print('f1 score на тестовой "Случайный лес"', f1_score(split_data.y, predicted))

[83.3%][time: 00:02:29][.........................................         ][time left: 00:00:29]
Лучшие параметры: {'f1_score': 0.6459028550241009, 'n_estimators': 100, 'max_depth': 30}
f1 score на тестовой "Случайный лес" 0.6657326982347997


<a id='11.6-bullet'></a> 
## Часть 3. Выводы. 

Значение метрики **f1** на тестовой выборке у модели "Логистическая регрессия" получилось **0.758**, что удовлетворительно в рамках данной задачи.

В свою очередь модель случайного леса дала гораздо меньшее значение метрики F1 = **0.666**, кроме того, подбор гиперпараметров сильно затягивает обучение данной модели с таким количеством признаков.

Таким образом мы смогли обучить модель, которая не тестовой выборке показала удовлетворительный результат.