# Проект для «Викишоп»

## Подготовка

In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

import nltk
from nltk.stem import WordNetLemmatizer
from sklearn.pipeline import Pipeline
import re

import nltk
from nltk.corpus import stopwords as nltk_stopwords
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from lightgbm import LGBMClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
state = 12345

In [2]:
data = pd.read_csv('/datasets/toxic_comments.csv')

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159571 entries, 0 to 159570
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   text    159571 non-null  object
 1   toxic   159571 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 2.4+ MB


In [4]:
data.isna().sum()

text     0
toxic    0
dtype: int64

In [5]:
data['toxic'].value_counts()

0    143346
1     16225
Name: toxic, dtype: int64

### Вывод:

- Информация о структуре данных - двумерная(DataFrame);
- таблица имеет 159571 строк и 2 столбца; 
- присутствуют названия столбцов и количество ненулевых значений;
- в таблице присутствуют значения типов int(1 столбeц) и object(1столбец);
- использование памяти.

Пропуски отсутствуют. Посмотрели сколько уникальных значений у столбца с целевыми признаками.

### Лемматизация

In [7]:
#def lemmatize(text):
    #m = Mystem()
    #lemm_list = m.lemmatize(text)
    #lemm_text = "".join(lemm_list)
        
    #return lemm_text

In [8]:
data['text'] = data['text'].map(str)

In [9]:
def lemmatize(text):
    word_list = nltk.word_tokenize(text)
    wnl = WordNetLemmatizer()
    lemm_text = " ".join([wnl.lemmatize(w) for w in word_list])
        
    return lemm_text

Напишем функцию, которая оставит в тексте только латинские символы и пробелы:

In [10]:
def clear_text(text):
    text = text.lower()
    clear_text = re.sub(r'[^а-яА-ЯёЁa-zA-Z]', ' ', text)
    clear_text = clear_text.split()
    clear_text = " ".join(clear_text)
    return clear_text

### Очищенный и лемматизированный текст:

In [11]:
lemmatize(clear_text(data['text'][0]))

'explanation why the edits made under my username hardcore metallica fan were reverted they weren t vandalism just closure on some gas after i voted at new york doll fac and please don t remove the template from the talk page since i m retired now'

In [12]:
nltk.download('stopwords')
stopwords = set(nltk_stopwords.words('english'))

[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Обучение

In [13]:
features = data.drop('toxic', axis=1)
target = data['toxic']

In [14]:
features_train, features_test, target_train, target_test = train_test_split(features, target, test_size = 0.25, random_state=state)

In [15]:
tf_idf = TfidfVectorizer(ngram_range=(1,3), min_df=3, stop_words=stopwords)

In [16]:
features_train = tf_idf.fit_transform(features_train['text'])
features_test = tf_idf.transform(features_test['text'])

In [17]:
print(features_train.shape)
print(features_test.shape)

(119678, 281863)
(39893, 281863)


### Логистическая регрессия(LogisticRegression)

In [56]:
%%time

model_logreg = LogisticRegression()
parametrs = {
        'C': [0.1, 1, 10, 100],
        'class_weight': ['balanced', None]}
model_logreg_grid = GridSearchCV(model_logreg, parametrs, cv=3)
model_logreg_grid.fit(features_train, target_train)
pred_logreg_test = model_logreg_grid.predict(features_test)
f1_logreg_test = f1_score(target_test, pred_logreg_test)

print('Лучшие гиперпараметры для модели: ', model_logreg_grid.best_params_)
print('F1:',  f1_logreg_test)

Лучшие гиперпараметры для модели:  {'C': 100, 'class_weight': None}
F1: 0.7819994610617085
CPU times: user 7min 43s, sys: 12min 33s, total: 20min 17s
Wall time: 20min 19s


Лучшие гиперпараметры для модели:  {'C': 100, 'class_weight': None}
F1: 0.7819994610617085
CPU times: user 7min 43s, sys: 12min 33s, total: 20min 17s
Wall time: 20min 19s

### LightGBM(LGBMClassifier)

In [None]:
%%time

model_lgbmc = LGBMClassifier()
parametrs = {
        #'boosting_type' : ['gbdt'],
        'n_estimators' : range(10, 100, 10),
        'max_depth' : range(1, 16, 2),
        'random_state' : [state],
        'learning_rate' : [0.15, 0.25],
        #'num_leaves' : [31]
             }
model_lgbmc_grid = GridSearchCV(model_lgbmc, parametrs, cv=3)
model_lgbmc_grid.fit(features_train, target_train)
pred_lgbmc_test = model_lgbmc_grid.predict(features_test)
f1_lgbmc_test = f1_score(target_test, pred_lgbmc_test)

print('Лучшие гиперпараметры для модели: ', model_lgbmc_grid.best_params_)
print('F1:',  f1_lgbmc_test)

### Дерево решений(DecisionTreeClassifier)

In [55]:
%%time

model_tree = DecisionTreeClassifier(random_state=state)
parametrs = {'max_depth': range (1,16, 2)}
model_tree_grid = GridSearchCV(model_tree, parametrs, cv=3)
model_tree_grid.fit(features_train, target_train)
pred_tree_test = model_tree_grid.predict(features_test)
f1_tree_test = f1_score(target_test, pred_tree_test)

print('Глубина дерева для лучшей модели:', model_tree_grid.best_params_)
print('F1:',  f1_tree_test)

Глубина дерева для лучшей модели: {'max_depth': 15}
F1: 0.6268939393939393
CPU times: user 3min 40s, sys: 839 ms, total: 3min 41s
Wall time: 3min 41s


Глубина дерева для лучшей модели: {'max_depth': 15}
F1: 0.6268939393939393
CPU times: user 3min 40s, sys: 839 ms, total: 3min 41s
Wall time: 3min 41s

In [18]:
#%%time

#pipe_tree = Pipeline([
    #('tfidf', TfidfVectorizer(ngram_range=(1,3), max_features=6, stop_words=stopwords)),
    #('m_tree', DecisionTreeClassifier(random_state=12345))])

#param_tree = {'m_tree__max_depth': range (1,16, 2)}

#tree_grid = GridSearchCV(estimator=pipe_tree, param_grid=param_tree, cv=3)
#tree_grid.fit(features_train, target_train)

#print('Глубина дерева для лучшей модели:', tree_grid.best_params_)

### CatBoostClassifier

In [None]:
%%time

model_cat = CatBoostClassifier()
parametrs = {
         'iterations': [200],
         'learning_rate': [0.15],
         'random_state' : [state]}
model_cat_grid = GridSearchCV(model_cat, parametrs, cv=3)
model_cat_grid.fit(features_train, target_train)
pred_cat_test = model_cat_grid.predict(features_test)
f1_cat_test = f1_score(target_test, pred_cat_test)

print('Лучшие гиперпараметры для модели:', model_cat_grid.best_params_)
print('F1:',  f1_cat_test)

### Случайный лес(RandomForestClassifier)

In [18]:
%%time

model_forest = RandomForestClassifier(random_state=state)
parametrs = {'n_estimators': range(10, 100, 10)}
model_forest_grid = GridSearchCV(model_forest, parametrs, cv=3)
model_forest_grid.fit(features_train, target_train)
pred_forest_test = model_forest_grid.predict(features_test)
f1_forest_test = f1_score(target_test, pred_forest_test)

print('Количество деревьев для лучшей модели:', model_forest_grid.best_params_)
print('F1:',  f1_forest_test)

Количество деревьев для лучшей модели: {'n_estimators': 90}
F1: 0.7213797204876597
CPU times: user 2h 28min 2s, sys: 4.24 s, total: 2h 28min 6s
Wall time: 2h 28min 8s


Количество деревьев для лучшей модели: {'n_estimators': 90}
F1: 0.7213797204876597
CPU times: user 2h 28min 2s, sys: 4.24 s, total: 2h 28min 6s
Wall time: 2h 28min 8s

## Выводы

Из трех посчитанных моделей лучший результат показала модель Логистическая регрессия(LogisticRegression) с метрикой качества F1-мера = 0.78. На втором месте оказалась модель Случайный лес(RandomForestClassifier) с метрой качества F1-мера = 0.72, а на треттем месте модель Дерево решений(DecisionTreeClassifier) с метрой качества F1-мера = 0.62.