# Разработка инструмента для поиска токсичных комментариев в интернет-магазине

[Загрузка и подготовка данных](#Загрузка-и-подготовка-данных)

[Обучение моделей](#Обучение-моделей)

[Выводы](#Выводы)

## Загрузка и подготовка данных

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords as nltk_stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import sent_tokenize, word_tokenize
import re
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import f1_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from sklearn.svm import LinearSVC

data = pd.read_csv("../input/toxic-comments/toxic_comments.csv")
data.set_axis(['text','toxic_comments'],axis = 'columns',inplace = True) # переименуем столбец с названием "toxic" 
# в столбец с названием "toxic_comments", чтобы после лемматизации не получилось два столбца с названием "toxic"
data.info()
data.describe()
data = data.head(50000)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159571 entries, 0 to 159570
Data columns (total 2 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   text            159571 non-null  object
 1   toxic_comments  159571 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 2.4+ MB


In [2]:
lemmatizer = WordNetLemmatizer()

def pos_tagify(text):         # создадим новые признаки по частям речи
                              # процесс разметки слова в тексте как соответствующего определенной части речи 
                              # на основе как его определения, так и контекста
    word_map = {
        "N": wordnet.NOUN, 
        "V": wordnet.VERB,
        "J": wordnet.ADJ,
        "R": wordnet.ADV
    }
    
    tagged_text = nltk.pos_tag(text)
    tagged_text = [
        (word, word_map.get(pos_tag[0])) if pos_tag[0] in word_map.keys()
        else (word, wordnet.NOUN)
        for (word, pos_tag) in tagged_text
    ]
    
    return tagged_text

def preprocessing(rows):     # проводим предобработку данных
    result = []
    for row in rows:
        
        tokens = word_tokenize(row, language="english") # токенизируем 
        tokens = " ".join(tokens)
        
        clear = re.sub(r'[^a-zA-Z]', ' ', tokens) # очищаем токены от ненужных символов
        pos_tag = pos_tagify([i for i in clear.split()]) 
        
        phrase = ''
        for word in pos_tag:
            phrase += ' ' + (lemmatizer.lemmatize(word[0], pos=word[1])) # лемматизируем слова
        
        result.append(phrase)

    return pd.Series(result)

data['lemm_text'] = preprocessing(data['text']) 

#print(data.head(100))

In [3]:
train, test = train_test_split(data, test_size=0.25, random_state=12345)

corpus = train['lemm_text'].values.astype('U')
corpus_test = test['lemm_text'].values.astype('U')

In [4]:
stopwords = set(nltk_stopwords.words('english'))
count_tf_idf = TfidfVectorizer(stop_words=stopwords, max_features=3000) # создадим счётчик, указав в нём стоп-слова, 
# а также ограничим количество фичей для ускорения процесса обработки данных
tf_idf_train = count_tf_idf.fit_transform(corpus)
df_train = pd.DataFrame(tf_idf_train.toarray(), columns=count_tf_idf.get_feature_names())
train = pd.concat([train.reset_index(), df_train.reset_index()], axis=1)
df_train = [] # обнуление ради высвобождения RAM

tf_idf_test = count_tf_idf.transform(corpus_test)
df_test = pd.DataFrame(tf_idf_test.toarray(), columns=count_tf_idf.get_feature_names())
test = pd.concat([test.reset_index(), df_test.reset_index()], axis=1)
df_test = [] # обнуление ради высвобождения RAM

corpus = [] # обнуление ради высвобождения RAM
corpus_test = [] # обнуление ради высвобождения RAM

In [5]:
features_train = train.drop(['text', 'toxic_comments', 'lemm_text', 'level_0', 'index'], axis=1)
target_train = train['toxic_comments']
target_test = test['toxic_comments']
features_test = test.drop(['text', 'lemm_text', 'toxic_comments', 'level_0', 'index'], axis=1)

In [6]:
print(features_train.head(10))
print(target_train.head(10))

   ability  able  abortion  absence  absolute  absolutely  absurd  abuse  \
0      0.0   0.0       0.0      0.0       0.0         0.0     0.0    0.0   
1      0.0   0.0       0.0      0.0       0.0         0.0     0.0    0.0   
2      0.0   0.0       0.0      0.0       0.0         0.0     0.0    0.0   
3      0.0   0.0       0.0      0.0       0.0         0.0     0.0    0.0   
4      0.0   0.0       0.0      0.0       0.0         0.0     0.0    0.0   
5      0.0   0.0       0.0      0.0       0.0         0.0     0.0    0.0   
6      0.0   0.0       0.0      0.0       0.0         0.0     0.0    0.0   
7      0.0   0.0       0.0      0.0       0.0         0.0     0.0    0.0   
8      0.0   0.0       0.0      0.0       0.0         0.0     0.0    0.0   
9      0.0   0.0       0.0      0.0       0.0         0.0     0.0    0.0   

   abusive  academic  ...  yellow  yes  yesterday  yet   yo  york  young  \
0      0.0       0.0  ...     0.0  0.0        0.0  0.0  0.0   0.0    0.0   
1      0.0 

## Обучение моделей

In [7]:
research_result = pd.DataFrame(columns=['Model_name', 'F1-score'])
def train_model(model, name, features_train, target_train, features_test, target_test):
    model.fit(features_train, target_train)
    predictions = model.predict(features_test)
    f1 = f1_score(predictions, target_test) 
    results = pd.DataFrame([[name, f1]], columns=['Model_name', 'F1-score'])
    return results

In [8]:
# Логистическая регрессия
model_regression=LogisticRegression(random_state=12345, solver='liblinear')
model_regression_results = train_model(model_regression, 'Logistic Regression', features_train, target_train, features_test, target_test)
research_result = research_result.append(model_regression_results)
print(research_result)

            Model_name  F1-score
0  Logistic Regression  0.676085


In [9]:
# Дерево решений
model_tree=DecisionTreeClassifier(criterion = "entropy", max_depth = 12,  min_samples_split = 4,
                              min_samples_leaf = 1, random_state=12345)
model_tree_results = train_model(model_tree, 'Decision Tree Classifier', features_train, target_train, features_test, target_test)
research_result = research_result.append(model_tree_results)
print(research_result)

                 Model_name  F1-score
0       Logistic Regression  0.676085
0  Decision Tree Classifier  0.588667


In [10]:
# Cлучайный лес
model_forest = RandomForestClassifier(n_estimators=14, max_depth=134, min_samples_leaf=1, random_state=12345)
model_forest_results = train_model(model_forest, 'Random Forest Classifier', features_train, target_train, features_test, target_test)
research_result = research_result.append(model_forest_results)
print(research_result)

                 Model_name  F1-score
0       Logistic Regression  0.676085
0  Decision Tree Classifier  0.588667
0  Random Forest Classifier  0.728532


In [17]:
# Подбор гиперпараметров
from sklearn.model_selection import GridSearchCV

model_forest = RandomForestClassifier(random_state=12345)

param_grid = {
    'n_estimators': [1, 4, 5],
    'max_depth': [6, 8, 11],
    'min_samples_leaf': [1, 2, 5],
}

from sklearn.metrics import f1_score, make_scorer

f1 = make_scorer(f1_score , average='macro')

CV_rfc = GridSearchCV(estimator=model_forest, param_grid=param_grid, cv= 3, scoring=f1)
CV_rfc.fit(features_train, target_train)

print(CV_rfc.best_params_, CV_rfc.best_score_)

#print(CV_rfc.cv_results_)

{'max_depth': 11, 'min_samples_leaf': 5, 'n_estimators': 1} 0.6696808683644543


In [14]:
# Классификатор LightGBM
model_LGBMR = LGBMClassifier(boosting_type='gbdt', num_leaves=4, max_depth=20, learning_rate=0.3, n_estimators=10, random_state=12345)
model_LGBMR_results = train_model(model_LGBMR, 'LGBM Classifier', features_train, target_train, features_test, target_test)
research_result = research_result.append(model_LGBMR_results)
print(research_result)

                 Model_name  F1-score
0       Logistic Regression  0.676085
0  Decision Tree Classifier  0.588667
0  Random Forest Classifier  0.728532
0           LGBM Classifier  0.469032


In [15]:
# Классификация линейных опорных векторов
model_svc = LinearSVC(C=3, random_state=12345)
model_svc_results = train_model(model_svc, 'Linear SVC', features_train, target_train, features_test, target_test)
research_result = research_result.append(model_svc_results)
print(research_result)

                 Model_name  F1-score
0       Logistic Regression  0.676085
0  Decision Tree Classifier  0.588667
0  Random Forest Classifier  0.728532
0           LGBM Classifier  0.469032
0                Linear SVC  0.738747


## Выводы

In [16]:
print(research_result)

                 Model_name  F1-score
0       Logistic Regression  0.676085
0  Decision Tree Classifier  0.588667
0  Random Forest Classifier  0.728532
0           LGBM Classifier  0.469032
0                Linear SVC  0.738747


Из всех рассмотренных мной моделей искомой метрики смогла достигнуть только модель Linear Support Vector Classification. Логистическая регрессия, Случайный лес и LGBM-модель - их метрика была чуть ниже. Наиболее низкую метрику показало Дерево решений.
* В данном примере для более быстрой работы модели запущены на 1/3 части от объема исходных данных.