## Подготовка

In [1]:
import pandas as pd
import nltk
import re
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier 
from nltk.corpus import wordnet

In [2]:
data = pd.read_csv('/datasets/toxic_comments.csv')

In [3]:
data.head()

Unnamed: 0,text,toxic
0,Explanation\nWhy the edits made under my usern...,0
1,D'aww! He matches this background colour I'm s...,0
2,"Hey man, I'm really not trying to edit war. It...",0
3,"""\nMore\nI can't make any real suggestions on ...",0
4,"You, sir, are my hero. Any chance you remember...",0


In [4]:
data['text'][0]

"Explanation\nWhy the edits made under my username Hardcore Metallica Fan were reverted? They weren't vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please don't remove the template from the talk page since I'm retired now.89.205.38.27"

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159571 entries, 0 to 159570
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   text    159571 non-null  object
 1   toxic   159571 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 2.4+ MB


In [6]:
data.duplicated().sum()

0

In [7]:
(data['toxic'].value_counts())/len(data['toxic'])

0    0.898321
1    0.101679
Name: toxic, dtype: float64

В таблице нет пропусков и дубликатов. Целевым признаков является столбец 'toxic'. В тексте комментариев содержатся знаки препинания, разделители строк, цифры. Необходимо привести слова к начальной форме и буквы к нижнему регистру, а также очистить текст от лишних символов.

In [8]:
# приведение к нижнему регистру
data['text'] = data['text'].str.lower()
data['text'].head()

0    explanation\nwhy the edits made under my usern...
1    d'aww! he matches this background colour i'm s...
2    hey man, i'm really not trying to edit war. it...
3    "\nmore\ni can't make any real suggestions on ...
4    you, sir, are my hero. any chance you remember...
Name: text, dtype: object

In [9]:
nltk.download('averaged_perceptron_tagger')   
nltk.download('omw-1.4')   
nltk.download('punkt') 

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/jovyan/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package omw-1.4 to /home/jovyan/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [10]:
%%time

# лемматизация
lemmatizer = WordNetLemmatizer()

# функция для нахождения правильного POS-тега
def get_wordnet_pos(word):
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)

data['text'] = data['text'].apply(lambda line:
                                   [lemmatizer.lemmatize(word, get_wordnet_pos(word)) for word in line.split()])

CPU times: user 18min 6s, sys: 1min 44s, total: 19min 50s
Wall time: 19min 54s


In [11]:
data['text'].head()

0    [explanation, why, the, edits, make, under, my...
1    [d'aww!, he, match, this, background, colour, ...
2    [hey, man,, i'm, really, not, try, to, edit, w...
3    [", more, i, can't, make, any, real, suggestio...
4    [you,, sir,, be, my, hero., any, chance, you, ...
Name: text, dtype: object

In [12]:
# очистка от лишних символов
def clear_text(text):
    
    # оставим только английские буквы, что не подходит, заменим пробелами
    text = re.sub(r'[^a-z]', ' ', str(text))
    
    # удаление лишних пробелов
    text = text.split()
    text = ' '.join(text)
    return text

In [13]:
data['text'] = data['text'].apply(clear_text)
data.head()

Unnamed: 0,text,toxic
0,explanation why the edits make under my userna...,0
1,d aww he match this background colour i m seem...,0
2,hey man i m really not try to edit war it s ju...,0
3,more i can t make any real suggestion on impro...,0
4,you sir be my hero any chance you remember wha...,0


Была произведена предварительная обработка текста для дальнейшего анализа.

## Обучение

### Создание признаков

In [14]:
features = data['text']
target = data['toxic']
features_train, features_test, target_train, target_test = (
    train_test_split(features, target, test_size = 0.2, random_state = 12345))

features_train.shape, target_train.shape, features_test.shape, target_test.shape

((127656,), (127656,), (31915,), (31915,))

In [15]:
# получим список стоп-слов и передадим его в счетчик TF-IDF
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
count_tf_idf = TfidfVectorizer(stop_words=stop_words, min_df=2) 

[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [16]:
# вычислим TF-IDF для корпуса текстов
tf_idf_train = count_tf_idf.fit_transform(features_train)
tf_idf_test = count_tf_idf.transform(features_test)
tf_idf_train.shape, tf_idf_test.shape

((127656, 58834), (31915, 58834))

### Логистическая регрессия

In [17]:
%%time

model_logr = LogisticRegression(solver='liblinear')
params_logr = {'penalty': ['l1', 'l2'], 'max_iter': (100, 500)}
grid_logr = GridSearchCV(model_logr, params_logr, cv=3)
grid_logr.fit(tf_idf_train, target_train)
grid_logr.best_params_

CPU times: user 26.9 s, sys: 21.4 s, total: 48.3 s
Wall time: 48.3 s


{'max_iter': 500, 'penalty': 'l1'}

In [18]:
model_logr = LogisticRegression(random_state = 12345, penalty = 'l1', solver='liblinear', max_iter=100)
model_logr.fit(tf_idf_train, target_train)
predictions_logr = model_logr.predict(tf_idf_test)
f1_logr = f1_score(target_test, predictions_logr)
f1_logr

0.7707676130389065

In [19]:
# добавлим аргумент class_weight='balanced' к модели Логистическая регрессия
model_logr_cl = LogisticRegression(random_state = 12345, penalty = 'l1', 
                                   solver='liblinear', max_iter=100, class_weight='balanced')
model_logr_cl.fit(tf_idf_train, target_train)
predictions_logr_cl = model_logr_cl.predict(tf_idf_test)
f1_logr_cl = f1_score(target_test, predictions_logr_cl)
f1_logr_cl

0.7454545454545454

### Решающее дерево

In [20]:
%%time

model_dt = DecisionTreeClassifier()
params_dt = {'max_depth': [1, 15]}
grid_dt = GridSearchCV(model_dt, params_dt, cv=3, scoring='f1')
grid_dt.fit(tf_idf_train, target_train)
grid_dt.best_params_

CPU times: user 27.6 s, sys: 167 ms, total: 27.8 s
Wall time: 27.8 s


{'max_depth': 15}

In [21]:
model_dt = DecisionTreeClassifier(random_state = 12345, max_depth=15)
model_dt.fit(tf_idf_train, target_train)
pred = model_dt.predict(tf_idf_test)
f1_dt = f1_score(target_test, pred)
f1_dt

0.6308116627265564

In [22]:
model_dt_cl = DecisionTreeClassifier(random_state = 12345, max_depth=15, class_weight='balanced')
model_dt_cl.fit(tf_idf_train, target_train)
pred_cl = model_dt_cl.predict(tf_idf_test)
f1_dt_cl = f1_score(target_test, pred_cl)
f1_dt_cl

0.59790675547098

### Случайный лес

In [23]:
%%time

model_rf = RandomForestClassifier()
params_rf = {'n_estimators': [1, 100], 'max_depth': [1, 15]}
grid_rf = GridSearchCV(model_rf, params_rf, cv=3, scoring='f1')
grid_rf.fit(tf_idf_train, target_train)
grid_rf.best_params_

CPU times: user 47.5 s, sys: 514 ms, total: 48.1 s
Wall time: 48.2 s


{'max_depth': 15, 'n_estimators': 1}

In [24]:
model_rf = RandomForestClassifier(random_state=12345, n_estimators = 1, max_depth = 15)
model_rf.fit(tf_idf_train, target_train)
predictions_rf = model_rf.predict(tf_idf_test)
f1_rf = f1_score(target_test, predictions_rf)
f1_rf

0.08997722095671981

In [25]:
model_rf_cl = RandomForestClassifier(random_state=12345, n_estimators = 1, max_depth = 15, class_weight='balanced')
model_rf_cl.fit(tf_idf_train, target_train)
predictions_rf_cl = model_rf_cl.predict(tf_idf_test)
f1_rf_cl = f1_score(target_test, predictions_rf_cl)
f1_rf_cl

0.2196115831454829

## Выводы

Была произведена работа по предсказанию токсичности комментариев. На первом этапе слова из текста комментариев были приведены к начальной форме. Сам текст был очищен от лишних символов, а заглавные буквы были заменены на строчные.
На втором этапе были протестированы несколько моделей. Была применена техника взвешивания классов, но она не улучшила метрику. Ансамблиевая модель Случайный лес показала очень плохую метрику, несмотря на подбор параметров. Модель Решающее дерево показала результат лучше, но не дотянула до проходного порога 75. Наилучший результат показала модель Логистическая регрессия: значение метрики F1-мера на тестовой выборке 0.77.

In [26]:
index = ['LogisticRegression', 'DecisionTree', 'RandomForest']
columns = ['Подбор_гиперпараметров', 'Взвешивание_классов']
list = [[f1_logr, f1_logr_cl], [f1_dt, f1_dt_cl], [f1_rf, f1_rf_cl]]

df = pd.DataFrame(data=list, index=index, columns=columns)
df

Unnamed: 0,Подбор_гиперпараметров,Взвешивание_классов
LogisticRegression,0.770768,0.745455
DecisionTree,0.630812,0.597907
RandomForest,0.089977,0.219612
