# 1. Подготовка

In [1]:
import pandas as pd
import numpy as np
import nltk
import re
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer 
from nltk.corpus import stopwords as nltk_stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from catboost import CatBoostClassifier
import xgboost as xgb
from sklearn.metrics import f1_score
from sklearn.utils import shuffle

In [2]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Анатолий\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [3]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Анатолий\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
def lemmatize(text):
    lem = nltk.word_tokenize(text)
    lemmtext = [lemmatizer.lemmatize(i, get_wordnet_pos(i)) for i in nltk.word_tokenize(text)]
    return " ".join(lemmtext)

In [5]:
lemmatizer = WordNetLemmatizer()

In [6]:
def get_wordnet_pos(word):
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)

In [7]:
def clear_text(text):
    txt = re.sub(r'[^a-zA-Z0-9]',' ', text)
    txt = str(txt).lower()
    return ' '.join(txt.split())

In [8]:
def upsample(features, target, repeat):
    features_zeros = features[target == 0]
    features_ones = features[target == 1]
    target_zeros = target[target == 0]
    target_ones = target[target == 1]

    features_upsampled = pd.concat([features_zeros] + [features_ones] * repeat)
    target_upsampled = pd.concat([target_zeros] + [target_ones] * repeat)
    
    features_upsampled, target_upsampled = shuffle(
        features_upsampled, target_upsampled, random_state=12345)
    
    return features_upsampled, target_upsampled

In [9]:
data = pd.read_csv('toxic_comments.csv')

In [10]:
data.shape

(159571, 2)

In [11]:
data.head(3)

Unnamed: 0,text,toxic
0,Explanation\nWhy the edits made under my usern...,0
1,D'aww! He matches this background colour I'm s...,0
2,"Hey man, I'm really not trying to edit war. It...",0


In [12]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159571 entries, 0 to 159570
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   text    159571 non-null  object
 1   toxic   159571 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 2.4+ MB


Тип данных нормально

In [13]:
data['toxic'].value_counts()

0    143346
1     16225
Name: toxic, dtype: int64

In [14]:
#data = data.sample(15000)

In [None]:
data['lemm_text'] = data['text'].apply(lambda x: lemmatize(x))

In [None]:
data['lemm_text'] = data['lemm_text'].apply(lambda x: clear_text(x))

In [None]:
data = data.drop(['text'], axis = 1)

In [None]:
features = data.drop(['toxic'], axis = 1)

In [None]:
target = data['toxic']

In [None]:
features_train, features_valid, target_train, target_valid = train_test_split(
    features, target, test_size=0.4, random_state=12345)

In [None]:
features_valid, features_test, target_valid, target_test = train_test_split(
    features_valid, target_valid, test_size=0.5, random_state=12345)

In [None]:
features_train.shape

In [None]:
features_valid.shape

In [None]:
features_test.shape

In [None]:
target_train.shape

In [None]:
target_valid.shape

In [None]:
target_test.shape

In [None]:
features_train, target_train = upsample(features_train, target_train, 9)

Разбил / Проверил

In [None]:
corpus_train = features_train['lemm_text'].values.astype('U')

In [None]:
corpus_valid = features_valid['lemm_text'].values.astype('U')

In [None]:
corpus_test = features_test['lemm_text'].values.astype('U')

Корпус в юникод

In [None]:
stopwords = set(nltk_stopwords.words('english'))

In [None]:
count_tf_idf = TfidfVectorizer(stop_words=stopwords)

In [None]:
count_tf_idf.fit(corpus_train)

Обучил только на трэйн

In [None]:
tf_idf_train  = count_tf_idf.transform(corpus_train)

In [None]:
tf_idf_valid = count_tf_idf.transform(corpus_valid)

In [None]:
tf_idf_test = count_tf_idf.transform(corpus_test)

Преобразовал.

# Обучение / Модели

#### LogisticRegression

In [None]:
model_LR = LogisticRegression(solver = 'liblinear', random_state = 12345)

In [None]:
model_LR.fit(tf_idf_train, target_train)

In [None]:
predict_LR = model_LR.predict(tf_idf_valid)

In [None]:
f1_LR = f1_score(target_valid, predict_LR)

In [None]:
f1_LR

#### RandomForestClassifier

In [None]:
model_RFC =  RandomForestClassifier(n_estimators = 220, n_jobs = 4, random_state = 12345)

In [None]:
model_RFC.fit(tf_idf_train, target_train)

In [None]:
predict_RFC = model_RFC.predict(tf_idf_valid)

In [None]:
f1_RFC = f1_score(target_valid, predict_RFC)

In [None]:
f1_RFC

#### DecisionTreeClassifier

In [None]:
model_DTC = DecisionTreeClassifier(max_depth = 96, random_state = 12345)

In [None]:
model_DTC.fit(tf_idf_train, target_train)

In [None]:
predict_DTC = model_DTC.predict(tf_idf_valid)

In [None]:
f1_DTC = f1_score(target_valid, predict_DTC)

In [None]:
f1_DTC

#### CatBoostClassifier

In [None]:
model_CBC = CatBoostClassifier(iterations = 91, depth = 6, random_state = 12345)

In [None]:
model_CBC.fit(tf_idf_train, target_train)

In [None]:
predict_CBC = model_CBC.predict(tf_idf_valid)

In [None]:
f1_CBC = f1_score(target_valid, predict_CBC)

In [None]:
f1_CBC

#### XGBClassifier

In [None]:
model_XGBC = xgb.XGBClassifier(max_depth = 71, n_estimators = 33, random_state = 12345)

In [None]:
model_XGBC.fit(tf_idf_train, target_train)

In [None]:
predict_XGBC = model_XGBC.predict(tf_idf_valid)

In [None]:
f1_XGBC = f1_score(target_valid, predict_XGBC)

In [None]:
f1_XGBC

# Выводы / Тест

Предсказания на тестовой выборке

In [None]:
predict_LR_test = model_LR.predict(tf_idf_test)

In [None]:
f1_LR_test = f1_score(target_test, predict_LR_test)

In [None]:
f1_LR_test

In [None]:
predict_RFC_test = model_RFC.predict(tf_idf_test)

In [None]:
f1_RFC_test = f1_score(target_test, predict_RFC_test)

In [None]:
f1_RFC_test

In [None]:
predict_DTC_test = model_DTC.predict(tf_idf_test)

In [None]:
f1_DTC_test = f1_score(target_test, predict_DTC_test)

In [None]:
f1_DTC_test

In [None]:
predict_CBC_test = model_CBC.predict(tf_idf_test)

In [None]:
f1_CBC_test = f1_score(target_test, predict_CBC_test)

In [None]:
f1_CBC_test

In [None]:
predict_XGBC_test = model_XGBC.predict(tf_idf_test)

In [None]:
f1_XGBC_test = f1_score(target_test, predict_XGBC_test)

In [None]:
f1_XGBC_test

In [None]:
table = pd.DataFrame(
    {'name': ['LogisticRegression', 'RandomForestClassifier', 'DecisionTreeClassifier', 'CatBoostClassifier', 'XGBClassifier']
                      , 'f1_score': [f1_LR, f1_RFC, f1_DTC, f1_CBC, f1_XGBC]
                      , 'f1_score_test': [f1_LR_test, f1_RFC_test, f1_DTC_test, f1_CBC_test, f1_XGBC_test]})

In [None]:
table