In [1]:
import pandas as pd
import re 
import nltk

from pymystem3 import Mystem
from tqdm import notebook
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize.regexp import RegexpTokenizer
from catboost import CatBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics

In [2]:
df = pd.read_csv(r'toxic_comments.csv')
df.head()

Unnamed: 0,text,toxic
0,Explanation\nWhy the edits made under my usern...,0
1,D'aww! He matches this background colour I'm s...,0
2,"Hey man, I'm really not trying to edit war. It...",0
3,"""\nMore\nI can't make any real suggestions on ...",0
4,"You, sir, are my hero. Any chance you remember...",0


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159571 entries, 0 to 159570
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   text    159571 non-null  object
 1   toxic   159571 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 2.4+ MB


In [4]:
df['text'] = df['text'].values.astype('U')

In [5]:
df.text = df.text.str.lower()
stemmer = SnowballStemmer("english")
tokenizer = RegexpTokenizer(r'\w{2,}')
def preprocessing(text):
    new_words = tokenizer.tokenize(text)
    new_list = []
    for w in new_words:
        w = stemmer.stem(w)
        new_list.append(w)
    new_list = ' '.join(new_list)
    return new_list

In [7]:
corpus = pd.Series([preprocessing(text) for text in notebook.tqdm(df['text'])])

HBox(children=(FloatProgress(value=0.0, max=159571.0), HTML(value='')))




In [8]:
nltk.download('stopwords')
stop_words = set(stopwords.words("english"))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Александра\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [9]:
count_tf_idf = TfidfVectorizer(stop_words=stop_words)
tf_idf = count_tf_idf.fit_transform(corpus)

In [10]:
features = tf_idf
target = df['toxic']

features_train, features_test, target_train, target_test = train_test_split(tf_idf, target, test_size = 0.25, random_state = 12345) 


print('test size', features_test.shape)
print('train size', features_train.shape)

test size (39893, 153357)
train size (119678, 153357)


# Обучение

In [15]:
%%time
for n_estimators in range(5,51,5):
    model =  RandomForestClassifier(n_estimators = n_estimators, random_state = 123)
    model.fit(features_train, target_train)
    print('n_estimators: ', n_estimators,'F1:',f1_score(target_test, model.predict(features_test)))

n_estimators:  5 F1: 0.6550218340611353
n_estimators:  10 F1: 0.6470868391808223
n_estimators:  15 F1: 0.6822588090475459
n_estimators:  20 F1: 0.6605533596837944
n_estimators:  25 F1: 0.6825568797399783
n_estimators:  30 F1: 0.6703262233375157
n_estimators:  35 F1: 0.6879137798306388
n_estimators:  40 F1: 0.6832143965249767
n_estimators:  45 F1: 0.6890704485894866
n_estimators:  50 F1: 0.6783151326053043
Wall time: 1h 24min 58s


In [16]:
%%time
model =  RandomForestClassifier()


model.fit(features_train, target_train)
 
predictions = model.predict(features_test)

print(f1_score(target_test, predictions))
print(metrics.classification_report(target_test, predictions))

0.6927219572241883
              precision    recall  f1-score   support

           0       0.95      1.00      0.97     35806
           1       0.93      0.55      0.69      4087

    accuracy                           0.95     39893
   macro avg       0.94      0.77      0.83     39893
weighted avg       0.95      0.95      0.94     39893

Wall time: 32min 8s


In [17]:
%%time
model = SGDClassifier(class_weight = 'balanced')

model.fit(features_train, target_train)
predictions = model.predict(features_test)

print(f1_score(target_test, predictions))
print(metrics.classification_report(target_test, predictions))

0.748544317446625
              precision    recall  f1-score   support

           0       0.98      0.95      0.97     35806
           1       0.67      0.85      0.75      4087

    accuracy                           0.94     39893
   macro avg       0.83      0.90      0.86     39893
weighted avg       0.95      0.94      0.94     39893

Wall time: 1.14 s


In [18]:
%%time
model = LogisticRegression(class_weight = 'balanced', random_state = 123)

model = model.fit(features_train, target_train)
predictions = model.predict(features_test)

print(f1_score(target_test, predictions))
print(metrics.classification_report(target_test, predictions))

0.75564681724846
              precision    recall  f1-score   support

           0       0.98      0.95      0.97     35806
           1       0.68      0.86      0.76      4087

    accuracy                           0.94     39893
   macro avg       0.83      0.90      0.86     39893
weighted avg       0.95      0.94      0.95     39893

Wall time: 5.55 s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [19]:
%%time
for depth in range(5,51,5):
    model = DecisionTreeClassifier(max_depth = depth, random_state = 12345)
    model.fit(features_train, target_train)
    predictions = model.predict(features_test)
    print('depth: ',depth,'F1 score: ',f1_score(target_test, predictions))

depth:  5 F1 score:  0.5472055030094584
depth:  10 F1 score:  0.6249404289118348
depth:  15 F1 score:  0.6534012031466914
depth:  20 F1 score:  0.6762633078422552
depth:  25 F1 score:  0.684070796460177
depth:  30 F1 score:  0.6951183864367143
depth:  35 F1 score:  0.7023826714801445
depth:  40 F1 score:  0.7048899056334
depth:  45 F1 score:  0.7089678510998307
depth:  50 F1 score:  0.7099916504313942
Wall time: 6min 47s


In [21]:
for n in range(5,51,5):
    model = KNeighborsClassifier(n_neighbors = n)
    model.fit(features_train, target_train)
    predictions = model.predict(features_test)
    print(f1_score(target_test, predictions))

0.27751734483405216
0.20727193555410406
0.1845132743362832
0.15044047887960244
0.139122527847238
0.27355110642781877
0.43489780469341416
0.46655549379284783
0.5076233183856501
0.5130403715612719


1. Лучший результат показала модель логистической регрессии - ~0.76
2. Самые плохие результаты у классификатора с методом k-ближайшего соседа, хотя прогоняя код в GoogleColab, результатыы были порядком выше
3. Также хороший результат у SGDClassifier: 0.748544317446625, если округлить до сотых, то результат подходит под условие задачи (0.75)