## Курсовая работа

Бинарная классификация комментариев на токсичность

### Приступая к работе

In [72]:
import numpy as np
import pandas as pd

In [73]:
train_df = pd.read_csv('data/train.csv')
test_df = pd.read_csv('data/test.csv')
test_labels = pd.read_csv('data/test_labels.csv')
train_df.head(3)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0


In [74]:
train_df['severe_toxic'].unique()

array([0, 1])

Один признак текстовый, остальные бинарные. Еще есть __id__, но он мне не нужен.

In [75]:
df = train_df.drop('id', axis=1)
df.head(3)

Unnamed: 0,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0


В качестве целевой переменной будет факт токсичности (признак __toxic__).

In [76]:
X = df.drop('toxic', axis=1)
y = df['toxic']
X.head(3)

Unnamed: 0,comment_text,severe_toxic,obscene,threat,insult,identity_hate
0,Explanation\nWhy the edits made under my usern...,0,0,0,0,0
1,D'aww! He matches this background colour I'm s...,0,0,0,0,0
2,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0


Текст надо представить в виде чисел, понятных компьютеру (векторов). Для этого я буду использовать TF-IDF векторизатор. В него я добавил регулярное выражение, которое отсеивает все числа, чтобы они не попали в словарь слов.

In [77]:
from sklearn.feature_extraction.text import TfidfVectorizer
import re

In [101]:
%%time
delimiters = ' ', '\n', '_'
regex_pattern = '|'.join(map(re.escape, delimiters))
for i in range(X.shape[0]):
    X.loc[i, 'comment_text'] = ' '.join(re.split(regex_pattern, re.sub(r'[0-9]+', '', X.loc[i, 'comment_text'].lower())))
X.head(3)

CPU times: user 14min 11s, sys: 5.06 s, total: 14min 16s
Wall time: 25min 41s


Unnamed: 0,comment_text,severe_toxic,obscene,threat,insult,identity_hate
0,explanation why the edits made under my userna...,0,0,0,0,0
1,d'aww! he matches this background colour i'm s...,0,0,0,0,0
2,"hey man, i'm really not trying to edit war. it...",0,0,0,0,0


In [112]:
vectorizer = TfidfVectorizer()
corpus = X['comment_text']
tfidf_features = vectorizer.fit_transform(corpus)
vectorizer.get_feature_names()[0:10]

['aa',
 'aaa',
 'aaaa',
 'aaaaa',
 'aaaaaaaa',
 'aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaany',
 'aaaaaaaaaah',
 'aaaaaaaaaahhhhhhhhhhhhhh',
 'aaaaaaaaadm',
 'aaaaaaaaaq']

In [113]:
len(vectorizer.get_feature_names())

172817

In [114]:
tfidf_features.shape

(159571, 172817)

In [115]:
X.shape

(159571, 6)