# Logreg toxicity classification

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import spacy

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


In [2]:
STOPWORDS = stopwords.words('russian')

In [3]:
train_df = pd.read_csv('../data/toxicity_classification/classification_train.csv')
dev_df = pd.read_csv('../data/toxicity_classification/classification_dev.csv')

In [4]:
train_df

Unnamed: 0,text,label
0,"и,чё,блядь где этот херой был до этого со свои...",toxic
1,"О, а есть деанон этого петуха?",toxic
2,"херну всякую пишут,из-за этого лайка.долбоебизм.",toxic
3,из за таких пидоров мы и страдаем,toxic
4,гондон путинский он а не артист,toxic
...,...,...
13891,"Пусть его уже закроют до конца его дней, он же...",neutral
13892,твоя химия - это просто кошмар,neutral
13893,"меня изнасиловали,а тебе всё равно (((",neutral
13894,Когда напьюсь - на маньяка похож...,neutral


## No stopwords

In [5]:
vectorizer = TfidfVectorizer(stop_words=None)

In [6]:
X_train = vectorizer.fit_transform(train_df['text'])
X_dev = vectorizer.transform(dev_df['text'])

In [7]:
clf = LogisticRegression()
clf.fit(X_train, train_df['label'])
y_pred = clf.predict(X_dev)
print(classification_report(dev_df['label'], y_pred))

              precision    recall  f1-score   support

     neutral       0.77      0.93      0.84       800
       toxic       0.91      0.72      0.81       800

    accuracy                           0.83      1600
   macro avg       0.84      0.83      0.83      1600
weighted avg       0.84      0.83      0.83      1600



In [35]:
words_coefs = {word: coef for word, coef in zip(vectorizer.get_feature_names_out(), clf.coef_.squeeze())}

In [75]:
bad_words = {word: coef for word, coef in words_coefs.items() if coef > 1}
len(bad_words)

208

In [76]:
bad_words

{'99': 2.023861506544052,
 'gt': 1.2330937881406328,
 'ахуеть': 2.2028296849712965,
 'бл': 2.719850339575442,
 'бля': 6.833198857023797,
 'блядей': 1.7439862437662201,
 'бляди': 2.3939066923878736,
 'блядство': 1.1688014321758764,
 'блядь': 6.572862268203039,
 'блять': 9.180646487638752,
 'бляя': 2.3104609141985932,
 'бляяя': 1.6592343832824903,
 'быдло': 1.8334312442367493,
 'гавно': 1.0542049408538703,
 'гандон': 1.6713292244277582,
 'гнида': 1.1669482199468202,
 'говно': 3.502582073245874,
 'дебил': 3.7260984536906308,
 'дебилов': 1.7821341056769906,
 'дебилы': 3.622482105305647,
 'дерьмо': 2.7620272746326417,
 'дибил': 1.4724345044498364,
 'долбаеб': 1.6535546948721649,
 'долбаебы': 1.8035956327531233,
 'долбаёбы': 1.0383508416803733,
 'долбоеб': 2.952916198508164,
 'долбоебов': 1.842404845752098,
 'долбоебы': 1.9302555262604273,
 'долбоёб': 1.6510749704176662,
 'долбоёбы': 1.883772140634431,
 'дохуя': 1.7586177298542949,
 'дура': 3.4634692585368345,
 'дурак': 1.3059065221255044,
 

## NLTK stopwords

In [None]:
vectorizer = TfidfVectorizer(stop_words=STOPWORDS)

In [None]:
X_train = vectorizer.fit_transform(train_df['text'])
X_dev = vectorizer.transform(dev_df['text'])

In [None]:
clf = LogisticRegression()
clf.fit(X_train, train_df['label'])
y_pred = clf.predict(X_dev)
print(classification_report(dev_df['label'], y_pred))

              precision    recall  f1-score   support

     neutral       0.77      0.96      0.86       800
       toxic       0.95      0.71      0.81       800

    accuracy                           0.84      1600
   macro avg       0.86      0.84      0.83      1600
weighted avg       0.86      0.84      0.83      1600



## Spacy lemmatization

In [None]:
nlp = spacy.load('ru_core_news_lg')

In [None]:
def spacy_lemmatize(text):
    doc = nlp(text)
    lemmas = [token.lemma_ for token in doc if (token.text.isalpha() and token.text not in STOPWORDS)]
    return lemmas

In [None]:
vectorizer = TfidfVectorizer(tokenizer=spacy_lemmatize)

In [None]:
%%time

X_train = vectorizer.fit_transform(train_df['text'])
X_dev = vectorizer.transform(dev_df['text'])



CPU times: user 2min 57s, sys: 487 ms, total: 2min 57s
Wall time: 2min 59s


In [None]:
clf = LogisticRegression()
clf.fit(X_train, train_df['label'])
y_pred = clf.predict(X_dev)
print(classification_report(dev_df['label'], y_pred))

              precision    recall  f1-score   support

     neutral       0.79      0.96      0.86       800
       toxic       0.95      0.74      0.83       800

    accuracy                           0.85      1600
   macro avg       0.87      0.85      0.85      1600
weighted avg       0.87      0.85      0.85      1600

