In [21]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import random

In [22]:
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec
import gensim.downloader as api
from sklearn.model_selection import train_test_split

In [23]:
from sklearn.ensemble import RandomForestClassifier

In [24]:
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import Dataset, DataLoader
import torch
from tqdm import tqdm
from sklearn.metrics import f1_score, classification_report

In [25]:
seed = 42
torch.manual_seed(seed)
np.random.seed(seed)
random.seed(seed)

In [26]:
df = pd.read_csv('comments.csv')
df.drop('Unnamed: 0', axis = 1, inplace = True)

In [27]:
df.isna().sum()

text     0
toxic    0
dtype: int64

In [28]:
df.duplicated().sum()

0

In [29]:
df['toxic'].value_counts() # Дисбаланс классов

toxic
0    143106
1     16186
Name: count, dtype: int64

# Предобработка текста

### Методы предобработки текста
1. Удаление HTML-тегов
2. Удаление знаков препинания
3. Приведение к нижнему регистру
4. Удаление стоп-слов: Стоп-слова — это часто встречающиеся слова (например, "и", "в", "на")
5. Лемматизация или стемминг: Эти методы приводят слова к их базовой форме (лемме или корню)

In [32]:
nltk.download('stopwords')
nltk.download('wordnet')
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [33]:
def preprocess_text(text):
    text = re.sub('<.*?>', '', text)
    text = re.sub(r'[^\w\s]', '', text).lower()
    words = text.split()
    words = [lemmatizer.lemmatize(word) for word in words]  #if word not in stop_words] пока не будем удалять стоп-слова
    return ' '.join(words)

In [34]:
df['preprocessed_text'] = df['text'].apply(preprocess_text)

In [35]:
df

Unnamed: 0,text,toxic,preprocessed_text
0,Explanation\nWhy the edits made under my usern...,0,explanation why the edits made under my userna...
1,D'aww! He matches this background colour I'm s...,0,daww he match this background colour im seemin...
2,"Hey man, I'm really not trying to edit war. It...",0,hey man im really not trying to edit war it ju...
3,"""\nMore\nI can't make any real suggestions on ...",0,more i cant make any real suggestion on improv...
4,"You, sir, are my hero. Any chance you remember...",0,you sir are my hero any chance you remember wh...
...,...,...,...
159287,""":::::And for the second time of asking, when ...",0,and for the second time of asking when your vi...
159288,You should be ashamed of yourself \n\nThat is ...,0,you should be ashamed of yourself that is a ho...
159289,"Spitzer \n\nUmm, theres no actual article for ...",0,spitzer umm there no actual article for prosti...
159290,And it looks like it was actually you who put ...,0,and it look like it wa actually you who put on...


# Векторизация Word2Vec

In [None]:
glove_model = None
try:
    glove_model = api.load("glove-wiki-gigaword-100")
except api.exceptions.DownloadError as e:
    print(f"Error loading GloVe model: {e}")
    print("Please ensure you have an internet connection and try again.")
    exit()

Датасет весьма большой, поэтому начнем с эмбеддингов слов, если будет плохо, то попробуем TF-IDF

In [None]:
def get_comment_vector(words):
    vectors = [glove_model[word] for word in words if word in glove_model]
    if vectors:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(glove_model.vector_size)

In [None]:
df['vector'] = df['preprocessed_text'].apply(get_comment_vector)
X = np.array(df['vector'].tolist())
y = df['toxic']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("\nTraining data vectors shape:", X_train.shape)
print("Testing data vectors shape:", X_test.shape)

# Построение моделей

In [None]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=100, max_depth=10, 
                                random_state=42,  
                               class_weight='balanced'
                              )
model.fit(X_train, y_train)
y_approx = model.predict(X_train)
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

In [None]:
f1_score(y_test, y_pred)

Пробовал разные варианты моделей классического ML, f1_score слишком маленький, поэтому идем к BERT

# BERT

In [45]:
class ToxicComments():
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        text = str(self.texts.iloc[item])
        label = self.labels.iloc[item]
        encoding = self.tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=self.max_len,
        return_token_type_ids=False,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt'
    )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

In [47]:
def train_bert_with_f1(model, dataloader, device, num_epochs=3):
    model.train()
    optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)
    for epoch in range(num_epochs):
        total_loss = 0
        all_predicted_labels = []
        all_true_labels = []
        progress_bar = tqdm(dataloader, desc=f'Epoch {epoch+1}/{num_epochs}', unit='batch')
        for batch in progress_bar:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            logits = outputs.logits
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
            _, predicted_labels = torch.max(logits, dim=1)
            # Собираем предсказанные и истинные метки для вычисления F1-score
            all_predicted_labels.extend(predicted_labels.cpu().tolist())
            all_true_labels.extend(labels.cpu().tolist())
            progress_bar.set_postfix({'loss': f'{loss.item():.4f}'})
        avg_loss = total_loss / len(dataloader)
        train_f1 = f1_score(all_true_labels, all_predicted_labels, average='binary')  # Для бинарной классификации
        print(f'\nEpoch {epoch+1} завершена. Средний loss: {avg_loss:.4f}, F1-score на обучающей выборке: {train_f1:.4f}')

    print('\nОбучение завершено!')

In [49]:
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [62]:
X_train_BERT, X_test_BERT, y_train_BERT, y_test_BERT = train_test_split(df['text'], df['toxic'], test_size=0.2)
MAX_LEN = 128
datasets = [X_train_BERT, X_test_BERT, y_train_BERT, y_test_BERT]
for ds in datasets:
    ds.reset_index(drop = True, inplace = True)
train_dataset = ToxicComments(X_train_BERT, y_train_BERT, tokenizer, MAX_LEN)
test_dataset = ToxicComments(X_test_BERT, y_test_BERT, tokenizer, MAX_LEN)
train_dataloader = DataLoader(train_dataset, batch_size=16)
test_dataloader = DataLoader(test_dataset, batch_size=16)

In [64]:
print(f"Размер обучающего датасета: {len(train_dataset)}")
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
print(f"Количество батчей в DataLoader: {len(train_dataloader)}")

Размер обучающего датасета: 127433
Количество батчей в DataLoader: 7965


In [136]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
train_bert_with_f1(model, train_dataloader, device, num_epochs=3)

Epoch 1/3: 100%|█████████████████████████████████████████████████| 7965/7965 [9:41:10<00:00,  4.38s/batch, loss=0.1015]



Epoch 1 завершена. Средний loss: 0.0938, F1-score на обучающей выборке: 0.8140


Epoch 2/3: 100%|█████████████████████████████████████████████████| 7965/7965 [9:33:01<00:00,  4.32s/batch, loss=0.0244]



Epoch 2 завершена. Средний loss: 0.0578, F1-score на обучающей выборке: 0.8824


Epoch 3/3: 100%|█████████████████████████████████████████████████| 7965/7965 [9:38:20<00:00,  4.36s/batch, loss=0.0076]


Epoch 3 завершена. Средний loss: 0.0318, F1-score на обучающей выборке: 0.9411

Обучение завершено!





мой прекрасный торч почему-то не видел gpu и обучал на cpu больше суток...

In [138]:
output_dir = './toxic_comment_model'  # Укажите путь, куда вы хотите сохранить модель
output_tokenizer_dir = './toxic_comment_tokenizer' # Укажите путь для токенизатора

# Сохранение модели
model.save_pretrained(output_dir)
# Сохранение токенизатора
tokenizer.save_pretrained(output_tokenizer_dir)

print(f"Модель сохранена в: {output_dir}")
print(f"Токенизатор сохранен в: {output_tokenizer_dir}")

Модель сохранена в: ./toxic_comment_model
Токенизатор сохранен в: ./toxic_comment_tokenizer


In [68]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [70]:
device

device(type='cuda')

In [58]:
model_path = './toxic_comment_model'
tokenizer_path = './toxic_comment_tokenizer'

try:
    # Загрузка предварительно обученной модели для классификации последовательностей
    model = BertForSequenceClassification.from_pretrained(model_path)
    print(f"Модель успешно загружена из: {model_path}")

    # Загрузка предварительно обученного токенизатора
    tokenizer = BertTokenizer.from_pretrained(tokenizer_path)
    print(f"Токенизатор успешно загружен из: {tokenizer_path}")


except Exception as e:
    print(f"Произошла ошибка при загрузке модели или токенизатора: {e}")

Модель успешно загружена из: ./toxic_comment_model
Токенизатор успешно загружен из: ./toxic_comment_tokenizer


In [78]:
model.eval()
predictions = []
actual_labels = []
with torch.no_grad():
    for batch in test_dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask)
        _, preds = torch.max(outputs.logits, dim=1)
        predictions.extend(preds.cpu().tolist())
        actual_labels.extend(labels.cpu().tolist())

In [82]:
print(classification_report(actual_labels, predictions))
print(f"f1: {f1_score(actual_labels, predictions)}")

              precision    recall  f1-score   support

           0       0.99      1.00      0.99     28658
           1       0.96      0.93      0.94      3201

    accuracy                           0.99     31859
   macro avg       0.98      0.96      0.97     31859
weighted avg       0.99      0.99      0.99     31859

f1: 0.9444532866465064


Результаты превзошли все ожидания, слава трансформерам!