In [None]:
import torch
import torch.nn as nn
from torch.nn import Module

import numpy as np
import matplotlib.pyplot as plt
from torchtext.datasets import IMDB

In [None]:
import random
random.seed(0)
np.random.seed(0)
torch.manual_seed(0)
torch.cuda.manual_seed(0)
torch.backends.cudnn.deterministic = True

In [None]:
!pip install datasets

Collecting datasets
  Downloading datasets-2.19.1-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.8 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m12.7 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub>=0.21.2 (from datasets)
  Downloading huggingface_hub-0.23.0-py3-none-an

In [None]:
from datasets import load_dataset
data = load_dataset("imdb", trust_remote_code=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/7.81k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

Задача бинарной классификации

In [None]:
data['train']['text'][1]

'"I Am Curious: Yellow" is a risible and pretentious steaming pile. It doesn\'t matter what one\'s political views are because this film can hardly be taken seriously on any level. As for the claim that frontal male nudity is an automatic NC-17, that isn\'t true. I\'ve seen R-rated films with male nudity. Granted, they only offer some fleeting views, but where are the R-rated films with gaping vulvas and flapping labia? Nowhere, because they don\'t exist. The same goes for those crappy cable shows: schlongs swinging in the breeze but not a clitoris in sight. And those pretentious indie movies like The Brown Bunny, in which we\'re treated to the site of Vincent Gallo\'s throbbing johnson, but not a trace of pink visible on Chloe Sevigny. Before crying (or implying) "double-standard" in matters of nudity, the mentally obtuse should take into account one unavoidably obvious anatomical difference between men and women: there are no genitals on display when actresses appears nude, and the s

In [None]:
unique_labels, counts = np.unique(data['train']['label'], return_counts=True)
for label, count in zip(unique_labels, counts):
    print(f"Значение {label}: {count}")

Значение 0: 12500
Значение 1: 12500


## Обработка текста

In [None]:
from transformers import BertTokenizer
from collections import Counter
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split

In [None]:
import re
def remove_html_tags(text):
    return re.sub('<.*?>', '', str(text))

Векторизация

In [None]:
def vectorize_sequences(sequences, dimension=10000):
    results = np.zeros((len(sequences), dimension))
    for i, sequence in enumerate(sequences):
      results[i, sequence] = 1
    return results

x_train = vectorize_sequences(data['train']['text'])
x_test = vectorize_sequences(data['test']['text'])


# nn.Embedding можно использовать готовое решение

Токенизация с помощью BERT

In [None]:
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=100):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.tokenized_texts = [self.tokenize(text) for text in texts]

    def tokenize(self, text):
        tokens = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        return tokens['input_ids'].squeeze()

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        input_ids = self.tokenized_texts[idx]
        return input_ids, torch.tensor(self.labels[idx])



texts = data['train']['text']
labels = data['train']['label']
train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)

# Инициализация токенизатора
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Создание DataLoader-ов
train_dataset = TextDataset(train_texts, train_labels, tokenizer)
val_dataset = TextDataset(val_texts, val_labels, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=256, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=256)



In [None]:
len(list(train_loader))

79

## Создание модели

In [None]:
class ClassifierNet(torch.nn.Module):
    def __init__(self, n_hidden_neurons, p=0.5):
        super(ClassifierNet, self).__init__()

        self.fc1 = torch.nn.Linear(1200, n_hidden_neurons)
        self.ac1 = torch.nn.ReLU()
        # self.dr1 = torch.nn.Dropout(p)
        self.batch_norm1 = torch.nn.BatchNorm1d(n_hidden_neurons)


        self.fc2 = torch.nn.Linear(n_hidden_neurons, n_hidden_neurons // 4)
        self.ac2 = torch.nn.ReLU()
        # self.dr2 = torch.nn.Dropout(p)
        self.batch_norm2 = torch.nn.BatchNorm1d(n_hidden_neurons // 4)

        self.fc3 = torch.nn.Linear(n_hidden_neurons // 4, 1)
        self.ac3 = torch.nn.Sigmoid()


    def forward(self, x):
        x = self.fc1(x)
        x = self.ac1(x)
        # x = self.dr1(x)
        x = self.batch_norm1(x)

        x = self.fc2(x)
        x = self.ac2(x)
        # x = self.dr2(x)
        x = self.batch_norm2(x)

        x = self.fc3(x)
        x = self.ac3(x)
        return x

net = ClassifierNet(1200)
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
net = net.to(device)


loss = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(net.parameters(), lr=1.0e-3)

In [None]:
class LSTM_Model(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, output_size, num_layers=1):
        super(LSTM_Model, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size, padding_idx=0)  # Векторизация
        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        x = self.embedding(x)
        out, _ = self.lstm(x)
        out = out[:, -1, :]
        out = self.fc(out)
        return out


vocab_size = len(tokenizer)
embed_size = 100
hidden_size = 128
output_size = 1 # тк бинарная классификация
num_layers = 1 # число слоев в модели LSTM

model_1 = LSTM_Model(vocab_size, embed_size, hidden_size, output_size, num_layers)


class RNN_Model(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, output_size, num_layers=1):
        super(RNN_Model, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size, padding_idx=0)
        self.rnn = nn.RNN(embed_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        x = self.embedding(x)
        out, _ = self.rnn(x)
        out = out[:, -1, :]
        out = self.fc(out)
        return out


model_2 = RNN_Model(vocab_size, embed_size, hidden_size, output_size, num_layers)

## Обучение модели

In [None]:
from torch.optim import Adam, RMSprop
from sklearn.metrics import f1_score

def train_model(model, train_loader, val_loader, num_epochs=10, learning_rate=0.001):
    Loss_function = nn.BCEWithLogitsLoss()
    # Loss_function = nn.CrossEntropyLoss()
    # optimizer = Adam(model.parameters(), lr=learning_rate)
    optimizer = RMSprop(model.parameters(), lr=learning_rate)

    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        y_true = []
        y_pred = []

        for input_ids, labels in train_loader:
            optimizer.zero_grad()
            outputs = model(input_ids)
            labels = labels.float().unsqueeze(1)  # Преобразование меток
            loss = Loss_function(outputs, labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()


            # Расчет метрик
            predicted_labels = torch.round(outputs) # predicted_labels = torch.round(torch.sigmoid(outputs))
            y_true.extend(labels.tolist())
            y_pred.extend(predicted_labels.tolist())

        epoch_loss = running_loss / len(train_loader.dataset)  # Усредняем потери по всем обучающим примерам
        f1_macro = f1_score(y_true, y_pred, average='macro')


        val_loss = 0.0
        y_true_val = []
        y_pred_val = []

        model.eval()
        with torch.no_grad():
            for input_ids, labels in val_loader:
                outputs = model(input_ids)  # Усредняем потери по всем обучающим примерам
                labels = labels.float().unsqueeze(1)
                loss = Loss_function(outputs, labels)
                val_loss += loss.item()

                # Расчет метрик на валидационном наборе
                predicted_labels = torch.round(outputs) # predicted_labels = torch.round(torch.sigmoid(outputs))
                y_true_val.extend(labels.tolist())
                y_pred_val.extend(predicted_labels.tolist())

        val_loss /= len(val_loader.dataset)
        f1_macro_val = f1_score(y_true_val, y_pred_val, average='macro')

        print(f'Эпоха {epoch+1}/{num_epochs}, Training Loss: {epoch_loss}, Training f1: {f1_macro}, Validation Loss: {val_loss}, Validation f1: {f1_macro_val}')

# Обучение модели
train_model(model_2, train_loader, val_loader, num_epochs=10, learning_rate=0.001)

Эпоха 1/10, Training Loss: 0.002800439453125, Training f1: 0.11654957424395762, Validation Loss: 0.002793583297729492, Validation f1: 0.33511191784672734
Эпоха 2/10, Training Loss: 0.0027264067471027375, Training f1: 0.2362839128269731, Validation Loss: 0.0028086751818656923, Validation f1: 0.3346640053226879
Эпоха 3/10, Training Loss: 0.00272680869102478, Training f1: 0.22125490882271068, Validation Loss: 0.0027661600470542907, Validation f1: 0.22141382657168243
Эпоха 4/10, Training Loss: 0.002673647212982178, Training f1: 0.2585416379450412, Validation Loss: 0.0027522008180618286, Validation f1: 0.262777413845344
Эпоха 5/10, Training Loss: 0.0026157664865255354, Training f1: 0.18394372007774645, Validation Loss: 0.002751212751865387, Validation f1: 0.219496870335439
Эпоха 6/10, Training Loss: 0.002585976234078407, Training f1: 0.1926010699989897, Validation Loss: 0.00278285117149353, Validation f1: 0.1938267302870213
Эпоха 7/10, Training Loss: 0.002598245760798454, Training f1: 0.179