In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader,TensorDataset, random_split
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, accuracy_score
import seaborn as sns
import random
import torchvision
import zipfile
from tqdm import tqdm
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, AdamW


In [2]:
def train(model, optimizer, train_loader, device, criterion, n_epochs):
    # Переводим модель в режим обучения
    model.train()
    model.to(device)
    # Список для хранения значений функции потерь
    losses_per_epoch = []

    for epoch in range(1, n_epochs + 1):
        list_process = []  # Очищаем список для каждой эпохи

        for batch_idx, (input_ids, attention_mask, labels) in enumerate(train_loader):
            # Переносим данные и цели на указанное устройство (например, GPU)
            input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

            # Обнуляем градиенты оптимизатора
            optimizer.zero_grad()

            # Подсчет прогнозов модели
            output = model(input_ids, attention_mask=attention_mask)

            # Вычисление функции потерь (лосса) между прогнозами и истинными значениями
            loss = criterion(output.logits, labels)

            # Расчет градиентов и выполнение одного шага оптимизатора
            loss.backward()
            optimizer.step()

            # Добавляем значение лосса в список процесса обучения
            list_process.append(loss.item())

            if batch_idx % 100 == 0:
                # Выводим информацию о процессе обучения каждые 100 батчей
                print(f'Train Epoch: {epoch} [{batch_idx * len(input_ids)}/{len(train_loader.dataset)} '
                      f'({100. * batch_idx / len(train_loader):.2f}%)]\tLoss: {loss.item():.6f}')

        # Выводим общую информацию о лоссе на данной эпохе
        epoch_loss = sum(list_process) / len(list_process)
        print(f'Train Epoch: {epoch}\tAverage Loss: {epoch_loss:.6f}')
        losses_per_epoch.append(epoch_loss)

    return losses_per_epoch


In [3]:
def train_and_evaluate_model(model, optimizer, train_loader, test_loader, device, criterion, n_epochs):
    # Обучение модели
    losses_per_epoch = train(model, optimizer, train_loader, device, criterion, n_epochs)

    # Вывод графика функции потерь по эпохам
    plt.figure(figsize=(10, 5))
    plt.plot(range(1, n_epochs + 1), losses_per_epoch)
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.title('Training Loss')
    plt.show()

    # Оценка модели на тестовом наборе
    model.eval()
    correct = 0
    total = 0
    predicted_labels = []
    true_labels = []

    with torch.no_grad():
        for data in test_loader:
            images, labels = data
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
            predicted_labels.extend(predicted.cpu().numpy())
            true_labels.extend(labels.cpu().numpy())

    # Вывод матрицы ошибок
    cm = confusion_matrix(true_labels, predicted_labels)
    plt.figure(figsize=(10, 8))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=range(10), yticklabels=range(10))
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.title('Confusion Matrix')
    plt.show()

    # Вывод точности на тестовом наборе
    accuracy = accuracy_score(true_labels, predicted_labels)
    print(f'Accuracy on the test set: {accuracy * 100:.2f}%')

In [4]:
data = pd.read_csv("/content/drive/MyDrive/datasets/balanced_df.csv")
data['Review Text'] = data['Review Text'].astype(str)

In [5]:
data['Rating'].value_counts()

1    1902
2    1902
3    1902
4    1902
5    1902
Name: Rating, dtype: int64

In [6]:
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

# Initializing the BERT tokenizer with max_length
max_length = 512  # Set your desired maximum sequence length
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', max_length=max_length)

# Applying the tokenizer to the 'Review Text' column
def preprocess_data(reviews, ratings):
    tokenized_inputs = reviews.apply(lambda x: tokenizer(x, padding='max_length', truncation=True, return_tensors='pt'))
    input_ids = torch.cat([tensor['input_ids'] for tensor in tokenized_inputs], dim=0)
    attention_mask = torch.cat([tensor['attention_mask'] for tensor in tokenized_inputs], dim=0)
    labels = torch.tensor(ratings.tolist())
    return TensorDataset(input_ids, attention_mask, labels)

# Creating DataLoader for training set
train_dataset = preprocess_data(train_data['Review Text'], train_data['Rating'])
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

# Creating DataLoader for test set
test_dataset = preprocess_data(test_data['Review Text'], test_data['Rating'])
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [None]:
bert_model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=5)
# Замените оптимизатор, функцию потерь и количество эпох на свои
optimizer = AdamW(bert_model.parameters(), lr=1e-5)
criterion = nn.CrossEntropyLoss()
n_epochs = 3
device = 'cuda' if torch.cuda.is_available() else 'cpu'

train_and_evaluate_model(bert_model, optimizer, train_loader, test_loader, device, criterion, n_epochs)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
