In [1]:
%pip install transformers scikit-learn torch pandas matplotlib seaborn nltk

Note: you may need to restart the kernel to use updated packages.


In [4]:
import os
import pandas as pd

# Функция для загрузки IMDB данных
def load_imdb_data(directory):
    data = []
    labels = []
    for label in ['pos', 'neg']:  # Позитивные и негативные отзывы
        folder = os.path.join(directory, label)
        for file in os.listdir(folder):
            file_path = os.path.join(folder, file)
            with open(file_path, 'r', encoding='utf-8') as f:
                data.append(f.read())
                labels.append(1 if label == 'pos' else 0)  # 1 - позитив, 0 - негатив
    return pd.DataFrame({'text': data, 'label': labels})

# Загрузка данных
train_data_path = r"C:\Users\Asus\OneDrive - International Information Technology University\Рабочий стол\GAMES\Emo\data\aclImdb\train"
test_data_path = r"C:\Users\Asus\OneDrive - International Information Technology University\Рабочий стол\GAMES\Emo\data\aclImdb\test"

train_df = load_imdb_data(train_data_path)
test_df = load_imdb_data(test_data_path)

# Просмотр данных
print("Train dataset shape:", train_df.shape)
print("Test dataset shape:", test_df.shape)
print(train_df.head())


Train dataset shape: (25000, 2)
Test dataset shape: (25000, 2)
                                                text  label
0  Bromwell High is a cartoon comedy. It ran at t...      1
1  Homelessness (or Houselessness as George Carli...      1
2  Brilliant over-acting by Lesley Ann Warren. Be...      1
3  This is easily the most underrated film inn th...      1
4  This is not the typical Mel Brooks film. It wa...      1


In [10]:
from transformers import DistilBertTokenizer
from torch.utils.data import Dataset, DataLoader
import torch

# Инициализация токенизатора
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

# Токенизация текста
def tokenize_data(df, tokenizer, max_length=128):
    encodings = tokenizer(list(df["text"]), truncation=True, padding=True, max_length=max_length, return_tensors="pt")
    return encodings, torch.tensor(df["label"].values)

train_encodings, train_labels = tokenize_data(train_df, tokenizer)
test_encodings, test_labels = tokenize_data(test_df, tokenizer)

# PyTorch Dataset
class IMDbDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item["labels"] = self.labels[idx]
        return item

train_dataset = IMDbDataset(train_encodings, train_labels)
test_dataset = IMDbDataset(test_encodings, test_labels)

In [24]:
import sys
print(sys.version)

3.10.6 (tags/v3.10.6:9c7b4bd, Aug  1 2022, 21:53:49) [MSC v.1932 64 bit (AMD64)]


In [30]:
%pip uninstall torch torchvision torchaudio -y

Found existing installation: torch 2.0.1
Uninstalling torch-2.0.1:
  Successfully uninstalled torch-2.0.1
Found existing installation: torchvision 0.15.2+cu118
Uninstalling torchvision-0.15.2+cu118:
  Successfully uninstalled torchvision-0.15.2+cu118
Found existing installation: torchaudio 2.5.1
Uninstalling torchaudio-2.5.1:
  Successfully uninstalled torchaudio-2.5.1
Note: you may need to restart the kernel to use updated packages.


You can safely remove it manually.


In [31]:
%pip install "C:\Users\Asus\OneDrive - International Information Technology University\Рабочий стол\GAMES\Emo\torch-2.5.1+cu118-cp310-cp310-win_amd64.whl"
%pip install "C:\Users\Asus\OneDrive - International Information Technology University\Рабочий стол\GAMES\Emo\torchvision-0.15.2+cu118-cp310-cp310-win_amd64.whl"

Processing c:\users\asus\onedrive - international information technology university\рабочий стол\games\emo\torch-2.5.1+cu118-cp310-cp310-win_amd64.whl
torch is already installed with the same version as the provided wheel. Use --force-reinstall to force an installation of the wheel.
Note: you may need to restart the kernel to use updated packages.
Processing c:\users\asus\onedrive - international information technology university\рабочий стол\games\emo\torchvision-0.15.2+cu118-cp310-cp310-win_amd64.whlNote: you may need to restart the kernel to use updated packages.

Collecting torch==2.0.1 (from torchvision==0.15.2+cu118)
  Using cached torch-2.0.1-cp310-cp310-win_amd64.whl.metadata (23 kB)
Using cached torch-2.0.1-cp310-cp310-win_amd64.whl (172.3 MB)
Installing collected packages: torch, torchvision
  Attempting uninstall: torch
    Found existing installation: torch 2.5.1+cu118
    Uninstalling torch-2.5.1+cu118:
      Successfully uninstalled torch-2.5.1+cu118
Successfully installe

In [11]:
from transformers import DistilBertForSequenceClassification
from torch.optim import AdamW
from torch.cuda.amp import GradScaler, autocast

# Устройство
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print("Используемое устройство:", device)

# Модель DistilBERT
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)
model.gradient_checkpointing_enable()  # Градиентный чекпоинтинг
model.to(device)

# Оптимизация
optimizer = AdamW(model.parameters(), lr=2e-5)
scaler = GradScaler()

Используемое устройство: cpu


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  scaler = GradScaler()


In [39]:
# Обучение
epochs = 3
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in train_loader:
        optimizer.zero_grad()
        inputs = {key: val.to(device) for key, val in batch.items() if key != "labels"}
        labels = batch["labels"].to(device)
        
        # Смешанная точность (обновленный вызов autocast)
        with autocast(dtype=torch.float16):  # Убираем device_type
            outputs = model(**inputs, labels=labels)
            loss = outputs.loss
        
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        total_loss += loss.item()
    print(f"Epoch {epoch + 1} - Loss: {total_loss / len(train_loader):.4f}")

  with autocast(dtype=torch.float16):  # Убираем device_type


KeyboardInterrupt: 

In [None]:
# Оценка модели
model.eval()
predictions, true_labels = [], []

for batch in test_loader:
    inputs = {key: val.to(device) for key, val in batch.items() if key != "labels"}
    labels = batch["labels"].to(device)
    with torch.no_grad():
        outputs = model(**inputs)
    logits = outputs.logits
    preds = torch.argmax(logits, dim=1).cpu().numpy()
    predictions.extend(preds)
    true_labels.extend(labels.cpu().numpy())

# Классификационный отчет
print("Classification Report:")
print(classification_report(true_labels, predictions, target_names=["Negative", "Positive"]))