### Гиперпараметры
parameters.py

In [19]:
E_DIM, H_DIM = 10, 20
TRAINLOADER_PARAMS = {
    'batch_size': 4, 'num_workers': 2, 'shuffle': True
}
TESTLOADER_PARAMS = {
    'batch_size': 4, 'num_workers': 2, 'shuffle': False
}
EPOCHES = 4
PATH_TO_DATASET = "dataset/IMDB Dataset.csv"

### Модель
model.py

In [9]:
import torch
import torch.nn as nn


class RNN(nn.Module):

    def __init__(self, e_dim=E_DIM, h_dim=H_DIM):
        super().__init__()
        self.rnn = nn.RNN(e_dim, h_dim, batch_first=True)
        self.fc1 = nn.Linear(h_dim, h_dim * 2)

    def forward(self, x):
        x, hs = self.rnn(x)
        x = torch.cat((x, hs), dim=1)
        x = self.fc1(x)
        return x


### Интерфейс для работы с датасетом
data.py

#### Старый интерфейс

In [10]:
import torch
from torch.utils.data import Dataset
import torchtext.transforms as transforms
import csv
from transformers import BertTokenizer


class IMDB(Dataset):

    def __init__(self, path):
        super().__init__()
        with open(path) as file:
            csvreader = csv.reader(file, delimiter=',')
            self.data = [i for i in csvreader]

        # self.label_transform = transforms.Sequential(
        #     transforms.LabelToIndex()
        # )
        # self.tz = BertTokenizer.from_pretrained('bert-base-cased')
        # self.tokenized_data = []
        # for i in self.data:
        #     self.tokenized_data += self.tz.tokenize(i[0])

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        value = self.data[index]
        item, label = value
        item = self.text_to_tensor(item)
        return item, label

    @staticmethod
    def text_to_tensor(text):
        char_to_index = {char: idx for idx, char in enumerate(set(text))}
        text_indices = [char_to_index[char] for char in text]
        tensor = torch.tensor(text_indices, dtype=torch.long)
        return tensor


#### Переделанный интерфейс

In [25]:
import torch
from torch.utils.data import random_split, DataLoader
import csv


class IMDB_full:

    def __init__(self, path_to_dataset, split_size=0.5):
        super().__init__()
        self.full_dataset = self.load_csv(path_to_dataset)
        self.split_size = 0.5
        self.split_dataset()

    def __len__(self):
        return len(self.full_dataset)

    @staticmethod
    def load_csv(path_to_csv, delimiter=','):
        with open(path_to_csv, 'r') as file:
            reader = csv.reader(file, delimiter=delimiter)
            data = [i for i in reader]
        return data

    def split_dataset(self):
        trainset_size = int(len(self.full_dataset) * self.split_size)
        self.trainset, self.testset = random_split(self.full_dataset, (trainset_size, 
                                                                  len(self.full_dataset) - trainset_size))

    def change_split_size(self, new_split_size):
        self.split_size = new_split_size
        self.split_dataset()

    def get_trainloader(self):
        return DataLoader(self.trainset, **TRAINLOADER_PARAMS)

    def get_testloader(self):
        return DataLoader(self.testset, **TESTLOADER_PARAMS)

    @staticmethod
    def text_to_tensor(text):
        char_to_index = {char: idx for idx, char in enumerate(set(text))}
        text_indices = [char_to_index[char] for char in text]
        tensor = torch.tensor(text_indices, dtype=torch.long)
        return tensor

### Интерфейс для обучения
main.py

In [16]:
import torch.nn as nn
from torch import optim

In [32]:
class Learner:
    def __init__(self, model, trainset, testset, epoches=EPOCHES):
        self.net = model()
        self.epoches = epoches
        self.trainset, self.testset = trainset, testset

    def set_loss(self, lr=0.01):
        self.criterion = nn.CrossEntropyLoss()
        self.optimizer = optim.Adam(self.net.parameters(), lr=lr)

    def start_learning(self):
        for epoch in range(self.epoches):
            running_loss = 0.0
            for i, data in enumerate(self.trainset, 0):
                self.optimizer.zero_grad()
                inputs, labels = data
                output = self.net(inputs)
                loss = self.criterion(output, labels)
                loss.backward()
                self.optimizer.step()
                running_loss += loss.item()
                if i % 2000 == 1999:  # выводим каждую 2000 мини-батчу
                    print(f'[{epoch + 1}, {i + 1:5d}] loss: {running_loss / 2000:.3f}')
                    running_loss = 0.0
        
        

### Обучение

In [33]:
dataset = IMDB_full(PATH_TO_DATASET)
trainset, testset = dataset.get_trainloader(), dataset.get_testloader()

learner = Learner(RNN, trainset, testset)
learner.set_loss()
learner.start_learning()

AttributeError: 'tuple' object has no attribute 'dim'