In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch
from torch import nn
from torch.nn import functional as F
from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Векторизация


In [2]:
text_data_frame = pd.read_csv('bib_data_union_v3.csv.zip', compression='zip')
# text_data_frame = pd.read_csv('text_small_df.csv')

In [3]:
text_data_frame.head()

Unnamed: 0,tokenized_record,style_name
0,"[ num ] sp upword , sp caplet . sp caplet . sp...",iaea
1,[ capword sp othword sp capword ( year ) ] sp ...,bestpapers
2,[ capword ( year ) ] sp caplet . sp caplet . s...,bestpapers
3,[ capword sp othword sp capword ( year ) ] sp ...,bestpapers
4,[ capword ( year ) ] sp capword sp caplet . sp...,bestpapers


In [5]:
# text_data_frame = text_data_frame.drop(['Unnamed: 0'], axis=1)

In [6]:
corpus = text_data_frame.tokenized_record
vectorizer = TfidfVectorizer(tokenizer=lambda txt: txt.split(), 
                             ngram_range=(1, 2), 
                             token_pattern=r'\S*')
vectorized_data = vectorizer.fit_transform(corpus).toarray()
print("SUCCESS")



SUCCESS


In [7]:
df = pd.DataFrame(data=vectorized_data, columns=vectorizer.get_feature_names_out())
df['style_name'] = text_data_frame.style_name

In [7]:
# from sklearn.model_selection import train_test_split

# big_df, small_df = train_test_split(df, test_size=0.017, random_state=0)

In [8]:
# df = small_df

In [9]:
len(df)

6007454

In [10]:
df.head()

Unnamed: 0,"""",""" (",""" )",""" ,",""" .",""" :",""" ;",""" ]",""" caplet",""" capword",...,— caplet,— capword,— num,— othword,— smallet,— sp,— upword,— year,— —,style_name
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,iaea
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,bestpapers
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,bestpapers
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,bestpapers
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,bestpapers


In [8]:
from sklearn.preprocessing import LabelEncoder
def encode(df):
    columnsToEncode = list(df[['style_name']])
    le = LabelEncoder()
    for feature in columnsToEncode:
        try:
            df[feature] = le.fit_transform(df[feature])
        except:
            print('Error encoding '+feature)
    return df

In [9]:
df = encode(df)

In [10]:
UNIQUE_TOKENS_N = len(list(df)) - 1
UNIQUE_LABELS_N = len(set(df.style_name))

In [11]:
from sklearn.model_selection import train_test_split
from scipy.sparse import csr_matrix

train_df, test_df = train_test_split(df, test_size=0.3, random_state=0)
train_X, train_y, test_X, test_y = train_df.drop(['style_name'], axis=1), train_df.style_name, \
                                   test_df.drop(['style_name'], axis=1), test_df.style_name
train_X, test_X = csr_matrix(train_X.values), csr_matrix(test_X.values)

In [12]:
train_y = train_y.to_numpy()

In [13]:
test_y = test_y.to_numpy()

In [14]:
from torch.utils.data import Dataset

# заимствованный код из библиотеки dlnlputils
class SparseFeaturesDataset(Dataset):
    def __init__(self, features, targets):
        self.features = features
        self.targets = targets

    def __len__(self):
        return self.features.shape[0]

    def __getitem__(self, idx):
        cur_features = torch.from_numpy(self.features[idx].toarray()[0]).float()
        cur_label = torch.from_numpy(np.asarray(self.targets[idx])).long()
        return cur_features, cur_label

In [15]:
import copy
import datetime
import random
import traceback
from torch.utils.data import DataLoader

# заимствованный код из библиотеки dlnlputils
def copy_data_to_device(data, device):
    if torch.is_tensor(data):
        return data.to(device)
    elif isinstance(data, (list, tuple)):
        return [copy_data_to_device(elem, device) for elem in data]
    raise ValueError('Недопустимый тип данных {}'.format(type(data)))
 
def train_eval_loop(model, train_dataset, val_dataset, criterion,
                    lr=1e-4, epoch_n=10, batch_size=32,
                    device=None, early_stopping_patience=10, l2_reg_alpha=0,
                    max_batches_per_epoch_train=10000,
                    max_batches_per_epoch_val=1000,
                    data_loader_ctor=DataLoader,
                    optimizer_ctor=None,
                    lr_scheduler_ctor=None,
                    shuffle_train=True,
                    dataloader_workers_n=0):
    """
    Цикл для обучения модели. После каждой эпохи качество модели оценивается по отложенной выборке.
    :param model: torch.nn.Module - обучаемая модель
    :param train_dataset: torch.utils.data.Dataset - данные для обучения
    :param val_dataset: torch.utils.data.Dataset - данные для оценки качества
    :param criterion: функция потерь для настройки модели
    :param lr: скорость обучения
    :param epoch_n: максимальное количество эпох
    :param batch_size: количество примеров, обрабатываемых моделью за одну итерацию
    :param device: cuda/cpu - устройство, на котором выполнять вычисления
    :param early_stopping_patience: наибольшее количество эпох, в течение которых допускается
        отсутствие улучшения модели, чтобы обучение продолжалось.
    :param l2_reg_alpha: коэффициент L2-регуляризации
    :param max_batches_per_epoch_train: максимальное количество итераций на одну эпоху обучения
    :param max_batches_per_epoch_val: максимальное количество итераций на одну эпоху валидации
    :param data_loader_ctor: функция для создания объекта, преобразующего датасет в батчи
        (по умолчанию torch.utils.data.DataLoader)
    :return: кортеж из двух элементов:
        - среднее значение функции потерь на валидации на лучшей эпохе
        - лучшая модель
    """
    if device is None:
        device = 'cuda' if torch.cuda.is_available() else 'cpu'
    device = torch.device(device)
    model.to(device)

    if optimizer_ctor is None:
        optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=l2_reg_alpha)
    else:
        optimizer = optimizer_ctor(model.parameters(), lr=lr)

    if lr_scheduler_ctor is not None:
        lr_scheduler = lr_scheduler_ctor(optimizer)
    else:
        lr_scheduler = None

    train_dataloader = data_loader_ctor(train_dataset, batch_size=batch_size, shuffle=shuffle_train,
                                        num_workers=dataloader_workers_n)
    val_dataloader = data_loader_ctor(val_dataset, batch_size=batch_size, shuffle=False,
                                      num_workers=dataloader_workers_n)

    best_val_loss = float('inf')
    best_epoch_i = 0
    best_model = copy.deepcopy(model)

    for epoch_i in range(epoch_n):
        try:
            epoch_start = datetime.datetime.now()
            print('Эпоха {}'.format(epoch_i))

            model.train()
            mean_train_loss = 0
            train_batches_n = 0
            for batch_i, (batch_x, batch_y) in enumerate(train_dataloader):
                if batch_i > max_batches_per_epoch_train:
                    break

                batch_x = copy_data_to_device(batch_x, device)
                batch_y = copy_data_to_device(batch_y, device)

                pred = model(batch_x)
                loss = criterion(pred, batch_y)

                model.zero_grad()
                loss.backward()

                optimizer.step()

                mean_train_loss += float(loss)
                train_batches_n += 1

            mean_train_loss /= train_batches_n
            print('Эпоха: {} итераций, {:0.2f} сек'.format(train_batches_n,
                                                           (datetime.datetime.now() - epoch_start).total_seconds()))
            print('Среднее значение функции потерь на обучении', mean_train_loss)



            model.eval()
            mean_val_loss = 0
            val_batches_n = 0

            with torch.no_grad():
                for batch_i, (batch_x, batch_y) in enumerate(val_dataloader):
                    if batch_i > max_batches_per_epoch_val:
                        break

                    batch_x = copy_data_to_device(batch_x, device)
                    batch_y = copy_data_to_device(batch_y, device)

                    pred = model(batch_x)
                    loss = criterion(pred, batch_y)

                    mean_val_loss += float(loss)
                    val_batches_n += 1

            mean_val_loss /= val_batches_n
            print('Среднее значение функции потерь на валидации', mean_val_loss)

            if mean_val_loss < best_val_loss:
                best_epoch_i = epoch_i
                best_val_loss = mean_val_loss
                best_model = copy.deepcopy(model)
                print('Новая лучшая модель!')
            elif epoch_i - best_epoch_i > early_stopping_patience:
                print('Модель не улучшилась за последние {} эпох, прекращаем обучение'.format(
                    early_stopping_patience))
                break

            if lr_scheduler is not None:
                lr_scheduler.step(mean_val_loss)

            print()
        except KeyboardInterrupt:
            print('Досрочно остановлено пользователем')
            break
        except Exception as ex:
            print('Ошибка при обучении: {}\n{}'.format(ex, traceback.format_exc()))
            # break

    return best_val_loss, best_model

In [16]:
# заимствованный код из библиотеки dlnlputils
def predict_with_model(model, dataset, device=None, batch_size=32, num_workers=0, return_labels=False):
    """
    :param model: torch.nn.Module - обученная модель
    :param dataset: torch.utils.data.Dataset - данные для применения модели
    :param device: cuda/cpu - устройство, на котором выполнять вычисления
    :param batch_size: количество примеров, обрабатываемых моделью за одну итерацию
    :return: numpy.array размерности len(dataset) x *
    """
    if device is None:
        device = 'cuda' if torch.cuda.is_available() else 'cpu'
    results_by_batch = []

    device = torch.device(device)
    model.to(device)
    model.eval()

    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers)
    labels = []
    with torch.no_grad():
        import tqdm
        for batch_x, batch_y in tqdm.tqdm(dataloader, total=len(dataset)/batch_size):
            try:
                batch_x = copy_data_to_device(batch_x, device)
                if return_labels:
                    labels.append(batch_y.numpy())
                batch_pred = model(batch_x)
                results_by_batch.append(batch_pred.detach().cpu().numpy())
            except:
                print('Ошибка')

    if return_labels:
        return np.concatenate(results_by_batch, 0), np.concatenate(labels, 0)
    else:
        return np.concatenate(results_by_batch, 0)

In [17]:
def train_epoch(net, dataloader, lr=0.01, optimizer=None, loss_fn = torch.nn.NLLLoss(), epoch_size=None, report_freq=200):
    optimizer = optimizer or torch.optim.Adam(net.parameters(), lr=lr)
    net.train() 
    total_loss, acc, count, i = 0, 0, 0, 0
    for idx, (labels, features) in enumerate(dataloader):
        print(idx)
        optimizer.zero_grad()
        out = net(features)
        loss = loss_fn(out,labels) #cross_entropy(out,labels)
        loss.backward()
        optimizer.step()
        total_loss += loss
        _, predicted = torch.max(out, 1)
        acc += (predicted == labels).sum()
        count += len(labels)
        i += 1
        if i % report_freq == 0:
            print(f"{count}: acc={acc.item() / count}")
        if epoch_size and count > epoch_size:
            break
    return total_loss.item() / count, acc.item() / count

In [18]:
def bowify(b):
    return (
            torch.LongTensor([t[0]-1 for t in b]),
            torch.stack([to_bow(t[1]) for t in b])
    )

train_dataset = SparseFeaturesDataset(train_X, train_y)
test_dataset = SparseFeaturesDataset(test_X, test_y)
train_loader = DataLoader(train_dataset, batch_size=16, collate_fn=bowify, shuffle=True,num_workers=0)
test_loader = DataLoader(test_dataset, batch_size=16, collate_fn=bowify, shuffle=True,num_workers=0)

# Model 1 (Linear)


In [21]:
model = nn.Linear(UNIQUE_TOKENS_N, UNIQUE_LABELS_N)

scheduler = lambda optim: \
    torch.optim.lr_scheduler.ReduceLROnPlateau(optim, patience=5, factor=0.5, verbose=True)

best_val_loss, best_model = train_eval_loop(model=model,
                                            train_dataset=train_dataset,
                                            val_dataset=test_dataset,
                                            criterion=F.cross_entropy,
                                            lr=1e-1,
                                            epoch_n=50,
                                            batch_size=32,
                                            l2_reg_alpha=0,
                                            lr_scheduler_ctor=scheduler)

RuntimeError: CUDA error: out of memory
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [None]:
best_val_loss

In [21]:
torch.save(best_model.state_dict(), 'pt_linear_model_1')

# Model 2 (Sequential)

In [16]:
from torchnlp.nn import CNNEncoder

In [22]:
class ConvolutionNeuralNetwork(nn.Module):
    def __init__(self, 
                 embedding_dim, 
                 num_filters, 
                 ngram_filter_sizes=(2, 3, 4), 
                 activation=nn.ReLU(),
                 output_dim=None):
        super().__init__()
        self.embedding_dim = embedding_dim
        self.num_filters = num_filters
        self.ngram_filter_sizes = ngram_filter_sizes
        self.activation = activation
        self.output_dim = output_dim
        self.conv_layers = [
            nn.Conv1d(in_channels=embedding_dim, out_channels=num_filters, kernel_size=ngram_size)
            for ngram_size in ngram_filter_sizes
        ]
        for i, conv_layer in enumerate(self.conv_layers):
            self.add_module('conv_layer_%d' % i, conv_layer)
        maxpool_output_dim = num_filters * len(ngram_filter_sizes)
        if output_dim:
            self.projection_layer = nn.Linear(maxpool_output_dim, output_dim)
        else:
            self.projection_layer = None
            self.output_dim = maxpool_output_dim
    
    def forward(self, tokens):
        '''
        tokens: FloatTensor[batch_size, num_tokens, input_dim]
        '''
        tokens = torch.transpose(tokens, 1, 2)
        
        filter_outputs = []
        for i in range(len(self.conv_layers)):
            conv_layer = getattr(self, 'conv_layer_%d' % i)
            filter_outputs.append(self.activation(conv_layer(tokens)).max(dim=2)[0])
        maxpool_output = torch.cat(filter_outputs, dim=1) if len(filter_outputs) > 1 else filter_outputs[0]
        
        if self.projection_layer:
            result = self.projection_layer(maxpool_output)
        else:
            result = maxpool_output
        return result

In [19]:
import torch.optim as optim

class TextCNN(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_filters, filter_sizes, num_classes):
        super(TextCNN, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embed_dim)
        self.conv_layers = nn.ModuleList([
            nn.Conv1d(in_channels=embed_dim, out_channels=num_filters, kernel_size=fs) for fs in filter_sizes
        ])
        self.fc = nn.Linear(num_filters * len(filter_sizes), num_classes)

    def forward(self, x):
        x = self.embeddings(x)  # [batch_size, seq_len, embed_dim]
        x = x.permute(0, 2, 1)  # [batch_size, embed_dim, seq_len]
        x_conv = [nn.functional.relu(conv(x)) for conv in self.conv_layers]  # [(batch_size, num_filters, conv_seq_len), ...]
        x_pool = [nn.functional.max_pool1d(conv, conv.size(2)).squeeze(2) for conv in x_conv]  # [(batch_size, num_filters), ...]
        x_fc = torch.cat(x_pool, dim=1)  # [batch_size, num_filters * len(filter_sizes)]
        x_out = self.fc(x_fc)  # [batch_size, num_classes]
        return x_out

# Define hyperparameters
vocab_size = UNIQUE_TOKENS_N
embed_dim = 100
num_filters = 128
filter_sizes = [3, 4, 5]
num_classes = UNIQUE_LABELS_N
learning_rate = 0.001
batch_size = 32
num_epochs = 10

# Define dataset and dataloader
train_dataset = train_dataset
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataset = test_dataset
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

# Initialize model and optimizer
model = TextCNN(vocab_size, embed_dim, num_filters, filter_sizes, num_classes)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Define loss function
criterion = nn.CrossEntropyLoss()

# Train the model
for epoch in range(num_epochs):
    train_loss = 0.0
    train_acc = 0.0
    model.train()
    for inputs, labels in train_dataloader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        train_loss += loss.item() * inputs.size(0)
        _, preds = torch.max(outputs, 1)
        train_acc += torch.sum(preds == labels.data)
    train_loss = train_loss / len(train_dataset)
    train_acc = train_acc / len(train_dataset)

    val_loss = 0.0
    val_acc = 0.0
    model.eval()
    with torch.no_grad():
        for inputs, labels in val_dataloader:
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            val_loss += loss.item() * inputs.size(0)
            _, preds = torch.max(outputs, 1)
            val_acc += torch.sum(preds == labels.data)
        val_loss = val_loss / len(val_dataset)
        val_acc = val_acc / len(val_dataset)

    print(f"Epoch {epoch+1}/{num_epochs}: Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}, Val Loss: {val_loss}")

RuntimeError: Expected tensor for argument #1 'indices' to have one of the following scalar types: Long, Int; but got torch.FloatTensor instead (while checking arguments for embedding)

In [23]:
conv_model = ConvolutionNeuralNetwork(features_num=UNIQUE_TOKENS_N, layers_n=5, dropout=0.1)

scheduler = lambda optim: \
    torch.optim.lr_scheduler.ReduceLROnPlateau(optim, patience=5, factor=0.5, verbose=True)

best_val_loss, best_model = train_eval_loop(
                                           conv_model,
                                           train_dataset,
                                           test_dataset,
                                           F.cross_entropy,
                                           lr=1e-1,
                                           epoch_n=50,
                                           batch_size=32,
                                           l2_reg_alpha=0,
                                           lr_scheduler_ctor=scheduler
                                           )

TypeError: ConvolutionNeuralNetwork.__init__() got an unexpected keyword argument 'features_num'

In [None]:
seq_model = nn.Sequential(
           nn.Dropout(0.05),
           nn.Linear(UNIQUE_TOKENS_N, UNIQUE_LABELS_N)
)


scheduler = lambda optim: \
    torch.optim.lr_scheduler.ReduceLROnPlateau(optim, patience=5, factor=0.5, verbose=True)

best_val_loss, best_model = train_eval_loop(model=seq_model,
                                            train_dataset=train_dataset,
                                            val_dataset=test_dataset,
                                            criterion=F.cross_entropy,
                                            lr=1e-1,
                                            epoch_n=50,
                                            batch_size=64,
                                            l2_reg_alpha=0,
                                            lr_scheduler_ctor=scheduler)

Эпоха 0
Эпоха: 10001 итераций, 61.34 сек
Среднее значение функции потерь на обучении 1.094627337620957
Среднее значение функции потерь на валидации 0.9286486540998254
Новая лучшая модель!

Эпоха 1
Эпоха: 10001 итераций, 61.21 сек
Среднее значение функции потерь на обучении 1.0057485575479765
Среднее значение функции потерь на валидации 0.8762897318536109
Новая лучшая модель!

Эпоха 2
Эпоха: 10001 итераций, 60.45 сек
Среднее значение функции потерь на обучении 1.00187811619305
Среднее значение функции потерь на валидации 0.8735460841155552
Новая лучшая модель!

Эпоха 3
Эпоха: 10001 итераций, 60.67 сек
Среднее значение функции потерь на обучении 1.0027435652411778
Среднее значение функции потерь на валидации 0.8817427166989752

Эпоха 4
Эпоха: 10001 итераций, 61.92 сек
Среднее значение функции потерь на обучении 1.006321900922672
Среднее значение функции потерь на валидации 0.897151441096545

Эпоха 5
Эпоха: 10001 итераций, 66.91 сек
Среднее значение функции потерь на обучении 1.0054317902

In [21]:
torch.save(seq_model.state_dict(), 'linear_dropout_0.1_model')

In [19]:
loaded_model = nn.Linear(UNIQUE_TOKENS_N, UNIQUE_LABELS_N)
loaded_model.load_state_dict(torch.load('pt_linear_model_1'))
loaded_model.eval()

RuntimeError: CUDA error: out of memory
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [None]:
pred = predict_with_model(best_model, test_dataset)
loss = F.cross_entropy(torch.from_numpy(pred), 
                       torch.from_numpy(test_y).long())

print('Среднее значение функции потерь на обучении', float(loss))
print('Доля верных ответов', accuracy_score(test_y, pred.argmax(-1)))
print('Precision', precision_score(test_y, pred.argmax(-1), average='macro'))
print('Recall', recall_score(test_y, pred.argmax(-1), average='macro'))
print('F1', f1_score(test_y, pred.argmax(-1), average='macro'))
print()