# Техническая часть

In [1]:
!pip install -q sentencepiece
import sentencepiece
!pip install -q transformers

!pip install -q comet_ml
import comet_ml
!pip install -q pytorch-lightning

!git clone -q https://github.com/DanilDmitriev1999/QA

[K     |████████████████████████████████| 1.2MB 9.0MB/s 
[K     |████████████████████████████████| 2.1MB 8.9MB/s 
[K     |████████████████████████████████| 901kB 50.5MB/s 
[K     |████████████████████████████████| 3.3MB 37.7MB/s 
[K     |████████████████████████████████| 266kB 9.5MB/s 
[K     |████████████████████████████████| 522kB 15.9MB/s 
[K     |████████████████████████████████| 61kB 6.8MB/s 
[K     |████████████████████████████████| 61kB 8.3MB/s 
[?25h  Building wheel for configobj (setup.py) ... [?25l[?25hdone
[K     |████████████████████████████████| 849kB 9.3MB/s 
[K     |████████████████████████████████| 112kB 29.3MB/s 
[K     |████████████████████████████████| 276kB 12.8MB/s 
[K     |████████████████████████████████| 829kB 24.9MB/s 
[K     |████████████████████████████████| 184kB 50.9MB/s 
[K     |████████████████████████████████| 1.3MB 53.4MB/s 
[K     |████████████████████████████████| 296kB 56.2MB/s 
[K     |████████████████████████████████| 143kB 53.2M

In [2]:
import warnings
warnings.filterwarnings('ignore')
import comet_ml

import numpy as np
import collections
import functools
import json
import random
import os
import math
import re

from io import open
from tqdm import tqdm
from pprint import pprint
from typing import List
from sklearn.metrics import accuracy_score
import seaborn as sns
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader

from transformers import AdamW, AutoTokenizer, AutoModel

import pytorch_lightning as pl
from pytorch_lightning.loggers import CometLogger
from pytorch_lightning import Trainer, seed_everything

from QA.DataModule.dataset import *
from QA.DataModule.reader import *

from QA.model.BERT import *
from QA.utils.trainer import *

seed_everything(294)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
if device == 'cuda':
    from torch.cuda import LongTensor
else:
    from torch import LongTensor
print(device)

Global seed set to 294


cuda


# Данные

In [None]:
train_file_path = '/content/QA/data/sber_squad/train-v1.1.json'
dev_file_path = '/content/QA/data/sber_squad/dev-v1.1.json'
train = ReadData(train_file_path)
train_data = train.data
dev = ReadData(dev_file_path)
dev_data = dev.data

tokenizer = AutoTokenizer.from_pretrained('distilbert-base-multilingual-cased')

In [4]:
def collate_fn(examples):
    return tokenizer.pad(examples, return_tensors='pt')

train_dataset = QADataset(train_data, tokenizer)
train_iter = DataLoader(dataset=QADataset(train_data, tokenizer),
                        batch_size=4, collate_fn=collate_fn)
dev_iter = DataLoader(dataset=QADataset(dev_data, tokenizer),
                        batch_size=8, collate_fn=collate_fn)

In [5]:
next(iter(train_iter))

{'input_ids': tensor([[  101,   511, 38300,  ..., 31399,   136,   102],
        [  101, 11480, 17914,  ...,     0,     0,     0],
        [  101,   526, 44169,  ...,     0,     0,     0],
        [  101, 12624, 11657,  ...,     0,     0,     0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), 'start_positions': tensor([160,  34,  64,  98]), 'end_positions': tensor([199,  36,  66, 102])}

# Train

In [6]:
comet_logger = CometLogger(
    api_key="HWfJT3eyByVJWe4nEbi1pGosA",
    workspace="danildmitriev1999",
    project_name="qa",
    experiment_name="test",
)

CometLogger will be initialized in online mode


In [None]:
N_EPOCHS = 3
CLIP = 1

QAModel = QA2Linear('bert-base-multilingual-cased').to(device)
criterion = nn.CrossEntropyLoss().to(device)

model_trainer = ModelTrainer(QAModel, criterion).to(device)

trainer = Trainer(max_epochs=N_EPOCHS,
                gpus=1,
                gradient_clip_val=CLIP,
                progress_bar_refresh_rate=1,
                log_every_n_steps=3,
                logger=[comet_logger],
                )

In [None]:
trainer.fit(model_trainer, train_iter, dev_iter)

# Тест

In [None]:
def predict(dt, n, model):
    text = torch.tensor([val_dataset[n]['input_ids']]).to(device)
    mask = torch.tensor([val_dataset[n]['attention_mask']]).to(device)

    with torch.no_grad():
        lg_start, lg_end = model(text, mask)

    start_pred = torch.argmax(lg_start, dim=1).squeeze(-1).cpu().detach().numpy()[0]
    end_pred = torch.argmax(lg_end, dim=1).squeeze(-1).cpu().detach().numpy()[0]

    print(f"gold position: {val_dataset[n]['start_positions'], val_dataset[n]['end_positions']}")
    print(f'position predict: {start_pred, end_pred}')

    print(f"gold: {tokenizer.decode(val_dataset[n]['input_ids'][val_dataset[n]['start_positions']: val_dataset[n]['end_positions']+1])}")
    print(f"predict: {tokenizer.decode(val_dataset[n]['input_ids'][start_pred: end_pred+1])}")

In [None]:
val_dataset = QADataset(dev_data, tokenizer)
trained_model = model_trainer.model
trained_model = trained_model.to(device)

In [None]:
dev_data[5]

In [None]:
predict(dev_data, 5, trained_model)