# Настройка среды

In [1]:
!curl https://raw.githubusercontent.com/pytorch/xla/master/contrib/scripts/env-setup.py -o pytorch-xla-env-setup.py > /dev/null
!pip install torch
!pip install torchvision
!python pytorch-xla-env-setup.py --version 20200529 --apt-packages libomp5 libopenblas-dev > /dev/null
!pip install transformers > /dev/null
!pip install pandarallel > /dev/null

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  4264  100  4264    0     0  12615      0 --:--:-- --:--:-- --:--:-- 12615
Copying gs://tpu-pytorch/wheels/torch-nightly+20200529-cp37-cp37m-linux_x86_64.whl...

Operation completed over 1 objects/91.7 MiB.                                     
Copying gs://tpu-pytorch/wheels/torch_xla-nightly+20200529-cp37-cp37m-linux_x86_64.whl...

Operation completed over 1 objects/122.1 MiB.                                    
Copying gs://tpu-pytorch/wheels/torchvision-nightly+20200529-cp37-cp37m-linux_x86_64.whl...

Operation completed over 1 objects/2.3 MiB.                                      
[31mERROR: fastai 1.0.61 requires torchvision, which is not installed.[0m
[31mERROR: kornia 0.3.1 has requirement torch==1.5.0, but you'll have torch 1.6.0a0+b08a4aa which is incompatible.[0m
[31mERROR: allennlp 0.9.0 has requi

In [2]:
import numpy as np
import pandas as pd
import os
os.environ['XLA_USE_BF16'] = "1"

import time
import random
from datetime import datetime
from tqdm import tqdm
tqdm.pandas()

from glob import glob
for path in glob(f'../input/*'):
    print(path)

import torch
import torch.nn as nn
from torch.utils.data import Dataset,DataLoader
from torch.utils.data.sampler import SequentialSampler

from transformers import XLMRobertaModel, XLMRobertaTokenizer, XLMRobertaConfig
from transformers import AdamW, get_linear_schedule_with_warmup, get_constant_schedule

import re
import nltk
nltk.download('punkt')
from nltk import sent_tokenize

from pandarallel import pandarallel
pandarallel.initialize(nb_workers=4, progress_bar=True)

import warnings
warnings.filterwarnings("ignore")

import torch_xla
import torch_xla.core.xla_model as xm
import torch_xla.distributed.parallel_loader as pl
import torch_xla.distributed.xla_multiprocessing as xmp

../input/jigsaw-multilingual-toxic-comment-classification
../input/multitpu-inference


  from pandas import Panel


[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
INFO: Pandarallel will run on 4 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


# Настройка random seed

In [3]:
def init_seed(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

# Предобработка текстов

In [4]:
def exclude_duplicate_sentences(text, lang='en'):
    sentences = []
    for sentence in sent_tokenize(text, 'english'):
        sentence = sentence.strip()
        if sentence not in sentences:
            sentences.append(sentence)
    return ' '.join(sentences)

def clean_text(text, lang='en'):
    text = str(text)
    text = re.sub(r'[0-9"]', '', text)
    text = re.sub(r'#[\S]+\b', '', text)
    text = re.sub(r'@[\S]+\b', '', text)
    text = re.sub(r'https?\S+', '', text)
    text = re.sub(r'\s+', ' ', text)
    text = exclude_duplicate_sentences(text, lang)
    return text.strip()

# Класс для работы с датасетом

In [5]:
class DatasetRetriever(Dataset):

    def __init__(self, df):
        self.ids = df['id'].values
        self.comment_texts = df['comment_text'].values
        self.tokenizer = XLMRobertaTokenizer.from_pretrained('../input/multitpu-inference')

    def get_tokens(self, text):
        encoded = self.tokenizer.encode_plus(text, add_special_tokens=True, max_length=300, pad_to_max_length=True)
        return encoded['input_ids'], encoded['attention_mask']

    def __len__(self):
        return self.ids.shape[0]

    def __getitem__(self, idx):
        text = self.comment_texts[idx]
        tokens, attention_mask = self.get_tokens(text)
        tokens, attention_mask = torch.tensor(tokens), torch.tensor(attention_mask)
        return self.ids[idx], tokens, attention_mask

In [6]:
%%time

df_test = pd.read_csv(f'../input/jigsaw-multilingual-toxic-comment-classification/test.csv')
df_test['comment_text'] = df_test.parallel_apply(lambda x: clean_text(x['content'], x['lang']), axis=1)
df_test = df_test.drop(columns=['content'])

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=15953), Label(value='0 / 15953')))…

CPU times: user 1.31 s, sys: 359 ms, total: 1.67 s
Wall time: 12.1 s


In [7]:
init_seed(8)
test_dataset = DatasetRetriever(df_test)

# Модель

In [8]:
class ToxicSimpleNNModel(nn.Module):

    def __init__(self, backbone):
        super(ToxicSimpleNNModel, self).__init__()
        self.backbone = backbone
        self.dropout = nn.Dropout(0.3)
        self.linear = nn.Linear(in_features=self.backbone.pooler.dense.out_features*2, out_features=2,)

    def forward(self, input_ids, attention_masks):
        bs, seq_length = input_ids.shape
        seq_x, _ = self.backbone(input_ids=input_ids, attention_mask=attention_masks)
        apool = torch.mean(seq_x, 1)
        mpool, _ = torch.max(seq_x, 1)
        x = torch.cat((apool, mpool), 1)
        x = self.dropout(x)
        return self.linear(x)

In [9]:
backbone = XLMRobertaModel(XLMRobertaConfig.from_pretrained('../input/multitpu-inference'))

In [10]:
class MultiTPUPredictor:
    
    def __init__(self, model, device):
        if not os.path.exists('node_submissions'):
            os.makedirs('node_submissions')

        self.model = model
        self.device = device

        xm.master_print(f'Model prepared. Device is {self.device}')

    def run_inference(self, test_loader, verbose=True, verbose_step=50):
        self.model.eval()
        result = {'id': [], 'toxic': []}
        t = time.time()
        for step, (ids, inputs, attention_masks) in enumerate(test_loader):
            if verbose:
                if step % 10 == 0:
                    xm.master_print(f'Prediction Step {step}, time: {(time.time() - t):.5f}')

            with torch.no_grad():
                inputs = inputs.to(self.device, dtype=torch.long) 
                attention_masks = attention_masks.to(self.device, dtype=torch.long)
                outputs = self.model(inputs, attention_masks)
                toxics = nn.functional.softmax(outputs, dim=1).data.cpu().numpy()[:,1]

            result['id'].extend(ids.numpy())
            result['toxic'].extend(toxics)

        result = pd.DataFrame(result)
        node_count = len(glob('node_submissions/*.csv'))
        result.to_csv(f'node_submissions/submission_{node_count}_{datetime.utcnow().microsecond}.csv', index=False)

In [11]:
net = ToxicSimpleNNModel(backbone=backbone)
checkpoint = torch.load('../input/multitpu-inference/checkpoint-xlm-roberta.bin', map_location=torch.device('cpu'))
net.load_state_dict(checkpoint);
checkpoint = None
del checkpoint

In [12]:
def fit(rank, flags):
    device = xm.xla_device()
    model = net.to(device)

    test_sampler = torch.utils.data.distributed.DistributedSampler(
        test_dataset,
        num_replicas=xm.xrt_world_size(),
        rank=xm.get_ordinal(),
        shuffle=False
    )
    test_loader = torch.utils.data.DataLoader(
        test_dataset,
        batch_size=16,
        sampler=test_sampler,
        pin_memory=False,
        drop_last=False,
        num_workers=1
    )

    fitter = MultiTPUPredictor(model=model, device=device)
    fitter.run_inference(test_loader)

# Training

In [13]:
%%time

FLAGS={}
xmp.spawn(fit, args=(FLAGS,), nprocs=8, start_method='fork')

Model prepared. Device is xla:1
Prediction Step 0, time: 0.17896
Prediction Step 10, time: 15.51774
Prediction Step 20, time: 17.58273
Prediction Step 30, time: 19.80451
Prediction Step 40, time: 22.02203
Prediction Step 50, time: 24.02253
Prediction Step 60, time: 26.21621
Prediction Step 70, time: 28.36274
Prediction Step 80, time: 30.47557
Prediction Step 90, time: 32.41090
Prediction Step 100, time: 34.56655
Prediction Step 110, time: 36.78136
Prediction Step 120, time: 39.08056
Prediction Step 130, time: 41.11180
Prediction Step 140, time: 43.08858
Prediction Step 150, time: 45.26450
Prediction Step 160, time: 47.26791
Prediction Step 170, time: 49.31654
Prediction Step 180, time: 51.39643
Prediction Step 190, time: 53.67593
Prediction Step 200, time: 55.96479
Prediction Step 210, time: 58.25276
Prediction Step 220, time: 60.38066
Prediction Step 230, time: 62.18966
Prediction Step 240, time: 64.15626
Prediction Step 250, time: 66.27520
Prediction Step 260, time: 68.27470
Predicti

In [14]:
submission = pd.concat([pd.read_csv(path) for path in glob('node_submissions/*.csv')]).groupby('id').mean()
ensemble = pd.read_csv('../input/multitpu-inference/submission-ensemble.csv', index_col='id')

In [15]:
def scale_min_max_submission(submission):
    min_, max_ = submission['toxic'].min(), submission['toxic'].max()
    submission['toxic'] = (submission['toxic'] - min_) / (max_ - min_)
    return submission

In [16]:
submission['toxic'] = (scale_min_max_submission(submission)['toxic'] + scale_min_max_submission(ensemble)['toxic']) / 2
submission.to_csv('submission.csv')