In [None]:
import os
if 'ozom671games.zip' not in os.listdir():
    from google.colab import drive
    drive.mount('/content/drive')
    !pip install transformers
    !pip install sentencepiece
    !pip install bitsandbytes
    !cp drive/MyDrive/ozon/ozom671games.zip ozom671games.zip
    !unzip ozom671games.zip

In [None]:
import pandas as pd
import json
from tqdm.notebook import tqdm
import numpy as np
from scipy.spatial.distance import cosine, euclidean
from sklearn.metrics import pairwise_distances
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
import torch

In [None]:
import random
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

seed_everything(228)

In [None]:
train = pd.read_parquet('train_data.parquet')
target = pd.read_parquet('train_pairs.parquet')

In [None]:
# json.loads(data['categories_1'][0])['3']

In [None]:
ltr = len(target)

In [None]:
test = pd.read_parquet('test_data.parquet')
test_target = pd.read_parquet('test_pairs_wo_target.parquet')

In [None]:
target = pd.concat([target, test_target]).reset_index(drop = True)

In [None]:
data = pd.concat([train, test[~test.variantid.isin(train['variantid'].unique())] ])

In [None]:
data = target.merge(data[['name', 'variantid', 'categories', 'color_parsed']], right_on = 'variantid', left_on = 'variantid1', how = 'left').merge(
    data[['name', 'variantid', 'categories', 'color_parsed']], right_on = 'variantid', left_on = 'variantid2', how = 'left', suffixes=('_1', '_2'))

In [None]:
# [len(x) for x in train['color_parsed'] if x is not None]

In [None]:
from transformers import AdamW, AutoConfig, AutoModel, AutoTokenizer, get_cosine_schedule_with_warmup
import numpy as np
import torch
from sklearn.model_selection import StratifiedKFold, GroupKFold, KFold

In [None]:
data['cat_3'] = [json.loads(x)['3'] for x in data['categories_1']]
data['tmp'] = data['target'].astype('str') + '_' + data['cat_3']

def standart_split(data, target, n_splits = 5):
    split_list = []
    kf = StratifiedKFold(n_splits = n_splits, shuffle = True, random_state = 228)
    for train_index, test_index in kf.split(data.loc[:ltr-1,:], data['tmp'][:ltr]) :
        split_list += [(train_index, test_index)]
    return split_list

split_list = standart_split(data, 'target')



In [None]:
import torch
import torch.nn as nn
from torch.nn import Parameter
import torch.nn.functional as F

from torch.optim import Adam, SGD, AdamW
from torch.utils.data import DataLoader, Dataset

In [None]:
class TrainDataset(Dataset):
    def __init__(self, df, tokenizer, max_len, shuffle = 0):
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.texts_1 = df['name_1'].values
        self.texts_2 = df['name_2'].values
        self.cats_1 = df['color_parsed_1'].values
        self.cats_2 = df['color_parsed_2'].values
        self.target = df['target'].values
        self.shuffle = shuffle

    def __len__(self):
        return len(self.texts_1)

    def __getitem__(self, item):
        inputs = [self.texts_1[item], self.texts_2[item]]

        if self.shuffle == 1:
            text_1 = self.texts_2[item] 
            text_2 = self.texts_1[item]
        elif self.shuffle == 2:
            if random.random() > 0.5:
                text_1 = self.texts_2[item] 
                text_2 = self.texts_1[item]
            else :
                text_1 = self.texts_1[item] 
                text_2 = self.texts_2[item]
        else:
            text_1 = self.texts_1[item]
            text_2 = self.texts_2[item]

        # json.loads(self.cats_2[item])['3'] + ". " + json.loads(self.cats_2[item])['4'] + ". " + self.texts_2[item]
        # drp = set(text_1.split())
        # text_2 = ' '.join([x for x in text_2.split() if x not in drp])
        token_inputs = tokenizer.encode_plus(
                    text_1,
                    text_2,
                    add_special_tokens=True,
                    max_length=self.max_len,
                    padding="do_not_pad",
                    truncation=True,
                    # padding="max_length",
                    # truncation=True,
                )
        label = self.target[item]
        return token_inputs, label

In [None]:
# random.random()

In [None]:
class CustomModel(nn.Module):
    def __init__(self, model, fc_dropout = [0.3], nn_dp = 0.1, lns = 1e-07, config_path=None, pretrained=False):
        super().__init__()
        # self.cfg = cfg
        if config_path is None:
            self.config = AutoConfig.from_pretrained(model)
        else:
            self.config = torch.load(config_path)

        self.config.update(
            {
                'hidden_dropout_prob': nn_dp,
                "output_hidden_states": True,
                'layer_norm_eps': lns,
                # "add_pooling_layer": True,
                # "use_cache" : False,
                "num_labels": 2,
            }
        )

        if pretrained:
            self.model = AutoModel.from_pretrained(model, config=self.config)
            # self.model.gradient_checkpointing_enable()
        else:
            self.model = AutoModel(self.config)
            # self.model.gradient_checkpointing_enable()
        

        self.num_dropout = len(fc_dropout)
        self.fc_dropout0 = nn.Dropout(fc_dropout[0])
        self.fc_dropout1 = nn.Dropout(fc_dropout[1] if len(fc_dropout) > 1 else 0)
        self.fc_dropout2 = nn.Dropout(fc_dropout[2] if len(fc_dropout) > 2 else 0)
        self.fc_dropout3 = nn.Dropout(fc_dropout[3] if len(fc_dropout) > 3 else 0)
        self.fc_dropout4 = nn.Dropout(fc_dropout[4] if len(fc_dropout) > 4 else 0)

        self.fc = nn.Linear(self.config.hidden_size, 1)
        # self._init_weights(self.fc)
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        
    def feature(self, inputs):
        outputs = self.model(**inputs)
        # print(outputs)
        # last_hidden_states = outputs.pooler_output
        # last_hidden_states = outputs.last_hidden_state.mean(dim=1)
        last_hidden_states = outputs[0][:,0,:].squeeze(1)
        return last_hidden_states

    def forward(self, inputs):
        feature = self.feature(inputs)
        output_list = []
        output0 = self.fc(self.fc_dropout0(feature))
        output1 = self.fc(self.fc_dropout1(feature))
        output2 = self.fc(self.fc_dropout2(feature))
        output3 = self.fc(self.fc_dropout3(feature))
        output4 = self.fc(self.fc_dropout4(feature))
        
        output_list = [output0, output1, output2, output3, output4]
        return output_list[:self.num_dropout]

In [None]:
from transformers.models.deberta_v2 import DebertaV2TokenizerFast
from tqdm.notebook import tqdm

In [None]:
def make_predict(model, valid_dataloader, criterion, epoch, valid_target):
    preds = []
    model.eval()
    len_loader = len(valid_dataloader)
    tk0 = tqdm(enumerate(valid_dataloader), total = len_loader)
    average_loss = 0
    with torch.no_grad():
        for batch_number,  (inputs, labels)  in tk0:
            for k, v in inputs.items():
                inputs[k] = v.cuda()
            labels = labels.cuda()

            with torch.cuda.amp.autocast():
                y_preds_list  = model(inputs)
                loss_list = [criterion(pred[:, 0], labels) for pred in y_preds_list]
                loss = sum(loss_list) / len(loss_list)
                
            y_preds = sum(y_preds_list) / len(y_preds_list)

            average_loss += loss.cpu().detach().numpy()
            tk0.set_postfix(loss=average_loss / (batch_number + 1), stage="validation", epoch = epoch)
            preds += [y_preds.sigmoid().to('cpu').numpy()]
    preds = np.concatenate(preds)[:, 0]
    return preds

In [None]:
class Collate:
    def __init__(self, tokenizer, is_train = True):
        self.tokenizer = tokenizer
        self.is_train = is_train
    def __call__(self, batch):

        inputs = [sample[0] for sample in batch]
        labels = [sample[1] for sample in batch]
        # print(inputs[0])

        # calculate max token length of this batch
        batch_max = max([len(ids['input_ids']) for ids in inputs])
        # add padding
        # print(inputs[0])
        inputs_dict = dict()
        inputs_dict["attention_mask"] = [s['attention_mask'] + (batch_max - len(s['attention_mask'])) * [0] for s in inputs]
        inputs_dict["input_ids"] = [s['input_ids'] + (batch_max - len(s['attention_mask'])) * [0] for s in inputs]

        # labels = [s.tolist() for s in labels]

        # convert to tensors
        inputs_dict["attention_mask"] = torch.tensor(inputs_dict["attention_mask"], dtype=torch.long)
        inputs_dict["input_ids"] = torch.tensor(inputs_dict["input_ids"], dtype=torch.long)

        labels = torch.tensor(labels, dtype=torch.float)

        return inputs_dict, labels

In [None]:
def standart_split(data, target, ltr, n_splits = 5):
    split_list = []
    kf = GroupKFold(n_splits = n_splits)
    for train_index, test_index in kf.split(data.loc[:ltr-1,:], data['target'][:ltr], data['variantid1'][:ltr]) :
        # test_index = data.loc[test_index][data.loc[test_index, 'type'] == 1].index
        split_list += [(train_index, test_index)]
    return split_list

split_list = standart_split(data, 'target', ltr)

In [None]:
max_len = 420
max_val_len = 400

batch_size = 16
epochs = 3
lr = 2e-5
fp16 = True
clip_grad_norm = 5

model_name = "DeepPavlov/rubert-base-cased"
fc_dropout= [0.1, 0.2, 0.3, 0.4, 0.5]
# fc_dropout= [0.1, 0.2, 0.3]
tokenizer = AutoTokenizer.from_pretrained(model_name)
# tokenizer.pad_token = tokenizer.eos_token
weight_decay = 0.01
eps = 1e-6
betas = (0.9, 0.99)
scheduler = 'cosine' # ['linear', 'cosine']
accumulation_steps = 1
batch_scheduler = True


params_train = {'batch_size': batch_size, 'shuffle': True, 'drop_last': False, 'num_workers': 4}
params_valid = {'batch_size': batch_size, 'shuffle': False, 'drop_last': False, 'num_workers': 4}

for fold in [0, 1, 2, 3, 4]:
    ckp = f'model_n1{fold}'
    criterion = nn.BCEWithLogitsLoss()
      
    train_df = data.loc[split_list[fold][0]].reset_index(drop=True)
    valid_df = data.loc[split_list[fold][1]].reset_index(drop=True)
    model = CustomModel(model_name, fc_dropout, pretrained = True).cuda()

    model.train()
    # model_dict = torch.load("content/roberta_base_chk/checkpoint-66668/pytorch_model.bin", map_location='cuda')
    # new_model_dict = {k.replace('bert', 'model'):v for k,v in model_dict.items()}
    # model.load_state_dict(new_model_dict, strict=False)

    scaler = torch.cuda.amp.GradScaler(enabled = True)
    collate_fn = Collate(tokenizer)
    train_dataloader = DataLoader( TrainDataset(train_df, tokenizer, max_len), collate_fn  = collate_fn, **params_train)

    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_parameters = [
        {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay)],
          'lr': lr, 'weight_decay': weight_decay},
        {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay)],
          'lr': lr, 'weight_decay': 0.0},
        {'params': [p for n, p in model.named_parameters() if "model" not in n],
          'lr': lr, 'weight_decay': 0.0}
    ]


    optimizer = AdamW(optimizer_parameters, lr=lr, eps=eps, betas=betas)
    
    num_train_steps = int(len(train_df) / batch_size * epochs) // accumulation_steps


    scheduler = torch.optim.lr_scheduler.LinearLR(optimizer, 1, 0.01, num_train_steps, -1)

    best_score = -1
    len_dataloader = len(train_dataloader)
    num_check = 7
    check_epoch_list = [int(x / num_check * len_dataloader) for x in range(num_check)]
    print(check_epoch_list)
    for epoch in range(epochs):
        if epoch == 1:
            train_dataloader = DataLoader( TrainDataset(train_df, tokenizer, max_len, 1), collate_fn  = collate_fn, **params_train)
        elif epoch == 2:
            train_dataloader = DataLoader( TrainDataset(train_df, tokenizer, max_len, 2), collate_fn  = collate_fn, **params_train)
        average_loss = 0
        tk0 = tqdm(enumerate(train_dataloader), total = len_dataloader)
        for batch_number,  (inputs, labels)  in tk0:
            # hui
            for k, v in inputs.items():
                inputs[k] = v.cuda()
            labels = labels.cuda()

            with torch.cuda.amp.autocast():
                y_preds_list  = model(inputs)
                loss_list = [criterion(pred[:, 0], labels) for pred in y_preds_list]
                loss = sum(loss_list) / len(loss_list)

            optimizer.zero_grad()
            scaler.scale(loss).backward()
            # if epoch > 1:
            #     awp.attack_backward(inputs,labels, criterion) 
            if clip_grad_norm > 0:
                scaler.unscale_(optimizer)
                torch.nn.utils.clip_grad_norm_(model.parameters(), clip_grad_norm)
            scaler.step(optimizer)
            scaler.update()
            scheduler.step()

            average_loss += loss.cpu().detach().numpy()
            tk0.set_postfix(loss=average_loss / (batch_number + 1), stage="train", epoch = epoch)


    torch.save(model.state_dict(), f'drive/MyDrive/model_ozon_3_{fold}_best.pt')

Some weights of the model checkpoint at DeepPavlov/rubert-base-cased-conversational were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


[0, 2189, 4379, 6568, 8758, 10947, 13137]


  0%|          | 0/15327 [00:00<?, ?it/s]

  0%|          | 0/15327 [00:00<?, ?it/s]

  0%|          | 0/15327 [00:00<?, ?it/s]

Some weights of the model checkpoint at DeepPavlov/rubert-base-cased-conversational were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


[0, 2189, 4379, 6568, 8758, 10947, 13137]


  0%|          | 0/15327 [00:00<?, ?it/s]

  0%|          | 0/15327 [00:00<?, ?it/s]

  0%|          | 0/15327 [00:00<?, ?it/s]

Some weights of the model checkpoint at DeepPavlov/rubert-base-cased-conversational were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


[0, 2189, 4379, 6568, 8758, 10947, 13137]


  0%|          | 0/15327 [00:00<?, ?it/s]

  0%|          | 0/15327 [00:00<?, ?it/s]

  0%|          | 0/15327 [00:00<?, ?it/s]

Some weights of the model checkpoint at DeepPavlov/rubert-base-cased-conversational were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


[0, 2189, 4379, 6568, 8758, 10947, 13137]


  0%|          | 0/15327 [00:00<?, ?it/s]



  0%|          | 0/15327 [00:00<?, ?it/s]

  0%|          | 0/15327 [00:00<?, ?it/s]