# Solution

In [12]:
# ====================================================
# Directory settings
# ====================================================
import os
import sys
INPUT_DIR = '../input/us-patent-phrase-to-phrase-matching/'
OUTPUT_DIR = './'
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

In [13]:
import os
import gc
import math
import time
import random
from sklearn import preprocessing
from sklearn import model_selection
from dataclasses import dataclass
from typing import Optional
from tqdm import tqdm
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
import pandas as pd
import numpy as np
import scipy.stats as sp

os.system('pip uninstall -y transformers')
os.system('pip uninstall -y tokenizers')
os.system('python -m pip install --no-index --find-links=../input/pppm-pip-wheels-dataset transformers')
os.system('python -m pip install --no-index --find-links=../input/pppm-pip-wheels-dataset tokenizers')
import tokenizers
import transformers
print(f"tokenizers.__version__: {tokenizers.__version__}")
print(f"transformers.__version__: {transformers.__version__}")

from transformers import AdamW, AutoConfig, AutoModel, AutoTokenizer, get_cosine_schedule_with_warmup
print(f"tokenizers.__version__: {tokenizers.__version__}")
print(f"transformers.__version__: {transformers.__version__}")
from transformers import AutoTokenizer, AutoModel, AutoConfig, AutoModelForTokenClassification
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup
%env TOKENIZERS_PARALLELISM=true


import warnings 
warnings.filterwarnings('ignore')

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

Found existing installation: transformers 4.18.0
Uninstalling transformers-4.18.0:
  Successfully uninstalled transformers-4.18.0




Found existing installation: tokenizers 0.12.1
Uninstalling tokenizers-0.12.1:
  Successfully uninstalled tokenizers-0.12.1




Looking in links: ../input/pppm-pip-wheels-dataset
Processing /kaggle/input/pppm-pip-wheels-dataset/transformers-4.16.2-py3-none-any.whl
Processing /kaggle/input/pppm-pip-wheels-dataset/tokenizers-0.11.6-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl
Installing collected packages: tokenizers, transformers
Successfully installed tokenizers-0.11.6 transformers-4.16.2




Looking in links: ../input/pppm-pip-wheels-dataset




tokenizers.__version__: 0.12.1
transformers.__version__: 4.18.0
tokenizers.__version__: 0.12.1
transformers.__version__: 4.18.0
env: TOKENIZERS_PARALLELISM=true


In [14]:
# ====================================================
# Utils
# ====================================================
def get_score(y_true, y_pred):
    score = sp.stats.pearsonr(y_true, y_pred)[0]
    return score


def get_logger(filename=OUTPUT_DIR+'train'):
    from logging import getLogger, INFO, StreamHandler, FileHandler, Formatter
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=f"{filename}.log")
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

LOGGER = get_logger()

def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_everything(seed=42)

In [15]:
# ====================================================
# Data Loading
# ====================================================
test = pd.read_csv(INPUT_DIR+'test.csv')
submission = pd.read_csv(INPUT_DIR+'sample_submission.csv')
print(f"test.shape: {test.shape}")
print(f"submission.shape: {submission.shape}")

test.shape: (36, 4)
submission.shape: (36, 2)


In [16]:
# ====================================================
# CPC Data
# ====================================================
cpc_texts = torch.load('../input/pppm-debertav3large-baseline/cpc_texts.pth')
test['context_text'] = test['context'].map(cpc_texts)

In [17]:
test['text'] = test['anchor'] + '[SEP]' + test['target'] + '[SEP]'  + test['context_text']

In [18]:
# ====================================================
# Dataset
# ====================================================
def prepare_input(cfg, text):
    inputs = cfg.tokenizer(text,
                           add_special_tokens=True,
                           max_length=cfg.max_len,
                           padding="max_length",
                           return_offsets_mapping=False)
    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype=torch.long)
    return inputs


class TestDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.texts = df['text'].values

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        inputs = prepare_input(self.cfg, self.texts[item])
        return inputs

In [19]:
# ====================================================
# inference
# ====================================================
def inference_fn(test_loader, model, device):
    preds = []
    model.eval()
    model.to(device)
    tk0 = tqdm(test_loader, total=len(test_loader))
    for inputs in tk0:
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        with torch.no_grad():
            y_preds = model(inputs)
        preds.append(y_preds.sigmoid().to('cpu').numpy())
    predictions = np.concatenate(preds)
    return predictions

In [20]:
# ====================================================
# inference(MSE)
# ====================================================
def inference_fn_mse(test_loader, model, device):
    preds = []
    model.eval()
    model.to(device)
    tk0 = tqdm(test_loader, total=len(test_loader))
    for inputs in tk0:
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        with torch.no_grad():
            y_preds = model(inputs)
        preds.append(y_preds.to('cpu').numpy())
    predictions = np.concatenate(preds)
    return predictions

In [21]:
# ====================================================
# Model
# ====================================================
class CustomModel(nn.Module):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__()
        self.cfg = cfg
        if config_path is None:
            self.config = AutoConfig.from_pretrained(cfg.model, output_hidden_states=True)
        else:
            self.config = torch.load(config_path)
        if pretrained:
            self.model = AutoModel.from_pretrained(cfg.model, config=self.config)
        else:
            self.model = AutoModel.from_config(self.config)
        self.fc_dropout = nn.Dropout(cfg.fc_dropout)
        self.fc = nn.Linear(self.config.hidden_size, self.cfg.target_size)
        self._init_weights(self.fc)
        self.attention = nn.Sequential(
            nn.Linear(self.config.hidden_size, 512),
            nn.Tanh(),
            nn.Linear(512, 1),
            nn.Softmax(dim=1)
        )
        self._init_weights(self.attention)
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        
    def feature(self, inputs):
        outputs = self.model(**inputs)
        last_hidden_states = outputs[0]
        # feature = torch.mean(last_hidden_states, 1)
        weights = self.attention(last_hidden_states)
        feature = torch.sum(weights * last_hidden_states, dim=1)
        return feature

    def forward(self, inputs):
        feature = self.feature(inputs)
        output = self.fc(self.fc_dropout(feature))
        return output

In [22]:
class TransformerHead(nn.Module):
    def __init__(self, in_features, max_length, num_layers=1, nhead=8, num_targets=1):
        super().__init__()

        self.transformer = nn.TransformerEncoder(encoder_layer=nn.TransformerEncoderLayer(d_model=in_features,
                                                                                          nhead=nhead),
                                                 num_layers=num_layers)
        self.row_fc = nn.Linear(in_features, 1)
        self.out_features = max_length

    def forward(self, x):
        out = self.transformer(x)
        out = self.row_fc(out).squeeze(-1)
        return out


# ====================================================
# Model
# ====================================================
class Th(nn.Module):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__()
        self.cfg = cfg
        if config_path is None:
            self.config = AutoConfig.from_pretrained(cfg.model, output_hidden_states=True)
        else:
            self.config = torch.load(config_path)
        if pretrained:
            self.model = AutoModel.from_pretrained(cfg.model, config=self.config)
        else:
            self.model = AutoModel.from_config(self.config)

        self.feature_extractor = AutoModelForTokenClassification.from_pretrained(cfg.model)
        in_features = self.feature_extractor.classifier.in_features
        self.attention = TransformerHead(in_features=in_features, max_length=cfg.max_len, num_layers=1, nhead=8, num_targets=1)
        self.fc_dropout = nn.Dropout(cfg.fc_dropout)
        self.fc = nn.Linear(self.attention.out_features, self.cfg.target_size)
        self._init_weights(self.fc)
        self._init_weights(self.attention)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)

    def feature(self, inputs):
        outputs = self.model(**inputs)
        last_hidden_states = outputs[0]
        # feature = torch.mean(last_hidden_states, 1)
        feature = self.attention(last_hidden_states)

        return feature

    def forward(self, inputs):
        feature = self.feature(inputs)
        # print(feature.shape)
        output = self.fc(self.fc_dropout(feature))
        return output

In [23]:
# ====================================================
# Model(MSE)
# ====================================================
class MSEModel(nn.Module):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__()
        self.cfg = cfg
        if config_path is None:
            self.config = AutoConfig.from_pretrained(cfg.model, output_hidden_states=True)
        else:
            self.config = torch.load(config_path)
        if pretrained:
            '''if num == 0:
                self.model = AutoModel.from_pretrained(cfg.model0, config=self.config)
            if num == 1:
                self.model = AutoModel.from_pretrained(cfg.model1, config=self.config)
            if num == 2:
                self.model = AutoModel.from_pretrained(cfg.model2, config=self.config)
            if num == 3:
                self.model = AutoModel.from_pretrained(cfg.model3, config=self.config)'''
            self.model = AutoModel.from_pretrained(cfg.model, config=self.config)
        else:
            self.model = AutoModel.from_config(self.config)
        self.fc_dropout = nn.Dropout(cfg.fc_dropout)
        self.fc = nn.Linear(self.config.hidden_size, self.cfg.target_size)
        self._init_weights(self.fc)
        self.attention = nn.Sequential(
            nn.Linear(self.config.hidden_size, 512),
            nn.Tanh(),
            nn.Linear(512, 1),
            nn.Softmax(dim=1)
        )
        self._init_weights(self.attention)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)


    def feature(self, inputs):
        outputs = self.model(**inputs)
        last_hidden_states = outputs[0]
        # 取出mask
        attention_mask = inputs['attention_mask']
        attention_mask = attention_mask.unsqueeze(2)
        # 将padding置为0
        feature = last_hidden_states * attention_mask
        # 平均只包含非padding部分
        feature = torch.sum(feature, 1) / torch.sum(attention_mask, dim=1)
        return feature



    def forward(self, inputs):
        feature = self.feature(inputs)
        output = self.fc(self.fc_dropout(feature))
        return output

In [24]:
# ====================================================
# Model
# ====================================================
class ELECTRAModel(nn.Module):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__()
        self.cfg = cfg
        if config_path is None:
            self.config = AutoConfig.from_pretrained(cfg.model, output_hidden_states=True)
        else:
            self.config = torch.load(config_path)
        if pretrained:
            self.model = AutoModel.from_pretrained(cfg.model, config=self.config)
        else:
            self.model = AutoModel.from_config(self.config)
        self.fc_dropout = nn.Dropout(cfg.fc_dropout)
        self.fc = nn.Linear(self.config.hidden_size, self.cfg.target_size)
        self._init_weights(self.fc)
        self.attention = nn.Sequential(
            nn.Linear(self.config.hidden_size, 512),
            nn.Tanh(),
            nn.Linear(512, 1),
            nn.Softmax(dim=1)
        )
        self.layer_norm1 = nn.LayerNorm(self.config.hidden_size)
        self._init_weights(self.attention)
        self.linear = nn.Linear(self.config.hidden_size, 1)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)

    def feature(self, inputs):
        outputs = self.model(**inputs)
        last_hidden_states = outputs[0]
        # feature = torch.mean(last_hidden_states, 1)
        weights = self.attention(last_hidden_states)
        feature = torch.sum(weights * last_hidden_states, dim=1)
        return feature

    def forward(self, inputs):
        outputs = self.model(**inputs)
        last_hidden_state = outputs[0]
        input_mask_expanded = inputs["attention_mask"].unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        out = sum_embeddings / sum_mask

        out = self.layer_norm1(out)
        output = self.fc(out)

        return output

In [25]:
# ====================================================
# Model
# ====================================================
class LUKEModel(nn.Module):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__()
        self.cfg = cfg
        if config_path is None:
            self.config = LukeConfig.from_pretrained(cfg.model, output_hidden_states=True)
        else:
            self.config = torch.load(config_path)
        if pretrained:
            self.model = LukeModel.from_pretrained(cfg.model, config=self.config)
        else:
            self.model = LukeModel(self.config)
        self.fc_dropout = nn.Dropout(cfg.fc_dropout)
        self.fc = nn.Linear(self.config.hidden_size, cfg.target_size)

    def feature(self, inputs):
        outputs = self.model(**inputs)
        last_hidden_states = outputs[0]
        feature = torch.mean(last_hidden_states, 1)
        return feature

    def forward(self, inputs):
        feature = self.feature(inputs)
        output = self.fc(self.fc_dropout(feature))
        return output

In [26]:
# ====================================================
# Model
# ====================================================
class BARTModel(nn.Module):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__()
        self.cfg = cfg
        if config_path is None:
            self.config = BartConfig.from_pretrained(cfg.model, output_hidden_states=True)
        else:
            self.config = torch.load(config_path)
        if pretrained:
            self.model = BartModel.from_pretrained(cfg.model, config=self.config)
        else:
            self.model = BartModel(self.config)
        self.fc_dropout = nn.Dropout(cfg.fc_dropout)
        self.fc = nn.Linear(self.config.hidden_size, cfg.target_size)

    def feature(self, inputs):
        outputs = self.model(**inputs)
        last_hidden_states = outputs[0]
        feature = torch.mean(last_hidden_states, 1)
        return feature

    def forward(self, inputs):
        feature = self.feature(inputs)
        output = self.fc(self.fc_dropout(feature))
        return output

# 1.deberta-v3-small-4folds transformer head

In [27]:
# ====================================================
# CFG
# ====================================================
class CFG:
    num_workers=4
    config_path='../input/uspppmdebertasmalltransformer-head08440/USPPPM-deberta-small-transformer_head-0.8440/outputconfig.pth'
    model="../input/deberta-v3-small/deberta-v3-small"
    tokenizer_path='../input/uspppmdebertaltransformer-head08624/outputtokenizer'
    batch_size=32
    fc_dropout=0.2
    target_size=1
    max_len=133
    seed=42
    n_fold=4
    trn_fold=[0, 1, 2, 3]

In [28]:
# ====================================================
# tokenizer
# ====================================================
CFG.tokenizer = AutoTokenizer.from_pretrained(CFG.tokenizer_path)

In [29]:
test_dataset = TestDataset(CFG, test)
test_loader = DataLoader(test_dataset,
                         batch_size=CFG.batch_size,
                         shuffle=False,
                         num_workers=CFG.num_workers, pin_memory=True, drop_last=False)
predictions = []
for fold in CFG.trn_fold:
    model = Th(CFG, config_path=CFG.config_path, pretrained=False)
    state = torch.load(f"../input/uspppmdebertasmalltransformer-head08440/USPPPM-deberta-small-transformer_head-0.8440/output-tmp-Allen-Pycharm-input-deberta-v3-small_fold{fold}_best.pth")
    model.load_state_dict(state['model'])
    prediction = inference_fn(test_loader, model, device)
    predictions.append(prediction)
    del model, state, prediction; gc.collect()
    torch.cuda.empty_cache()
pred1 = np.mean(predictions, axis=0)

Some weights of the model checkpoint at ../input/deberta-v3-small/deberta-v3-small were not used when initializing DebertaV2ForTokenClassification: ['lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.dense.bias', 'mask_predictions.LayerNorm.bias', 'mask_predictions.dense.weight', 'mask_predictions.classifer.bias', 'lm_predictions.lm_head.dense.bias', 'deberta.embeddings.position_embeddings.weight', 'lm_predictions.lm_head.LayerNorm.weight', 'mask_predictions.LayerNorm.weight', 'mask_predictions.classifer.weight']
- This IS expected if you are initializing DebertaV2ForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2ForTokenClassification from the checkpoint of a model that you expect to be exactly identical (in

# 3.bert for patents 4folds

In [30]:
# ====================================================
# CFG
# ====================================================
class CFG:
    num_workers=4
    config_path='../input/uspppmbertforpatents/bert-for-patents/outputconfig.pth'
    tokenizer_path='../input/uspppmbertforpatents/bert-for-patents/outputtokenizer/'
    #model="microsoft/deberta-v3-large"
    batch_size=32
    fc_dropout=0.2
    target_size=1
    max_len=117
    seed=42
    n_fold=4
    trn_fold=[0, 1, 2, 3]

In [31]:
CFG.tokenizer = AutoTokenizer.from_pretrained(CFG.tokenizer_path)

In [32]:
test_dataset = TestDataset(CFG, test)
test_loader = DataLoader(test_dataset,
                         batch_size=CFG.batch_size,
                         shuffle=False,
                         num_workers=CFG.num_workers, pin_memory=True, drop_last=False)
predictions = []
for fold in CFG.trn_fold:
    model = CustomModel(CFG, config_path=CFG.config_path, pretrained=False)
    state = torch.load(f"../input/uspppmbertforpatents/bert-for-patents/output-tmp-pycharm_project_926-input-bert-for-patents_fold{fold}_best.pth")
    model.load_state_dict(state['model'])
    prediction = inference_fn(test_loader, model, device)
    predictions.append(prediction)
    del model, state, prediction; gc.collect()
    torch.cuda.empty_cache()
pred3 = np.mean(predictions, axis=0)

100%|██████████| 2/2 [00:00<00:00,  2.18it/s]
100%|██████████| 2/2 [00:00<00:00,  2.13it/s]
100%|██████████| 2/2 [00:00<00:00,  2.21it/s]
100%|██████████| 2/2 [00:00<00:00,  2.17it/s]


# 4.Deberta large

In [33]:
# ====================================================
# CFG
# ====================================================
class CFG:
    num_workers=4
    config_path='../input/uspppmdebertav3large4foldscv08612/uspppm-deberta-large-4folds-0.8612/outputconfig.pth'
    #model="microsoft/deberta-v3-large"
    tokenizer_path='../input/uspppmdebertaltransformer-head08624/outputtokenizer'
    batch_size=32
    fc_dropout=0.2
    target_size=1
    max_len=133
    seed=42
    n_fold=4
    trn_fold=[0, 1, 2, 3]

In [34]:
CFG.tokenizer = AutoTokenizer.from_pretrained(CFG.tokenizer_path)

In [35]:
test_dataset = TestDataset(CFG, test)
test_loader = DataLoader(test_dataset,
                         batch_size=CFG.batch_size,
                         shuffle=False,
                         num_workers=CFG.num_workers, pin_memory=True, drop_last=False)
predictions = []
for fold in CFG.trn_fold:
    model = CustomModel(CFG, config_path=CFG.config_path, pretrained=False)
    state = torch.load(f'../input/uspppmdebertav3large4foldscv08612/uspppm-deberta-large-4folds-0.8612/output-tmp-Allen-Pycharm-input-deberta-v3-large_fold{fold}_best.pth')
    model.load_state_dict(state['model'])
    prediction = inference_fn(test_loader, model, device)
    predictions.append(prediction)
    del model, state, prediction; gc.collect()
    torch.cuda.empty_cache()
pred4 = np.mean(predictions, axis=0)

100%|██████████| 2/2 [00:01<00:00,  1.67it/s]
100%|██████████| 2/2 [00:01<00:00,  1.79it/s]
100%|██████████| 2/2 [00:01<00:00,  1.79it/s]
100%|██████████| 2/2 [00:01<00:00,  1.74it/s]


# 6. Deberta-large-transformer_head

In [36]:
# ====================================================
# CFG
# ====================================================
class CFG:
    num_workers=4
    config_path='../input/uspppmdebertaltransformer-head08624/outputconfig.pth'
    model='../input/deberta-v3-large/deberta-v3-large'
    tokenizer_path='../input/uspppmdebertaltransformer-head08624/outputtokenizer'
    batch_size=32
    fc_dropout=0.2
    target_size=1
    max_len=133
    seed=42
    n_fold=4
    trn_fold=[0, 1, 2, 3]

In [37]:
CFG.tokenizer = AutoTokenizer.from_pretrained(CFG.tokenizer_path)

In [38]:
test_dataset = TestDataset(CFG, test)
test_loader = DataLoader(test_dataset,
                         batch_size=CFG.batch_size,
                         shuffle=False,
                         num_workers=CFG.num_workers, pin_memory=True, drop_last=False)
predictions = []
for fold in CFG.trn_fold:
    model = Th(CFG, config_path=CFG.config_path, pretrained=False)
    state = torch.load(f'../input/uspppmdebertaltransformer-head08624/USPPPM-deberta-large-transformer_head-0.8624/output-tmp-Allen-Pycharm-input-deberta-v3-large_fold{fold}_best.pth')
    model.load_state_dict(state['model'])
    prediction = inference_fn(test_loader, model, device)
    predictions.append(prediction)
    del model, state, prediction; gc.collect()
    torch.cuda.empty_cache()
pred6 = np.mean(predictions, axis=0)

Some weights of the model checkpoint at ../input/deberta-v3-large/deberta-v3-large were not used when initializing DebertaV2ForTokenClassification: ['lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.bias', 'mask_predictions.LayerNorm.bias', 'mask_predictions.dense.bias', 'mask_predictions.dense.weight', 'mask_predictions.classifer.bias', 'deberta.embeddings.position_embeddings.weight', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.LayerNorm.weight', 'mask_predictions.classifer.weight']
- This IS expected if you are initializing DebertaV2ForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2ForTokenClassification from the checkpoint of a model that you expect to be exactly identical (in

# 7.deberta-base-transformer_head-0.8503

In [39]:
# ====================================================
# CFG
# ====================================================
class CFG:
    num_workers=4
    config_path='../input/uspppmdebertabasetransformer-head08503/USPPPM-deberta-base-transformer_head-0.8503/outputconfig.pth'
    model='../input/deberta-v3-base/deberta-v3-base'
    tokenizer_path='../input/uspppmdebertaltransformer-head08624/outputtokenizer'
    batch_size=32
    fc_dropout=0.2
    target_size=1
    max_len=133
    seed=42
    n_fold=4
    trn_fold=[0, 1, 2, 3]

In [40]:
CFG.tokenizer = AutoTokenizer.from_pretrained(CFG.tokenizer_path)

In [41]:
test_dataset = TestDataset(CFG, test)
test_loader = DataLoader(test_dataset,
                         batch_size=CFG.batch_size,
                         shuffle=False,
                         num_workers=CFG.num_workers, pin_memory=True, drop_last=False)
predictions = []
for fold in CFG.trn_fold:
    model = Th(CFG, config_path=CFG.config_path, pretrained=False)
    state = torch.load(f'../input/uspppmdebertabasetransformer-head08503/USPPPM-deberta-base-transformer_head-0.8503/output-tmp-Allen-Pycharm-input-deberta-v3-base_fold{fold}_best.pth')
    model.load_state_dict(state['model'])
    prediction = inference_fn(test_loader, model, device)
    predictions.append(prediction)
    del model, state, prediction; gc.collect()
    torch.cuda.empty_cache()
pred7 = np.mean(predictions, axis=0)

Some weights of the model checkpoint at ../input/deberta-v3-base/deberta-v3-base were not used when initializing DebertaV2ForTokenClassification: ['lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.dense.bias', 'mask_predictions.LayerNorm.bias', 'mask_predictions.dense.weight', 'mask_predictions.classifer.bias', 'lm_predictions.lm_head.dense.bias', 'deberta.embeddings.position_embeddings.weight', 'lm_predictions.lm_head.LayerNorm.weight', 'mask_predictions.LayerNorm.weight', 'mask_predictions.classifer.weight']
- This IS expected if you are initializing DebertaV2ForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2ForTokenClassification from the checkpoint of a model that you expect to be exactly identical (init

# 8.bert for patents Transformer_head-0.8493

In [42]:
# ====================================================
# CFG
# ====================================================
class CFG:
    num_workers=4
    path="../input/uspppmberttransformer-head08493/USPPPM-bert-for-patents-transformer_head--0.8493/outputconfig.pth"
    config_path='../input/uspppmberttransformer-head08493/USPPPM-bert-for-patents-transformer_head--0.8493/outputconfig.pth'
    model='../input/bert-for-patents'
    tokenizer_path='../input/uspppmberttransformer-head08493/USPPPM-bert-for-patents-transformer_head--0.8493/outputtokenizer'
    batch_size=32
    fc_dropout=0.2
    target_size=1
    max_len=117
    seed=42
    n_fold=4
    trn_fold=[0, 1, 2, 3]

In [43]:
CFG.tokenizer = AutoTokenizer.from_pretrained(CFG.tokenizer_path)

In [44]:
test_dataset = TestDataset(CFG, test)
test_loader = DataLoader(test_dataset,
                         batch_size=CFG.batch_size,
                         shuffle=False,
                         num_workers=CFG.num_workers, pin_memory=True, drop_last=False)
predictions = []
for fold in CFG.trn_fold:
    model = Th(CFG, config_path=CFG.config_path, pretrained=False)
    state = torch.load(f'../input/uspppmberttransformer-head08493/USPPPM-bert-for-patents-transformer_head--0.8493/output-tmp-Allen-Pycharm-input-bert-for-patents_fold{fold}_best.pth')
    model.load_state_dict(state['model'])
    prediction = inference_fn(test_loader, model, device)
    predictions.append(prediction)
    del model, state, prediction; gc.collect()
    torch.cuda.empty_cache()
pred8 = np.mean(predictions, axis=0)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at ../input/bert-for-patents and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 2/2 [00:01<00:00,  1.81it/s]
Some weights of BertForTokenClassification were not initialized from the model checkpoint at ../input/bert-for-patents and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 2/2 [00:01<00:00,  1.76it/s]
Some weights of BertForTokenClassification were not initialized from the model checkpoint at ../input/bert-for-patents and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|█████████

# 10.Roberta large

In [45]:
# ====================================================
# CFG
# ====================================================
class CFG:
    num_workers=4
    config_path='../input/uspppmrobertalarge08406/roberta/config.pth'
    #model="microsoft/deberta-v3-large"
    tokenizer_path='../input/uspppmrobertalarge08406/roberta/tokenizer'
    batch_size=32
    fc_dropout=0.2
    target_size=1
    max_len=175
    seed=42
    n_fold=4
    trn_fold=[0, 1, 2, 3]

In [46]:
CFG.tokenizer = AutoTokenizer.from_pretrained(CFG.tokenizer_path)

In [47]:
test_dataset = TestDataset(CFG, test)
test_loader = DataLoader(test_dataset,
                         batch_size=CFG.batch_size,
                         shuffle=False,
                         num_workers=CFG.num_workers, pin_memory=True, drop_last=False)
predictions = []
for fold in CFG.trn_fold:
    model = CustomModel(CFG, config_path=CFG.config_path, pretrained=False)
    state = torch.load(f'../input/uspppmrobertalarge08406/roberta/-tmp-Allen-input-roberta-large_fold{fold}_best.pth')
    model.load_state_dict(state['model'])
    prediction = inference_fn(test_loader, model, device)
    predictions.append(prediction)
    del model, state, prediction; gc.collect()
    torch.cuda.empty_cache()
pred10 = np.mean(predictions, axis=0)

100%|██████████| 2/2 [00:01<00:00,  1.86it/s]
100%|██████████| 2/2 [00:01<00:00,  1.77it/s]
100%|██████████| 2/2 [00:01<00:00,  1.81it/s]
100%|██████████| 2/2 [00:01<00:00,  1.80it/s]


# 11.mnli-deberta-large

In [48]:
# ====================================================
# CFG
# ====================================================
class CFG:
    num_workers=4
    config_path='../input/uspppmmnlidebertalarge08504/USPPPM-mnli-deberta-large/config.pth'
    #model="microsoft/deberta-v3-large"
    tokenizer_path='../input/uspppmmnlidebertalarge08504/USPPPM-mnli-deberta-large/tokenizer'
    batch_size=32
    fc_dropout=0.2
    target_size=1
    max_len=175
    seed=42
    n_fold=4
    trn_fold=[0, 1, 2, 3]

In [49]:
CFG.tokenizer = AutoTokenizer.from_pretrained(CFG.tokenizer_path)

In [50]:
test_dataset = TestDataset(CFG, test)
test_loader = DataLoader(test_dataset,
                         batch_size=CFG.batch_size,
                         shuffle=False,
                         num_workers=CFG.num_workers, pin_memory=True, drop_last=False)
predictions = []
for fold in CFG.trn_fold:
    model = CustomModel(CFG, config_path=CFG.config_path, pretrained=False)
    state = torch.load(f'../input/uspppmmnlidebertalarge08504/USPPPM-mnli-deberta-large/-root-autodl-tmp-input-deberta-large-mnli_fold{fold}_best.pth')
    model.load_state_dict(state['model'])
    prediction = inference_fn(test_loader, model, device)
    predictions.append(prediction)
    del model, state, prediction; gc.collect()
    torch.cuda.empty_cache()
pred11 = np.mean(predictions, axis=0)

100%|██████████| 2/2 [00:01<00:00,  1.42it/s]
100%|██████████| 2/2 [00:01<00:00,  1.56it/s]
100%|██████████| 2/2 [00:01<00:00,  1.43it/s]
100%|██████████| 2/2 [00:01<00:00,  1.55it/s]


# 12.roberta-base

In [51]:
# ====================================================
# CFG
# ====================================================
class CFG:
    num_workers=4
    config_path='../input/uspppmrobertabase08063/USPPPM-roberta-base/config.pth'
    #model="microsoft/deberta-v3-large"
    tokenizer_path='../input/uspppmrobertabase08063/USPPPM-roberta-base/tokenizer'
    batch_size=32
    fc_dropout=0.2
    target_size=1
    max_len=175
    seed=42
    n_fold=4
    trn_fold=[0, 1, 2, 3]

In [52]:
CFG.tokenizer = AutoTokenizer.from_pretrained(CFG.tokenizer_path)

In [53]:
test_dataset = TestDataset(CFG, test)
test_loader = DataLoader(test_dataset,
                         batch_size=CFG.batch_size,
                         shuffle=False,
                         num_workers=CFG.num_workers, pin_memory=True, drop_last=False)
predictions = []
for fold in CFG.trn_fold:
    model = CustomModel(CFG, config_path=CFG.config_path, pretrained=False)
    state = torch.load(f'../input/uspppmrobertabase08063/USPPPM-roberta-base/-root-autodl-tmp-input-roberta-base_fold{fold}_best.pth')
    model.load_state_dict(state['model'])
    prediction = inference_fn(test_loader, model, device)
    predictions.append(prediction)
    del model, state, prediction; gc.collect()
    torch.cuda.empty_cache()
pred12 = np.mean(predictions, axis=0)

100%|██████████| 2/2 [00:00<00:00,  3.02it/s]
100%|██████████| 2/2 [00:00<00:00,  2.92it/s]
100%|██████████| 2/2 [00:00<00:00,  2.40it/s]
100%|██████████| 2/2 [00:00<00:00,  2.92it/s]


# 13.deberta-mnli-th

In [54]:
# ====================================================
# CFG
# ====================================================
class CFG:
    num_workers=4
    config_path='../input/uspppmmnlith08477/USPPPM-mnli-th-0.8477/config.pth'
    model="../input/microsoft-deberta-large-mnli"
    tokenizer_path='../input/uspppmmnlith08477/USPPPM-mnli-th-0.8477/tokenizer'
    batch_size=32
    fc_dropout=0.2
    target_size=1
    max_len=175
    seed=42
    n_fold=4
    trn_fold=[0, 1, 2, 3]

In [55]:
CFG.tokenizer = AutoTokenizer.from_pretrained(CFG.tokenizer_path)

In [56]:
test_dataset = TestDataset(CFG, test)
test_loader = DataLoader(test_dataset,
                         batch_size=CFG.batch_size,
                         shuffle=False,
                         num_workers=CFG.num_workers, pin_memory=True, drop_last=False)
predictions = []
for fold in CFG.trn_fold:
    model = Th(CFG, config_path=CFG.config_path, pretrained=False)
    state = torch.load(f'../input/uspppmmnlith08477/USPPPM-mnli-th-0.8477/-root-autodl-tmp-input-deberta-large-mnli_fold{fold}_best.pth')
    model.load_state_dict(state['model'])
    prediction = inference_fn(test_loader, model, device)
    predictions.append(prediction)
    del model, state, prediction; gc.collect()
    torch.cuda.empty_cache()
pred13 = np.mean(predictions, axis=0)

Some weights of DebertaForTokenClassification were not initialized from the model checkpoint at ../input/microsoft-deberta-large-mnli and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 2/2 [00:01<00:00,  1.27it/s]
Some weights of DebertaForTokenClassification were not initialized from the model checkpoint at ../input/microsoft-deberta-large-mnli and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 2/2 [00:01<00:00,  1.29it/s]
Some weights of DebertaForTokenClassification were not initialized from the model checkpoint at ../input/microsoft-deberta-large-mnli and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it 

# 16. MSE Deberta Large

In [57]:
# ====================================================
# CFG
# ====================================================
class CFG:
    num_workers=4
    config_path='../input/uspppmmsedebertav3large/MSE-deberta-large-0.8457/config.pth'
    #model="../input/microsoft-deberta-large-mnli"
    tokenizer_path='../input/uspppmmsedebertav3large/MSE-deberta-large-0.8457/tokenizer'
    batch_size=32
    fc_dropout=0.2
    target_size=1
    max_len=133
    seed=42
    n_fold=4
    trn_fold=[0, 1, 2, 3]

In [58]:
CFG.tokenizer = AutoTokenizer.from_pretrained(CFG.tokenizer_path)

In [59]:
test_dataset = TestDataset(CFG, test)
test_loader = DataLoader(test_dataset,
                         batch_size=CFG.batch_size,
                         shuffle=False,
                         num_workers=CFG.num_workers, pin_memory=True, drop_last=False)
predictions = []
for fold in CFG.trn_fold:
    model = MSEModel(CFG, config_path=CFG.config_path, pretrained=False)
    state = torch.load(f'../input/uspppmmsedebertav3large/MSE-deberta-large-0.8457/-tmp-Allen-input-deberta-v3-large_fold{fold}_best.pth')
    model.load_state_dict(state['model'])
    prediction = inference_fn_mse(test_loader, model, device)
    predictions.append(prediction)
    del model, state, prediction; gc.collect()
    torch.cuda.empty_cache()
pred16 = np.mean(predictions, axis=0)

100%|██████████| 2/2 [00:01<00:00,  1.84it/s]
100%|██████████| 2/2 [00:01<00:00,  1.76it/s]
100%|██████████| 2/2 [00:01<00:00,  1.53it/s]
100%|██████████| 2/2 [00:01<00:00,  1.71it/s]


# 18.patentSBERTa

In [60]:
# ====================================================
# CFG
# ====================================================
class CFG:
    num_workers=4
    config_path='../input/uspppmpatentsberta08313/patentSBERTa/config.pth'
    #model="../input/microsoft-deberta-large-mnli"
    tokenizer_path='../input/uspppmpatentsberta08313/tokenizer'
    batch_size=32
    fc_dropout=0.2
    target_size=1
    max_len=125
    seed=42
    n_fold=4
    trn_fold=[0, 1, 2, 3]

In [61]:
CFG.tokenizer = AutoTokenizer.from_pretrained(CFG.tokenizer_path)

In [62]:
test_dataset = TestDataset(CFG, test)
test_loader = DataLoader(test_dataset,
                         batch_size=CFG.batch_size,
                         shuffle=False,
                         num_workers=CFG.num_workers, pin_memory=True, drop_last=False)
predictions = []
for fold in CFG.trn_fold:
    model = CustomModel(CFG, config_path=CFG.config_path, pretrained=False)
    state = torch.load(f'../input/uspppmpatentsberta08313/patentSBERTa/-root-autodl-tmp-input-PatentSBERTa_fold{fold}_best.pth')
    model.load_state_dict(state['model'])
    prediction = inference_fn(test_loader, model, device)
    predictions.append(prediction)
    del model, state, prediction; gc.collect()
    torch.cuda.empty_cache()
pred18 = np.mean(predictions, axis=0)

100%|██████████| 2/2 [00:00<00:00,  2.25it/s]
100%|██████████| 2/2 [00:00<00:00,  3.24it/s]
100%|██████████| 2/2 [00:00<00:00,  3.19it/s]
100%|██████████| 2/2 [00:00<00:00,  3.32it/s]


# 19.Electra

In [63]:
# ====================================================
# CFG
# ====================================================
class CFG:
    num_workers=4
    config_path='../input/uspppmelectralarge/electra-large-0.8452/config.pth'
    #model="../input/microsoft-deberta-large-mnli"
    tokenizer_path='../input/uspppmelectralarge/electra-large-0.8452/tokenizer'
    batch_size=32
    fc_dropout=0.2
    target_size=1
    max_len=125
    seed=42
    n_fold=4
    trn_fold=[0, 1, 2, 3]

In [64]:
CFG.tokenizer = AutoTokenizer.from_pretrained(CFG.tokenizer_path)

In [65]:
test_dataset = TestDataset(CFG, test)
test_loader = DataLoader(test_dataset,
                         batch_size=CFG.batch_size,
                         shuffle=False,
                         num_workers=CFG.num_workers, pin_memory=True, drop_last=False)
predictions = []
for fold in CFG.trn_fold:
    model = ELECTRAModel(CFG, config_path=CFG.config_path, pretrained=False)
    state = torch.load(f'../input/uspppmelectralarge/electra-large-0.8452/-root-autodl-tmp-input-electra-large_fold{fold}_best.pth',
                       map_location=torch.device('cpu'))
    model.load_state_dict(state['model'])
    prediction = inference_fn(test_loader, model, device)
    predictions.append(prediction)
    del model, state, prediction; gc.collect()
    torch.cuda.empty_cache()
pred19 = np.mean(predictions, axis=0)

100%|██████████| 2/2 [00:00<00:00,  2.10it/s]
100%|██████████| 2/2 [00:01<00:00,  1.85it/s]
100%|██████████| 2/2 [00:00<00:00,  2.08it/s]
100%|██████████| 2/2 [00:00<00:00,  2.04it/s]


# 20. ernie

In [66]:
# ====================================================
# CFG
# ====================================================
class CFG:
    num_workers=4
    config_path='../input/uspppmernielarge/ernie-large-ml=125/config.pth'
    #model="../input/microsoft-deberta-large-mnli"
    tokenizer_path='../input/uspppmernielarge/ernie-large-ml=125/tokenizer'
    batch_size=32
    fc_dropout=0.2
    target_size=1
    max_len=125
    seed=42
    n_fold=4
    trn_fold=[0, 1, 2, 3]

In [67]:
CFG.tokenizer = AutoTokenizer.from_pretrained(CFG.tokenizer_path)

In [68]:
test_dataset = TestDataset(CFG, test)
test_loader = DataLoader(test_dataset,
                         batch_size=CFG.batch_size,
                         shuffle=False,
                         num_workers=CFG.num_workers, pin_memory=True, drop_last=False)
predictions = []
for fold in CFG.trn_fold:
    model = ELECTRAModel(CFG, config_path=CFG.config_path, pretrained=False)
    state = torch.load(f'../input/uspppmernielarge/ernie-large-ml=125/-root-autodl-tmp-input-ernie-large_fold{fold}_best.pth')
    model.load_state_dict(state['model'])
    prediction = inference_fn(test_loader, model, device)
    predictions.append(prediction)
    del model, state, prediction; gc.collect()
    torch.cuda.empty_cache()
pred20 = np.mean(predictions, axis=0)

100%|██████████| 2/2 [00:00<00:00,  2.10it/s]
100%|██████████| 2/2 [00:00<00:00,  2.09it/s]
100%|██████████| 2/2 [00:00<00:00,  2.08it/s]
100%|██████████| 2/2 [00:00<00:00,  2.02it/s]


# 21.funnel

In [69]:
# ====================================================
# CFG
# ====================================================
class CFG:
    num_workers=4
    config_path='../input/uspppmfunnel/funnel-ml=125/config.pth'
    #model="../input/microsoft-deberta-large-mnli"
    tokenizer_path='../input/uspppmfunnel/funnel-ml=125/tokenizer'
    batch_size=32
    fc_dropout=0.2
    target_size=1
    max_len=125
    seed=42
    n_fold=4
    trn_fold=[0, 1, 2, 3]

In [70]:
CFG.tokenizer = AutoTokenizer.from_pretrained(CFG.tokenizer_path)

In [71]:
test_dataset = TestDataset(CFG, test)
test_loader = DataLoader(test_dataset,
                         batch_size=CFG.batch_size,
                         shuffle=False,
                         num_workers=CFG.num_workers, pin_memory=True, drop_last=False)
predictions = []
for fold in CFG.trn_fold:
    model = ELECTRAModel(CFG, config_path=CFG.config_path, pretrained=False)
    state = torch.load(f'../input/uspppmfunnel/funnel-ml=125/-root-autodl-tmp-input-funnel_fold{fold}_best.pth')
    model.load_state_dict(state['model'])
    prediction = inference_fn(test_loader, model, device)
    predictions.append(prediction)
    del model, state, prediction; gc.collect()
    torch.cuda.empty_cache()
pred21 = np.mean(predictions, axis=0)

100%|██████████| 2/2 [00:00<00:00,  2.17it/s]
100%|██████████| 2/2 [00:00<00:00,  2.15it/s]
100%|██████████| 2/2 [00:00<00:00,  2.30it/s]
100%|██████████| 2/2 [00:00<00:00,  2.18it/s]


# 22.luke

In [72]:
from transformers import LukeTokenizer, LukeModel, LukeConfig

# ====================================================
# CFG
# ====================================================
class CFG:
    num_workers=4
    config_path='../input/uspppmluke/luke-large-ml=175/config.pth'
    #model="../input/microsoft-deberta-large-mnli"
    tokenizer_path='../input/uspppmluke/luke-large-ml=175/tokenizer'
    batch_size=32
    fc_dropout=0.2
    target_size=1
    max_len=175
    seed=42
    n_fold=4
    trn_fold=[0, 1, 2, 3]

In [73]:
CFG.tokenizer = LukeTokenizer.from_pretrained(CFG.tokenizer_path)

In [74]:
test_dataset = TestDataset(CFG, test)
test_loader = DataLoader(test_dataset,
                         batch_size=CFG.batch_size,
                         shuffle=False,
                         num_workers=CFG.num_workers, pin_memory=True, drop_last=False)
predictions = []
for fold in CFG.trn_fold:
    model = LUKEModel(CFG, config_path=CFG.config_path, pretrained=False)
    state = torch.load(f'../input/uspppmluke/luke-large-ml=175/studio-ousia-luke-large_fold{fold}_best.pth')
    model.load_state_dict(state['model'])
    prediction = inference_fn(test_loader, model, device)
    predictions.append(prediction)
    del model, state, prediction; gc.collect()
    torch.cuda.empty_cache()
pred22 = np.mean(predictions, axis=0)

100%|██████████| 2/2 [00:01<00:00,  1.50it/s]
100%|██████████| 2/2 [00:01<00:00,  1.71it/s]
100%|██████████| 2/2 [00:01<00:00,  1.68it/s]
100%|██████████| 2/2 [00:01<00:00,  1.69it/s]


# 23.bart large

In [75]:
from transformers import BartTokenizer, BartModel, BartConfig

# ====================================================
# CFG
# ====================================================
class CFG:
    num_workers=4
    config_path='../input/uspppmbartlarge/bart-ml=174/config.pth'
    #model="../input/microsoft-deberta-large-mnli"
    tokenizer_path='../input/uspppmbartlarge/bart-ml=174/tokenizer'
    batch_size=32
    fc_dropout=0.2
    target_size=1
    max_len=174
    seed=42
    n_fold=4
    trn_fold=[0, 1, 2, 3]

In [76]:
CFG.tokenizer = BartTokenizer.from_pretrained(CFG.tokenizer_path)

In [77]:
test_dataset = TestDataset(CFG, test)
test_loader = DataLoader(test_dataset,
                         batch_size=CFG.batch_size,
                         shuffle=False,
                         num_workers=CFG.num_workers, pin_memory=True, drop_last=False)
predictions = []
for fold in CFG.trn_fold:
    model = BARTModel(CFG, config_path=CFG.config_path, pretrained=False)
    state = torch.load(f'../input/uspppmbartlarge/bart-ml=174/-root-autodl-tmp-input-bart-large_fold{fold}_best.pth')
    model.load_state_dict(state['model'])
    prediction = inference_fn(test_loader, model, device)
    predictions.append(prediction)
    del model, state, prediction; gc.collect()
    torch.cuda.empty_cache()
pred23 = np.mean(predictions, axis=0)

100%|██████████| 2/2 [00:01<00:00,  1.57it/s]
100%|██████████| 2/2 [00:01<00:00,  1.56it/s]
100%|██████████| 2/2 [00:01<00:00,  1.51it/s]
100%|██████████| 2/2 [00:01<00:00,  1.58it/s]


# Pred2 Ensemble

In [78]:
w1 = 0.03605261 # deberta-small-transformer-head
#w2 = 0.01577283 # deberta-base
w3 = 0.18962274 # bert for patents
w4 = 0.11517471 # deberta-large
#w5 = 0.01215114 # deberta-xsmall transformer_head
w6 = 0.13769397 # deberta-large-transformer_head
w7 = 0.01575894 # deberta-base-transformer_head
w8 = 0.08336175 # bert for patents transformer_head
#w9 = 0.01323456 # roberta-large-th
w10 = 0.02831863 # roberta-large
w11 = 0.05526383 # mnli-deberta-large
w12 = 0.00992199 # roberta-base
w13 = 0.02017179 # mnli-th
#w14 = -0.03159355 # mse-deberta-small
#w15 = 0.02173322 # mse-deberta-base
w16 = 0.03500124 # mse-deberta-large
#w17 = 0.00915298 # mse-bert
w18 = 0.0057002 # patentSBERTa
w19 = 0.06964565 # electra
w20 = 0.03321846
w21 = 0.07020563
w22 = 0.02133232
w23 = 0.01152879
'''
0.8838650207947062
'''

'\n0.8838650207947062\n'

In [79]:
from sklearn.preprocessing import MinMaxScaler

MMscaler = MinMaxScaler()

pred1_mm = MMscaler.fit_transform(pred1.reshape(-1,1)).reshape(-1)
#pred2_mm = MMscaler.fit_transform(pred2.reshape(-1,1)).reshape(-1)
pred3_mm = MMscaler.fit_transform(pred3.reshape(-1,1)).reshape(-1)
pred4_mm = MMscaler.fit_transform(pred4.reshape(-1,1)).reshape(-1)
#pred5_mm = MMscaler.fit_transform(pred5.reshape(-1,1)).reshape(-1)
pred6_mm = MMscaler.fit_transform(pred6.reshape(-1,1)).reshape(-1)
pred7_mm = MMscaler.fit_transform(pred7.reshape(-1,1)).reshape(-1)
pred8_mm = MMscaler.fit_transform(pred8.reshape(-1,1)).reshape(-1)
#pred9_mm = MMscaler.fit_transform(pred9.reshape(-1,1)).reshape(-1)
pred10_mm = MMscaler.fit_transform(pred10.reshape(-1,1)).reshape(-1)
pred11_mm = MMscaler.fit_transform(pred11.reshape(-1,1)).reshape(-1)
pred12_mm = MMscaler.fit_transform(pred12.reshape(-1,1)).reshape(-1)
pred13_mm = MMscaler.fit_transform(pred13.reshape(-1,1)).reshape(-1)
#pred14_mm = MMscaler.fit_transform(pred14.reshape(-1,1)).reshape(-1)
#pred15_mm = MMscaler.fit_transform(pred15.reshape(-1,1)).reshape(-1)
pred16_mm = MMscaler.fit_transform(pred16.reshape(-1,1)).reshape(-1)
#pred17_mm = MMscaler.fit_transform(pred17.reshape(-1,1)).reshape(-1)
pred18_mm = MMscaler.fit_transform(pred18.reshape(-1,1)).reshape(-1)
pred19_mm = MMscaler.fit_transform(pred19.reshape(-1,1)).reshape(-1)
pred20_mm = MMscaler.fit_transform(pred20.reshape(-1,1)).reshape(-1)
pred21_mm = MMscaler.fit_transform(pred21.reshape(-1,1)).reshape(-1)
pred22_mm = MMscaler.fit_transform(pred22.reshape(-1,1)).reshape(-1)
pred23_mm = MMscaler.fit_transform(pred23.reshape(-1,1)).reshape(-1)


final_predictions =  pred1_mm * w1 + pred3_mm * w3 + pred4_mm * w4 + pred6_mm * w6 + pred7_mm * w7 + pred8_mm * w8 + pred10_mm * w10 + pred11_mm * w11 + pred12_mm * w12 + pred13_mm * w13 + pred16_mm * w16 + pred18_mm * w18 + pred19_mm * w19 + pred20_mm * w20 + pred21_mm * w21 + pred22_mm * w22 + pred23_mm * w23

# Submission

In [81]:
PATH = '../input/us-patent-phrase-to-phrase-matching'
sub = pd.read_csv(os.path.join(PATH, 'sample_submission.csv'))

In [82]:
sub['score'] = final_predictions
sub.to_csv('submission.csv', index=False)

In [83]:
sub.head()

Unnamed: 0,id,score
0,4112d61851461f60,0.42914
1,09e418c93a776564,0.630015
2,36baf228038e314b,0.426603
3,1f37ead645e7f0c8,0.241545
4,71a5b6ad068d531f,0.018452
