### If this notebook is helpful, please upvote [the original version](https://www.kaggle.com/code/leehann/inference-bert-for-uspatents)! (score: 0.8392)

### Version 2-8: 

```
# === add np.median ===

add_preds = []
for x in zip(*upd_predictions):
    add_preds.append(np.median(x, axis=0))
    
upd_predictions.append(add_preds)

# === add np.mean ===

[...]
```

### I am trying to improve my score: 0.8393

### Version 9: 

```
def _upd_score_between(data, thresholds, value):
    mask_th = data.between(*thresholds, inclusive='both')
    data[mask_th] = value


def upd_score(data, th_dict=None):
    [...]
    if th0:
        if isinstance(th0, float):
            th0 = (result.min(), th0)
        
        if isinstance(th0, tuple):
            _upd_score_between(result, th0, 0)
    
    if th25 and isinstance(th25, tuple):
        _upd_score_between(result, th25, 0.25)

    [...]
```

#### Calibrate scores (use thresholds)

```
thresholds_dict = {       # between(min_x, max_x, inclusive='both')
    '0': 0.02,            # (min_x, max_x) or X -> (data.min(), X)
    '.25': (0.24, 0.26),  # (min_x, max_x)
    '.50': (0.49, 0.51),  # (min_x, max_x)
    '.75': (0.74, 0.76),  # (min_x, max_x)
    '1': 0.98             # (min_x, max_x) or X -> (x, data.max())
}

submission['score'] = upd_score(submission['score'], thresholds_dict)

```

# 1. Import & Set & Def & Load

In [1]:
import os
import pandas as pd
import numpy as np

from tqdm import tqdm

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoConfig, AutoModel

from sklearn.preprocessing import MinMaxScaler

In [2]:
class CFG_DEB_SIMPLE:
    input_path = '../input/us-patent-phrase-to-phrase-matching/'
    model_path = '../input/deberta-v3-large/deberta-v3-large'
    batch_size = 24
    num_workers = 2
    num_fold = 4
    max_input_length = 130

In [3]:
class TestDataset(Dataset):
    def __init__(self, df, tokenizer, max_input_length):
        self.text = df['text'].values.astype(str)
        self.tokenizer = tokenizer
        self.max_input_length = max_input_length
        
    def __len__(self):
        return len(self.text)

    def __getitem__(self, item):
        inputs = self.text[item]
        
        inputs = self.tokenizer(inputs,
                    max_length=self.max_input_length,
                    padding='max_length',
                    truncation=True)
        
        return torch.as_tensor(inputs['input_ids'], dtype=torch.long), \
               torch.as_tensor(inputs['token_type_ids'], dtype=torch.long), \
               torch.as_tensor(inputs['attention_mask'], dtype=torch.long)
    
    
class Custom_Bert_Simple(nn.Module):
    def __init__(self, model_path):
        super().__init__()
        
        config = AutoConfig.from_pretrained(model_path)
        config.num_labels = 1
        self.base = AutoModelForSequenceClassification.from_config(config=config)
        dim = config.hidden_size
        self.dropout = nn.Dropout(p=0)
        self.cls = nn.Linear(dim,1)
        
    def forward(self, input_ids, attention_mask, token_type_ids, labels=None):
        base_output = self.base(input_ids=input_ids,
                                attention_mask=attention_mask,
                                token_type_ids=token_type_ids
        )

        return base_output[0]

In [4]:
def valid_fn(valid_loader, model, device):
    model.eval()
    preds = []
    labels = []
    
    for step, batch in enumerate(valid_loader):
        input_ids, token_type_ids, attention_mask = [i.to(device) for i in batch]
    
        with torch.no_grad():
            y_preds = model(input_ids, attention_mask, token_type_ids)
        
        preds.append(y_preds.to('cpu').numpy())
    
    predictions = np.concatenate(preds)
    
    return predictions


min_max_scaler = MinMaxScaler()

def upd_outputs(data, is_trim=True, is_minmax=True, is_reshape=True):
    """\o/"""
    if is_trim == True:
        data = np.where(data <=0, 0, data)
        data = np.where(data >=1, 1, data)

    if is_minmax ==True:
        data = min_max_scaler.fit_transform(data)
    
    if is_reshape == True:
        data = data.reshape(-1)
        
    return data


def _upd_score_between(data, thresholds, value):
    """\o/"""
    mask_th = data.between(*thresholds, inclusive='both')
    data[mask_th] = value


def upd_score(data, th_dict=None):
    """\o/"""
    if isinstance(data, pd.Series):
        result = data.copy()
    else:
        return data

    if not th_dict:        
        th_dict = {
            '0': 0.02,
            '.25': (0.24, 0.26),
            '.50': (0.49, 0.51),
            '.75': (0.74, 0.76),
            '1': 0.98
        }

    if isinstance(th_dict, dict):    
        th0 = th_dict.get('0')
        th25 = th_dict.get('.25')
        th50 = th_dict.get('.50')
        th75 = th_dict.get('.75')
        th100 = th_dict.get('1')
    else:
        return data
    
    if th0:
        if isinstance(th0, float):
            th0 = (result.min(), th0)
        
        if isinstance(th0, tuple):
            _upd_score_between(result, th0, 0)
    
    if th25 and isinstance(th25, tuple):
        _upd_score_between(result, th25, 0.25)

    if th50 and isinstance(th50, tuple):
        _upd_score_between(result, th50, 0.50)
            
    if th75 and isinstance(th75, tuple):
        _upd_score_between(result, th75, 0.75)
            
    if th100:
        if isinstance(th100, float):
            th100 = (th100, result.max())
        
        if isinstance(th100, tuple):
            _upd_score_between(result, th100, 1)

    return result

In [5]:
test_df = pd.read_csv(f"{CFG_DEB_SIMPLE.input_path}test.csv")
titles = pd.read_csv('../input/cpc-codes/titles.csv')
test_df = test_df.merge(titles, left_on='context', right_on='code')

cpc_texts = torch.load("../input/folds-dump-the-two-paths-fix/cpc_texts.pth")

test_df['context_text'] = test_df['context'].map(cpc_texts)
test_df['text'] = test_df['anchor'] + '[SEP]' + test_df['target'] + '[SEP]'  + test_df['context_text']
test_df['text'] = test_df['text'].apply(str.lower)

test_df.head()

Unnamed: 0,id,anchor,target,context,code,title,section,class,subclass,group,main_group,context_text,text
0,4112d61851461f60,opc drum,inorganic photoconductor drum,G02,G02,OPTICS,G,2.0,,,,PHYSICS. OPTICS,opc drum[sep]inorganic photoconductor drum[sep...
1,5203a36c501f1b7c,generate in layer,generate by layer,G02,G02,OPTICS,G,2.0,,,,PHYSICS. OPTICS,generate in layer[sep]generate by layer[sep]ph...
2,7aa5908a77a7ec24,el display,illumination,G02,G02,OPTICS,G,2.0,,,,PHYSICS. OPTICS,el display[sep]illumination[sep]physics. optics
3,09e418c93a776564,adjust gas flow,altering gas flow,F23,F23,COMBUSTION APPARATUS; COMBUSTION PROCESSES,F,23.0,,,,MECHANICAL ENGINEERING; LIGHTING; HEATING; WEA...,adjust gas flow[sep]altering gas flow[sep]mech...
4,36baf228038e314b,lower trunnion,lower locating,B60,B60,VEHICLES IN GENERAL,B,60.0,,,,PERFORMING OPERATIONS; TRANSPORTING. VEHICLES ...,lower trunnion[sep]lower locating[sep]performi...


# 2. Extract & Update Predictions

In [6]:
tokenizer_deberta_v3 = AutoTokenizer.from_pretrained(CFG_DEB_SIMPLE.model_path)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [7]:
predictions = []

te_dataset = TestDataset(test_df, tokenizer_deberta_v3, CFG_DEB_SIMPLE.max_input_length)
te_dataloader = DataLoader(te_dataset,
                              batch_size=CFG_DEB_SIMPLE.batch_size, shuffle=False,
                              num_workers=CFG_DEB_SIMPLE.num_workers,
                              pin_memory=True, drop_last=False)

deberta_simple_path = "../input/us-patent-deberta-simple/microsoft_deberta-v3-large"

for fold in tqdm(range(CFG_DEB_SIMPLE.num_fold)):
    fold_path = f"{deberta_simple_path}_best{fold}.pth"
    
    model = Custom_Bert_Simple(CFG_DEB_SIMPLE.model_path)
    model.load_state_dict(torch.load(fold_path)['model'])
    model.to('cuda')
    
    prediction = valid_fn(te_dataloader, model, 'cuda')
    
    predictions.append(prediction)

100%|██████████| 4/4 [01:56<00:00, 29.02s/it]


In [8]:
print("folds:", len(predictions))
print("rows: ", len(predictions[0]))
print("score:", predictions[0][0])

folds: 4
rows:  36
score: [0.5036294]


In [9]:
n_predictions = 14

In [10]:
# first fold
predictions[0][:n_predictions]

array([[ 0.5036294 ],
       [ 0.832283  ],
       [ 0.35078326],
       [ 0.4791839 ],
       [ 0.49305084],
       [ 0.5206834 ],
       [ 0.2301636 ],
       [ 0.4067986 ],
       [ 0.27013338],
       [ 0.5016197 ],
       [ 0.516191  ],
       [-0.02492346],
       [ 0.1932471 ],
       [ 0.93988395]], dtype=float32)

In [11]:
# print(*upd_outputs(predictions[0], is_trim=False)[:n_predictions])
# print(*upd_outputs(predictions[0], is_minmax=False)[:n_predictions])

In [12]:
# np.where(x<=0, 0, x) .. >> min_max.fit_transform(x) >> x.reshape(-1)
upd_predictions = [upd_outputs(x, is_trim=False) for x in predictions]

In [13]:
print(*upd_predictions[0][:n_predictions])

0.5510465 0.8892669 0.3937512 0.52588946 0.54016006 0.56859696 0.26962045 0.4513971 0.3107537 0.5489783 0.5639738 0.0071080867 0.23162934 0.99999994


# 3. Additional & Final Predictions

In [14]:
origin_predictions = upd_predictions.copy()  # 5. Visualization

In [15]:
# === add np.median ===
add_preds = []
for x in zip(*upd_predictions):
    add_preds.append(np.median(x, axis=0))
    
upd_predictions.append(add_preds)

In [16]:
# === add np.mean ===
add_preds = []
for x in zip(*upd_predictions):
    add_preds.append(np.mean(x, axis=0))
    
upd_predictions.append(add_preds)

In [17]:
final_predictions = np.mean(upd_predictions, axis=0)

In [18]:
print(*final_predictions[:n_predictions])

0.31270644 0.8849662 0.2845092 0.59676236 0.5130505 0.60881376 0.28154138 0.09131225 0.2716439 0.4976574 0.46910086 0.0035888723 0.2536781 0.9989567


# 4. Create & Calibrate Submissions

In [19]:
submission = pd.DataFrame({
    'id': test_df['id'],
    'score': final_predictions,
})

submission.head(14)

Unnamed: 0,id,score
0,4112d61851461f60,0.312706
1,5203a36c501f1b7c,0.884966
2,7aa5908a77a7ec24,0.284509
3,09e418c93a776564,0.596762
4,36baf228038e314b,0.51305
5,b892011ab2e2cabc,0.608814
6,1f37ead645e7f0c8,0.281541
7,71a5b6ad068d531f,0.091312
8,16ae4b99d3601e60,0.271644
9,474c874d0c07bd21,0.497657


In [20]:
thresholds_dict = {
    '0': 0.02,
    '.25': (0.24, 0.26),
    '.50': (0.49, 0.51),
    '.75': (0.74, 0.76),
    '1': 0.98
}

submission['score'] = upd_score(submission['score'], thresholds_dict)

submission.head(14)

Unnamed: 0,id,score
0,4112d61851461f60,0.312706
1,5203a36c501f1b7c,0.884966
2,7aa5908a77a7ec24,0.284509
3,09e418c93a776564,0.596762
4,36baf228038e314b,0.51305
5,b892011ab2e2cabc,0.608814
6,1f37ead645e7f0c8,0.281541
7,71a5b6ad068d531f,0.091312
8,16ae4b99d3601e60,0.271644
9,474c874d0c07bd21,0.5


In [21]:
submission.to_csv('submission.csv', index=False)

# 5. Visualization origin_predictions

In [22]:
import seaborn as sns
cm = sns.light_palette('green', as_cmap=True)
props_param = "color:white; font-weight:bold; background-color:green;"
pd.set_option('display.precision', 10)

In [23]:
df = pd.DataFrame(origin_predictions).T.head(15)
df = df.rename_axis(columns='folds', index='rows')

df.style.background_gradient(cmap=cm, axis=1)

folds,0,1,2,3
rows,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0.5510464907,0.2486642748,0.2390424758,0.2669645548
1,0.8892669082,1.0,0.9663894773,0.641346097
2,0.393751204,0.2450732291,0.235289678,0.2839302421
3,0.5258894563,0.7600160241,0.4775429666,0.6382790804
4,0.5401600599,0.4998472333,0.5037006736,0.5131292939
5,0.5685969591,0.6299690008,0.6370874643,0.595620513
6,0.2696204484,0.4241000116,0.2226627618,0.237675786
7,0.4513970912,0.0021326696,0.0006830087,0.000854753
8,0.3107537031,0.2583637238,0.2570839226,0.2685575187
9,0.5489783287,0.5046707392,0.4754208326,0.4691711664


In [24]:
df.style.highlight_quantile(q_left=0.75, axis=1, color='green')

folds,0,1,2,3
rows,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0.5510464907,0.2486642748,0.2390424758,0.2669645548
1,0.8892669082,1.0,0.9663894773,0.641346097
2,0.393751204,0.2450732291,0.235289678,0.2839302421
3,0.5258894563,0.7600160241,0.4775429666,0.6382790804
4,0.5401600599,0.4998472333,0.5037006736,0.5131292939
5,0.5685969591,0.6299690008,0.6370874643,0.595620513
6,0.2696204484,0.4241000116,0.2226627618,0.237675786
7,0.4513970912,0.0021326696,0.0006830087,0.000854753
8,0.3107537031,0.2583637238,0.2570839226,0.2685575187
9,0.5489783287,0.5046707392,0.4754208326,0.4691711664


In [25]:
df.assign(mean=lambda x: x.mean(axis=1)) \
    .style.highlight_max(axis=1, props=props_param)

folds,0,1,2,3,mean
rows,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0.5510464907,0.2486642748,0.2390424758,0.2669645548,0.3264294565
1,0.8892669082,1.0,0.9663894773,0.641346097,0.8742506504
2,0.393751204,0.2450732291,0.235289678,0.2839302421,0.2895110846
3,0.5258894563,0.7600160241,0.4775429666,0.6382790804,0.6004319191
4,0.5401600599,0.4998472333,0.5037006736,0.5131292939,0.5142093301
5,0.5685969591,0.6299690008,0.6370874643,0.595620513,0.6078184843
6,0.2696204484,0.4241000116,0.2226627618,0.237675786,0.2885147333
7,0.4513970912,0.0021326696,0.0006830087,0.000854753,0.1137668863
8,0.3107537031,0.2583637238,0.2570839226,0.2685575187,0.2736897171
9,0.5489783287,0.5046707392,0.4754208326,0.4691711664,0.4995602667


In [26]:
df.sub(df.mean(axis=1), axis=0) \
    .style.background_gradient(cmap=cm, axis=1)

folds,0,1,2,3
rows,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0.2246170342,-0.0777651817,-0.0873869807,-0.0594649017
1,0.0150162578,0.1257493496,0.0921388268,-0.2329045534
2,0.1042401195,-0.0444378555,-0.0542214066,-0.0055808425
3,-0.0745424628,0.159584105,-0.1228889525,0.0378471613
4,0.0259507298,-0.0143620968,-0.0105086565,-0.0010800362
5,-0.0392215252,0.0221505165,0.02926898,-0.0121979713
6,-0.018894285,0.1355852783,-0.0658519715,-0.0508389473
7,0.3376302123,-0.1116342172,-0.1130838767,-0.1129121333
8,0.0370639861,-0.0153259933,-0.0166057944,-0.0051321983
9,0.049418062,0.0051104724,-0.0241394341,-0.0303891003
