In [1]:
import os
import pandas as pd
import numpy as np
import shutil
import time
import gc
import random
import math
import torch
from torch.utils.data import DataLoader, Dataset
import transformers
from transformers import TrainingArguments, Trainer, DataCollatorForWholeWordMask
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoConfig, AutoModel
from torch import nn
from torch.optim import Adam, SGD, AdamW
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup
os.environ["WANDB_DISABLED"] = "true"

In [2]:
df = pd.read_csv('../input/us-patent-phrase-to-phrase-matching/train.csv')

In [3]:
df_context = pd.read_csv('../input/cpc-codes/titles.csv')

In [4]:
df = df.merge(df_context, how='left', left_on='context', right_on='code')
df = df[['id', 'anchor', 'target', 'context', 'title', 'score']]
df

Unnamed: 0,id,anchor,target,context,title,score
0,37d61fd2272659b1,abatement,abatement of pollution,A47,FURNITURE; DOMESTIC ARTICLES OR APPLIANCES; CO...,0.50
1,7b9652b17b68b7a4,abatement,act of abating,A47,FURNITURE; DOMESTIC ARTICLES OR APPLIANCES; CO...,0.75
2,36d72442aefd8232,abatement,active catalyst,A47,FURNITURE; DOMESTIC ARTICLES OR APPLIANCES; CO...,0.25
3,5296b0c19e1ce60e,abatement,eliminating process,A47,FURNITURE; DOMESTIC ARTICLES OR APPLIANCES; CO...,0.50
4,54c1e3b9184cb5b6,abatement,forest region,A47,FURNITURE; DOMESTIC ARTICLES OR APPLIANCES; CO...,0.00
...,...,...,...,...,...,...
36468,8e1386cbefd7f245,wood article,wooden article,B44,DECORATIVE ARTS,1.00
36469,42d9e032d1cd3242,wood article,wooden box,B44,DECORATIVE ARTS,0.50
36470,208654ccb9e14fa3,wood article,wooden handle,B44,DECORATIVE ARTS,0.50
36471,756ec035e694722b,wood article,wooden material,B44,DECORATIVE ARTS,0.75


In [5]:
eval_df = pd.read_csv('../input/us-patent-phrase-to-phrase-matching/test.csv')

In [6]:
eval_df = eval_df.merge(df_context, how='left', left_on='context', right_on='code')
eval_df = eval_df[['id', 'anchor', 'target', 'context', 'title']]
eval_df

Unnamed: 0,id,anchor,target,context,title
0,4112d61851461f60,opc drum,inorganic photoconductor drum,G02,OPTICS
1,09e418c93a776564,adjust gas flow,altering gas flow,F23,COMBUSTION APPARATUS; COMBUSTION PROCESSES
2,36baf228038e314b,lower trunnion,lower locating,B60,VEHICLES IN GENERAL
3,1f37ead645e7f0c8,cap component,upper portion,D06,TREATMENT OF TEXTILES OR THE LIKE; LAUNDERING;...
4,71a5b6ad068d531f,neural stimulation,artificial neural network,H04,ELECTRIC COMMUNICATION TECHNIQUE
5,474c874d0c07bd21,dry corn,dry corn starch,C12,BIOCHEMISTRY; BEER; SPIRITS; WINE; VINEGAR; MI...
6,442c114ed5c4e3c9,tunneling capacitor,capacitor housing,G11,INFORMATION STORAGE
7,b8ae62ea5e1d8bdb,angular contact bearing,contact therapy radiation,B23,MACHINE TOOLS; METAL-WORKING NOT OTHERWISE PRO...
8,faaddaf8fcba8a3f,produce liquid hydrocarbons,produce a treated stream,C10,"PETROLEUM, GAS OR COKE INDUSTRIES; TECHNICAL G..."
9,ae0262c02566d2ce,diesel fuel tank,diesel fuel tanks,F02,COMBUSTION ENGINES; HOT-GAS OR COMBUSTION-PROD...


In [7]:
model_nm = '../input/debertav3small'

In [8]:
from transformers import AutoTokenizer
tonkenizer = AutoTokenizer.from_pretrained(model_nm)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [9]:
df['input'] = df['target'] + tonkenizer.sep_token + df['title'].apply(str.lower)

In [10]:
df

Unnamed: 0,id,anchor,target,context,title,score,input
0,37d61fd2272659b1,abatement,abatement of pollution,A47,FURNITURE; DOMESTIC ARTICLES OR APPLIANCES; CO...,0.50,abatement of pollution[SEP]furniture; domestic...
1,7b9652b17b68b7a4,abatement,act of abating,A47,FURNITURE; DOMESTIC ARTICLES OR APPLIANCES; CO...,0.75,act of abating[SEP]furniture; domestic article...
2,36d72442aefd8232,abatement,active catalyst,A47,FURNITURE; DOMESTIC ARTICLES OR APPLIANCES; CO...,0.25,active catalyst[SEP]furniture; domestic articl...
3,5296b0c19e1ce60e,abatement,eliminating process,A47,FURNITURE; DOMESTIC ARTICLES OR APPLIANCES; CO...,0.50,eliminating process[SEP]furniture; domestic ar...
4,54c1e3b9184cb5b6,abatement,forest region,A47,FURNITURE; DOMESTIC ARTICLES OR APPLIANCES; CO...,0.00,forest region[SEP]furniture; domestic articles...
...,...,...,...,...,...,...,...
36468,8e1386cbefd7f245,wood article,wooden article,B44,DECORATIVE ARTS,1.00,wooden article[SEP]decorative arts
36469,42d9e032d1cd3242,wood article,wooden box,B44,DECORATIVE ARTS,0.50,wooden box[SEP]decorative arts
36470,208654ccb9e14fa3,wood article,wooden handle,B44,DECORATIVE ARTS,0.50,wooden handle[SEP]decorative arts
36471,756ec035e694722b,wood article,wooden material,B44,DECORATIVE ARTS,0.75,wooden material[SEP]decorative arts


In [11]:
eval_df['input'] = eval_df['target'] + tonkenizer.sep_token + eval_df['title'].apply(str.lower)

In [12]:
eval_df

Unnamed: 0,id,anchor,target,context,title,input
0,4112d61851461f60,opc drum,inorganic photoconductor drum,G02,OPTICS,inorganic photoconductor drum[SEP]optics
1,09e418c93a776564,adjust gas flow,altering gas flow,F23,COMBUSTION APPARATUS; COMBUSTION PROCESSES,altering gas flow[SEP]combustion apparatus; co...
2,36baf228038e314b,lower trunnion,lower locating,B60,VEHICLES IN GENERAL,lower locating[SEP]vehicles in general
3,1f37ead645e7f0c8,cap component,upper portion,D06,TREATMENT OF TEXTILES OR THE LIKE; LAUNDERING;...,upper portion[SEP]treatment of textiles or the...
4,71a5b6ad068d531f,neural stimulation,artificial neural network,H04,ELECTRIC COMMUNICATION TECHNIQUE,artificial neural network[SEP]electric communi...
5,474c874d0c07bd21,dry corn,dry corn starch,C12,BIOCHEMISTRY; BEER; SPIRITS; WINE; VINEGAR; MI...,dry corn starch[SEP]biochemistry; beer; spirit...
6,442c114ed5c4e3c9,tunneling capacitor,capacitor housing,G11,INFORMATION STORAGE,capacitor housing[SEP]information storage
7,b8ae62ea5e1d8bdb,angular contact bearing,contact therapy radiation,B23,MACHINE TOOLS; METAL-WORKING NOT OTHERWISE PRO...,contact therapy radiation[SEP]machine tools; m...
8,faaddaf8fcba8a3f,produce liquid hydrocarbons,produce a treated stream,C10,"PETROLEUM, GAS OR COKE INDUSTRIES; TECHNICAL G...","produce a treated stream[SEP]petroleum, gas or..."
9,ae0262c02566d2ce,diesel fuel tank,diesel fuel tanks,F02,COMBUSTION ENGINES; HOT-GAS OR COMBUSTION-PROD...,diesel fuel tanks[SEP]combustion engines; hot-...


In [13]:
from sklearn.model_selection import train_test_split
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

In [14]:
import torch
from torch.utils.data import Dataset, DataLoader

class TrainDataset(Dataset):
    def __init__(self, df):
        self.input = df['input'].values.astype(str)
        self.anchor = df['anchor'].values.astype(str)
        self.label = df['score'].values
        
    def __len__(self):
        return len(self.input)
    
    def __getitem__(self, item):
        inputs = self.input[item]
        anchor = self.anchor[item]
        label = self.label[item]
        
        model_inputs = tonkenizer(inputs, anchor,
                                max_length=100,
                                padding='max_length',
                                truncation=True)
        
        return {**model_inputs,
               'label':torch.as_tensor(label, dtype=torch.float)}
class evalDataset(Dataset):
    def __init__(self, df):
        self.input = df['input'].values.astype(str)
        self.anchor = df['anchor'].values.astype(str)
        
    def __len__(self):
        return len(self.input)
    
    def __getitem__(self, item):
        inputs = self.input[item]
        anchor = self.anchor[item]
        
        model_inputs = tonkenizer(inputs, anchor,
                                max_length=100,
                                padding='max_length',
                                truncation=True)
        
        return {**model_inputs}

In [15]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = predictions.reshape(len(predictions))
    return {
        'pearson': np.corrcoef(predictions, labels)[0][1]
    }

In [16]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(model_nm, num_labels=1)

Some weights of the model checkpoint at ../input/debertav3small were not used when initializing DebertaV2ForSequenceClassification: ['mask_predictions.dense.bias', 'mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.bias', 'mask_predictions.classifier.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.dense.weight', 'mask_predictions.classifier.bias', 'mask_predictions.LayerNorm.weight']
- This IS expected if you are initializing DebertaV2ForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2ForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a B

In [17]:
metric_name = 'pearson'
batch_size = 128
args = TrainingArguments(
       'model_test',
    evaluation_strategy='epoch',
    save_strategy='epoch',
    learning_rate = 2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size*2,
    num_train_epochs=4,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
    save_total_limit=1
)

Using the `WAND_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [18]:
train_dataset = TrainDataset(train_df)
val_dataset = TrainDataset(val_df)

trainer = Trainer(model,
                  args,
                  train_dataset=train_dataset,
                  eval_dataset=val_dataset,
                  tokenizer=tonkenizer,
                  compute_metrics=compute_metrics
                 )

In [19]:
trainer.train()

***** Running training *****
  Num examples = 29178
  Num Epochs = 4
  Instantaneous batch size per device = 128
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 1
  Total optimization steps = 912


Epoch,Training Loss,Validation Loss,Pearson
1,No log,0.03011,0.780291
2,No log,0.025432,0.804081
3,0.035400,0.02491,0.81555
4,0.035400,0.025334,0.819155


***** Running Evaluation *****
  Num examples = 7295
  Batch size = 256
Saving model checkpoint to model_test/checkpoint-228
Configuration saved in model_test/checkpoint-228/config.json
Model weights saved in model_test/checkpoint-228/pytorch_model.bin
tokenizer config file saved in model_test/checkpoint-228/tokenizer_config.json
Special tokens file saved in model_test/checkpoint-228/special_tokens_map.json
added tokens file saved in model_test/checkpoint-228/added_tokens.json
***** Running Evaluation *****
  Num examples = 7295
  Batch size = 256
Saving model checkpoint to model_test/checkpoint-456
Configuration saved in model_test/checkpoint-456/config.json
Model weights saved in model_test/checkpoint-456/pytorch_model.bin
tokenizer config file saved in model_test/checkpoint-456/tokenizer_config.json
Special tokens file saved in model_test/checkpoint-456/special_tokens_map.json
added tokens file saved in model_test/checkpoint-456/added_tokens.json
Deleting older checkpoint [model_tes

TrainOutput(global_step=912, training_loss=0.02968331178029378, metrics={'train_runtime': 813.9331, 'train_samples_per_second': 143.393, 'train_steps_per_second': 1.12, 'total_flos': 3019689459288000.0, 'train_loss': 0.02968331178029378, 'epoch': 4.0})

In [20]:
va_dataset = evalDataset(eval_df)

In [21]:
outputs = trainer.predict(va_dataset).predictions.astype(float)
outputs

***** Running Prediction *****
  Num examples = 36
  Batch size = 256


array([[ 0.59887791],
       [ 0.82254803],
       [ 0.43658477],
       [ 0.32469204],
       [ 0.19401582],
       [ 0.63048869],
       [ 0.45947769],
       [-0.04661069],
       [ 0.33588639],
       [ 1.03724444],
       [ 0.18336947],
       [ 0.26041457],
       [ 0.72024208],
       [ 0.80885589],
       [ 0.8689518 ],
       [ 0.41447252],
       [ 0.26497272],
       [-0.0535671 ],
       [ 0.56936848],
       [ 0.30718952],
       [ 0.44618943],
       [ 0.25668541],
       [ 0.17961937],
       [ 0.26442224],
       [ 0.55967021],
       [-0.09723711],
       [-0.02346445],
       [-0.05432898],
       [-0.0902738 ],
       [ 0.72493285],
       [ 0.09252907],
       [ 0.0362477 ],
       [ 0.7316134 ],
       [ 0.43430769],
       [ 0.41810319],
       [ 0.2526339 ]])

In [22]:
outputs = np.clip(outputs, 0, 1)

In [23]:
import datasets

submission = datasets.Dataset.from_dict({
    'id': eval_df['id'],
    'score': outputs.flatten()
})

submission.to_csv('submission.csv', index=False)

Creating CSV from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

1207