In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import time
from scipy import stats
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from tqdm import tqdm

import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader

!pip install transformers
!pip install sentencepiece
import tokenizers
import transformers
from transformers import AutoTokenizer, AutoModel, AutoConfig, AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer
from transformers import EarlyStoppingCallback
from transformers.integrations import TensorBoardCallback

pd.set_option('display.max_colwidth', 250)
pd.set_option('display.max_rows', 250)

[0m

In [2]:
model_name = 'microsoft/deberta-v3-base'
output_dir="./deberta_new_output"
logging_dir= './logs'
overwrite_output_dir=True
max_len=128

num_train_epochs=20
max_steps=500
evaluation_strategy="steps"
eval_steps=250
metric_for_best_model='eval_corr_scipy'
early_stop_patience=3

per_device_train_batch_size=32
per_device_eval_batch_size=32
gradient_accumulation_steps=1
dataloader_num_workers=2

save_steps=250 # must be a round multiple of eval_steps
load_best_model_at_end=True
save_total_limit=2
log_level='info'
# log_level='debug'
logging_steps=250
logging_first_step=True

report_to="none"

learning_rate=0.00005
weight_decay=0.01
adam_beta1=0.9
adam_beta2=0.999
adam_epsilon=1e-08
lr_scheduler_type='cosine'
warmup_steps=400

In [3]:
df_train = pd.read_csv('../input/us-patent-phrase-to-phrase-matching/train.csv')
df_test = pd.read_csv('../input/us-patent-phrase-to-phrase-matching/test.csv')

In [4]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.save_pretrained('./tokenizer/')
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=1)

Downloading:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/579 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.35M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
  "The sentencepiece tokenizer that you are converting to a fast tokenizer uses the byte fallback option"
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Downloading:   0%|          | 0.00/354M [00:00<?, ?B/s]

Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2ForSequenceClassification: ['lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.dense.bias', 'mask_predictions.LayerNorm.bias', 'mask_predictions.dense.weight', 'mask_predictions.LayerNorm.weight', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.classifier.weight', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.classifier.bias', 'lm_predictions.lm_head.LayerNorm.weight']
- This IS expected if you are initializing DebertaV2ForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2ForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a

In [5]:
context_mapping_df = pd.read_csv('/kaggle/input/patentmatching-titles/titles.csv')
context_mapping = {}
for code, context in zip(context_mapping_df['code'], context_mapping_df['title']):
    context_mapping[code] = context

context_title_mapping = {"A" : "Human Necessities", 
      "B" : "Operations and Transport",
      "C" : "Chemistry and Metallurgy",
      "D" : "Textiles",
      "E" : "Fixed Constructions",
      "F" : "Mechanical Engineering",
      "G" : "Physics",
      "H" : "Electricity",
      "Y" : "Emerging Cross-Sectional Technologies"}

df_train['context_text'] = df_train['context'].apply(lambda x: context_mapping[x].lower())
df_train['context_title'] = df_train['context'].apply(lambda x: context_title_mapping[x[0]].lower())

df_train['text'] = df_train['anchor'] + '[SEP]' + df_train['target'] + '[SEP]' + df_train['context_text']

In [6]:
label_mapping = {0.0: 0, 0.25: 1, 0.5: 2, 0.75: 3, 1.0: 4}
df_train['label'] = df_train['score'].apply(lambda x: label_mapping[x])

In [7]:
X_train, X_valid = train_test_split(df_train, test_size=0.15, stratify=df_train['label'])

In [8]:
class TrainPatentDataset(Dataset):
    def __init__(self, df, tokenizer, max_len):
        self.texts = df['text'].values.tolist()
        self.labels = df['score'].values.tolist()
#         print(self.labels)
        self.max_len = max_len
        self.tokenizer = tokenizer
    def __len__(self):
        return len(self.texts)
    def __getitem__(self, idx):
        inputs_dict = self.tokenizer(self.texts[idx], padding='max_length', max_length = self.max_len, truncation=True, return_tensors="pt")
        for k, v in inputs_dict.items():
            inputs_dict[k] = v.squeeze(0)
        inputs_dict['labels'] = torch.tensor(self.labels[idx], dtype=torch.float)
        return inputs_dict

In [9]:
train_dataset = TrainPatentDataset(X_train, tokenizer, max_len)
val_dataset = TrainPatentDataset(X_valid, tokenizer, max_len)

In [10]:
def compute_metrics(p):
    preds, labels = p
    
    corr_scipy = stats.pearsonr(labels, preds)[0]
    corr_np = np.corrcoef(*p)[0][1]
#     corr_scipy = 0
#     corr_np = 0
    
    
    return {
        "corr_scipy": corr_scipy, "corr_np": corr_np
    }

In [11]:
args = TrainingArguments(
    num_train_epochs=num_train_epochs,
    max_steps=max_steps,
    evaluation_strategy=evaluation_strategy,
    eval_steps=eval_steps,
    metric_for_best_model=metric_for_best_model,
    
    per_device_train_batch_size=per_device_train_batch_size,
    per_device_eval_batch_size=per_device_eval_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    dataloader_num_workers=dataloader_num_workers,
#     seed=seed,
    
    output_dir=output_dir,
    logging_dir=logging_dir,
    overwrite_output_dir=overwrite_output_dir,
    save_steps=save_steps, # must be a round multiple of eval_steps
    load_best_model_at_end=load_best_model_at_end,
    save_total_limit=save_total_limit,
    log_level=log_level,
    logging_steps=logging_steps,
    logging_first_step=logging_first_step,
    
    report_to=report_to,
    
    learning_rate=learning_rate,
    weight_decay=weight_decay,
    adam_beta1=adam_beta1,
    adam_beta2=adam_beta2,
    adam_epsilon=adam_epsilon,
    lr_scheduler_type=lr_scheduler_type,
    warmup_steps=warmup_steps,
)

In [12]:
trainer = Trainer(
    model=model,
    args=args,
#     optimizers=(opt, opt_scheduler),
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=early_stop_patience), TensorBoardCallback()],
)

max_steps is given, it will override any value given in num_train_epochs


In [13]:
trainer.train()

***** Running training *****
  Num examples = 31002
  Num Epochs = 2
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 1
  Total optimization steps = 500


Step,Training Loss,Validation Loss,Corr Scipy,Corr Np
250,0.057,0.031925,0.754166,0.754166
500,0.0291,0.024248,0.810037,0.810037


***** Running Evaluation *****
  Num examples = 5471
  Batch size = 64
Saving model checkpoint to ./deberta_new_output/checkpoint-250
Configuration saved in ./deberta_new_output/checkpoint-250/config.json
Model weights saved in ./deberta_new_output/checkpoint-250/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 5471
  Batch size = 64
Saving model checkpoint to ./deberta_new_output/checkpoint-500
Configuration saved in ./deberta_new_output/checkpoint-500/config.json
Model weights saved in ./deberta_new_output/checkpoint-500/pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from ./deberta_new_output/checkpoint-500 (score: 0.8100367300643492).


TrainOutput(global_step=500, training_loss=0.04322228679060936, metrics={'train_runtime': 566.8097, 'train_samples_per_second': 56.456, 'train_steps_per_second': 0.882, 'total_flos': 2102407715261952.0, 'train_loss': 0.04322228679060936, 'epoch': 1.03})