In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import time
from scipy import stats
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from tqdm import tqdm

import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader

!pip install transformers
!pip install sentencepiece
import tokenizers
import transformers
from transformers import AutoTokenizer, AutoModel, AutoConfig, AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer
from transformers import EarlyStoppingCallback
from transformers.integrations import TensorBoardCallback

pd.set_option('display.max_colwidth', 250)
pd.set_option('display.max_rows', 250)

In [None]:
model_name = 'microsoft/deberta-v3-base'
output_dir="./deberta_new_output"
logging_dir= './logs'
overwrite_output_dir=True
max_len=128

num_train_epochs=20
max_steps=500
evaluation_strategy="steps"
eval_steps=250
metric_for_best_model='eval_corr_scipy'
early_stop_patience=3

per_device_train_batch_size=32
per_device_eval_batch_size=32
gradient_accumulation_steps=1
dataloader_num_workers=2

save_steps=250 # must be a round multiple of eval_steps
load_best_model_at_end=True
save_total_limit=2
log_level='info'
# log_level='debug'
logging_steps=250
logging_first_step=True

report_to="none"

learning_rate=0.00005
weight_decay=0.01
adam_beta1=0.9
adam_beta2=0.999
adam_epsilon=1e-08
lr_scheduler_type='cosine'
warmup_steps=400

In [None]:
df_train = pd.read_csv('../input/us-patent-phrase-to-phrase-matching/train.csv')
df_test = pd.read_csv('../input/us-patent-phrase-to-phrase-matching/test.csv')

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.save_pretrained('./tokenizer/')
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=1)

In [None]:
context_mapping_df = pd.read_csv('/kaggle/input/patentmatching-titles/titles.csv')
context_mapping = {}
for code, context in zip(context_mapping_df['code'], context_mapping_df['title']):
    context_mapping[code] = context

context_title_mapping = {"A" : "Human Necessities", 
      "B" : "Operations and Transport",
      "C" : "Chemistry and Metallurgy",
      "D" : "Textiles",
      "E" : "Fixed Constructions",
      "F" : "Mechanical Engineering",
      "G" : "Physics",
      "H" : "Electricity",
      "Y" : "Emerging Cross-Sectional Technologies"}

df_train['context_text'] = df_train['context'].apply(lambda x: context_mapping[x].lower())
df_train['context_title'] = df_train['context'].apply(lambda x: context_title_mapping[x[0]].lower())

df_train['text'] = df_train['anchor'] + '[SEP]' + df_train['target'] + '[SEP]' + df_train['context_text']

In [None]:
label_mapping = {0.0: 0, 0.25: 1, 0.5: 2, 0.75: 3, 1.0: 4}
df_train['label'] = df_train['score'].apply(lambda x: label_mapping[x])

In [None]:
X_train, X_valid = train_test_split(df_train, test_size=0.15, stratify=df_train['label'])

In [None]:
class TrainPatentDataset(Dataset):
    def __init__(self, df, tokenizer, max_len):
        self.texts = df['text'].values.tolist()
        self.labels = df['score'].values.tolist()
#         print(self.labels)
        self.max_len = max_len
        self.tokenizer = tokenizer
    def __len__(self):
        return len(self.texts)
    def __getitem__(self, idx):
        inputs_dict = self.tokenizer(self.texts[idx], padding='max_length', max_length = self.max_len, truncation=True, return_tensors="pt")
        for k, v in inputs_dict.items():
            inputs_dict[k] = v.squeeze(0)
        inputs_dict['labels'] = torch.tensor(self.labels[idx], dtype=torch.float)
        return inputs_dict

In [None]:
train_dataset = TrainPatentDataset(X_train, tokenizer, max_len)
val_dataset = TrainPatentDataset(X_valid, tokenizer, max_len)

In [None]:
def compute_metrics(p):
    preds, labels = p
    
    corr_scipy = stats.pearsonr(labels, preds)[0]
    corr_np = np.corrcoef(*p)[0][1]
#     corr_scipy = 0
#     corr_np = 0
    
    
    return {
        "corr_scipy": corr_scipy, "corr_np": corr_np
    }

In [None]:
args = TrainingArguments(
    num_train_epochs=num_train_epochs,
    max_steps=max_steps,
    evaluation_strategy=evaluation_strategy,
    eval_steps=eval_steps,
    metric_for_best_model=metric_for_best_model,
    
    per_device_train_batch_size=per_device_train_batch_size,
    per_device_eval_batch_size=per_device_eval_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    dataloader_num_workers=dataloader_num_workers,
#     seed=seed,
    
    output_dir=output_dir,
    logging_dir=logging_dir,
    overwrite_output_dir=overwrite_output_dir,
    save_steps=save_steps, # must be a round multiple of eval_steps
    load_best_model_at_end=load_best_model_at_end,
    save_total_limit=save_total_limit,
    log_level=log_level,
    logging_steps=logging_steps,
    logging_first_step=logging_first_step,
    
    report_to=report_to,
    
    learning_rate=learning_rate,
    weight_decay=weight_decay,
    adam_beta1=adam_beta1,
    adam_beta2=adam_beta2,
    adam_epsilon=adam_epsilon,
    lr_scheduler_type=lr_scheduler_type,
    warmup_steps=warmup_steps,
)

In [None]:
trainer = Trainer(
    model=model,
    args=args,
#     optimizers=(opt, opt_scheduler),
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=early_stop_patience), TensorBoardCallback()],
)

In [None]:
trainer.train()