In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import time
from scipy import stats
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from tqdm import tqdm

import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader

!pip install transformers
!pip install sentencepiece
import tokenizers
import transformers
from transformers import AutoTokenizer, AutoModel, AutoConfig, AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer
from transformers import EarlyStoppingCallback
from transformers.integrations import TensorBoardCallback

pd.set_option('display.max_colwidth', 250)
pd.set_option('display.max_rows', 250)

[0m

In [2]:
model_name = 'microsoft/deberta-v3-base'
output_dir="./deberta_new_output"
logging_dir= './logs'
overwrite_output_dir=True
max_len=128

num_train_epochs=20
max_steps=-1
evaluation_strategy="steps"
eval_steps=250
metric_for_best_model='eval_corr_scipy'
early_stop_patience=3

per_device_train_batch_size=32
per_device_eval_batch_size=32
gradient_accumulation_steps=1
dataloader_num_workers=2

save_steps=250 # must be a round multiple of eval_steps
load_best_model_at_end=True
save_total_limit=2
log_level='info'
# log_level='debug'
logging_steps=250
logging_first_step=True

report_to="none"

learning_rate=0.00005
weight_decay=0.01
adam_beta1=0.9
adam_beta2=0.999
adam_epsilon=1e-08
lr_scheduler_type='cosine'
warmup_steps=400

In [3]:
df_train = pd.read_csv('../input/us-patent-phrase-to-phrase-matching/train.csv')
df_test = pd.read_csv('../input/us-patent-phrase-to-phrase-matching/test.csv')

In [4]:
tokenizer = AutoTokenizer.from_pretrained('/kaggle/input/deberta-trainer-1/tokenizer')

In [5]:
context_mapping_df = pd.read_csv('/kaggle/input/patentmatching-titles/titles.csv')
context_mapping = {}
for code, context in zip(context_mapping_df['code'], context_mapping_df['title']):
    context_mapping[code] = context

context_title_mapping = {"A" : "Human Necessities", 
      "B" : "Operations and Transport",
      "C" : "Chemistry and Metallurgy",
      "D" : "Textiles",
      "E" : "Fixed Constructions",
      "F" : "Mechanical Engineering",
      "G" : "Physics",
      "H" : "Electricity",
      "Y" : "Emerging Cross-Sectional Technologies"}

df_test['context_text'] = df_test['context'].apply(lambda x: context_mapping[x].lower())
df_test['context_title'] = df_test['context'].apply(lambda x: context_title_mapping[x[0]].lower())

df_test['text'] = df_test['anchor'] + '[SEP]' + df_test['target'] + '[SEP]' + df_test['context_text']

In [6]:
class TestPatentDataset(Dataset):
    def __init__(self, df, tokenizer, max_len):
        self.texts = df['text'].values.tolist()
        self.max_len = max_len
        self.tokenizer = tokenizer
    def __len__(self):
        return len(self.texts)
    def __getitem__(self, idx):
        inputs_dict = self.tokenizer(self.texts[idx], padding='max_length', max_length = self.max_len, truncation=True, return_tensors="pt")
        for k, v in inputs_dict.items():
            inputs_dict[k] = v.squeeze(0)
#         inputs_dict['labels'] = torch.tensor(self.labels[idx], dtype=torch.float)
        return inputs_dict

In [7]:
test_dataset = TestPatentDataset(df_test, tokenizer, max_len)

In [8]:
checkpoint_id = 3000

In [9]:
model_path = f"/kaggle/input/deberta-trainer-1/deberta_new_output/checkpoint-{checkpoint_id}"
model = AutoModelForSequenceClassification.from_pretrained(model_path, num_labels=1) 

args = TrainingArguments(
    output_dir='tmp',
    per_device_train_batch_size=per_device_train_batch_size,
    per_device_eval_batch_size=per_device_eval_batch_size,
    report_to='none'
)
test_trainer = Trainer(
    model=model,
    args=args
)

In [10]:
raw_pred, _, _ = test_trainer.predict(test_dataset)

***** Running Prediction *****
  Num examples = 36
  Batch size = 64


In [11]:
submission_df = df_test[['id']]
submission_df['score'] = raw_pred
submission_df.to_csv('./submission.csv', index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
