In [9]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [10]:
train = pd.read_csv("/kaggle/input/us-patent-phrase-to-phrase-matching/train.csv")
train.head()

In [11]:
cpc = pd.read_csv('../input/cpc-codes/titles.csv')
cpc.head()

In [12]:
cpc = cpc.rename(columns = {"code" : "context"})
train = pd.merge(train, cpc[["context","title"]], on ="context", how = "left")
train.head()

In [13]:
train['sen1'] = train['anchor'].astype('str')+' '+train['title'].astype('str')
train = train.drop(['anchor','context','title'],axis=1)

In [14]:
train.head()

In [15]:
from sklearn.model_selection import train_test_split

x_train,x_test,y_train,y_test = train_test_split(train[['target','sen1']],train['score'],random_state=1234,test_size=0.3)
print(x_train.shape,x_test.shape)
print(y_train.shape,y_test.shape)

In [16]:
import torch
import torch.nn as nn
import transformers
from torch.nn.utils.clip_grad import clip_grad_norm
import torch_xla
import torch_xla.core.xla_model as xm

In [17]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.cuda.empty_cache()

In [18]:
class PatentModel(nn.Module):
    def __init__(self,bert_path):
        super(PatentModel,self).__init__()
        self.bert_path = bert_path
        self.bert = transformers.AutoModel.from_pretrained(self.bert_path)
        self.fc_layer = nn.Sequential(
            nn.Dropout(0.3),
            nn.Linear(768,1),
            nn.Sigmoid()
        )
    def forward(self,ids,mask,token_type_ids):
        out = self.bert(input_ids=ids,attention_mask=mask,token_type_ids=token_type_ids)
        last_hidden_state = out.last_hidden_state
        cls_embeddings = last_hidden_state[:, 0]
        bo = self.fc_layer(cls_embeddings)
        return bo

In [19]:
class PatentDataset:
    def __init__(self,text1,text2,label,tokenizer,max_len):
        self.text1=text1
        self.text2=text2
        self.label=label
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __len__(self):
        return len(self.text1)
    
    def __getitem__(self,idx):
        text_1 = str(self.text1[idx])
        text_2 = str(self.text2[idx])
        label = self.label[idx]
        
        inputs = self.tokenizer(
            text_1,
            text_2,
            add_special_tokens=True,
            padding='max_length',
            truncation=True,
            max_length=self.max_len,
            return_attention_mask=True
        )
        
        ids = inputs['input_ids']
        token_type_ids = inputs["token_type_ids"]
        mask = inputs['attention_mask']
        
        padding_len = self.max_len - len(ids)
        ids = ids + ([0]*padding_len)
        token_type_ids = token_type_ids + ([0]*padding_len)
        mask = mask + ([0]*padding_len)
        
        return {
            "ids": torch.tensor(ids,dtype=torch.long),
            "mask": torch.tensor(mask,dtype=torch.long),
            "token_type_ids": torch.tensor(token_type_ids,dtype=torch.long),
            "targets": torch.tensor(label,dtype=torch.float),
        }

In [20]:
max_len=128
train_batch_size = 8
epochs=4
deberta_path = '../input/huggingface-deberta-variants/deberta-base/deberta-base'

tokenizer = transformers.AutoTokenizer.from_pretrained(deberta_path)

In [21]:
train_text1 = list(x_train['target'].values)
train_text2 = list(x_train['sen1'].values)
train_label = list(y_train.values)

In [22]:
train_dataset = PatentDataset(text1=train_text1,text2 = train_text2,label=train_label,tokenizer = tokenizer,max_len=max_len)
train_data_loader = torch.utils.data.DataLoader(train_dataset,batch_size=train_batch_size,shuffle=True)

In [23]:
val_text1 = list(x_test['target'].values)
val_text2 = list(x_test['sen1'].values)
val_label = list(y_test.values)

valid_dataset = PatentDataset(text1=val_text1,text2 = val_text2,label=val_label,tokenizer = tokenizer,max_len=max_len)
valid_data_loader = torch.utils.data.DataLoader(valid_dataset,batch_size=train_batch_size,shuffle=True)

In [24]:
def train(model, optimizer,scheduler,loss_fun,epochs,train_loader,val_loader,device,clip_val=2):
        
    model.train()
    for epoch in range(epochs):
        losses = []
        for step,batch in enumerate(train_loader):
            batch_inputs, batch_masks, batch_labels = batch["ids"].to(device), batch["mask"].to(device), batch["targets"].to(device)
            batch_token_type_ids = batch["token_type_ids"]
            model.zero_grad()
            outputs = model(batch_inputs, batch_masks, batch_token_type_ids)
            loss = loss_fun(outputs.squeeze(),batch_labels.squeeze())
            losses.append(loss)
            loss.backward()
            clip_grad_norm(model.parameters(),clip_val)
            optimizer.step()
            scheduler.step()
        loss2 = sum(best_loss)/len(best_loss)
        print(f'Epoch : {epoch} ,Train loss : {loss2}')
    
    return model

In [25]:
def r2_score(outputs, labels):
    labels_mean = torch.mean(labels)
    ss_tot = torch.sum((labels - labels_mean) ** 2)
    ss_res = torch.sum((labels - outputs) ** 2)
    r2 = 1 - ss_res / ss_tot
    return r2

In [26]:
def evaluate(model,loss_function,test_dataloader,device):
    model.eval()
    test_loss, test_r2 = [], []
    for step,batch in enumerate(test_dataloader):
        batch_inputs, batch_masks, batch_labels = batch['ids'].to(device), batch['mask'].to(device), batch['targets'].to(device)
        batch_token_type_ids = batch['token_type_ids'].to(device)
        with torch.no_grad():
            outputs = model(batch_inputs, batch_masks, batch_token_type_ids)
        loss = loss_function(outputs, batch_labels)
        test_loss.append(loss.item())
        r2 = r2_score(outputs, batch_labels)
        test_r2.append(r2.item())
    return test_loss, test_r2

In [None]:
num_train_steps = len(train_data_loader) * epochs
model = PatentModel(deberta_path).to(device)

optimizer = transformers.AdamW(model.parameters(),lr=3e-5,eps=1e-8)
scheduler = transformers.get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=num_train_steps
)
loss_function = nn.MSELoss()

model = train(model, optimizer, scheduler, loss_function, epochs,train_data_loader, valid_data_loader,device)
loss1,r2_ = evaluate(model,loss_function,valid_data_loader,device)

loss = sum(loss1)/len(loss1)
r2 = sum(r2_)/len(r2_)
print(f"eval mean result : loss {loss}, r2 {r2}")