In [None]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import time
from scipy import stats
from tqdm import tqdm
from multiprocessing import cpu_count

import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader

import tokenizers
import transformers
from transformers import AutoTokenizer, AutoModel, AutoConfig, AutoModelForSequenceClassification
from transformers import get_cosine_schedule_with_warmup

In [None]:
model_name1 = 'microsoft/deberta-v3-base'
model_name2 = 'anferico/bert-for-patents'

In [None]:
tokenizer1 = AutoTokenizer.from_pretrained(model_name1, use_fast=True)
tokenizer1.save_pretrained('./tokenizer1/')
tokenizer2 = AutoTokenizer.from_pretrained(model_name2, use_fast=True)
tokenizer2.save_pretrained('./tokenizer2/')

config1 = AutoConfig.from_pretrained(model_name1)
config1.save_pretrained('./config1/')
config2 = AutoConfig.from_pretrained(model_name2)
config2.save_pretrained('./config2/')

In [None]:
df_train = pd.read_csv('/kaggle/input/us-patent-phrase-to-phrase-matching/train.csv')

In [None]:
context_mapping_df = pd.read_csv('/kaggle/input/patentmatching-titles/titles.csv')
context_mapping = {}
for code, context in zip(context_mapping_df['code'], context_mapping_df['title']):
    context_mapping[code] = context

context_title_mapping = {"A" : "Human Necessities", 
      "B" : "Operations and Transport",
      "C" : "Chemistry and Metallurgy",
      "D" : "Textiles",
      "E" : "Fixed Constructions",
      "F" : "Mechanical Engineering",
      "G" : "Physics",
      "H" : "Electricity",
      "Y" : "Emerging Cross-Sectional Technologies"}

df_train['context_text'] = df_train['context'].apply(lambda x: context_mapping[x].lower())
df_train['context_title'] = df_train['context'].apply(lambda x: context_title_mapping[x[0]].lower())

df_train['text'] = df_train['anchor'] + '[SEP]' + df_train['target'] + '[SEP]' + df_train['context_text']

In [None]:
label_mapping = {0.0: 0, 0.25: 1, 0.5: 2, 0.75: 3, 1.0: 4}
df_train['label'] = df_train['score'].apply(lambda x: label_mapping[x])

In [None]:
# df_train, _ = train_test_split(df_train, test_size=0.85, stratify=df_train['label'])
X_train, X_valid = train_test_split(df_train, test_size=0.15, stratify=df_train['label'])

In [None]:
class TrainPatentDataset(Dataset):
    def __init__(self, df, tokenizer1, tokenizer2, max_len):
        self.texts = df['text'].values.tolist()
        self.labels = df['score'].values.tolist()
        self.max_len = max_len
        self.tokenizer1 = tokenizer1
        self.tokenizer2 = tokenizer2
    def __len__(self):
        return len(self.texts)
    def __getitem__(self, idx):
        inputs1 = self.tokenizer1(self.texts[idx], padding='max_length', max_length = self.max_len, truncation=True, return_tensors="pt")
        for k, v in inputs1.items():
            inputs1[k] = v.squeeze(0)
        inputs2 = self.tokenizer2(self.texts[idx], padding='max_length', max_length = self.max_len, truncation=True, return_tensors="pt")
        for k, v in inputs2.items():
            inputs2[k] = v.squeeze(0)
        labels = torch.tensor(self.labels[idx])
        return inputs1, inputs2, labels

In [None]:
class CustomModel(nn.Module):
    def __init__(self, model_name1, model_name2):
        super().__init__()
        self.model1 = AutoModel.from_pretrained(model_name1)
        self.attention1 = nn.Sequential(
            nn.Linear(768, 512),
            nn.Tanh(),
            nn.Linear(512, 1),
            nn.Softmax(dim=1)
        )
        self.dropout11 = nn.Dropout()
        
        self.model2 = AutoModel.from_pretrained(model_name2)
        self.attention2 = nn.Sequential(
            nn.Linear(1024, 512),
            nn.Tanh(),
            nn.Linear(512, 1),
            nn.Softmax(dim=1)
        )
        self.dropout12 = nn.Dropout()
        
        self.dropout2 = nn.Dropout()
        self.fc = nn.Linear(768+1024, 1)
        
    def forward(self, inputs1, inputs2):
        outputs1 = self.model1(**inputs1).last_hidden_state
        outputs1 = self.dropout11(outputs1)
        weights1 = self.attention1(outputs1)
        outputs1 = torch.sum(weights1 * outputs1, dim=1)
        
        outputs2 = self.model2(**inputs2).last_hidden_state
        outputs2 = self.dropout12(outputs2)
        weights2 = self.attention2(outputs2)
        outputs2 = torch.sum(weights2 * outputs2, dim=1)
        
        outputs = torch.cat((outputs1, outputs2), -1)
        
        outputs = self.fc(self.dropout2(outputs))
        
        return outputs

In [None]:
def save_model(model, epoch_label, model_save_name):
    save_path = f'{model_save_name}_{epoch_label}.pth'
    torch.save(model.state_dict(), save_path)

In [None]:
max_len=128
batch_size = 8
num_workers=cpu_count()

In [None]:
train_dataset = TrainPatentDataset(X_train, tokenizer1, tokenizer2, max_len)
valid_dataset = TrainPatentDataset(X_valid, tokenizer1, tokenizer2, max_len)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers)
valid_loader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers)

In [None]:
model_save_name = 'uspppm_deberta'
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
num_epochs=25

learning_rate=0.0005
num_warmup_steps=2
early_stop_patience=4
gradient_accumulation = 4

model = CustomModel(model_name1, model_name2)
model = model.to(device)

criterion = nn.BCEWithLogitsLoss()

opt = torch.optim.AdamW(model.parameters(), lr=learning_rate)
scheduler = get_cosine_schedule_with_warmup(opt, num_warmup_steps=num_warmup_steps, num_training_steps=num_epochs)

In [None]:
train_loss_history = []
val_loss_history = []
val_pearson_history = []
valid_labels = X_valid['label']

best_pearson = -1
best_epoch = 1
step_counter = 1
for epoch in range(1, num_epochs+1):
    print(f'Epoch {epoch}')

    train_loss = []
    model.train()
    for inputs1, inputs2, labels in tqdm(train_loader):
        for k, v in inputs1.items():
            inputs1[k] = v.to(device)
        for k, v in inputs2.items():
            inputs2[k] = v.to(device)
        labels = labels.to(device)
        y_preds = model(inputs1, inputs2).squeeze(-1)

        loss = criterion(y_preds, labels)
        train_loss.append(loss.item())

        if gradient_accumulation > 1:
            loss = loss / gradient_accumulation
        loss.backward()
        if step_counter % gradient_accumulation == 0:
            opt.step()
            opt.zero_grad()
    
#     print(train_loss)
    train_loss = np.mean(train_loss)
    train_loss_history.append(train_loss)

    model.eval()
    val_loss = []
    val_preds = []
    for inputs1, inputs2, labels in tqdm(valid_loader):
        with torch.no_grad():
            for k, v in inputs1.items():
                inputs1[k] = v.to(device)
            for k, v in inputs2.items():
                inputs2[k] = v.to(device)
            labels = labels.to(device)
            
            y_preds = model(inputs1, inputs2).squeeze(-1)
            loss = criterion(y_preds, labels)
            val_loss.append(loss.item())
            val_preds.append(y_preds.sigmoid().cpu().detach().numpy())
    val_loss = np.mean(val_loss)
    val_loss_history.append(val_loss)
    
    predictions = np.concatenate(val_preds)
    pear_cor = stats.pearsonr(valid_labels, predictions)[0]
    val_pearson_history.append(pear_cor)
    print(f'Train loss: {train_loss:.6f}')
    print(f'Valid loss: {val_loss:.6f}, Pearson: {pear_cor:.6f}')
    
    scheduler.step()
    
    if pear_cor > best_pearson:
        if epoch != 1:
            os.remove(f'{model_save_name}_{best_epoch}.pth')
        best_pearson = pear_cor
        best_epoch = epoch
        save_model(model, best_epoch, model_save_name)
    elif epoch - best_epoch == early_stop_patience:
        break