In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import time
from scipy import stats
from tqdm import tqdm
from multiprocessing import cpu_count

import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader

import tokenizers
import transformers
from transformers import AutoTokenizer, AutoModel, AutoConfig, AutoModelForSequenceClassification
from transformers import get_cosine_schedule_with_warmup

In [2]:
model_name = 'microsoft/deberta-v3-base'

In [3]:
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
tokenizer.save_pretrained('./tokenizer/')
config = AutoConfig.from_pretrained(model_name)
config.save_pretrained('./config/')

Downloading:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/579 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.35M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
  "The sentencepiece tokenizer that you are converting to a fast tokenizer uses the byte fallback option"
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [4]:
df_train = pd.read_csv('/kaggle/input/us-patent-phrase-to-phrase-matching/train.csv')

In [5]:
context_mapping_df = pd.read_csv('/kaggle/input/patentmatching-titles/titles.csv')
context_mapping = {}
for code, context in zip(context_mapping_df['code'], context_mapping_df['title']):
    context_mapping[code] = context

context_title_mapping = {"A" : "Human Necessities", 
      "B" : "Operations and Transport",
      "C" : "Chemistry and Metallurgy",
      "D" : "Textiles",
      "E" : "Fixed Constructions",
      "F" : "Mechanical Engineering",
      "G" : "Physics",
      "H" : "Electricity",
      "Y" : "Emerging Cross-Sectional Technologies"}

df_train['context_text'] = df_train['context'].apply(lambda x: context_mapping[x].lower())
df_train['context_title'] = df_train['context'].apply(lambda x: context_title_mapping[x[0]].lower())

df_train['text'] = df_train['anchor'] + '[SEP]' + df_train['target'] + '[SEP]' + df_train['context_text']

In [6]:
label_mapping = {0.0: 0, 0.25: 1, 0.5: 2, 0.75: 3, 1.0: 4}
df_train['label'] = df_train['score'].apply(lambda x: label_mapping[x])

In [7]:
# df_train, _ = train_test_split(df_train, test_size=0.85, stratify=df_train['label'])
X_train, X_valid = train_test_split(df_train, test_size=0.15, stratify=df_train['label'])

In [8]:
class TrainPatentDataset(Dataset):
    def __init__(self, df, tokenizer, max_len):
        self.texts = df['text'].values.tolist()
        self.labels = df['score'].values.tolist()
        self.max_len = max_len
        self.tokenizer = tokenizer
    def __len__(self):
        return len(self.texts)
    def __getitem__(self, idx):
        inputs = self.tokenizer(self.texts[idx], padding='max_length', max_length = self.max_len, truncation=True, return_tensors="pt")
        for k, v in inputs.items():
            inputs[k] = v.squeeze(0)
        labels = torch.tensor(self.labels[idx])
        return inputs, labels

In [9]:
class CustomModel(nn.Module):
    def __init__(self, model_name):
        super().__init__()
        self.model = AutoModel.from_pretrained(model_name)
        self.attention = nn.Sequential(
            nn.Linear(768, 512),
            nn.Tanh(),
            nn.Linear(512, 1),
            nn.Softmax(dim=1)
        )
        self.dropout1 = nn.Dropout()
        self.dropout2 = nn.Dropout()
        self.fc = nn.Linear(768, 1)
        
    def forward(self, inputs):
        outputs = self.model(**inputs).last_hidden_state
        outputs = self.dropout1(outputs)
        weights = self.attention(outputs)
        outputs = torch.sum(weights * outputs, dim=1)
        outputs = self.fc(self.dropout2(outputs))
        
        return outputs

In [10]:
def save_model(model, epoch_label, model_save_name):
    save_path = f'{model_save_name}_{epoch_label}.pth'
    torch.save(model.state_dict(), save_path)

In [11]:
max_len=128
batch_size = 32
num_workers=cpu_count()

In [12]:
train_dataset = TrainPatentDataset(X_train, tokenizer, max_len)
valid_dataset = TrainPatentDataset(X_valid, tokenizer, max_len)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers)
valid_loader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers)

In [13]:
model_save_name = 'uspppm_deberta'
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
num_epochs=25
total_steps=int(num_epochs*X_train.shape[0]/batch_size)+1

learning_rate=0.0005
num_warmup_steps=int(0.1*total_steps)+1
early_stop_patience=4

model = CustomModel(model_name)
model = model.to(device)

criterion = nn.BCEWithLogitsLoss()

opt = torch.optim.AdamW(model.parameters(), lr=learning_rate)
scheduler = get_cosine_schedule_with_warmup(opt, num_warmup_steps=num_warmup_steps, num_training_steps=total_steps)

Downloading:   0%|          | 0.00/354M [00:00<?, ?B/s]

Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2Model: ['mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.LayerNorm.weight', 'mask_predictions.classifier.weight', 'mask_predictions.classifier.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.bias', 'mask_predictions.dense.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.LayerNorm.weight', 'mask_predictions.dense.weight']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [14]:
train_loss_history = []
val_loss_history = []
val_pearson_history = []
valid_labels = X_valid['label']

best_pearson = -1
best_epoch = 1
for epoch in range(1, num_epochs+1):
    print(f'Epoch {epoch}')

    train_loss = []
    model.train()
    for inputs, labels in tqdm(train_loader):
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        y_preds = model(inputs).squeeze(-1)
#         print(y_preds.shape, labels.shape)
#         print(labels)
#         print(y_preds)

        opt.zero_grad()
        loss = criterion(y_preds, labels)
        train_loss.append(loss.item())
#         print(loss.item())

        loss.backward()
        opt.step()
        scheduler.step()
    
#     print(train_loss)
    train_loss = np.mean(train_loss)
    train_loss_history.append(train_loss)

    model.eval()
    val_loss = []
    val_preds = []
    for inputs, labels in tqdm(valid_loader):
        with torch.no_grad():
            for k, v in inputs.items():
                inputs[k] = v.to(device)
            labels = labels.to(device)
            
            y_preds = model(inputs).squeeze(-1)
            loss = criterion(y_preds, labels)
            val_loss.append(loss.item())
            val_preds.append(y_preds.sigmoid().cpu().detach().numpy())
    val_loss = np.mean(val_loss)
    val_loss_history.append(val_loss)
    
    predictions = np.concatenate(val_preds)
    pear_cor = stats.pearsonr(valid_labels, predictions)[0]
    val_pearson_history.append(pear_cor)
    print(f'Train loss: {train_loss:.6f}')
    print(f'Valid loss: {val_loss:.6f}, Pearson: {pear_cor:.6f}')
    
    
    if pear_cor > best_pearson:
        if epoch != 1:
            os.remove(f'{model_save_name}_{best_epoch}.pth')
        best_pearson = pear_cor
        best_epoch = epoch
        save_model(model, best_epoch, model_save_name)
    elif epoch - best_epoch == early_stop_patience:
        break

Epoch 1


100%|██████████| 969/969 [08:05<00:00,  1.99it/s]
100%|██████████| 171/171 [00:26<00:00,  6.36it/s]


Train loss: 0.599434
Valid loss: 0.576665, Pearson: 0.736296
Epoch 2


100%|██████████| 969/969 [08:04<00:00,  2.00it/s]
100%|██████████| 171/171 [00:26<00:00,  6.34it/s]


Train loss: 0.624252
Valid loss: 0.655105, Pearson: 0.015037
Epoch 3


100%|██████████| 969/969 [08:04<00:00,  2.00it/s]
100%|██████████| 171/171 [00:27<00:00,  6.29it/s]


Train loss: 0.655825
Valid loss: 0.654566, Pearson: 0.014480
Epoch 4


100%|██████████| 969/969 [08:05<00:00,  2.00it/s]
100%|██████████| 171/171 [00:26<00:00,  6.35it/s]


Train loss: 0.655574
Valid loss: 0.654588, Pearson: 0.015544
Epoch 5


100%|██████████| 969/969 [08:04<00:00,  2.00it/s]
100%|██████████| 171/171 [00:26<00:00,  6.36it/s]

Train loss: 0.655620
Valid loss: 0.655832, Pearson: 0.015372



