In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import time
from scipy import stats
from tqdm import tqdm

import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader

!pip install transformers
!pip install sentencepiece
import tokenizers
import transformers
from transformers import AutoTokenizer, AutoModel, AutoConfig, AutoModelForSequenceClassification

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
model_name = 'microsoft/deberta-v3-small'
# model_name = 'microsoft/deberta-v3-base'
# model_name = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
  "The sentencepiece tokenizer that you are converting to a fast tokenizer uses the byte fallback option"
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
df_train = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/hse_mldm/train.csv')
df_test = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/hse_mldm/test.csv')

In [None]:
df_train

Unnamed: 0,id,anchor,target,context,score
0,37d61fd2272659b1,abatement,abatement of pollution,A47,0.50
1,7b9652b17b68b7a4,abatement,act of abating,A47,0.75
2,36d72442aefd8232,abatement,active catalyst,A47,0.25
3,5296b0c19e1ce60e,abatement,eliminating process,A47,0.50
4,54c1e3b9184cb5b6,abatement,forest region,A47,0.00
...,...,...,...,...,...
36468,8e1386cbefd7f245,wood article,wooden article,B44,1.00
36469,42d9e032d1cd3242,wood article,wooden box,B44,0.50
36470,208654ccb9e14fa3,wood article,wooden handle,B44,0.50
36471,756ec035e694722b,wood article,wooden material,B44,0.75


In [None]:
context_mapping_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/hse_mldm/titles.csv')
context_mapping = {}
for code, context in zip(context_mapping_df['code'], context_mapping_df['title']):
    context_mapping[code] = context

context_title_mapping = {"A" : "Human Necessities", 
      "B" : "Operations and Transport",
      "C" : "Chemistry and Metallurgy",
      "D" : "Textiles",
      "E" : "Fixed Constructions",
      "F" : "Mechanical Engineering",
      "G" : "Physics",
      "H" : "Electricity",
      "Y" : "Emerging Cross-Sectional Technologies"}

df_train['context_text'] = df_train['context'].apply(lambda x: context_mapping[x].lower())
df_train['context_title'] = df_train['context'].apply(lambda x: context_title_mapping[x[0]].lower())

df_test['context_text'] = df_train['context'].apply(lambda x: context_mapping[x].lower())
df_test['context_title'] = df_train['context'].apply(lambda x: context_title_mapping[x[0]].lower())

df_train['text'] = df_train['anchor'] + '[SEP]' + df_train['target'] + '[SEP]' + df_train['context_text']
df_test['text'] = df_test['anchor'] + '[SEP]' + df_test['target'] + '[SEP]' + df_test['context_text']
# df_train['text'] = df_train['anchor'] + ' ' + df_train['target'] + ' ' + df_train['context_text']
# df_test['text'] = df_test['anchor'] + ' ' + df_test['target'] + ' ' + df_test['context_text']

In [None]:
df_train

In [None]:
label_mapping = {0.0: 0, 0.25: 1, 0.5: 2, 0.75: 3, 1.0: 4}
df_train['label'] = df_train['score'].apply(lambda x: label_mapping[x])

In [None]:
context_text_lengths = []
for text in df_train['context_text']:
    context_text_lengths.append(len(tokenizer(text, add_special_tokens=False)['input_ids']))
max_context_text_length = max(context_text_lengths)

anchor_lengths = []
for text in df_train['anchor']:
    anchor_lengths.append(len(tokenizer(text, add_special_tokens=False)['input_ids']))
max_anchor_length = max(anchor_lengths)

target_lengths = []
for text in df_train['target']:
    target_lengths.append(len(tokenizer(text, add_special_tokens=False)['input_ids']))
max_target_length = max(target_lengths)

max_len = max_context_text_length + max_anchor_length + max_target_length + 4
print(f'max_len={max_len}')

max_len=99


In [None]:
X_train, X_valid = train_test_split(df_train, test_size=0.15, stratify=df_train['label'])

In [None]:
class TrainPatentDataset(Dataset):
    def __init__(self, df, tokenizer, max_len):
        # txt = df['text'].values.tolist()
        # self.texts = [tokenizer(str(i), padding='max_length', max_length = max_len, truncation=True, return_tensors="pt") for i in txt]
        self.texts = df['text'].values.tolist()
        self.labels = df['label'].values.tolist()
        self.max_len = max_len
    def __len__(self):
        return len(self.texts)
    def __getitem__(self, idx):
        inputs = tokenizer(self.texts[idx], padding='max_length', max_length = self.max_len, truncation=True, return_tensors="pt")
        for k, v in inputs.items():
            inputs[k] = v.squeeze(0)
        labels = torch.tensor(self.labels[idx], dtype=torch.long)
        return inputs, labels

In [None]:
class TestPatentDataset(Dataset):
    def __init__(self, df, tokenizer, max_len):
        self.texts = df['text'].values.tolist()
    def __len__(self):
        return len(self.texts)
    def __getitem__(self, idx):
        inputs = tokenizer(self.texts[idx], padding='max_length', max_length = self.max_len, truncation=True, return_tensors="pt")
        for k, v in inputs.items():
            inputs[k] = v.squeeze(0)
        return inputs

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
batch_size = 16
num_epochs = 5

In [None]:
train_dataset = TrainPatentDataset(X_train, tokenizer, max_len)
valid_dataset = TrainPatentDataset(X_valid, tokenizer, max_len)
test_dataset = TestPatentDataset(df_test, tokenizer, max_len)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=2)
valid_loader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False, num_workers=2)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=2)

In [None]:
class CustomModel(nn.Module):
    def __init__(self, model_name):
        super().__init__()
        self.model = AutoModel.from_pretrained(model_name)
        self.fc_dropout = nn.Dropout()
        self.fc = nn.Linear(768, 5)
        self.attention = nn.Sequential(
            nn.Linear(768, 512),
            nn.Tanh(),
            nn.Linear(512, 1),
            nn.Softmax(dim=1)
        )
    def forward(self, inputs):
        outputs = self.model(**inputs)
        last_hidden_states = outputs[0]
        # feature = torch.mean(last_hidden_states, 1)
        weights = self.attention(last_hidden_states)
        feature = torch.sum(weights * last_hidden_states, dim=1)
        output = self.fc(self.fc_dropout(feature))
        return output

In [None]:
model = CustomModel(model_name)
model = model.to(device)

Some weights of the model checkpoint at microsoft/deberta-v3-small were not used when initializing DebertaV2Model: ['lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.dense.weight', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'mask_predictions.LayerNorm.weight', 'mask_predictions.classifier.weight', 'mask_predictions.classifier.bias', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.dense.bias', 'mask_predictions.LayerNorm.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
device

device(type='cuda')

In [None]:
# criterion = nn.BCEWithLogitsLoss()
criterion = nn.CrossEntropyLoss()

In [None]:
params_to_update = model.parameters()
opt = torch.optim.SGD(params_to_update, lr=0.001, momentum=0.9)

In [None]:
start = time.time()
train_loss_history = []
val_loss_history = []
val_pearson_history = []
valid_labels = X_valid['label']

best_acc = 0.0
best_model_epoch = 0
for epoch in range(num_epochs):
    print(f'Epoch {epoch}/{num_epochs}')

    train_loss = []
    model.train()
    for inputs, labels in tqdm(train_loader):
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        y_preds = model(inputs)
        # print(y_preds.shape, labels.shape)
        # print(labels)
        # print(y_preds)

        opt.zero_grad()
        loss = criterion(y_preds, labels)
        train_loss.append(loss.item())
        # print(loss.item())

        loss.backward()
        opt.step()
    
    train_loss = np.mean(train_loss)
    train_loss_history.append(train_loss)
    print(f'Train loss: {train_loss:.6f}')

    model.eval()
    val_loss = []
    val_preds = []
    for inputs, labels in tqdm(valid_loader):
        with torch.no_grad():
            for k, v in inputs.items():
                inputs[k] = v.to(device)
            labels = labels.to(device)
            y_preds = model(inputs)
            loss = criterion(y_preds, labels)
            val_loss.append(loss.item())
            val_preds.append(y_preds.argmax(dim=-1).to('cpu').detach().numpy())
    val_loss = np.mean(val_loss)
    val_loss_history.append(val_loss)
    predictions = np.concatenate(val_preds)
    print(predictions)
    # predictions = np.concatenate(predictions)
    pear_cor = stats.pearsonr(valid_labels, predictions)[0]
    val_pearson_history.append(pear_cor)
    print(f'Valid loss: {val_loss:.6f}, Pearson: {pear_cor}')

Epoch 0/5


100%|██████████| 1938/1938 [06:09<00:00,  5.24it/s]


Train loss: 1.012258


100%|██████████| 342/342 [00:21<00:00, 16.27it/s]


[0 1 2 ... 3 1 2]
Valid loss: 0.825436, Pearson: 0.7718519278222177
Epoch 1/5


100%|██████████| 1938/1938 [06:11<00:00,  5.22it/s]


Train loss: 0.795099


100%|██████████| 342/342 [00:23<00:00, 14.84it/s]


[0 1 2 ... 3 1 2]
Valid loss: 0.712673, Pearson: 0.8002987334179096
Epoch 2/5


100%|██████████| 1938/1938 [06:11<00:00,  5.22it/s]


Train loss: 0.681554


100%|██████████| 342/342 [00:21<00:00, 16.16it/s]


[0 1 2 ... 3 1 2]
Valid loss: 0.727994, Pearson: 0.8070091865989626
Epoch 3/5


  1%|          | 24/1938 [00:04<06:22,  5.01it/s]


KeyboardInterrupt: ignored

In [None]:
def save_network(network, epoch_label, device):
    save_path = f'patent_model_{epoch_label}.pth'
    torch.save(network.cpu().state_dict(), save_path)
    if device == 'cuda':
        network.to(device)

In [None]:
save_network(model, 2, device)

In [None]:
def save_network_drive(network, epoch_label, device):
    save_path = f'/content/drive/MyDrive/Colab Notebooks/hse_mldm/patent_model_{epoch_label}.pth'
    torch.save(network.cpu().state_dict(), save_path)
    if device == 'cuda':
        network.to(device)

In [None]:
save_network_drive(model, 2, device)