In [1]:
# Made using https://luv-bansal.medium.com/fine-tuning-bert-for-text-classification-in-pytorch-503d97342db2

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import os
os.environ['TOKENIZERS_PARALLELISM'] = 'false'

## Load Data

In [3]:
data_path = '../data/us-patent-phrase-to-phrase-matching/'
train_file = 'train.csv'
test_file = 'test.csv'

df_train = pd.read_csv(data_path + train_file)
df_train.head(2)

Unnamed: 0,id,anchor,target,context,score
0,37d61fd2272659b1,abatement,abatement of pollution,A47,0.5
1,7b9652b17b68b7a4,abatement,act of abating,A47,0.75


In [4]:
df_test = pd.read_csv(data_path + test_file)
df_test.head(2)

Unnamed: 0,id,anchor,target,context
0,4112d61851461f60,opc drum,inorganic photoconductor drum,G02
1,09e418c93a776564,adjust gas flow,altering gas flow,F23


In [5]:
cpc_codes = pd.read_csv('../data/cooperative-patent-classification-codes-meaning/titles.csv').rename(columns={"code" : "context"})
cpc_codes.head(2)

Unnamed: 0,context,title,section,class,subclass,group,main_group
0,A,HUMAN NECESSITIES,A,,,,
1,A01,AGRICULTURE; FORESTRY; ANIMAL HUSBANDRY; HUNTI...,A,1.0,,,


In [6]:
cpc_codes = cpc_codes.rename(columns = {"code" : "context"})
df_train = pd.merge(df_train, cpc_codes[["context","title"]], on ="context", how = "left")
df_test = pd.merge(df_test, cpc_codes[["context","title"]], on ="context", how = "left")
df_train.head(2)

Unnamed: 0,id,anchor,target,context,score,title
0,37d61fd2272659b1,abatement,abatement of pollution,A47,0.5,FURNITURE; DOMESTIC ARTICLES OR APPLIANCES; CO...
1,7b9652b17b68b7a4,abatement,act of abating,A47,0.75,FURNITURE; DOMESTIC ARTICLES OR APPLIANCES; CO...


### Preprocessing

In [7]:
from transformers import (PreTrainedModel, RobertaModel, RobertaTokenizerFast, RobertaConfig,
                          get_constant_schedule_with_warmup, AdamW, RobertaTokenizer, BertTokenizerFast)
from torch.utils.data import DataLoader, Dataset

In [8]:
tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base')

In [9]:
MAX_ANCHOR_LEN = 40
MAX_TARGET_LEN = 50
MAX_TITLE_LEN = 175

df_train['input'] = df_train.apply(lambda x: (x.anchor + ' | ' + x.title, x.target), axis=1)  # Not sure '|' is a good idea
df_test['input'] = df_test.apply(lambda x: (x.anchor + ' | ' + x.title, x.target), axis=1)  # Not sure '|' is a good idea
df_train.input.iloc[:2]

0    (abatement | FURNITURE; DOMESTIC ARTICLES OR A...
1    (abatement | FURNITURE; DOMESTIC ARTICLES OR A...
Name: input, dtype: object

In [10]:
#df_train['input_ids'] = tokenizer(df_train.input.to_list())['input_ids']
#df_train['input_attention_mask'] = tokenizer(df_train.input.to_list())['attention_mask']
df_train['out'] = pd.get_dummies(df_train.score, prefix='score').agg(list, axis=1)


In [11]:
from sklearn.model_selection import train_test_split

df_train, df_val = train_test_split(df_train, test_size=.05, shuffle=True, random_state=41)
df_train.shape, df_val.shape

((34649, 8), (1824, 8))

In [12]:
class PatentDataset(Dataset):
    def __init__(self, tokenizer: RobertaTokenizerFast, dataset, max_anchor_len, max_target_len, max_title_len, export=False):
        """

        :param tokenizer:
        :param dataset:
        :param export: This mode is designed for computing final results on a dataset that does not contain the target variable
        """
        super(PatentDataset, self).__init__()
        self.export = export
        self.tokenizer: RobertaTokenizerFast = tokenizer
        self.df = dataset
        self.max_length = max_anchor_len + max_target_len + max_title_len  # FIXME

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        seq = self.df.input.iloc[index]

        inputs = self.tokenizer.encode_plus(
            seq,
            add_special_tokens=True,
            return_attention_mask=True,
            max_length=self.max_length,
            padding='max_length'
        )
        ids = inputs["input_ids"]
        mask = inputs["attention_mask"]

        out = dict(
            ids=torch.tensor(ids, dtype=torch.long),
            mask=torch.tensor(mask, dtype=torch.long),
        )
        if not self.export:
            out['target'] = torch.tensor(self.df.out.iloc[index], dtype=torch.float)

        return out


train_dataset = PatentDataset(tokenizer=tokenizer, dataset=df_train, max_anchor_len=MAX_ANCHOR_LEN, max_target_len=MAX_TARGET_LEN, max_title_len=MAX_TITLE_LEN)
val_dataset = PatentDataset(tokenizer=tokenizer, dataset=df_val, max_anchor_len=MAX_ANCHOR_LEN, max_target_len=MAX_TARGET_LEN, max_title_len=MAX_TITLE_LEN)
test_dataset = PatentDataset(tokenizer=tokenizer, dataset=df_test, max_anchor_len=MAX_ANCHOR_LEN, max_target_len=MAX_TARGET_LEN, max_title_len=MAX_TITLE_LEN, export=True)

In [13]:
train_dataloader = DataLoader(dataset=train_dataset, batch_size=64, num_workers=12)
val_dataloader = DataLoader(dataset=val_dataset, batch_size=64, num_workers=12)
test_dataloader = DataLoader(dataset=test_dataset, batch_size=64, num_workers=12, shuffle=False)

### Model

In [14]:
from pytorch_lightning import Trainer, seed_everything
from pytorch_lightning.callbacks.early_stopping import EarlyStopping
from pytorch_lightning.core.lightning import LightningModule
import torch
from torch import nn
from transformers import (PreTrainedModel, RobertaModel, RobertaTokenizerFast, RobertaConfig,
                          get_constant_schedule_with_warmup, AdamW, RobertaTokenizer, BertTokenizerFast)

from transformers import BertModel
import pytorch_lightning as pl
import torch.nn.functional as F

seed_everything(42)


Global seed set to 42


42

In [15]:
use_cuda = True
device = torch.device("cuda" if torch.cuda.is_available() and use_cuda else "cpu")
print('Using device', device)

Using device cuda


In [16]:
from pytorch_lightning.utilities.types import STEP_OUTPUT


class PatentRoBERTa(pl.LightningModule):
    # TODO add dropout
    def __init__(self, num_classes):
        super(PatentRoBERTa, self).__init__()
        self.pretrained_model = RobertaModel.from_pretrained("roberta-base")
        self.hidden = nn.Linear(self.pretrained_model.config.hidden_size, self.pretrained_model.config.hidden_size)
        self.classifier = nn.Linear(self.pretrained_model.config.hidden_size, num_classes)

    def forward(self, ids, mask):
        _, o2 = self.pretrained_model(ids, attention_mask=mask, return_dict=False)
        out = torch.relu(self.hidden(o2))
        out = torch.relu(self.classifier(out))
        return out

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=1e-3)
        lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1)
        return [optimizer], [lr_scheduler]

    def training_step(self, batch, batch_idx) -> STEP_OUTPUT:
        loss = self._common_step(batch, batch_idx, 'train')
        return loss

    def validation_step(self, batch, batch_idx):
        self._common_step(batch, batch_idx, 'val')

    def _common_step(self, batch, batch_idx, stage: str):
        ids, label, mask = self._prepare_batch(batch)
        output = self(ids=ids, mask=mask)
        label = label.type_as(output)
        loss = F.cross_entropy(output, label)
        acc = (torch.argmax(output, dim=-1) == torch.argmax(label, dim=-1)).float().mean()
        self.log(f"{stage}_loss", loss, on_step=True)
        self.log(f"{stage}_acc", acc, on_step=True)
        return loss

    def predict_step(self, batch, batch_idx: int, dataloader_idx: int = 0):
        ids, _, mask = self._prepare_batch(batch, include_target=False)
        output = self(ids=ids, mask=mask)
        return torch.argmax(output, dim=-1)

    def test_step(self, batch, batch_idx):
        self._common_step(batch, batch_idx, 'test')

    def _prepare_batch(self, batch, include_target=True):
        ids = batch['ids']
        mask = batch['mask']
        if not include_target:
            return ids, None, mask
        label = batch['target']
        return ids, label, mask


In [17]:
from pytorch_lightning.loggers import TensorBoardLogger
from pytorch_lightning.callbacks import LearningRateMonitor
import logging
from logging import WARNING
logging.basicConfig(level=WARNING)

early_stop_callback = EarlyStopping(monitor="val_loss", min_delta=1e-4, patience=2, verbose=False, mode="min")
lr_logger = LearningRateMonitor()
logger = TensorBoardLogger("lightning_logs")

trainer = pl.Trainer(
    accelerator='gpu',
    gradient_clip_val=0.1,
    # clipping gradients is a hyperparameter and important to prevent divergence
    # of the gradient for recurrent neural networks
    auto_lr_find=True,
    callbacks=[lr_logger, early_stop_callback],
    logger=logger,
    weights_summary="top",
    max_epochs=2
)

hparams = dict(
    num_classes=df_train.score.unique().size
)

checkpoint = None# "lightning_logs/lightning_logs/version_6/checkpoints/epoch=4-step=5700.ckpt"
if checkpoint is not None:
    model = PatentRoBERTa.load_from_checkpoint(checkpoint, **hparams)
    print(f'Checkpoint {checkpoint} loaded')
else:
    model = PatentRoBERTa(**hparams)



GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [18]:
deeper_layer_to_train = 11
requires_grad = False

for param in model.pretrained_model.named_parameters():
    if 'encoder.layer.11' in param[0]:
        requires_grad = True
    param[1].requires_grad = requires_grad
    print(f'layer {param[0]} is {"NOT " if requires_grad is False else ""}trained.')

layer embeddings.word_embeddings.weight is NOT trained.
layer embeddings.position_embeddings.weight is NOT trained.
layer embeddings.token_type_embeddings.weight is NOT trained.
layer embeddings.LayerNorm.weight is NOT trained.
layer embeddings.LayerNorm.bias is NOT trained.
layer encoder.layer.0.attention.self.query.weight is NOT trained.
layer encoder.layer.0.attention.self.query.bias is NOT trained.
layer encoder.layer.0.attention.self.key.weight is NOT trained.
layer encoder.layer.0.attention.self.key.bias is NOT trained.
layer encoder.layer.0.attention.self.value.weight is NOT trained.
layer encoder.layer.0.attention.self.value.bias is NOT trained.
layer encoder.layer.0.attention.output.dense.weight is NOT trained.
layer encoder.layer.0.attention.output.dense.bias is NOT trained.
layer encoder.layer.0.attention.output.LayerNorm.weight is NOT trained.
layer encoder.layer.0.attention.output.LayerNorm.bias is NOT trained.
layer encoder.layer.0.intermediate.dense.weight is NOT trained

In [19]:
# Disable training of BERT model
# for param in model.pretrained_model.parameters():
#     param.requires_grad = False
"""
requires_grad = False for all layers : val_acc_epoch = 0.3388157784938812
"""

trainer.validate(model, val_dataloader)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Validation: 0it [00:00, ?it/s]

────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
     Validate metric           DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
      val_acc_epoch          0.211074560880661
     val_loss_epoch         1.6103986501693726
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────


[{'val_loss_epoch': 1.6103986501693726, 'val_acc_epoch': 0.211074560880661}]

In [20]:
trainer.fit(model, train_dataloaders=train_dataloader, val_dataloaders=val_dataloader)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name             | Type         | Params
--------------------------------------------------
0 | pretrained_model | RobertaModel | 124 M 
1 | hidden           | Linear       | 590 K 
2 | classifier       | Linear       | 3.8 K 
--------------------------------------------------
8.3 M     Trainable params
116 M     Non-trainable params
125 M     Total params
500.960   Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

In [21]:
trainer.validate(model, val_dataloader)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Validation: 0it [00:00, ?it/s]

────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
     Validate metric           DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
      val_acc_epoch         0.20888157188892365
     val_loss_epoch          1.609437346458435
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────


[{'val_loss_epoch': 1.609437346458435, 'val_acc_epoch': 0.20888157188892365}]

In [22]:
results = trainer.predict(model, test_dataloader)

df_test['y_pred'] = np.concatenate(results)
df_test

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting: 542it [00:00, ?it/s]

Unnamed: 0,id,anchor,target,context,title,input,y_pred
0,4112d61851461f60,opc drum,inorganic photoconductor drum,G02,OPTICS,"(opc drum | OPTICS, inorganic photoconductor d...",0
1,09e418c93a776564,adjust gas flow,altering gas flow,F23,COMBUSTION APPARATUS; COMBUSTION PROCESSES,(adjust gas flow | COMBUSTION APPARATUS; COMBU...,0
2,36baf228038e314b,lower trunnion,lower locating,B60,VEHICLES IN GENERAL,"(lower trunnion | VEHICLES IN GENERAL, lower l...",0
3,1f37ead645e7f0c8,cap component,upper portion,D06,TREATMENT OF TEXTILES OR THE LIKE; LAUNDERING;...,(cap component | TREATMENT OF TEXTILES OR THE ...,0
4,71a5b6ad068d531f,neural stimulation,artificial neural network,H04,ELECTRIC COMMUNICATION TECHNIQUE,(neural stimulation | ELECTRIC COMMUNICATION T...,0
5,474c874d0c07bd21,dry corn,dry corn starch,C12,BIOCHEMISTRY; BEER; SPIRITS; WINE; VINEGAR; MI...,(dry corn | BIOCHEMISTRY; BEER; SPIRITS; WINE;...,0
6,442c114ed5c4e3c9,tunneling capacitor,capacitor housing,G11,INFORMATION STORAGE,"(tunneling capacitor | INFORMATION STORAGE, ca...",0
7,b8ae62ea5e1d8bdb,angular contact bearing,contact therapy radiation,B23,MACHINE TOOLS; METAL-WORKING NOT OTHERWISE PRO...,(angular contact bearing | MACHINE TOOLS; META...,0
8,faaddaf8fcba8a3f,produce liquid hydrocarbons,produce a treated stream,C10,"PETROLEUM, GAS OR COKE INDUSTRIES; TECHNICAL G...","(produce liquid hydrocarbons | PETROLEUM, GAS ...",0
9,ae0262c02566d2ce,diesel fuel tank,diesel fuel tanks,F02,COMBUSTION ENGINES; HOT-GAS OR COMBUSTION-PROD...,(diesel fuel tank | COMBUSTION ENGINES; HOT-GA...,0
