In [1]:
# Made using https://luv-bansal.medium.com/fine-tuning-bert-for-text-classification-in-pytorch-503d97342db2

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import os
os.environ['TOKENIZERS_PARALLELISM'] = 'false'

## Load Data

In [2]:
data_path = '../data/us-patent-phrase-to-phrase-matching/'
train_file = 'train.csv'
test_file = 'test.csv'

df_train = pd.read_csv(data_path + train_file)
df_train.head(2)

Unnamed: 0,id,anchor,target,context,score
0,37d61fd2272659b1,abatement,abatement of pollution,A47,0.5
1,7b9652b17b68b7a4,abatement,act of abating,A47,0.75


In [3]:
df_test = pd.read_csv(data_path + test_file)
df_test.head(2)

Unnamed: 0,id,anchor,target,context
0,4112d61851461f60,opc drum,inorganic photoconductor drum,G02
1,09e418c93a776564,adjust gas flow,altering gas flow,F23


In [4]:
cpc_codes = pd.read_csv('../data/cooperative-patent-classification-codes-meaning/titles.csv').rename(columns={"code" : "context"})
cpc_codes.head(2)

Unnamed: 0,context,title,section,class,subclass,group,main_group
0,A,HUMAN NECESSITIES,A,,,,
1,A01,AGRICULTURE; FORESTRY; ANIMAL HUSBANDRY; HUNTI...,A,1.0,,,


In [5]:
cpc_codes = cpc_codes.rename(columns = {"code" : "context"})
df_train = pd.merge(df_train, cpc_codes[["context","title"]], on ="context", how = "left")
df_test = pd.merge(df_test, cpc_codes[["context","title"]], on ="context", how = "left")
df_train.head(2)

Unnamed: 0,id,anchor,target,context,score,title
0,37d61fd2272659b1,abatement,abatement of pollution,A47,0.5,FURNITURE; DOMESTIC ARTICLES OR APPLIANCES; CO...
1,7b9652b17b68b7a4,abatement,act of abating,A47,0.75,FURNITURE; DOMESTIC ARTICLES OR APPLIANCES; CO...


### Preprocessing

In [6]:
from transformers import (PreTrainedModel, RobertaModel, RobertaTokenizerFast, RobertaConfig,
                          get_constant_schedule_with_warmup, AdamW, RobertaTokenizer, BertTokenizerFast)
from torch.utils.data import DataLoader, Dataset


In [7]:
#tokenizer = BertTokenizerFast.from_pretrained('roberta-base')
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")

In [8]:

MAX_ANCHOR_LEN = 40
MAX_TARGET_LEN = 50
MAX_TITLE_LEN = 175

df_train['input'] = df_train.apply(lambda x: (x.anchor + ' | ' + x.target, x.title), axis=1)  # Not sure '|' is a good idea
df_test['input'] = df_test.apply(lambda x: (x.anchor + ' | ' + x.target, x.title), axis=1)  # Not sure '|' is a good idea
df_train.input.iloc[:2]

0    (abatement | abatement of pollution, FURNITURE...
1    (abatement | act of abating, FURNITURE; DOMEST...
Name: input, dtype: object

In [9]:
#df_train['input_ids'] = tokenizer(df_train.input.to_list())['input_ids']
#df_train['input_attention_mask'] = tokenizer(df_train.input.to_list())['attention_mask']
df_train['out'] = pd.get_dummies(df_train.score, prefix='score').agg(list, axis=1)


In [10]:
from sklearn.model_selection import train_test_split

df_train, df_val = train_test_split(df_train, test_size=.05, shuffle=True, random_state=41)
df_train.shape, df_val.shape

((34649, 8), (1824, 8))

In [19]:
class PatentDataset(Dataset):
    def __init__(self, tokenizer: BertTokenizerFast, dataset, max_anchor_len, max_target_len, max_title_len, export=False):
        """

        :param tokenizer:
        :param dataset:
        :param export: This mode is designed for computing final results on a dataset that does not contain the target variable
        """
        super(PatentDataset, self).__init__()
        self.export = export
        self.tokenizer: BertTokenizerFast = tokenizer
        self.df = dataset
        self.max_length = max_anchor_len + max_target_len + max_title_len  # FIXME

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        seq = self.df.input.iloc[index]

        inputs = self.tokenizer.encode_plus(
            seq,
            add_special_tokens=True,
            return_attention_mask=True,
            max_length=self.max_length,
            padding='max_length'
        )
        ids = inputs["input_ids"]
        token_type_ids = inputs["token_type_ids"]
        mask = inputs["attention_mask"]

        out = dict(
            ids=torch.tensor(ids, dtype=torch.long),
            mask=torch.tensor(mask, dtype=torch.long),
            token_type_ids=torch.tensor(token_type_ids, dtype=torch.long),
        )
        if not self.export:
            out['target'] = torch.tensor(self.df.out.iloc[index], dtype=torch.float)

        return out

train_dataset = PatentDataset(tokenizer=tokenizer, dataset=df_train, max_anchor_len=MAX_ANCHOR_LEN, max_target_len=MAX_TARGET_LEN, max_title_len=MAX_TITLE_LEN)
val_dataset = PatentDataset(tokenizer=tokenizer, dataset=df_val, max_anchor_len=MAX_ANCHOR_LEN, max_target_len=MAX_TARGET_LEN, max_title_len=MAX_TITLE_LEN)
test_dataset = PatentDataset(tokenizer=tokenizer, dataset=df_test, max_anchor_len=MAX_ANCHOR_LEN, max_target_len=MAX_TARGET_LEN, max_title_len=MAX_TITLE_LEN, export=True)

In [20]:
train_dataloader = DataLoader(dataset=train_dataset, batch_size=64, num_workers=12)
val_dataloader = DataLoader(dataset=val_dataset, batch_size=64, num_workers=12)
test_dataloader = DataLoader(dataset=test_dataset, batch_size=64, num_workers=12, shuffle=False)

### Model

In [21]:
from pytorch_lightning import Trainer, seed_everything
from pytorch_lightning.callbacks.early_stopping import EarlyStopping
from pytorch_lightning.core.lightning import LightningModule
import torch
from torch import nn
from transformers import (PreTrainedModel, RobertaModel, RobertaTokenizerFast, RobertaConfig,
                          get_constant_schedule_with_warmup, AdamW, RobertaTokenizer, BertTokenizerFast)

from transformers import BertModel
import pytorch_lightning as pl
import torch.nn.functional as F

seed_everything(42)


Global seed set to 42


42

In [22]:
use_cuda = True
device = torch.device("cuda" if torch.cuda.is_available() and use_cuda else "cpu")
print('Using device', device)

Using device cuda


In [23]:
from pytorch_lightning.utilities.types import STEP_OUTPUT


class BERT(pl.LightningModule):
    def __init__(self, num_classes):
        super(BERT, self).__init__()
        self.pretrained_model = BertModel.from_pretrained("bert-base-uncased")
        self.out = nn.Linear(768, num_classes)

    def forward(self, ids, mask, token_type_ids):
        _, o2 = self.pretrained_model(ids, attention_mask=mask, token_type_ids=token_type_ids, return_dict=False)
        out = torch.sigmoid(self.out(o2))
        return out

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=1e-4)
        lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1)
        return [optimizer], [lr_scheduler]

    def training_step(self, batch, batch_idx) -> STEP_OUTPUT:
        loss = self._common_step(batch, batch_idx, 'train')
        return loss

    def validation_step(self, batch, batch_idx):
        self._common_step(batch, batch_idx, 'val')

    def _common_step(self, batch, batch_idx, stage: str):
        ids, label, mask, token_type_ids = self._prepare_batch(batch)
        output = self(ids=ids, mask=mask, token_type_ids=token_type_ids)
        label = label.type_as(output)
        loss = F.cross_entropy(output, label)
        acc = (torch.argmax(output, dim=-1) == torch.argmax(label, dim=-1)).float().mean()
        self.log(f"{stage}_loss", loss, on_step=True)
        self.log(f"{stage}_acc", acc, on_step=True)
        return loss

    def predict_step(self, batch, batch_idx: int, dataloader_idx: int = 0):
        ids, _, mask, token_type_ids = self._prepare_batch(batch, include_target=False)
        output = self(ids=ids, mask=mask, token_type_ids=token_type_ids)
        return torch.argmax(output, dim=-1)

    def test_step(self, batch, batch_idx):
        self._common_step(batch, batch_idx, 'test')

    def _prepare_batch(self, batch, include_target=True):
        ids = batch['ids']
        token_type_ids = batch['token_type_ids']
        mask = batch['mask']
        if not include_target:
            return ids, None, mask, token_type_ids
        label = batch['target']
        # label = label.unsqueeze(1)
        return ids, label, mask, token_type_ids


In [24]:
from pytorch_lightning.loggers import TensorBoardLogger
from pytorch_lightning.callbacks import LearningRateMonitor
import logging
from logging import WARNING
logging.basicConfig(level=WARNING)

early_stop_callback = EarlyStopping(monitor="train_loss", min_delta=1e-4, patience=2, verbose=False, mode="min")
lr_logger = LearningRateMonitor()
logger = TensorBoardLogger("lightning_logs")

trainer = pl.Trainer(
    accelerator='gpu',
    gradient_clip_val=0.1,
    # clipping gradients is a hyperparameter and important to prevent divergence
    # of the gradient for recurrent neural networks
    auto_lr_find=True,
    callbacks=[lr_logger, early_stop_callback],
    logger=logger,
    weights_summary="top",
)

hparams = dict(
    num_classes=df_train.score.unique().size
)

checkpoint = "lightning_logs/lightning_logs/version_0_bert_fine_tuned/checkpoints/epoch=4-step=2710.ckpt"
if checkpoint is not None:
    model = BERT.load_from_checkpoint(checkpoint, **hparams)
    print(f'Checkpoint {checkpoint} loaded')
else:
    model = BERT(**hparams)



  rank_zero_warn(
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification 

Checkpoint lightning_logs/lightning_logs/version_0_bert_fine_tuned/checkpoints/epoch=4-step=2710.ckpt loaded


In [18]:
# Disable training of BERT model
for param in model.pretrained_model.parameters():
    param.requires_grad = False

trainer.validate(model, val_dataloader)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Validation: 0it [00:00, ?it/s]

────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
     Validate metric           DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
      val_acc_epoch         0.08662280440330505
     val_loss_epoch         1.6055322885513306
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────


[{'val_loss_epoch': 1.6055322885513306, 'val_acc_epoch': 0.08662280440330505}]

In [19]:
trainer.fit(model, train_dataloaders=train_dataloader, val_dataloaders=val_dataloader)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name       | Type      | Params
-----------------------------------------
0 | bert_model | BertModel | 109 M 
1 | out        | Linear    | 3.8 K 
-----------------------------------------
3.8 K     Trainable params
109 M     Non-trainable params
109 M     Total params
437.944   Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

  rank_zero_warn("Detected KeyboardInterrupt, attempting graceful shutdown...")


In [17]:
trainer.validate(model, val_dataloader)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Validation: 0it [00:00, ?it/s]

────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
     Validate metric           DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
      val_acc_epoch         0.3229166567325592
     val_loss_epoch         1.4486135244369507
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────


[{'val_loss_epoch': 1.4486135244369507, 'val_acc_epoch': 0.3229166567325592}]

In [29]:
results = trainer.predict(model, test_dataloader)

df_test['y_pred'] = np.concatenate(results)
corresp_map = [.0, .25, .5, .75, 1.0]
df_test.y_pred = df_test.y_pred.apply(lambda x: corresp_map[x])
df_test

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting: 0it [00:00, ?it/s]

Unnamed: 0,id,anchor,target,context,title,input,y_pred
0,4112d61851461f60,opc drum,inorganic photoconductor drum,G02,OPTICS,"(opc drum | inorganic photoconductor drum, OPT...",0.25
1,09e418c93a776564,adjust gas flow,altering gas flow,F23,COMBUSTION APPARATUS; COMBUSTION PROCESSES,"(adjust gas flow | altering gas flow, COMBUSTI...",0.25
2,36baf228038e314b,lower trunnion,lower locating,B60,VEHICLES IN GENERAL,"(lower trunnion | lower locating, VEHICLES IN ...",0.5
3,1f37ead645e7f0c8,cap component,upper portion,D06,TREATMENT OF TEXTILES OR THE LIKE; LAUNDERING;...,"(cap component | upper portion, TREATMENT OF T...",0.5
4,71a5b6ad068d531f,neural stimulation,artificial neural network,H04,ELECTRIC COMMUNICATION TECHNIQUE,(neural stimulation | artificial neural networ...,0.25
5,474c874d0c07bd21,dry corn,dry corn starch,C12,BIOCHEMISTRY; BEER; SPIRITS; WINE; VINEGAR; MI...,"(dry corn | dry corn starch, BIOCHEMISTRY; BEE...",0.25
6,442c114ed5c4e3c9,tunneling capacitor,capacitor housing,G11,INFORMATION STORAGE,"(tunneling capacitor | capacitor housing, INFO...",0.5
7,b8ae62ea5e1d8bdb,angular contact bearing,contact therapy radiation,B23,MACHINE TOOLS; METAL-WORKING NOT OTHERWISE PRO...,(angular contact bearing | contact therapy rad...,0.25
8,faaddaf8fcba8a3f,produce liquid hydrocarbons,produce a treated stream,C10,"PETROLEUM, GAS OR COKE INDUSTRIES; TECHNICAL G...",(produce liquid hydrocarbons | produce a treat...,0.25
9,ae0262c02566d2ce,diesel fuel tank,diesel fuel tanks,F02,COMBUSTION ENGINES; HOT-GAS OR COMBUSTION-PROD...,"(diesel fuel tank | diesel fuel tanks, COMBUST...",0.25
