In [1]:
# Made using https://luv-bansal.medium.com/fine-tuning-bert-for-text-classification-in-pytorch-503d97342db2

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import os
os.environ['TOKENIZERS_PARALLELISM'] = 'false'

## Load Data

In [3]:
data_path = '../data/us-patent-phrase-to-phrase-matching/'
train_file = 'train.csv'
test_file = 'test.csv'

df_train = pd.read_csv(data_path + train_file)
df_train.head(2)

Unnamed: 0,id,anchor,target,context,score
0,37d61fd2272659b1,abatement,abatement of pollution,A47,0.5
1,7b9652b17b68b7a4,abatement,act of abating,A47,0.75


In [4]:
df_test = pd.read_csv(data_path + test_file)
df_test.head(2)

Unnamed: 0,id,anchor,target,context
0,4112d61851461f60,opc drum,inorganic photoconductor drum,G02
1,09e418c93a776564,adjust gas flow,altering gas flow,F23


In [5]:
cpc_codes = pd.read_csv('../data/cooperative-patent-classification-codes-meaning/titles.csv').rename(columns={"code" : "context"})
cpc_codes.head(2)

Unnamed: 0,context,title,section,class,subclass,group,main_group
0,A,HUMAN NECESSITIES,A,,,,
1,A01,AGRICULTURE; FORESTRY; ANIMAL HUSBANDRY; HUNTI...,A,1.0,,,


In [6]:
cpc_codes = cpc_codes.rename(columns = {"code" : "context"})
df_train = pd.merge(df_train, cpc_codes[["context","title"]], on ="context", how = "left")
df_test = pd.merge(df_test, cpc_codes[["context","title"]], on ="context", how = "left")
df_train.head(2)

Unnamed: 0,id,anchor,target,context,score,title
0,37d61fd2272659b1,abatement,abatement of pollution,A47,0.5,FURNITURE; DOMESTIC ARTICLES OR APPLIANCES; CO...
1,7b9652b17b68b7a4,abatement,act of abating,A47,0.75,FURNITURE; DOMESTIC ARTICLES OR APPLIANCES; CO...


### Preprocessing

In [7]:
from transformers import (PreTrainedModel, RobertaModel, RobertaTokenizerFast, RobertaConfig,
                          get_constant_schedule_with_warmup, AdamW, RobertaTokenizer, BertTokenizerFast)
from torch.utils.data import DataLoader, Dataset


In [8]:
tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base')

In [23]:
MAX_ANCHOR_LEN = 40
MAX_TARGET_LEN = 50
MAX_TITLE_LEN = 175

df_train['input'] = df_train.apply(lambda x: (x.anchor + ' | ' + x.title, x.target), axis=1)  # Not sure '|' is a good idea
df_test['input'] = df_test.apply(lambda x: (x.anchor + ' | ' + x.title, x.target), axis=1)  # Not sure '|' is a good idea
df_train.input.iloc[:2]

12713              (fence post | BUILDING, area enclosure)
16054    (illumination condition | PHOTOGRAPHY; CINEMAT...
Name: input, dtype: object

In [22]:
df_train

Unnamed: 0,id,anchor,target,context,score,title,input,out
12713,81d5ddf31347fb55,fence post,area enclosure,E04,0.25,BUILDING,"(fence post | area enclosure, BUILDING)","[0, 1, 0, 0, 0]"
16054,4d9522d41d2cdbfe,illumination condition,illumination conditions,G03,1.00,PHOTOGRAPHY; CINEMATOGRAPHY; ANALOGOUS TECHNIQ...,(illumination condition | illumination conditi...,"[0, 0, 0, 0, 1]"
15223,6a401e4913d33c41,hardware blocks,hardware design block,H04,0.50,ELECTRIC COMMUNICATION TECHNIQUE,"(hardware blocks | hardware design block, ELEC...","[0, 0, 1, 0, 0]"
35919,fb2142cd8f627770,wave generation,energy initiating,F42,0.50,AMMUNITION; BLASTING,"(wave generation | energy initiating, AMMUNITI...","[0, 0, 1, 0, 0]"
6295,b5fa55e436930db6,communicate through cavity,circulate through cavity,F01,0.50,MACHINES OR ENGINES IN GENERAL; ENGINE PLANTS ...,(communicate through cavity | circulate throug...,"[0, 0, 1, 0, 0]"
...,...,...,...,...,...,...,...,...
27064,d2a11a888cc1df50,receiver shaft,receiver brush,H04,0.50,ELECTRIC COMMUNICATION TECHNIQUE,"(receiver shaft | receiver brush, ELECTRIC COM...","[0, 0, 1, 0, 0]"
31597,076434b050b81842,sprayed,ejected,B41,0.50,PRINTING; LINING MACHINES; TYPEWRITERS; STAMPS,"(sprayed | ejected, PRINTING; LINING MACHINES;...","[0, 0, 1, 0, 0]"
20450,54823433d8f7c95b,moisture absorption rate,heart rate,D06,0.00,TREATMENT OF TEXTILES OR THE LIKE; LAUNDERING;...,"(moisture absorption rate | heart rate, TREATM...","[1, 0, 0, 0, 0]"
931,6cfe9eebf83c3070,agitate means,stirring means,B01,0.75,PHYSICAL OR CHEMICAL PROCESSES OR APPARATUS IN...,"(agitate means | stirring means, PHYSICAL OR C...","[0, 0, 0, 1, 0]"


In [10]:
#df_train['input_ids'] = tokenizer(df_train.input.to_list())['input_ids']
#df_train['input_attention_mask'] = tokenizer(df_train.input.to_list())['attention_mask']
df_train['out'] = pd.get_dummies(df_train.score, prefix='score').agg(list, axis=1)


In [11]:
from sklearn.model_selection import train_test_split

df_train, df_val = train_test_split(df_train, test_size=.05, shuffle=True, random_state=41)
df_train.shape, df_val.shape

((34649, 8), (1824, 8))

In [12]:
class PatentDataset(Dataset):
    def __init__(self, tokenizer: RobertaTokenizerFast, dataset, max_anchor_len, max_target_len, max_title_len, export=False):
        """

        :param tokenizer:
        :param dataset:
        :param export: This mode is designed for computing final results on a dataset that does not contain the target variable
        """
        super(PatentDataset, self).__init__()
        self.export = export
        self.tokenizer: RobertaTokenizerFast = tokenizer
        self.df = dataset
        self.max_length = max_anchor_len + max_target_len + max_title_len  # FIXME

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        seq = self.df.input.iloc[index]

        inputs = self.tokenizer.encode_plus(
            seq,
            add_special_tokens=True,
            return_attention_mask=True,
            max_length=self.max_length,
            padding='max_length'
        )
        ids = inputs["input_ids"]
        mask = inputs["attention_mask"]

        out = dict(
            ids=torch.tensor(ids, dtype=torch.long),
            mask=torch.tensor(mask, dtype=torch.long),
        )
        if not self.export:
            out['target'] = torch.tensor(self.df.out.iloc[index], dtype=torch.float)

        return out


train_dataset = PatentDataset(tokenizer=tokenizer, dataset=df_train, max_anchor_len=MAX_ANCHOR_LEN, max_target_len=MAX_TARGET_LEN, max_title_len=MAX_TITLE_LEN)
val_dataset = PatentDataset(tokenizer=tokenizer, dataset=df_val, max_anchor_len=MAX_ANCHOR_LEN, max_target_len=MAX_TARGET_LEN, max_title_len=MAX_TITLE_LEN)
test_dataset = PatentDataset(tokenizer=tokenizer, dataset=df_test, max_anchor_len=MAX_ANCHOR_LEN, max_target_len=MAX_TARGET_LEN, max_title_len=MAX_TITLE_LEN, export=True)

In [13]:
train_dataloader = DataLoader(dataset=train_dataset, batch_size=64, num_workers=12)
val_dataloader = DataLoader(dataset=val_dataset, batch_size=64, num_workers=12)
test_dataloader = DataLoader(dataset=test_dataset, batch_size=64, num_workers=12, shuffle=False)

### Model

In [14]:
from pytorch_lightning import Trainer, seed_everything
from pytorch_lightning.callbacks.early_stopping import EarlyStopping
from pytorch_lightning.core.lightning import LightningModule
import torch
from torch import nn
from transformers import (PreTrainedModel, RobertaModel, RobertaTokenizerFast, RobertaConfig,
                          get_constant_schedule_with_warmup, AdamW, RobertaTokenizer, BertTokenizerFast)

from transformers import BertModel
import pytorch_lightning as pl
import torch.nn.functional as F

seed_everything(42)


Global seed set to 42


42

In [31]:
print(df_train.titleb*.unique()[:10])

['BUILDING'
 'PHOTOGRAPHY; CINEMATOGRAPHY; ANALOGOUS TECHNIQUES USING WAVES OTHER THAN OPTICAL WAVES; ELECTROGRAPHY; HOLOGRAPHY'
 'ELECTRIC COMMUNICATION TECHNIQUE' 'AMMUNITION; BLASTING'
 'MACHINES OR ENGINES IN GENERAL; ENGINE PLANTS IN GENERAL; STEAM ENGINES'
 'BIOCHEMISTRY; BEER; SPIRITS; WINE; VINEGAR; MICROBIOLOGY; ENZYMOLOGY; MUTATION OR GENETIC ENGINEERING'
 'WORKING OR PRESERVING WOOD OR SIMILAR MATERIAL; NAILING OR STAPLING MACHINES IN GENERAL'
 'MEASURING; TESTING'
 'CONVEYING; PACKING; STORING; HANDLING THIN OR FILAMENTARY MATERIAL'
 'ELECTRIC TECHNIQUES NOT OTHERWISE PROVIDED FOR']


In [34]:
df_train['first_letter'] = df_train.context.str[0]

In [37]:
df_train.groupby('first_letter').title.agg(list)[0]

['MEDICAL OR VETERINARY SCIENCE; HYGIENE',
 'BUTCHERING; MEAT TREATMENT; PROCESSING POULTRY OR FISH',
 'MEDICAL OR VETERINARY SCIENCE; HYGIENE',
 'HABERDASHERY; JEWELLERY',
 'MEDICAL OR VETERINARY SCIENCE; HYGIENE',
 'MEDICAL OR VETERINARY SCIENCE; HYGIENE',
 'MEDICAL OR VETERINARY SCIENCE; HYGIENE',
 'FOODS OR FOODSTUFFS; TREATMENT THEREOF, NOT COVERED BY OTHER CLASSES',
 'MEDICAL OR VETERINARY SCIENCE; HYGIENE',
 'AGRICULTURE; FORESTRY; ANIMAL HUSBANDRY; HUNTING; TRAPPING; FISHING',
 'FOODS OR FOODSTUFFS; TREATMENT THEREOF, NOT COVERED BY OTHER CLASSES',
 'AGRICULTURE; FORESTRY; ANIMAL HUSBANDRY; HUNTING; TRAPPING; FISHING',
 'HABERDASHERY; JEWELLERY',
 'AGRICULTURE; FORESTRY; ANIMAL HUSBANDRY; HUNTING; TRAPPING; FISHING',
 'WEARING APPAREL',
 'AGRICULTURE; FORESTRY; ANIMAL HUSBANDRY; HUNTING; TRAPPING; FISHING',
 'MEDICAL OR VETERINARY SCIENCE; HYGIENE',
 'FOOTWEAR',
 'SPORTS; GAMES; AMUSEMENTS',
 'FURNITURE; DOMESTIC ARTICLES OR APPLIANCES; COFFEE MILLS; SPICE MILLS; SUCTION CLEANE

In [39]:
df_train['first_letter'].unique().size

8

In [32]:
df_train.context.unique().str

array(['E04', 'G03', 'H04', 'F42', 'F01', 'C12', 'B27', 'G01', 'B65',
       'H05', 'B01', 'B29', 'G06', 'H02', 'H03', 'G02', 'C09', 'F03',
       'E02', 'G04', 'F02', 'B60', 'F16', 'B07', 'B23', 'C11', 'D01',
       'C22', 'A61', 'B66', 'C01', 'C07', 'B81', 'A22', 'B05', 'D21',
       'C23', 'G07', 'H01', 'G11', 'C10', 'D05', 'D04', 'F22', 'A44',
       'B64', 'F24', 'D06', 'C04', 'E06', 'C08', 'B41', 'A23', 'C03',
       'A01', 'G05', 'E21', 'F41', 'G08', 'F28', 'F27', 'B24', 'A41',
       'A43', 'E05', 'B67', 'E03', 'A63', 'B61', 'A47', 'B63', 'B22',
       'D03', 'B28', 'A21', 'G09', 'G21', 'B62', 'B44', 'F15', 'B08',
       'B32', 'B21', 'C13', 'F04', 'A45', 'B25', 'E01', 'C02', 'A46',
       'C06', 'B02', 'F17', 'B03', 'F23', 'C25', 'C21', 'G10', 'F21',
       'F25', 'C14', 'A62', 'A24', 'G16', 'B31', 'F26'], dtype=object)

In [15]:
use_cuda = True
device = torch.device("cuda" if torch.cuda.is_available() and use_cuda else "cpu")
print('Using device', device)

Using device cuda


In [16]:
from torch.nn import ModuleList
from pytorch_lightning.utilities.types import STEP_OUTPUT


class PatentRoBERTa(pl.LightningModule):
    # TODO add dropout
    def __init__(self):
        super(PatentRoBERTa, self).__init__()
        self.pretrained_model = RobertaModel.from_pretrained("roberta-base")
        self.hidden = nn.Linear(self.pretrained_model.config.hidden_size, self.pretrained_model.config.hidden_size)
        self.classifier = nn.Linear(self.pretrained_model.config.hidden_size, 1)
        # Disable training of BERT model
        for param in self.pretrained_model.parameters():
            param.requires_grad = False

    def forward(self, ids, attention_mask):
        _, o2 = self.pretrained_model(ids, attention_mask=attention_mask, return_dict=False)
        out = torch.relu(self.hidden(o2))
        out = torch.relu(self.classifier(out))
        return out


class PatentRoBERTaEnsemble(pl.LightningModule):
    # TODO add dropout
    def __init__(self, num_classes):
        super(PatentRoBERTaEnsemble, self).__init__()
        self.pretrained_models = ModuleList([PatentRoBERTa() for _ in range(num_classes)])

    def forward(self, ids, mask):
        out = torch.concat([pretrained_model(ids, attention_mask=mask) for pretrained_model in self.pretrained_models], dim=1)
        new_out = new=model(out)
        [target_t, , ]
        return new_out
    # [0.2, 0.3, 0.5, 0.2] [0, 1, 0, 0, 0]

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=1e-4)
        # optimizer = torch.optim.AdamW(self.parameters(), lr=self.config['lr'], weight_decay=self.config['weight_decay'])
        lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1)
        return [optimizer], [lr_scheduler]

    def training_step(self, batch, batch_idx) -> STEP_OUTPUT:
        loss = self._common_step(batch, batch_idx, 'train')
        return loss

    def validation_step(self, batch, batch_idx):
        self._common_step(batch, batch_idx, 'val')

    def _common_step(self, batch, batch_idx, stage: str):
        ids, label, mask = self._prepare_batch(batch)
        output = self(ids=ids, mask=mask)
        label = label.type_as(output)
        loss = F.cross_entropy(output, label)
        acc = (torch.argmax(output, dim=-1) == torch.argmax(label, dim=-1)).float().mean()
        self.log(f"{stage}_loss", loss, on_step=True)
        self.log(f"{stage}_acc", acc, on_step=True)
        return loss

    def predict_step(self, batch, batch_idx: int, dataloader_idx: int = 0):
        ids, _, mask = self._prepare_batch(batch, include_target=False)
        output = self(ids=ids, mask=mask)
        return torch.argmax(output, dim=-1)

    def test_step(self, batch, batch_idx):
        self._common_step(batch, batch_idx, 'test')

    def _prepare_batch(self, batch, include_target=True):
        ids = batch['ids']
        mask = batch['mask']
        if not include_target:
            return ids, None, mask
        label = batch['target']
        return ids, label, mask


In [20]:
from pytorch_lightning.loggers import TensorBoardLogger
from pytorch_lightning.callbacks import LearningRateMonitor
import logging
from logging import WARNING
logging.basicConfig(level=WARNING)

early_stop_callback = EarlyStopping(monitor="val_loss", min_delta=1e-4, patience=2, verbose=False, mode="min")
lr_logger = LearningRateMonitor()
logger = TensorBoardLogger("lightning_logs")

trainer = pl.Trainer(
    accelerator='gpu',
    gradient_clip_val=0.1,
    # clipping gradients is a hyperparameter and important to prevent divergence
    # of the gradient for recurrent neural networks
    auto_lr_find=True,
    callbacks=[lr_logger, early_stop_callback],
    logger=logger,
    weights_summary="top",
)

hparams = dict(
    num_classes=df_train.score.unique().size
)

checkpoint = "./lightning_logs/lightning_logs/ensemble_model"
if checkpoint is not None:
    model = PatentRoBERTaEnsemble.load_from_checkpoint(checkpoint, **hparams)
    print(f'Checkpoint {checkpoint} loaded')
else:
    model = PatentRoBERTaEnsemble(**hparams)

  rank_zero_warn(
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.decoder.weight', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.decoder.weight', 'lm_head

Checkpoint ./lightning_logs/lightning_logs/ensemble_model loaded


In [18]:
trainer.validate(model, val_dataloader)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Validation: 0it [00:00, ?it/s]

────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
     Validate metric           DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
      val_acc_epoch        0.030701754614710808
     val_loss_epoch         1.6351041793823242
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────


[{'val_loss_epoch': 1.6351041793823242, 'val_acc_epoch': 0.030701754614710808}]

In [19]:
trainer.fit(model, train_dataloaders=train_dataloader, val_dataloaders=val_dataloader)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name              | Type       | Params
-------------------------------------------------
0 | pretrained_models | ModuleList | 626 M 
-------------------------------------------------
3.0 M     Trainable params
623 M     Non-trainable params
626 M     Total params
2,504.740 Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

  rank_zero_warn("Detected KeyboardInterrupt, attempting graceful shutdown...")


In [23]:
trainer.save_checkpoint('./lightning_logs/lightning_logs/ensemble_model')

In [21]:
trainer.validate(model, val_dataloader)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Validation: 0it [00:00, ?it/s]

────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
     Validate metric           DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
      val_acc_epoch         0.3388157784938812
     val_loss_epoch         1.5589337348937988
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────


[{'val_loss_epoch': 1.5589337348937988, 'val_acc_epoch': 0.3388157784938812}]

In [21]:
results = trainer.predict(model, test_dataloader)

df_test['y_pred'] = np.concatenate(results)
df_test

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting: 358it [00:00, ?it/s]

TypeError: dispatcher for __array_function__ did not return an iterable