In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import os
os.environ['TOKENIZERS_PARALLELISM'] = 'false'

## Load Data

In [3]:
data_path = '../data/us-patent-phrase-to-phrase-matching/'
train_file = 'train.csv'
test_file = 'test.csv'

df_train = pd.read_csv(data_path + train_file)
df_train.head(2)

Unnamed: 0,id,anchor,target,context,score
0,37d61fd2272659b1,abatement,abatement of pollution,A47,0.5
1,7b9652b17b68b7a4,abatement,act of abating,A47,0.75


In [4]:
df_test = pd.read_csv(data_path + test_file)
df_test.head(2)

Unnamed: 0,id,anchor,target,context
0,4112d61851461f60,opc drum,inorganic photoconductor drum,G02
1,09e418c93a776564,adjust gas flow,altering gas flow,F23


In [5]:
cpc_codes = pd.read_csv('../data/cooperative-patent-classification-codes-meaning/titles.csv').rename(columns={"code" : "context"})
cpc_codes.head(2)

Unnamed: 0,context,title,section,class,subclass,group,main_group
0,A,HUMAN NECESSITIES,A,,,,
1,A01,AGRICULTURE; FORESTRY; ANIMAL HUSBANDRY; HUNTI...,A,1.0,,,


In [6]:
cpc_codes = cpc_codes.rename(columns = {"code" : "context"})
df_train = pd.merge(df_train, cpc_codes[["context","title"]], on ="context", how = "left")
df_test = pd.merge(df_test, cpc_codes[["context","title"]], on ="context", how = "left")
df_train.head(2)

Unnamed: 0,id,anchor,target,context,score,title
0,37d61fd2272659b1,abatement,abatement of pollution,A47,0.5,FURNITURE; DOMESTIC ARTICLES OR APPLIANCES; CO...
1,7b9652b17b68b7a4,abatement,act of abating,A47,0.75,FURNITURE; DOMESTIC ARTICLES OR APPLIANCES; CO...


### Preprocessing

In [7]:
from transformers import (PreTrainedModel, RobertaModel, RobertaTokenizerFast, RobertaConfig,
                          get_constant_schedule_with_warmup, AdamW, RobertaTokenizer, BertTokenizerFast)
from torch.utils.data import DataLoader, Dataset


In [12]:
tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base')

In [25]:
MAX_ANCHOR_LEN = 40
MAX_TARGET_LEN = 50
MAX_TITLE_LEN = 175

df_train['input'] = df_train.apply(lambda x: (x.anchor, x.target + f' {tokenizer.cls_token} ' + x.title.lower()), axis=1)
df_test['input'] = df_test.apply(lambda x: (x.anchor, x.target + f' {tokenizer.cls_token} ' + x.title.lower()), axis=1)
df_train.input.iloc[:2]

0    (abatement, abatement of pollution <s> furnitu...
1    (abatement, act of abating <s> furniture; dome...
Name: input, dtype: object

In [24]:
df_train['input_ids'] = tokenizer(df_train.input.to_list())['input_ids']
#df_train['input_attention_mask'] = tokenizer(df_train.input.to_list())['attention_mask']
df_train['out'] = pd.get_dummies(df_train.score, prefix='score').agg(list, axis=1)
df_train.head(2)

Unnamed: 0,id,anchor,target,context,score,title,input,input_ids,out
0,37d61fd2272659b1,abatement,abatement of pollution,A47,0.5,FURNITURE; DOMESTIC ARTICLES OR APPLIANCES; CO...,"(abatement, abatement of pollution <s> furnitu...","[0, 873, 415, 6285, 2, 2, 873, 415, 6285, 9, 6...","[0, 0, 1, 0, 0]"
1,7b9652b17b68b7a4,abatement,act of abating,A47,0.75,FURNITURE; DOMESTIC ARTICLES OR APPLIANCES; CO...,"(abatement, act of abating <s> furniture; dome...","[0, 873, 415, 6285, 2, 2, 7257, 9, 4091, 1295,...","[0, 0, 0, 1, 0]"


In [11]:
from sklearn.model_selection import train_test_split

df_train, df_val = train_test_split(df_train, test_size=.05, shuffle=True, random_state=41)
df_train.shape, df_val.shape

((34649, 9), (1824, 9))

In [12]:
class PatentDataset(Dataset):
    def __init__(self, tokenizer: RobertaTokenizerFast, dataset, max_anchor_len, max_target_len, max_title_len, export=False):
        """

        :param tokenizer:
        :param dataset:
        :param export: This mode is designed for computing final results on a dataset that does not contain the target variable
        """
        super(PatentDataset, self).__init__()
        self.export = export
        self.tokenizer: RobertaTokenizerFast = tokenizer
        self.df = dataset
        self.max_length = max_anchor_len + max_target_len + max_title_len  # FIXME

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        seq = self.df.input.iloc[index]

        inputs = self.tokenizer.encode_plus(
            seq,
            add_special_tokens=True,
            return_attention_mask=True,
            max_length=self.max_length,
            padding='max_length'
        )
        ids = inputs["input_ids"]
        mask = inputs["attention_mask"]

        out = dict(
            ids=torch.tensor(ids, dtype=torch.long),
            mask=torch.tensor(mask, dtype=torch.long),
        )
        if not self.export:
            out['target'] = torch.tensor(self.df.out.iloc[index], dtype=torch.float)

        return out


train_dataset = PatentDataset(tokenizer=tokenizer, dataset=df_train, max_anchor_len=MAX_ANCHOR_LEN, max_target_len=MAX_TARGET_LEN, max_title_len=MAX_TITLE_LEN)
val_dataset = PatentDataset(tokenizer=tokenizer, dataset=df_val, max_anchor_len=MAX_ANCHOR_LEN, max_target_len=MAX_TARGET_LEN, max_title_len=MAX_TITLE_LEN)
test_dataset = PatentDataset(tokenizer=tokenizer, dataset=df_test, max_anchor_len=MAX_ANCHOR_LEN, max_target_len=MAX_TARGET_LEN, max_title_len=MAX_TITLE_LEN, export=True)

In [13]:
train_dataloader = DataLoader(dataset=train_dataset, batch_size=64, num_workers=12)
val_dataloader = DataLoader(dataset=val_dataset, batch_size=64, num_workers=12)
test_dataloader = DataLoader(dataset=test_dataset, batch_size=64, num_workers=12, shuffle=False)

### Model

In [14]:
from pytorch_lightning import Trainer, seed_everything
from pytorch_lightning.callbacks.early_stopping import EarlyStopping
from pytorch_lightning.core.lightning import LightningModule
import torch
from torch import nn
from transformers import (PreTrainedModel, RobertaModel, RobertaTokenizerFast, RobertaConfig,
                          get_constant_schedule_with_warmup, AdamW, RobertaTokenizer, BertTokenizerFast)

from transformers import BertModel
import pytorch_lightning as pl
import torch.nn.functional as F

seed_everything(42)


Global seed set to 42


42

In [16]:
import logging
from torch.nn import ModuleList
from pytorch_lightning.utilities.types import STEP_OUTPUT


def configure_trainable_layers(model, deepest_layer_to_train: int, state: bool):
    requires_grad = False

    for param in model.named_parameters():
        if f'encoder.layer.{deepest_layer_to_train}' in param[0]:
            requires_grad = state
        param[1].requires_grad = requires_grad
        # print(f'Layer {param[0]} is {"NOT " if requires_grad is False else ""}trained.')


# Trained in step 1 - pre-training
class PatentRoBERTa(pl.LightningModule):
    # TODO add dropout
    def __init__(self, pre_training: bool, deepest_layer_to_train: int):
        super(PatentRoBERTa, self).__init__()
        self.pretrained_model = RobertaModel.from_pretrained("roberta-base")
        self.classifier = nn.Linear(self.pretrained_model.config.hidden_size, 1)
        self.pre_training = pre_training
        self.deepest_layer_to_train = deepest_layer_to_train
        configure_trainable_layers(self.pretrained_model, deepest_layer_to_train, self.pre_training)

    def forward(self, ids, attention_mask):
        out = self.pretrained_model(ids, attention_mask=attention_mask, return_dict=True)
        if not self.pre_training:
            return out['last_hidden_state']  # batch_size, 265, 768
        out = torch.relu(self.classifier(out['pooler_output']))  # batch size, 1
        return out

    def pretrain(self, state: bool):
        self.pre_training = state
        self.classifier.requires_grad = state
        configure_trainable_layers(model, self.deepest_layer_to_train, state)

# Trained in step 2
class ClassificationTransformer(pl.LightningModule):
    def __init__(self, num_classes, dim=512):
        super(ClassificationTransformer, self).__init__()
        encoder_layer = nn.TransformerEncoderLayer(d_model=dim, nhead=4)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=1)
        self.classifier = nn.Linear(dim, num_classes)

    def forward(self, x):
        out = self.transformer_encoder(x)
        out = torch.mean(out, dim=1)  # batch_size, 1325, 768 -> batch_size, 1, 768
        return torch.relu(self.classifier(out))  # batch_size, 1, 768  -> batch_size, 5


class PatentRoBERTaEnsemble(pl.LightningModule):
    # TODO add dropout
    def __init__(self, num_classes, pre_training, deepest_layer_to_train):
        super(PatentRoBERTaEnsemble, self).__init__()
        self.pretrained_models = ModuleList([PatentRoBERTa(pre_training, deepest_layer_to_train) for _ in range(num_classes)])
        self.pre_training: bool = pre_training
        self.transformer_classifier = ClassificationTransformer(num_classes, dim=768)

    def forward(self, ids, mask):
        """
        After concat,
        if pre-training: then size is (batch_size 5)
        else: (batch_size, 1325, 768)
        :param ids:
        :param mask:
        :return:
        """
        out = torch.concat([pretrained_model(ids, attention_mask=mask) for pretrained_model in self.pretrained_models], dim=1)
        if self.pre_training:
            return out  # [0.2, 0.5, 0.01, 0, 0]
        out = self.transformer_classifier(out)
        return out

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=1e-4)
        # optimizer = torch.optim.AdamW(self.parameters(), lr=self.config['lr'], weight_decay=self.config['weight_decay'])
        lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1)
        return [optimizer], [lr_scheduler]

    def training_step(self, batch, batch_idx) -> STEP_OUTPUT:
        loss = self._common_step(batch, batch_idx, 'train')
        return loss

    def validation_step(self, batch, batch_idx):
        self._common_step(batch, batch_idx, 'val')

    def _common_step(self, batch, batch_idx, stage: str):
        ids, label, mask = self._prepare_batch(batch)
        output = self(ids=ids, mask=mask)
        label = label.type_as(output)
        loss = F.cross_entropy(output, label)
        acc = (torch.argmax(output, dim=-1) == torch.argmax(label, dim=-1)).float().mean()
        self.log(f"{stage}_loss", loss, on_step=True)
        self.log(f"{stage}_acc", acc, on_step=True)
        return loss

    def predict_step(self, batch, batch_idx: int, dataloader_idx: int = 0):
        if self.pre_training:
            logging.warning('Doing prediction on pre-training mode is not recommended.')
        ids, _, mask = self._prepare_batch(batch, include_target=False)
        output = self(ids=ids, mask=mask)
        return torch.argmax(output, dim=-1)

    def test_step(self, batch, batch_idx):
        self._common_step(batch, batch_idx, 'test')

    def _prepare_batch(self, batch, include_target=True):
        ids = batch['ids']
        mask = batch['mask']
        if not include_target:
            return ids, None, mask
        label = batch['target']
        return ids, label, mask

    def pretrain(self, state: bool):
        self.pre_training = state
        for module in self.pretrained_models:
            module.pretrain(state)


In [20]:
from pytorch_lightning.loggers import TensorBoardLogger
from pytorch_lightning.callbacks import LearningRateMonitor
import logging
from logging import WARNING
logging.basicConfig(level=WARNING)

early_stop_callback = EarlyStopping(monitor="val_loss", min_delta=1e-4, patience=2, verbose=False, mode="min")
lr_logger = LearningRateMonitor()
logger = TensorBoardLogger("lightning_logs")

trainer = pl.Trainer(
    accelerator='gpu',
    gradient_clip_val=0.1,
    auto_lr_find=True,
    callbacks=[lr_logger, early_stop_callback],
    logger=logger,
    weights_summary="top",
    max_epochs=1  # FIXME
)

hparams = dict(
    num_classes=df_train.score.unique().size,
    pre_training=True,
    deepest_layer_to_train=10
)

checkpoint = None
if checkpoint is not None:
    model = PatentRoBERTaEnsemble.load_from_checkpoint(checkpoint, **hparams)
    print(f'Checkpoint {checkpoint} loaded')
else:
    model = PatentRoBERTaEnsemble(**hparams)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.dense.bias', 'lm_head.decod

Checkpoint ./lightning_logs/lightning_logs/ensemble_model loaded


In [18]:
model.pretrain(True)
trainer.validate(model, val_dataloader)


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Validation: 0it [00:00, ?it/s]

────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
     Validate metric           DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
      val_acc_epoch         0.2083333283662796
     val_loss_epoch         1.6109473705291748
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────


[{'val_loss_epoch': 1.6109473705291748, 'val_acc_epoch': 0.2083333283662796}]

In [19]:
trainer.fit(model, train_dataloaders=train_dataloader, val_dataloaders=val_dataloader)
trainer.validate(model, val_dataloader)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name                   | Type                      | Params
---------------------------------------------------------------------
0 | pretrained_models      | ModuleList                | 623 M 
1 | transformer_classifier | ClassificationTransformer | 5.5 M 
---------------------------------------------------------------------
518 M     Trainable params
109 M     Non-trainable params
628 M     Total params
2,514.999 Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

RuntimeError: CUDA out of memory. Tried to allocate 206.00 MiB (GPU 0; 5.80 GiB total capacity; 4.40 GiB already allocated; 146.69 MiB free; 4.61 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [None]:
torch.cuda.empty_cache()

In [21]:
model.pretrain(False)
trainer.validate(model, val_dataloader)


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Validation: 0it [00:00, ?it/s]

RuntimeError: CUDA out of memory. Tried to allocate 50.00 MiB (GPU 0; 5.80 GiB total capacity; 4.52 GiB already allocated; 48.69 MiB free; 4.70 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [None]:
trainer.fit(model, train_dataloaders=train_dataloader, val_dataloaders=val_dataloader)
trainer.validate(model, val_dataloader)


In [None]:
trainer.save_checkpoint('./lightning_logs/lightning_logs/ensemble_model')

In [None]:
results = trainer.predict(model, test_dataloader)

df_test['y_pred'] = np.concatenate(results)
df_test

In [None]:
corresp_map = [.0, .25, .5, .75, 1.0]
df_test['score'] = df_test.y_pred.apply(lambda x: corresp_map[x])
df_test

In [None]:
submission = df_test.loc[:, ['id', 'score']]
submission.to_csv('submission.csv')

In [None]:
submission