In [2]:
import platform
environment = platform.system(); environment

'Darwin'

## Kaggle Setup

In [3]:
import os

is_kaggle = os.environ.get('KAGGLE_KERNEL_RUN_TYPE', '')

In [4]:
from pathlib import Path

credentials_path = Path('~/.kaggle/kaggle.json').expanduser()

### Download dataset

In [5]:
dataset_path = Path('us-patent-phrase-to-phrase-matching')

if not is_kaggle and not dataset_path.exists():
    import zipfile, kaggle
    
    kaggle.api.competition_download_cli(str(dataset_path))
    zipfile.ZipFile(f'{dataset_path}.zip').extractall(dataset_path)

In [6]:
if is_kaggle:
    dataset_path = Path('../input/us-patent-phrase-to-phrase-matching')
    ! pip install -q datasets

## Data Exploration

View downloaded files.

In [7]:
!ls {dataset_path}

sample_submission.csv test.csv              train.csv


Load training data.

In [8]:
import pandas as pd
training_df = pd.read_csv(dataset_path/'train.csv'); training_df

Unnamed: 0,id,anchor,target,context,score
0,37d61fd2272659b1,abatement,abatement of pollution,A47,0.50
1,7b9652b17b68b7a4,abatement,act of abating,A47,0.75
2,36d72442aefd8232,abatement,active catalyst,A47,0.25
3,5296b0c19e1ce60e,abatement,eliminating process,A47,0.50
4,54c1e3b9184cb5b6,abatement,forest region,A47,0.00
...,...,...,...,...,...
36468,8e1386cbefd7f245,wood article,wooden article,B44,1.00
36469,42d9e032d1cd3242,wood article,wooden box,B44,0.50
36470,208654ccb9e14fa3,wood article,wooden handle,B44,0.50
36471,756ec035e694722b,wood article,wooden material,B44,0.75


In [9]:
# include='object' allows strings to be included in the summary.
training_df.describe(include='object')

Unnamed: 0,id,anchor,target,context
count,36473,36473,36473,36473
unique,36473,733,29340,106
top,37d61fd2272659b1,component composite coating,composition,H01
freq,1,152,24,2186


## Preprocess Data

In [10]:
training_df['input'] = training_df.agg(lambda sample: f"TEXT1: {sample['context']}; TEXT2: {sample['target']}; ANC1: {sample['anchor']}", axis=1)

In [11]:
training_df['input'].head()

0    TEXT1: A47; TEXT2: abatement of pollution; ANC...
1    TEXT1: A47; TEXT2: act of abating; ANC1: abate...
2    TEXT1: A47; TEXT2: active catalyst; ANC1: abat...
3    TEXT1: A47; TEXT2: eliminating process; ANC1: ...
4    TEXT1: A47; TEXT2: forest region; ANC1: abatement
Name: input, dtype: object

## Tokenize and Numericalize Data

Store the training data in a Huggingface dataset.

In [12]:
from datasets import Dataset, DatasetDict

training_ds = Dataset.from_pandas(training_df); training_ds

Dataset({
    features: ['id', 'anchor', 'target', 'context', 'score', 'input'],
    num_rows: 36473
})

The data needs to be tokenized according to the model used.

In [13]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

model_name = 'microsoft/deberta-v3-small'
tokenizer = AutoTokenizer.from_pretrained(model_name)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [14]:
tokenizer.tokenize('Greetings, fellow humans!')

['▁Greetings', ',', '▁fellow', '▁humans', '!']

Uncommon words are split into fragments.

In [15]:
tokenizer.tokenize('supercalifragilisticexpialidocious')

['▁super', 'cali', 'frag', 'il', 'istic', 'exp', 'ial', 'ido', 'cious']

In [16]:
def tokenize_data(sample): return tokenizer(sample['input'])

`Map` runs the above function with every row in parallel.

In [17]:
# This is much faster than actually setting the environment variable TOKENIZERS_PARALLELISM to false.
tokenized_ds = training_ds.map(tokenize_data, batched=True)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  0%|          | 0/37 [00:00<?, ?ba/s]

In [18]:
first_sample = tokenized_ds[0]
first_sample['input'], first_sample['input_ids']

('TEXT1: A47; TEXT2: abatement of pollution; ANC1: abatement',
 [1,
  54453,
  435,
  294,
  336,
  5753,
  346,
  54453,
  445,
  294,
  47284,
  265,
  6435,
  346,
  23702,
  435,
  294,
  47284,
  2])

Rename `scores` column to `labels` since Huggingface expects a column with such name.

In [19]:
tokenized_ds = tokenized_ds.rename_columns({'score': 'labels'})

## Create Validation Set

Huggingface calls the validation set, the test set. So, be careful!

In [20]:
ds_dict = tokenized_ds.train_test_split(0.25, seed=42); ds_dict

DatasetDict({
    train: Dataset({
        features: ['id', 'anchor', 'target', 'context', 'labels', 'input', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 27354
    })
    test: Dataset({
        features: ['id', 'anchor', 'target', 'context', 'labels', 'input', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 9119
    })
})

## Load and Preprocess Test Set

In [21]:
testing_df = pd.read_csv(dataset_path/'test.csv')
# For some reason, `include='object'` is not needed here to include strings in
#  the summary.
testing_df.describe()

Unnamed: 0,id,anchor,target,context
count,36,36,36,36
unique,36,34,36,29
top,4112d61851461f60,el display,inorganic photoconductor drum,G02
freq,1,2,1,3


In [22]:
testing_df['input'] = testing_df.agg(lambda sample: f"TEXT1: {sample['context']}; TEXT2: {sample['target']}; ANC1: {sample['anchor']}", axis=1)

In [23]:
testing_ds = Dataset.from_pandas(testing_df).map(tokenize_data, batched=True)

  0%|          | 0/1 [00:00<?, ?ba/s]

## Define Metric Function

Pearson Correlation Coefficient

In [24]:
import numpy as np

np.set_printoptions(precision=2, suppress=True)

def pear_corr_dict(valid_pred):
    return {'pearson': np.corrcoef(*valid_pred)[0][1]}

## Create Model

In [None]:
from transformers import Trainer, TrainingArguments

In [None]:
batch_size = 32
epochs = 4

fastai has a learning rate finder, while Huggingface Transformers do not. Therefore, trial and error must be used. One approach is to being with a small learning rate and then double it for each experiment.

In [None]:
learning_rate = 8e-5

`TrainingArguments` sets up the parameters for the `Trainer`. The arguments below work fine for most cases. It's only the 3 parameters that have been defined above that need to be tweaked.

In [None]:
if environment == 'Darwin':
    # Use MPS device instead of fp16.
    arguments = TrainingArguments(
        'outputs',
        learning_rate = learning_rate,
        warmup_ratio = 0.1,
        lr_scheduler_type = 'cosine',
        use_mps_device=True,
        evaluation_strategy = "epoch",
        per_device_train_batch_size = batch_size,
        per_device_eval_batch_size = batch_size*2,
        num_train_epochs = epochs,
        weight_decay = 0.01,
        report_to='none'
    )
else:
    # Use fp16 instead of MPS device.
    arguments = TrainingArguments(
        'outputs',
        learning_rate = learning_rate,
        warmup_ratio = 0.1,
        lr_scheduler_type = 'cosine',
        fp16 = True,
        evaluation_strategy = "epoch",
        per_device_train_batch_size = batch_size,
        per_device_eval_batch_size = batch_size*2,
        num_train_epochs = epochs,
        weight_decay = 0.01,
        report_to='none'
    )

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=1)
trainer = Trainer(
    model,
    arguments,
    train_dataset=ds_dict['train'],
    eval_dataset=ds_dict['test'],
    tokenizer=tokenizer,
    compute_metrics=pear_corr_dict
    )

In [None]:
trainer.train()

## Save Model

In [None]:
trainer.save_model("model/")

## Zip and Download Model

### Zip Up

In [None]:
import zipfile

if is_kaggle:
    directory_to_zip = Path("/kaggle/working/model")

In [None]:
if is_kaggle:
    with zipfile.ZipFile("patent_model.zip", mode='w') as archive:
        for file in directory_to_zip.iterdir():
            archive.write(file, arcname=file.name)

In [None]:
if is_kaggle:
    with zipfile.ZipFile("patent_model.zip", mode="r") as archive:
        archive.printdir()

### Create download link since Kaggle's file manager isn't good.

In [None]:
import os

os.chdir(r'/kaggle/working')

In [None]:
from IPython.display import FileLink

FileLink(r'patent_model.zip')

## Model Inference

In [1]:
from transformers import AutoModel

In [28]:
model = AutoModel.from_pretrained('patent_model/')

Some weights of the model checkpoint at patent_model/ were not used when initializing DebertaV2Model: ['classifier.weight', 'classifier.bias', 'pooler.dense.bias', 'pooler.dense.weight']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [29]:
predictions = model.predict(testing_ds)

AttributeError: 'DebertaV2Model' object has no attribute 'predict'