In [None]:
import platform
environment = platform.system(); environment

## Kaggle Setup

In [None]:
import os

is_kaggle = os.environ.get('KAGGLE_KERNEL_RUN_TYPE', '')

In [None]:
from pathlib import Path

credentials_path = Path('~/.kaggle/kaggle.json').expanduser()

### Download dataset

In [None]:
dataset_path = Path('us-patent-phrase-to-phrase-matching')

if not is_kaggle and not dataset_path.exists():
    import zipfile, kaggle
    
    kaggle.api.competition_download_cli(str(dataset_path))
    zipfile.ZipFile(f'{dataset_path}.zip').extractall(dataset_path)

In [None]:
if is_kaggle:
    dataset_path = Path('../input/us-patent-phrase-to-phrase-matching')
    ! pip install -q datasets

## Data Exploration

View downloaded files.

In [None]:
!ls {dataset_path}

Load training data.

In [None]:
import pandas as pd
training_df = pd.read_csv(dataset_path/'train.csv'); training_df

In [None]:
# include='object' allows strings to be included in the summary.
training_df.describe(include='object')

## Preprocess Data

In [None]:
training_df['input'] = training_df.agg(lambda sample: f"TEXT1: {sample['context']}; TEXT2: {sample['target']}; ANC1: {sample['anchor']}", axis=1)

In [None]:
training_df['input'].head()

## Tokenize and Numericalize Data

Store the training data in a Huggingface dataset.

In [None]:
from datasets import Dataset, DatasetDict

training_ds = Dataset.from_pandas(training_df); training_ds

The data needs to be tokenized according to the model used.

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

model_name = 'microsoft/deberta-v3-small'
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
tokenizer.tokenize('Greetings, fellow humans!')

Uncommon words are split into fragments.

In [None]:
tokenizer.tokenize('supercalifragilisticexpialidocious')

In [None]:
def tokenize_data(sample): return tokenizer(sample['input'])

`Map` runs the above function with every row in parallel.

In [None]:
# This is much faster than actually setting the environment variable TOKENIZERS_PARALLELISM to false.
tokenized_ds = training_ds.map(tokenize_data, batched=True)

In [None]:
first_sample = tokenized_ds[0]
first_sample['input'], first_sample['input_ids']

Rename `scores` column to `labels` since Huggingface expects a column with such name.

In [None]:
tokenized_ds = tokenized_ds.rename_columns({'score': 'labels'})

## Create Validation Set

Huggingface calls the validation set, the test set. So, be careful!

In [None]:
ds_dict = tokenized_ds.train_test_split(0.25, seed=42); ds_dict

## Load and Preprocess Test Set

In [None]:
testing_df = pd.read_csv(dataset_path/'test.csv')
# For some reason, `include='object'` is not needed here to include strings in
#  the summary.
testing_df.describe()

In [None]:
testing_df['input'] = testing_df.agg(lambda sample: f"TEXT1: {sample['context']}; TEXT2: {sample['target']}; ANC1: {sample['anchor']}", axis=1)

In [None]:
testing_ds = Dataset.from_pandas(testing_df).map(tokenize_data, batched=True)

## Define Metric Function

Pearson Correlation Coefficient

In [None]:
import numpy as np

np.set_printoptions(precision=2, suppress=True)

def pear_corr_dict(valid_pred):
    return {'pearson': np.corrcoef(*valid_pred)[0][1]}

## Create Model

In [None]:
from transformers import Trainer, TrainingArguments

In [None]:
batch_size = 32
epochs = 4

fastai has a learning rate finder, while Huggingface Transformers do not. Therefore, trial and error must be used. One approach is to being with a small learning rate and then double it for each experiment.

In [None]:
learning_rate = 8e-5

`TrainingArguments` sets up the parameters for the `Trainer`. The arguments below work fine for most cases. It's only the 3 parameters that have been defined above that need to be tweaked.

In [None]:
if environment == 'Darwin':
    # Use MPS device instead of fp16.
    arguments = TrainingArguments(
        'outputs',
        learning_rate = learning_rate,
        warmup_ratio = 0.1,
        lr_scheduler_type = 'cosine',
        use_mps_device=True,
        evaluation_strategy = "epoch",
        per_device_train_batch_size = batch_size,
        per_device_eval_batch_size = batch_size*2,
        num_train_epochs = epochs,
        weight_decay = 0.01,
        report_to='none'
    )
else:
    # Use fp16 instead of MPS device.
    arguments = TrainingArguments(
        'outputs',
        learning_rate = learning_rate,
        warmup_ratio = 0.1,
        lr_scheduler_type = 'cosine',
        fp16 = True,
        evaluation_strategy = "epoch",
        per_device_train_batch_size = batch_size,
        per_device_eval_batch_size = batch_size*2,
        num_train_epochs = epochs,
        weight_decay = 0.01,
        report_to='none'
    )

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=1)
trainer = Trainer(
    model,
    arguments,
    train_dataset=ds_dict['train'],
    eval_dataset=ds_dict['test'],
    tokenizer=tokenizer,
    compute_metrics=pear_corr_dict
    )

In [None]:
trainer.train()

## Save Model

In [None]:
trainer.save_model("model/")

## Zip and Download Model

### Zip Up

In [None]:
import zipfile

if is_kaggle:
    directory_to_zip = Path("/kaggle/working/model")

In [None]:
if is_kaggle:
    with zipfile.ZipFile("patent_model.zip", mode='w') as archive:
        for file in directory_to_zip.iterdir():
            archive.write(file, arcname=file.name)

In [None]:
if is_kaggle:
    with zipfile.ZipFile("patent_model.zip", mode="r") as archive:
        archive.printdir()

### Create download link since Kaggle's file manager isn't good.

In [None]:
import os

os.chdir(r'/kaggle/working')

In [None]:
from IPython.display import FileLink

FileLink(r'patent_model.zip')