In [1]:
import platform
environment = platform.system(); environment

'Darwin'

In [2]:
from pathlib import Path

from datasets import Dataset

import pandas as pd

In [3]:
dataset_path = Path('us-patent-phrase-to-phrase-matching')

## Load Tokenizer

In [4]:
from transformers import AutoTokenizer

In [5]:
tokenizer = AutoTokenizer.from_pretrained('patent_model/')

In [6]:
tokenizer.tokenize('Greetings, fellow humans!')

['▁Greetings', ',', '▁fellow', '▁humans', '!']

In [7]:
def tokenize_document(document): return tokenizer(document['input'])

## Load Training Set

In [8]:
training_df = pd.read_csv(dataset_path/'train.csv'); training_df.head()

Unnamed: 0,id,anchor,target,context,score
0,37d61fd2272659b1,abatement,abatement of pollution,A47,0.5
1,7b9652b17b68b7a4,abatement,act of abating,A47,0.75
2,36d72442aefd8232,abatement,active catalyst,A47,0.25
3,5296b0c19e1ce60e,abatement,eliminating process,A47,0.5
4,54c1e3b9184cb5b6,abatement,forest region,A47,0.0


In [9]:
training_df.describe(include='object')

Unnamed: 0,id,anchor,target,context
count,36473,36473,36473,36473
unique,36473,733,29340,106
top,37d61fd2272659b1,component composite coating,composition,H01
freq,1,152,24,2186


### Process Training Data

In [10]:
training_df['input'] = training_df.agg(lambda document: f"TEXT1: {document['context']}; TEXT2: {document['target']}; ANC1: {document['anchor']}", axis=1)

In [11]:
training_df['input'].head()

0    TEXT1: A47; TEXT2: abatement of pollution; ANC...
1    TEXT1: A47; TEXT2: act of abating; ANC1: abate...
2    TEXT1: A47; TEXT2: active catalyst; ANC1: abat...
3    TEXT1: A47; TEXT2: eliminating process; ANC1: ...
4    TEXT1: A47; TEXT2: forest region; ANC1: abatement
Name: input, dtype: object

#### Tokenize and Numericalize Training Data

In [12]:
training_ds = Dataset.from_pandas(training_df); training_ds

Dataset({
    features: ['id', 'anchor', 'target', 'context', 'score', 'input'],
    num_rows: 36473
})

In [13]:
tokenized_ds = training_ds.map(tokenize_document, batched=True)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  0%|          | 0/37 [00:00<?, ?ba/s]

In [14]:
first_document = tokenized_ds[0]
first_document['input'], first_document['input_ids']

('TEXT1: A47; TEXT2: abatement of pollution; ANC1: abatement',
 [1,
  54453,
  435,
  294,
  336,
  5753,
  346,
  54453,
  445,
  294,
  47284,
  265,
  6435,
  346,
  23702,
  435,
  294,
  47284,
  2])

In [15]:
tokenized_ds = tokenized_ds.rename_columns({'score': 'labels'})

## Create Validation Set

In [16]:
ds_dict = tokenized_ds.train_test_split(0.25, seed=42); ds_dict

DatasetDict({
    train: Dataset({
        features: ['id', 'anchor', 'target', 'context', 'labels', 'input', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 27354
    })
    test: Dataset({
        features: ['id', 'anchor', 'target', 'context', 'labels', 'input', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 9119
    })
})

## Load Test Set

In [17]:
testing_df = pd.read_csv(dataset_path/'test.csv'); testing_df.head()

Unnamed: 0,id,anchor,target,context
0,4112d61851461f60,opc drum,inorganic photoconductor drum,G02
1,09e418c93a776564,adjust gas flow,altering gas flow,F23
2,36baf228038e314b,lower trunnion,lower locating,B60
3,1f37ead645e7f0c8,cap component,upper portion,D06
4,71a5b6ad068d531f,neural stimulation,artificial neural network,H04


In [18]:
testing_df.describe(include='object')

Unnamed: 0,id,anchor,target,context
count,36,36,36,36
unique,36,34,36,29
top,4112d61851461f60,el display,inorganic photoconductor drum,G02
freq,1,2,1,3


### Process Dataset

In [19]:
testing_df['input'] = testing_df.agg(lambda document: f"TEXT1: {document['context']}; TEXT2: {document['target']}; ANC1: {document['anchor']}", axis=1)

In [20]:
testing_ds = Dataset.from_pandas(testing_df).map(tokenize_document, batched=True)
testing_ds

  0%|          | 0/1 [00:00<?, ?ba/s]

Dataset({
    features: ['id', 'anchor', 'target', 'context', 'input', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 36
})

## Define Metric Function

In [21]:
import numpy as np

In [22]:
np.set_printoptions(precision=2, suppress=True)

def pear_corr_dict(prediction):
    return {'pearson': np.corrcoef(*prediction)[0][1]}

## Create Trainer

In [23]:
from transformers import AutoModelForSequenceClassification, AutoConfig, \
    Trainer, TrainingArguments

In [24]:
batch_size, epochs, learning_rate = 32, 4, 8e-5

In [25]:
if environment == 'Darwin':
    # Use MPS device instead of fp16.
    arguments = TrainingArguments(
        'outputs',
        learning_rate = learning_rate,
        warmup_ratio = 0.1,
        lr_scheduler_type = 'cosine',
        # Setting the below parameter to False because it's not working.
        use_mps_device=False,
        evaluation_strategy = "epoch",
        per_device_train_batch_size = batch_size,
        per_device_eval_batch_size = batch_size*2,
        num_train_epochs = epochs,
        weight_decay = 0.01,
        report_to='none'
    )
else:
    # Use fp16 instead of MPS device.
    arguments = TrainingArguments(
        'outputs',
        learning_rate = learning_rate,
        warmup_ratio = 0.1,
        lr_scheduler_type = 'cosine',
        fp16 = True,
        evaluation_strategy = "epoch",
        per_device_train_batch_size = batch_size,
        per_device_eval_batch_size = batch_size*2,
        num_train_epochs = epochs,
        weight_decay = 0.01,
        report_to='none'
    )

In [26]:
config = AutoConfig.from_pretrained('patent_model/')

In [27]:
model = AutoModelForSequenceClassification.from_pretrained('patent_model/',
                                                           config=config)

In [28]:
trainer = Trainer(
    model,
    arguments,
    tokenizer=tokenizer,
    compute_metrics=pear_corr_dict,
    train_dataset=ds_dict['train'],
    eval_dataset=ds_dict['test'],
)

## Finally! Inference.

In [29]:
predictions = trainer.predict(testing_ds).predictions.astype(float)

The following columns in the test set don't have a corresponding argument in `DebertaV2ForSequenceClassification.forward` and have been ignored: id, input, target, anchor, context. If id, input, target, anchor, context are not expected by `DebertaV2ForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 36
  Batch size = 64
You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
  attention_scores = torch.bmm(query_layer, key_layer.transpose(-1, -2)) / torch.tensor(
  score += c2p_att / torch.tensor(scale, dtype=c2p_att.dtype)
  score += p2c_att / torch.tensor(scale, dtype=p2c_att.dtype)


In [30]:
predictions

array([[ 0.35],
       [ 0.76],
       [ 0.56],
       [ 0.31],
       [-0.02],
       [ 0.56],
       [ 0.53],
       [-0.02],
       [ 0.32],
       [ 1.07],
       [ 0.25],
       [ 0.27],
       [ 0.85],
       [ 0.95],
       [ 0.75],
       [ 0.47],
       [ 0.3 ],
       [-0.  ],
       [ 0.62],
       [ 0.39],
       [ 0.49],
       [ 0.28],
       [ 0.17],
       [ 0.27],
       [ 0.55],
       [-0.01],
       [-0.02],
       [-0.03],
       [-0.02],
       [ 0.56],
       [ 0.3 ],
       [-0.01],
       [ 0.73],
       [ 0.54],
       [ 0.49],
       [ 0.25]])

Some are larger than 1 and less than 0. Let's apply a ReLU to fix it, though
there are better ways to do fix it.

In [32]:
predictions = np.clip(predictions, 0, 1); predictions

array([[0.35],
       [0.76],
       [0.56],
       [0.31],
       [0.  ],
       [0.56],
       [0.53],
       [0.  ],
       [0.32],
       [1.  ],
       [0.25],
       [0.27],
       [0.85],
       [0.95],
       [0.75],
       [0.47],
       [0.3 ],
       [0.  ],
       [0.62],
       [0.39],
       [0.49],
       [0.28],
       [0.17],
       [0.27],
       [0.55],
       [0.  ],
       [0.  ],
       [0.  ],
       [0.  ],
       [0.56],
       [0.3 ],
       [0.  ],
       [0.73],
       [0.54],
       [0.49],
       [0.25]])

## Create submission file

In [33]:
import datasets

In [34]:
submission = datasets.Dataset.from_dict({
    'id': testing_ds['id'],
    'score': predictions
})

submission.to_csv('submission.csv', index=False)

Creating CSV from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

853