In [24]:
import os
import zipfile
import pathlib
from pathlib import Path
import kaggle
from dotenv import load_dotenv

In [25]:
path = Path('us-patent-phrase-to-phrase-matching')
if not path.exists():
    kaggle.api.competition_download_cli(str(path))
    zipfile.ZipFile(f'{path}.zip').extractall(path)

In [26]:
import pandas as pd

df = pd.read_csv(path/'train.csv')
df.head()

Unnamed: 0,id,anchor,target,context,score
0,37d61fd2272659b1,abatement,abatement of pollution,A47,0.5
1,7b9652b17b68b7a4,abatement,act of abating,A47,0.75
2,36d72442aefd8232,abatement,active catalyst,A47,0.25
3,5296b0c19e1ce60e,abatement,eliminating process,A47,0.5
4,54c1e3b9184cb5b6,abatement,forest region,A47,0.0


In [27]:
## Create the input 

df['input'] = df['context'] + ' ' + df['anchor'] + ' ' + df['target']
df[['input', 'score']].head()

Unnamed: 0,input,score
0,A47 abatement abatement of pollution,0.5
1,A47 abatement act of abating,0.75
2,A47 abatement active catalyst,0.25
3,A47 abatement eliminating process,0.5
4,A47 abatement forest region,0.0


In [29]:
!pip install datasets transformers
from datasets import Dataset, DatasetDict

ds = Dataset.from_pandas(df[['input', 'score']])
ds



Dataset({
    features: ['input', 'score'],
    num_rows: 36473
})

In [30]:
model_name = "distilbert-base-uncased"

from transformers import AutoTokenizer, AutoModelForSequenceClassification
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [31]:
print(df['input'][0])
tokenizer.tokenize(df['input'][0])

A47 abatement abatement of pollution


['a',
 '##47',
 'aba',
 '##tem',
 '##ent',
 'aba',
 '##tem',
 '##ent',
 'of',
 'pollution']

In [32]:
tokenizer.tokenize('My name is jeff')

['my', 'name', 'is', 'jeff']

In [33]:
# Function to tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples['input'], padding="max_length", truncation=True)  

In [34]:
tokenized_ds = ds.map(tokenize_function, batched=True)
tokenized_ds

Map:   0%|          | 0/36473 [00:00<?, ? examples/s]

Dataset({
    features: ['input', 'score', 'input_ids', 'attention_mask'],
    num_rows: 36473
})

In [35]:
tokenized_ds = tokenized_ds.rename_columns({'score':'labels'})

In [36]:
eval_df = pd.read_csv(path/'test.csv')
eval_df['input'] = eval_df.context + ' ' + eval_df.target + ' ' + eval_df.anchor
eval_ds = Dataset.from_pandas(eval_df).map(tokenize_function, batched=True)

Map:   0%|          | 0/36 [00:00<?, ? examples/s]

In [37]:
# metric for evaluation
def corr_d(eval_pred):
    import numpy as np
    from scipy.stats import pearsonr
    predictions, labels = eval_pred
    return {'pearson': pearsonr(labels, predictions[:,0])[0]}

In [43]:
from transformers import TrainingArguments,Trainer
batch_size = 60
epochs = 10

lr = 8e-5

training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=lr,
    optim="adamw_torch",
    warmup_ratio=0.1,
    lr_scheduler_type="cosine",
    fp16=False,
    bf16=True,
    # evaluation_strategy="epoch",
    per_device_train_batch_size=batch_size,
    num_train_epochs=epochs,
    weight_decay=0.01,
    report_to="none")

model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=1)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds,
    eval_dataset=eval_ds,
    compute_metrics=corr_d,
)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
trainer.train()



Step,Training Loss
500,0.0595
1000,0.032
1500,0.02
2000,0.017
2500,0.0164
3000,0.016
3500,0.016
4000,0.016
4500,0.0152
5000,0.016




TrainOutput(global_step=6080, training_loss=0.021229954769736843, metrics={'train_runtime': 1152.0317, 'train_samples_per_second': 316.658, 'train_steps_per_second': 5.278, 'total_flos': 4.831397268532224e+16, 'train_loss': 0.021229954769736843, 'epoch': 10.0})

In [None]:
trainer.evaluate(eval_dataset=tokenized_ds)



{'eval_loss': 0.01203677523881197,
 'eval_pearson': 0.9069551229476929,
 'eval_runtime': 382.5732,
 'eval_samples_per_second': 715.157,
 'eval_steps_per_second': 89.395,
 'epoch': 10.0}

In [None]:
import numpy as np

preds = trainer.predict(eval_ds).predictions.astype(float)
preds = np.clip(preds, 0, 1)
preds[:10]


array([[0.46289062],
       [0.7265625 ],
       [0.3125    ],
       [0.3046875 ],
       [0.30273438],
       [0.58203125],
       [0.51953125],
       [0.2734375 ],
       [0.18457031],
       [1.        ]])

In [None]:
import datasets

submission = datasets.Dataset.from_dict({
    'id': eval_ds['id'],
    'score': preds
})

submission.to_csv('submission.csv', index=False)

Creating CSV from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

1061