In [63]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

# Data Load

In [64]:
df = pd.read_csv('/kaggle/input/us-patent-phrase-to-phrase-matching/train.csv')
df.head()

Unnamed: 0,id,anchor,target,context,score
0,37d61fd2272659b1,abatement,abatement of pollution,A47,0.5
1,7b9652b17b68b7a4,abatement,act of abating,A47,0.75
2,36d72442aefd8232,abatement,active catalyst,A47,0.25
3,5296b0c19e1ce60e,abatement,eliminating process,A47,0.5
4,54c1e3b9184cb5b6,abatement,forest region,A47,0.0


In [65]:
df.describe(include='object')

Unnamed: 0,id,anchor,target,context
count,36473,36473,36473,36473
unique,36473,733,29340,106
top,8d135da0b55b8c88,component composite coating,composition,H01
freq,1,152,24,2186


## Prompt Structure Feature
Transformer models don’t naturally understand multiple columns unless you combine them into structured text

In [66]:
df['input'] = 'TEXT1: ' + df.context + '; TEXT2: ' + df.target + '; TEXT3: ' + df.anchor

df.input.head()

0    TEXT1: A47; TEXT2: abatement of pollution; TEX...
1    TEXT1: A47; TEXT2: act of abating; TEXT3: abat...
2    TEXT1: A47; TEXT2: active catalyst; TEXT3: aba...
3    TEXT1: A47; TEXT2: eliminating process; TEXT3:...
4    TEXT1: A47; TEXT2: forest region; TEXT3: abate...
Name: input, dtype: object

# Dataset Creation

In [67]:
from datasets import Dataset, DatasetDict

ds = Dataset.from_pandas(df)
ds

Dataset({
    features: ['id', 'anchor', 'target', 'context', 'score', 'input'],
    num_rows: 36473
})

# Tokenization

## Tokenizer Selection

In [68]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

#We choose a pretrained model
model_nm = 'microsoft/deberta-v3-small'

#With this we obtain the tokenizer from the model
tokz = AutoTokenizer.from_pretrained(model_nm)



### Tokenization Explanation

In [69]:
#When the words are well known by the model the segmentatios is done by word.
print(tokz.tokenize('In english the words are well recognized.'))

#But for not known, the words are splitted
print(tokz.tokenize('Obviamente en español no conoce ninguna palabra.'))

print(tokz.tokenize('hello my friends'))

#the tokens correspond from the third vector value onward
print(tokz('hello my friends'))

['▁In', '▁english', '▁the', '▁words', '▁are', '▁well', '▁recognized', '.']
['▁Ob', 'vi', 'amente', '▁en', '▁español', '▁no', '▁con', 'oce', '▁n', 'ing', 'una', '▁palabra', '.']
['▁hello', '▁my', '▁friends']
{'input_ids': [1, 12018, 312, 774, 2], 'token_type_ids': [0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1]}


## Defining Tokenization Functions

In [41]:
def tokeniz_funct(x):
    return tokz(x['input'])

In [42]:
tokenized_ds = ds.map(tokeniz_funct, batched=True)

Map:   0%|          | 0/36473 [00:00<?, ? examples/s]

In [43]:
row = tokenized_ds[0]
row

{'id': '37d61fd2272659b1',
 'anchor': 'abatement',
 'target': 'abatement of pollution',
 'context': 'A47',
 'score': 0.5,
 'input': 'TEXT1: A47; TEXT2: abatement of pollution; TEXT3: abatement',
 'input_ids': [1,
  54453,
  435,
  294,
  336,
  5753,
  346,
  54453,
  445,
  294,
  47284,
  265,
  6435,
  346,
  54453,
  508,
  294,
  47284,
  2],
 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [44]:
tokenized_ds = tokenized_ds.rename_columns({'score':'labels'})

# Test and Validation Sets

## Train - Validation Sets

In [45]:
phrase_dataset = tokenized_ds.train_test_split(0.25, seed=42)
phrase_dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'anchor', 'target', 'context', 'labels', 'input', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 27354
    })
    test: Dataset({
        features: ['id', 'anchor', 'target', 'context', 'labels', 'input', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 9119
    })
})

In [46]:
train_ds = phrase_dataset['train']
valid_ds = phrase_dataset['test']

## Test Set

In [47]:
test_df = pd.read_csv('/kaggle/input/us-patent-phrase-to-phrase-matching/test.csv')

test_df.describe()

Unnamed: 0,id,anchor,target,context
count,36,36,36,36
unique,36,34,36,29
top,4112d61851461f60,hybrid bearing,inorganic photoconductor drum,G02
freq,1,2,1,3


In [48]:
test_df['input'] = 'TEXT1: ' + test_df.context + '; TEXT2: ' + test_df.target + '; TEXT3: ' + test_df.anchor
test_ds = Dataset.from_pandas(test_df).map(tokeniz_funct, batched=True)

Map:   0%|          | 0/36 [00:00<?, ? examples/s]

# Metrics

## Correlation Coeficient:
Indicates the relation between sets of values, can be features and label.

In [49]:
lista = [[134,354,674,76],[456,236,767,45]]
test = np.array(lista)

#The format of this function does not help as we get a diagonal symetric matrix
print('Correlation Matrix:')
print(np.corrcoef(test[0],test[1]))

#To get a more useful result we use just one of the diagonal values
print('Correlation Value:')
np.corrcoef(test[0],test[1])[0][1]

Correlation Matrix:
[[1.         0.79181586]
 [0.79181586 1.        ]]
Correlation Value:


0.7918158640315541

### Defining correlation function

In [50]:
#Defining the correlation coeficient function
def corr_coef(x,y):
    return np.corrcoef(x,y)[0][1]

In [51]:
#Defining the correlation dictionary function
#Predictions of model are sometines composed of (prediction, label), *eval_pred unpacks those values in 2 tupples

def corr_dict(eval_pred):
    return {'pearson':corr_coef(*eval_pred)}

### Correlation Plot

In [52]:
def show_corr_plot(df, a, b):
    x,y = df[a],df[b]
    plt.scatter(x, y, alpha=0.5, s=4)
    plt.title('{} vs {}; r:{}'.format(a,b,corr_coef(x,y)))

# Model Training

In [53]:
import transformers
from transformers import TrainingArguments, Trainer
print('transformers version:',transformers.__version__)

transformers version: 4.51.1


### Declaring Training Configuration
We define the training hyperparameters

In [54]:
def get_training_config():
    config = {
        'batch_size': 128,
        'epochs': 4,
        'learning_rate': 8e-5,
        'weight_decay': 0.01 
    }
    return config

config = get_training_config()

### Set Training Arguments

In [55]:
args = TrainingArguments('outputs',
                         learning_rate = config['learning_rate'],
                         warmup_ratio = 0.1,
                         lr_scheduler_type = 'cosine',
                         fp16=True,
                         eval_strategy='epoch',
                         per_device_train_batch_size = config['batch_size'],
                         per_device_eval_batch_size = config['batch_size']*2,
                         num_train_epochs = config['epochs'],
                         weight_decay = config['weight_decay'],
                         report_to = 'none'
                        )

In [56]:
model = AutoModelForSequenceClassification.from_pretrained(model_nm, num_labels = 1)
trainer = Trainer(model,
                  args,
                  train_dataset= train_ds,
                  eval_dataset = valid_ds,
                  tokenizer = tokz,
                  compute_metrics = corr_dict)

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-small and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(model,


### Training

In [57]:
trainer.train()

Epoch,Training Loss,Validation Loss,Pearson
1,No log,0.028346,0.796515
2,No log,0.02192,0.822037
3,0.031700,0.021853,0.832686
4,0.031700,0.022255,0.833421


TrainOutput(global_step=856, training_loss=0.02404388311867402, metrics={'train_runtime': 259.675, 'train_samples_per_second': 421.357, 'train_steps_per_second': 3.296, 'total_flos': 715555561923540.0, 'train_loss': 0.02404388311867402, 'epoch': 4.0})

# Model Prediction

In [58]:
preds = trainer.predict(valid_ds).predictions.astype(float)
preds

array([0.31420898, 0.23828125, 0.00450897, ..., 0.47265625, 0.50244141,
       0.4675293 ])

In [59]:
preds = np.clip(preds, 0, 1)
preds

array([0.31420898, 0.23828125, 0.00450897, ..., 0.47265625, 0.50244141,
       0.4675293 ])

# File Submission

In [60]:
import datasets

submission = datasets.Dataset.from_dict({
    'id': valid_ds['id'],
    'score': preds
})

submission.to_csv('submission.csv', index=False)

Creating CSV from Arrow format:   0%|          | 0/10 [00:00<?, ?ba/s]

272090