In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, StratifiedKFold
import shutil

from torch.utils.data import DataLoader, Dataset
import datasets, transformers
from transformers import TrainingArguments, Trainer
from transformers import AutoModelForSequenceClassification, AutoTokenizer

os.environ["WANDB_DISABLED"] = "true"

# Config

In [2]:
class CFG:
    input_path = '../input/uspppm-train-data/'
    model_path = '../input/bert-for-patents/bert-for-patents/' #pretrained model from hugging face
    #model_path = "../input/uspppm-bert-train-v3/"
    
    learning_rate = 1e-5
    weight_decay = 0.01
    num_fold = 5
    epochs = 5
    batch_size = 16

# Preproc

In [3]:
train_df = pd.read_csv(f"{CFG.input_path}uspppm_train.csv")
#titles = pd.read_csv('../input/cpc-codes/titles.csv')
#train_df = train_df.merge(titles, left_on='context', right_on='code')

debug = False
if debug:
    train_df = train_df[:50]
    print(len(train_df))
    

# https://www.kaggle.com/code/abhishek/phrase-matching-folds
def create_folds(data, num_splits):
    # we create a new column called kfold and fill it with -1
    data["fold"] = -1
    
    # the next step is to randomize the rows of the data
    # data = data.sample(frac=1).reset_index(drop=True)

    # calculate number of bins by Sturge's rule
    # I take the floor of the value, you can also
    # just round it
    # num_bins = int(np.floor(1 + np.log2(len(data))))
    
    # bin targets
    data.loc[:, "bins"] = pd.cut(
        data["score"], bins=5, labels=False
    )
    
    # initiate the kfold class from model_selection module
    kf = StratifiedKFold(n_splits=num_splits, shuffle=True, random_state=42)
    
    # fill the new kfold column
    # note that, instead of targets, we use bins!
    for f, (t_, v_) in enumerate(kf.split(X=data, y=data.bins.values)):
        data.loc[v_, 'fold'] = f
    
    # drop the bins column
    data = data.drop("bins", axis=1)

    # return dataframe with folds
    return data

In [4]:
# sections = {"A" : "Human Necessities", 
#             "B" : "Operations and Transport",
#             "C" : "Chemistry and Metallurgy",
#             "D" : "Textiles",
#             "E" : "Fixed Constructions",
#             "F" : "Mechanical Engineering",
#             "G" : "Physics",
#             "H" : "Electricity",
#             "Y" : "Emerging Cross-Sectional Technologies"}

In [5]:
# #combine anchor, context, title (previously no context) 

# train_df['topic'] = train_df['section'].map(sections).str.lower()

# train_df['input'] = train_df['section'] + " " + train_df['class'].astype(str) + " " + train_df['topic'] + ' ' +\
#                     train_df['title'].str.lower() + " - " + train_df['anchor']
train_df = create_folds(train_df, CFG.num_fold)
# train_df = train_df.drop(['subclass','group','main_group'],axis = 1)

In [6]:
pd.set_option('display.max_colwidth', None)
train_df.head()
train_df

Unnamed: 0,input,target,score,fold
0,abat a 47 human necessities furnitur domest articl applianc coffe mill spice mill suction cleaner general,abat pollut,0.50,1
1,abat a 47 human necessities furnitur domest articl applianc coffe mill spice mill suction cleaner general,act abat,0.75,3
2,abat a 47 human necessities furnitur domest articl applianc coffe mill spice mill suction cleaner general,activ catalyst,0.25,3
3,abat a 47 human necessities furnitur domest articl applianc coffe mill spice mill suction cleaner general,elimin process,0.50,3
4,abat a 47 human necessities furnitur domest articl applianc coffe mill spice mill suction cleaner general,forest region,0.00,4
...,...,...,...,...
40468,fire cartridg case c 6 chemistry and metallurgy explos match,fire cartridg,0.75,1
40469,calcul analysi h 4 electricity electr communic techniqu,determin analysi,0.75,4
40470,grip surfac d 5 textiles sew embroid tuft,grip layer,0.75,4
40471,sphygmomanomet devic g 5 physics control regul,sphygmomanomet,0.75,2


# Tokenizer

In [None]:
tokenizer = AutoTokenizer.from_pretrained(CFG.model_path)

# Dataset

In [7]:
class TrainDataset(Dataset):
    def __init__(self, df):
        self.inputs = df['input'].values.astype(str)
        self.targets = df['target'].values.astype(str)
        self.label = df['score'].values

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, item):
        inputs = self.inputs[item]
        targets = self.targets[item]
        label = self.label[item]
        
        return {
        **tokenizer( inputs, targets ),
        'label':label.astype(np.float32)
    }

# Train

In [8]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = predictions.reshape(len(predictions))
    return {
        'pearson': np.corrcoef(predictions, labels)[0][1]
    }

In [9]:
train_df.head()

Unnamed: 0,input,target,score,fold
0,abat a 47 human necessities furnitur domest articl applianc coffe mill spice mill suction cleaner general,abat pollut,0.5,1
1,abat a 47 human necessities furnitur domest articl applianc coffe mill spice mill suction cleaner general,act abat,0.75,3
2,abat a 47 human necessities furnitur domest articl applianc coffe mill spice mill suction cleaner general,activ catalyst,0.25,3
3,abat a 47 human necessities furnitur domest articl applianc coffe mill spice mill suction cleaner general,elimin process,0.5,3
4,abat a 47 human necessities furnitur domest articl applianc coffe mill spice mill suction cleaner general,forest region,0.0,4


In [10]:
oof_df = pd.DataFrame()
for fold in range(CFG.num_fold):
    tr_data = train_df[train_df['fold']!=fold].reset_index(drop=True)
    va_data = train_df[train_df['fold']==fold].reset_index(drop=True)
    tr_dataset = TrainDataset(tr_data)
    va_dataset = TrainDataset(va_data)
    
    args = TrainingArguments(
        output_dir=f"/tmp/uspppm",
        evaluation_strategy="epoch",
        save_strategy="epoch",
        lr_scheduler_type='linear',
        learning_rate=CFG.learning_rate,
        per_device_train_batch_size=CFG.batch_size,
        per_device_eval_batch_size=CFG.batch_size,
        num_train_epochs=CFG.epochs,
        weight_decay=CFG.weight_decay,
        metric_for_best_model="pearson",
        load_best_model_at_end=True,
    )
    
    #model = AutoModelForSequenceClassification.from_pretrained(f'{CFG.model_path}uspppm_{fold}', num_labels=1)
    #tokenizer = AutoTokenizer.from_pretrained(f'{CFG.model_path}uspppm_{fold}')
    model = AutoModelForSequenceClassification.from_pretrained(CFG.model_path, num_labels=1)
    tokenizer = AutoTokenizer.from_pretrained(CFG.model_path)
    trainer = Trainer(
        model,
        args,
        train_dataset=tr_dataset,
        eval_dataset=va_dataset,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics
    )
    
    trainer.train()
    shutil.rmtree(f"/tmp/uspppm")
    trainer.save_model(f"uspppm_{fold}")
    
    outputs = trainer.predict(va_dataset)
    predictions = outputs.predictions.reshape(-1)
    va_data['preds'] = predictions
    oof_df = pd.concat([oof_df, va_data])

Using the `WAND_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
Some weights of the model checkpoint at ../input/bert-for-patents/bert-for-patents/ were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from th

Epoch,Training Loss,Validation Loss,Pearson
1,0.0394,0.031174,0.779812
2,0.0273,0.024299,0.823168
3,0.0179,0.023351,0.833184
4,0.0125,0.022361,0.84458
5,0.0097,0.021787,0.846015


***** Running Evaluation *****
  Num examples = 8095
  Batch size = 16
Saving model checkpoint to /tmp/uspppm/checkpoint-2024
Configuration saved in /tmp/uspppm/checkpoint-2024/config.json
Model weights saved in /tmp/uspppm/checkpoint-2024/pytorch_model.bin
tokenizer config file saved in /tmp/uspppm/checkpoint-2024/tokenizer_config.json
Special tokens file saved in /tmp/uspppm/checkpoint-2024/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 8095
  Batch size = 16
Saving model checkpoint to /tmp/uspppm/checkpoint-4048
Configuration saved in /tmp/uspppm/checkpoint-4048/config.json
Model weights saved in /tmp/uspppm/checkpoint-4048/pytorch_model.bin
tokenizer config file saved in /tmp/uspppm/checkpoint-4048/tokenizer_config.json
Special tokens file saved in /tmp/uspppm/checkpoint-4048/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 8095
  Batch size = 16
Saving model checkpoint to /tmp/uspppm/checkpoint-6072
Configuration saved in /tmp/uspppm

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using the `WAND_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
loading configuration file ../input/bert-for-patents/bert-for-patents/config.json
Model config BertConfig {
  "_name_or_path": "../input/bert-for-patents/bert-for-patents/",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "id2label": {
    "0": "LABEL_0"
  },
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "label2id": {
    "LABEL_0": 0
  },
  

Epoch,Training Loss,Validation Loss,Pearson
1,0.0395,0.032953,0.787026
2,0.0256,0.039033,0.821384
3,0.0177,0.025107,0.83723
4,0.0132,0.022506,0.846124
5,0.0094,0.021477,0.848145


***** Running Evaluation *****
  Num examples = 8095
  Batch size = 16
Saving model checkpoint to /tmp/uspppm/checkpoint-2024
Configuration saved in /tmp/uspppm/checkpoint-2024/config.json
Model weights saved in /tmp/uspppm/checkpoint-2024/pytorch_model.bin
tokenizer config file saved in /tmp/uspppm/checkpoint-2024/tokenizer_config.json
Special tokens file saved in /tmp/uspppm/checkpoint-2024/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 8095
  Batch size = 16
Saving model checkpoint to /tmp/uspppm/checkpoint-4048
Configuration saved in /tmp/uspppm/checkpoint-4048/config.json
Model weights saved in /tmp/uspppm/checkpoint-4048/pytorch_model.bin
tokenizer config file saved in /tmp/uspppm/checkpoint-4048/tokenizer_config.json
Special tokens file saved in /tmp/uspppm/checkpoint-4048/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 8095
  Batch size = 16
Saving model checkpoint to /tmp/uspppm/checkpoint-6072
Configuration saved in /tmp/uspppm

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using the `WAND_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
loading configuration file ../input/bert-for-patents/bert-for-patents/config.json
Model config BertConfig {
  "_name_or_path": "../input/bert-for-patents/bert-for-patents/",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "id2label": {
    "0": "LABEL_0"
  },
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "label2id": {
    "LABEL_0": 0
  },
  

Epoch,Training Loss,Validation Loss,Pearson
1,0.04,0.030054,0.789269
2,0.0269,0.025876,0.822838
3,0.0182,0.022168,0.839416
4,0.0128,0.021268,0.84688
5,0.0096,0.021669,0.846682


***** Running Evaluation *****
  Num examples = 8095
  Batch size = 16
Saving model checkpoint to /tmp/uspppm/checkpoint-2024
Configuration saved in /tmp/uspppm/checkpoint-2024/config.json
Model weights saved in /tmp/uspppm/checkpoint-2024/pytorch_model.bin
tokenizer config file saved in /tmp/uspppm/checkpoint-2024/tokenizer_config.json
Special tokens file saved in /tmp/uspppm/checkpoint-2024/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 8095
  Batch size = 16
Saving model checkpoint to /tmp/uspppm/checkpoint-4048
Configuration saved in /tmp/uspppm/checkpoint-4048/config.json
Model weights saved in /tmp/uspppm/checkpoint-4048/pytorch_model.bin
tokenizer config file saved in /tmp/uspppm/checkpoint-4048/tokenizer_config.json
Special tokens file saved in /tmp/uspppm/checkpoint-4048/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 8095
  Batch size = 16
Saving model checkpoint to /tmp/uspppm/checkpoint-6072
Configuration saved in /tmp/uspppm

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using the `WAND_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
loading configuration file ../input/bert-for-patents/bert-for-patents/config.json
Model config BertConfig {
  "_name_or_path": "../input/bert-for-patents/bert-for-patents/",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "id2label": {
    "0": "LABEL_0"
  },
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "label2id": {
    "LABEL_0": 0
  },
  

Epoch,Training Loss,Validation Loss,Pearson
1,0.0411,0.028268,0.788295
2,0.0271,0.02488,0.818615
3,0.0181,0.023058,0.832659
4,0.0136,0.022167,0.84447
5,0.0103,0.021451,0.84788


***** Running Evaluation *****
  Num examples = 8094
  Batch size = 16
Saving model checkpoint to /tmp/uspppm/checkpoint-2024
Configuration saved in /tmp/uspppm/checkpoint-2024/config.json
Model weights saved in /tmp/uspppm/checkpoint-2024/pytorch_model.bin
tokenizer config file saved in /tmp/uspppm/checkpoint-2024/tokenizer_config.json
Special tokens file saved in /tmp/uspppm/checkpoint-2024/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 8094
  Batch size = 16
Saving model checkpoint to /tmp/uspppm/checkpoint-4048
Configuration saved in /tmp/uspppm/checkpoint-4048/config.json
Model weights saved in /tmp/uspppm/checkpoint-4048/pytorch_model.bin
tokenizer config file saved in /tmp/uspppm/checkpoint-4048/tokenizer_config.json
Special tokens file saved in /tmp/uspppm/checkpoint-4048/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 8094
  Batch size = 16
Saving model checkpoint to /tmp/uspppm/checkpoint-6072
Configuration saved in /tmp/uspppm

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using the `WAND_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
loading configuration file ../input/bert-for-patents/bert-for-patents/config.json
Model config BertConfig {
  "_name_or_path": "../input/bert-for-patents/bert-for-patents/",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "id2label": {
    "0": "LABEL_0"
  },
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "label2id": {
    "LABEL_0": 0
  },
  

Epoch,Training Loss,Validation Loss,Pearson
1,0.0405,0.026947,0.796365
2,0.0271,0.024463,0.82926
3,0.0175,0.023255,0.847041
4,0.0133,0.022027,0.851368
5,0.01,0.02056,0.854878


***** Running Evaluation *****
  Num examples = 8094
  Batch size = 16
Saving model checkpoint to /tmp/uspppm/checkpoint-2024
Configuration saved in /tmp/uspppm/checkpoint-2024/config.json
Model weights saved in /tmp/uspppm/checkpoint-2024/pytorch_model.bin
tokenizer config file saved in /tmp/uspppm/checkpoint-2024/tokenizer_config.json
Special tokens file saved in /tmp/uspppm/checkpoint-2024/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 8094
  Batch size = 16
Saving model checkpoint to /tmp/uspppm/checkpoint-4048
Configuration saved in /tmp/uspppm/checkpoint-4048/config.json
Model weights saved in /tmp/uspppm/checkpoint-4048/pytorch_model.bin
tokenizer config file saved in /tmp/uspppm/checkpoint-4048/tokenizer_config.json
Special tokens file saved in /tmp/uspppm/checkpoint-4048/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 8094
  Batch size = 16
Saving model checkpoint to /tmp/uspppm/checkpoint-6072
Configuration saved in /tmp/uspppm

In [11]:
predictions = oof_df['preds'].values
label = oof_df['score'].values
eval_pred = predictions, label
compute_metrics(eval_pred)

{'pearson': 0.8486152414944023}

In [12]:
oof_df.to_csv('oof_df.csv')