In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, StratifiedKFold
import shutil

from torch.utils.data import DataLoader, Dataset
import datasets, transformers
from transformers import TrainingArguments, Trainer
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from wordcloud import WordCloud, STOPWORDS
import string

from nltk.stem.snowball import SnowballStemmer


os.environ["WANDB_DISABLED"] = "true"

# Config

In [2]:
class CFG:
    input_path = '../input/us-patent-phrase-to-phrase-matching/'
    #model_path = ["../input/uspppm-debertv3large-5folds/","../input/uspppm-bert-for-patents-baseline-5folds/",
    #             "../input/uspppm-debertv3large-5folds-v2/","../input/berttrain/","../input/uspppmdebertav3/",
    #              "../input/uspppm-bert-train-v3/","../input/uspppm-deberta-v3/"]
    #model_path = ["../input/uspppm-debertv3large-5folds/uspppm_1", "../input/berttrain/uspppm_3","../input/berttrain/uspppm_4"]
    #model_path = ["../input/uspppm-bert-for-patents-baseline-5folds/","../input/berttrain/","../input/uspppmdebertav3/"]
    model_path = ["../input/uspppm-bert-train-v3/"]
    #model_path = ["../input/uspppm-deberta-v3/"]
    #model_path = ["../input/uspppm-bert-train-v3/","../input/uspppm-deberta-v3/"]
    
    learning_rate = 2e-5
    weight_decay = 0.01
    num_fold = 5
    epochs = 5
    batch_size = 32
    self_testing = False


# Preproc

In [3]:
stemmer = SnowballStemmer("english")

def remove_punctuations(text):
    for punctuation in string.punctuation:
        text = text.replace(punctuation, '')
    return text

def remove_stopwords(text):
    temp = ''
    for w in text.split():
        if w in STOPWORDS: continue
        temp += ' ' + w
    return temp

def stemming(text):
    temp = ''
    for w in text.split():
        temp += ' ' + stemmer.stem(w)
    return temp

In [4]:

def preproc(modelname, self_testing):
    sections = {"A" : "Human Necessities", 
                "B" : "Operations and Transport",
                "C" : "Chemistry and Metallurgy",
                "D" : "Textiles",
                "E" : "Fixed Constructions",
                "F" : "Mechanical Engineering",
                "G" : "Physics",
                "H" : "Electricity",
                "Y" : "Emerging Cross-Sectional Technologies"}

    titles = pd.read_csv('../input/cpc-codes/titles.csv')
    if not self_testing:
        test_df = pd.read_csv(f"{CFG.input_path}test.csv")
        test_df = test_df.merge(titles, left_on='context', right_on='code')
        test_df['input'] = test_df['section'] + " "+ test_df['class'].astype(str) + " " +test_df['title']+' [SEP] '+test_df['anchor']

    else:
        train_df = pd.read_csv(f"{CFG.input_path}train.csv")
        train_df = train_df.merge(titles, left_on = "context", right_on = 'code')
        train_df["input"] = train_df['section'] + " "+ train_df['class'].astype(str) + " " +train_df['title']+' [SEP] '+train_df['anchor']
        val_df = train_df.sample(128, random_state = 1996)
        test_df = val_df

    if modelname == "../input/uspppm-bert-train-v3/":
        #test_df['topic'] = test_df['section'].map(sections).str.lower()
        #test_df['input'] = test_df['section'] + " " + test_df['class'].astype(str) + " " + test_df['topic'] + ' ' +\
        #                    test_df['title'].str.lower() + " - " + test_df['anchor']
        
        test_df = pd.read_csv(f"{CFG.input_path}test.csv")
        test_df = test_df.merge(titles, left_on='context', right_on='code')
        # target and anchor
        test_df['target'] =  test_df.target.apply(remove_punctuations)
        test_df['target'] =  test_df.target.apply(remove_stopwords)
        test_df['target'] =  test_df.target.apply(stemming)

        test_df['anchor'] =  test_df.anchor.apply(remove_punctuations)
        test_df['anchor'] =  test_df.anchor.apply(remove_stopwords)
        test_df['anchor'] =  test_df.anchor.apply(stemming)
        
        #titles
        test_df['title'] = test_df.title.apply(remove_punctuations)
        test_df['title'] = test_df.title.str.lower()
        test_df['title'] = test_df.title.apply(remove_stopwords)
        test_df['title'] = test_df.title.apply(stemming)
        
        #merge
        test_df = test_df.drop(['subclass','group','main_group', 'context'],axis = 1)
        test_df['class'] = test_df['class'].astype(int)
        
        #form input
        test_df['topic'] = test_df['section'].map(sections).str.lower()
        test_df['input'] = test_df['anchor'] +' '+ test_df['section'].str.lower() + " " + test_df['class'].astype(str) + " " + test_df['topic'] + ' ' +\
                    test_df['title'].str.lower() 

    return test_df

#test = preproc(CFG.model_path[0], CFG.self_testing)

In [5]:
#pd.set_option('display.max_colwidth', None)
#test[['input','target']]

# Tokenizer

In [6]:
#tokenizer = AutoTokenizer.from_pretrained(f'{CFG.model_path[]}uspppm_1')

# Dataset

In [7]:
class InferDataset(Dataset):
    def __init__(self, df):
        self.inputs = df['input'].values.astype(str)
        self.targets = df['target'].values.astype(str)

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, item):
        inputs = self.inputs[item]
        targets = self.targets[item]
        
        return {
        **tokenizer( inputs, targets )
    }

# Inference

In [8]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = predictions.reshape(len(predictions))
    return {
        'pearson': np.corrcoef(predictions, labels)[0][1]
    }

In [9]:
predictions = []

n_model = len(CFG.model_path)

#n_model = 1

for i in range(n_model):
    for fold in range(CFG.num_fold):
        test_df = preproc(CFG.model_path[i], CFG.self_testing)
        te_dataset = InferDataset(test_df)
        model = AutoModelForSequenceClassification.from_pretrained(f'{CFG.model_path[i]}uspppm_{fold}', num_labels=1)
        tokenizer = AutoTokenizer.from_pretrained(f'{CFG.model_path[i]}uspppm_{fold}')
        trainer = Trainer(
                model,
                tokenizer=tokenizer
            )

        outputs = trainer.predict(te_dataset)
        prediction = outputs.predictions.reshape(-1)
        predictions.append(prediction)
    
#predictions = np.mean(predictions, axis=0)


Using the `WAND_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
***** Running Prediction *****
  Num examples = 36
  Batch size = 8


loading configuration file ../input/uspppm-bert-train-v3/uspppm_1/config.json
Model config BertConfig {
  "_name_or_path": "../input/uspppm-bert-train-v3/uspppm_1",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "id2label": {
    "0": "LABEL_0"
  },
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "label2id": {
    "LABEL_0": 0
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "problem_type": "regression",
  "torch_dtype": "float32",
  "transformers_version": "4.16.2",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 39859
}

loading weights file ../input/uspppm-bert-train-v3/uspppm_1/pytorch_model.bin
All model checkpoint weights were used when 

loading configuration file ../input/uspppm-bert-train-v3/uspppm_2/config.json
Model config BertConfig {
  "_name_or_path": "../input/uspppm-bert-train-v3/uspppm_2",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "id2label": {
    "0": "LABEL_0"
  },
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "label2id": {
    "LABEL_0": 0
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "problem_type": "regression",
  "torch_dtype": "float32",
  "transformers_version": "4.16.2",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 39859
}

loading weights file ../input/uspppm-bert-train-v3/uspppm_2/pytorch_model.bin
All model checkpoint weights were used when 

loading configuration file ../input/uspppm-bert-train-v3/uspppm_3/config.json
Model config BertConfig {
  "_name_or_path": "../input/uspppm-bert-train-v3/uspppm_3",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "id2label": {
    "0": "LABEL_0"
  },
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "label2id": {
    "LABEL_0": 0
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "problem_type": "regression",
  "torch_dtype": "float32",
  "transformers_version": "4.16.2",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 39859
}

loading weights file ../input/uspppm-bert-train-v3/uspppm_3/pytorch_model.bin
All model checkpoint weights were used when 

loading configuration file ../input/uspppm-bert-train-v3/uspppm_4/config.json
Model config BertConfig {
  "_name_or_path": "../input/uspppm-bert-train-v3/uspppm_4",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "id2label": {
    "0": "LABEL_0"
  },
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "label2id": {
    "LABEL_0": 0
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "problem_type": "regression",
  "torch_dtype": "float32",
  "transformers_version": "4.16.2",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 39859
}

loading weights file ../input/uspppm-bert-train-v3/uspppm_4/pytorch_model.bin
All model checkpoint weights were used when 

In [10]:
#use gaussian distribution assumption, calculate percentile



In [11]:
# # # round down within a range
# # # #minmax scaler
# from sklearn.preprocessing import MinMaxScaler
# import bisect
# import collections

# predictions_mean = np.mean(predictions, axis=0)
# # print("predictions_mean:\n", predictions_mean)

# ##Apply Minmax scaler 
# temp1 = predictions_mean.reshape(-1,1)
# temp1 = MinMaxScaler().fit_transform(temp1)
# temp1 = temp1.flatten()
 
    
# n = len(predictions_mean)

# score = [0, 0.25, 0.5, 0.75, 1.00]
# predictions_seg4 = []
# w = 0.01

# for i in range(n):
#     temp = temp1[i]
#     ind = min(bisect.bisect(score,temp),4)
    
#     if ind == 0:
#         temp = 0.0
        
#     elif score[ind]-w<temp<score[ind]+w:
#         temp = score[ind]
#     elif score[ind-1]-w<temp<score[ind-1]+w:
#         temp = score[ind-1]
    
#     if temp>1.0: temp = 1.0
    
#     predictions_seg4.append(temp)
    
# print("predictions_seg4:\n", predictions_seg4)

In [12]:
# #apply majority vote
# n, m = len(predictions), len(predictions[0])

# import bisect
# import collections
# score = [0, 0.25, 0.5, 0.75, 1.00]
# temp1 = [[0]*m for _ in range(n)]

# for j in range(n):
#     for i in range(m):
#         ind = min(bisect.bisect(score,predictions[j][i]),4)
#         if ind == 0:
#             temp1[j][i] = score[ind]
#         else:
#             d = abs(predictions[j][i]-score[ind]) - abs(predictions[j][i]-score[ind-1])
#             temp1[j][i] = score[ind] if d<0 else score[ind-1]  

# print("predictions\n",predictions)
# print("temp1\n",temp1)

# temp2 = []
# for i in range(m):
#     temp2.append([p[i] for p in temp1])
    
# print("temp2\n",temp2)

# predictions_seg0 = []
# for i in range(m):
#     temp = collections.Counter(temp2[i])
#     temp = [[temp[i],i] for i in temp]
#     ind = temp.index(max(temp))
#     predictions_seg0.append(temp[ind][1])

    
# print("predictions_seg0\n", predictions_seg0, len(predictions_seg0))


In [13]:
#segment score to 0, 0.25, 0.5,0.75, 1.00
# import bisect
# score = [0, 0.25, 0.75, 1.00]
# predictions_seg1 = [0]*len(predictions)

# print(predictions)

# for i in range(len(predictions)):
#     ind = min(bisect.bisect(score,predictions[i]),3)
#     if ind == 0:
#         predictions_seg1[i] = score[ind]
#     else:
#         d = abs(predictions[i]-score[ind]) - abs(predictions[i]-score[ind-1])
#         predictions_seg1[i] = score[ind] if d<0 else score[ind-1]  

# print("predictions_seg1\n",predictions_seg1)
# #segment negative vale and larger than 1 value
# predictions_seg2 = [0]*len(predictions)

# for i in range(len(predictions)):
#     if predictions[i] < 0:
#         predictions_seg2[i] = 0
#     elif predictions[i] > 1:
#         predictions_seg2[i] = 1
#     else:
#         predictions_seg2[i] = predictions[i]
        
# print("predictions_seg2\n",predictions_seg2)

# #minmax scaler
# from sklearn.preprocessing import MinMaxScaler

# predictions = np.mean(predictions, axis=0)
# predictions_seg3 = predictions.reshape(-1,1)
# predictions_seg3 = MinMaxScaler().fit_transform(predictions_seg3)
# predictions_seg3 = predictions_seg3.flatten()
# print("predictions_seg3\n",predictions_seg3)

# Self testing

* Individual

In [14]:
#self testing
if CFG.self_testing:
    m,n = len(predictions), len(predictions[0])
    labels = test_df["score"].reset_index(drop = True)

    model = [[i+j for j in range(5)] for i in range(0,m,5)]

    #individual------------------------------------------------------------
    pearson = []
    print("Individual model:\n")
    for mo in range(m): 
        prediction = predictions[mo]
        eval_pred = prediction, labels
        pearson.append(compute_metrics(eval_pred)['pearson'])
        print(mo, compute_metrics(eval_pred))

    # Individual model
    #average within model--------------------------------------------------
    print("\n Average within models:\n")
    model = [[i+j for j in range(5)] for i in range(0,m,5)]
    predictions_ave = [] 
    for mo in model:
        p = predictions[mo[0]:mo[-1]+1]
        prediction = np.mean(p, axis = 0)
        eval_pred = prediction, labels
        print(mo, compute_metrics(eval_pred))
        predictions_ave.append(prediction)

    # Average several models

    def BackTrac(models, model, m):
        if len(model) > m:
            return
        if model and not model in models:
            models.add(model)

        for i in range(m):
            if not i in model: BackTrac(models, model.union(frozenset([i])), m)

#     print("\n Average several models:\n")
#     models = set()
#     BackTrac(models,frozenset(),m)
#     models = [list(mo) for mo in models]
#     models.sort(key = lambda x: len(x))
#     #average several models------------------------------------------------
#     for mo in models:
#         prediction = [predictions_ave[m] for m in mo] 
#         prediction = np.mean(prediction, axis=0)
#         eval_pred = prediction, labels
#         print(mo, compute_metrics(eval_pred))

#     # Weighted by Pearson correlation
#     #Weighted average by pearson correlation--------------------------------
#     print("\n Weighted average by pearson correlation:\n")
#     w = np.exp(pearson)/np.sum(np.exp(pearson))

#     prediction = []

#     for i in range(len(predictions[0])):
#         temp = 0
#         for j in range(len(predictions)):
#             temp += predictions[j][i]*w[j]
#         prediction.append(temp)

#     eval_pred = np.array(prediction), labels
#     print(compute_metrics(eval_pred))

    # Save predictions for self testing--------------------------------------
    pre = dict()
    for i in range(len(predictions)):
        pre[f"{i}"] = predictions[i]

    pre["label"] = np.array(labels)
    pre2 = datasets.Dataset.from_dict(pre)

    pre2.to_csv('predictions.csv', index=False)

# Save to submission

In [15]:
#prediction = np.mean(predictions, axis=0)
prediction = np.mean(predictions, axis=0)
submission = datasets.Dataset.from_dict({
    'id': test_df['id'],
    'score': prediction,
})

#submission[['score']] = MinMaxScaler().fit_transform(submission[['score']])
submission.to_csv('submission.csv', index=False)
submission

Creating CSV from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Dataset({
    features: ['id', 'score'],
    num_rows: 36
})