In [None]:
import pandas as pd
import numpy as np
import random
import gc
import time
import os
import re
import torch
import torch
from torch.utils.data.dataset import Dataset
import torch.nn as nn
from transformers import AutoConfig, AutoModel, AdamW, AutoTokenizer
import sys
import scipy
from torch.optim.lr_scheduler import CosineAnnealingWarmRestarts
import os
from torch.utils.data import DataLoader
import warnings
import scipy.stats

class CFG:
    result_dir = '/home/xuming/workspace/pppm' # result dir
    data_dir = '/home/xuming/workspace/us-patent-phrase-to-phrase-matching' # data dir
    k_folds = 5 # k folds
    n_jobs = 5 # n_jobs
    seed = 42 # random seed
    device = torch.cuda.is_available() # use cuda
    print_freq = 100 # print frequency
    
    model_name = 'bert-for-patents' # model name  # electra-large / deberta-v3-large / funnel-large / bert-for-patents 
    base_epoch = 5 # epoch
    batch_size = 32 # batch size
    lr = 1e-5 # learning rate
    seq_length = 200 # sequence length
    max_grad_norm = 1 # gradient clipping
    

def seed_everything(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False

seed_everything(CFG.seed)


class KFold(object):
    """
    Group split by group_col
    """
    def __init__(self, k_folds=10, flag_name='fold_flag'):
        self.k_folds = k_folds # k folds
        self.flag_name = flag_name # fold_flag

    def group_split(self, train_df, group_col): 
        group_value = list(set(train_df[group_col])) # group value
        group_value.sort() # sort
        fold_flag = [i % self.k_folds for i in range(len(group_value))] # fold_flag
        np.random.shuffle(fold_flag) # shuffle
        train_df = train_df.merge(pd.DataFrame({group_col: group_value, self.flag_name: fold_flag}), how='left', on=group_col) # merge
        return train_df

def get_data():
    train_df = pd.read_csv(CFG.data_dir + '/train.csv') # train data
    train_df = KFold(CFG.k_folds).group_split(train_df, group_col='anchor') # kfold group split
    titles = get_cpc_texts() # cpc texts
    train_df = get_text(train_df, titles) # # train data get text
    test_df = pd.read_csv(CFG.data_dir + '/test.csv') # test data
    test_df['score'], test_df['fold_flag'] = 0, -1 # test fill score and fold_flag
    test_df = get_text(test_df, titles) # # test data get text
    print(train_df.shape, test_df.shape) # print shape
    return train_df, test_df # return train and test data

def get_text(df, titles):
    df['anchor'] = df['anchor'].apply(lambda x:x.lower()) # anchor lower
    df['target'] = df['target'].apply(lambda x:x.lower()) # target lower
    # title
    df['title'] = df['context'].map(titles)
    df['title'] = df['title'].apply(lambda x:x.lower().replace(';', '').replace('  ',' ').strip())

    df = df.join(df.groupby(['anchor', 'context']).target.agg(list).rename('gp_targets'), on=['anchor', 'context']) # group by anchor and context and get target_list
    df['gp_targets'] = df.apply(lambda x: ', '.join([i for i in x['gp_targets'] if i != x['target']]), axis=1) # get gp_targets
    df['text'] = df['anchor'] + '[SEP]' + df['target'] + '[SEP]'  + df['title'] + '[SEP]'  + df['gp_targets'] # anchor [SEP] target [SEP] title [SEP] gp_targets
    return df

def get_cpc_texts():
    '''
    get cpc texts
    '''
    # get cpc codes
    contexts = []  
    pattern = '[A-Z]\d+'
    for file_name in os.listdir(f'{CFG.data_dir}/cpc-data/CPCSchemeXML202105'):
        result = re.findall(pattern, file_name)
        if result:
            contexts.append(result)
    contexts = sorted(set(sum(contexts, []))) # all unique cpc codes
    # like ['A01', 'A21', 'A22', 'A23', 'A24', 'A41', 'A42', 'A43', 'A44', 'A45']
    
    results = {}
    for cpc in ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'Y']:
        with open(f'{CFG.data_dir}/cpc-data/CPCTitleList202202/cpc-section-{cpc}_20220201.txt') as f:
            s = f.read()
        # 总目录及其text 如 "A		HUMAN NECESSITIES"
        pattern = f'{cpc}\t\t.+' 
        result = re.findall(pattern, s)
        pattern = "^"+pattern[:-2]
        cpc_result = re.sub(pattern, "", result[0]) # 获取描述，如 'HUMAN NECESSITIES'

        for context in [c for c in contexts if c[0] == cpc]:
            pattern = f'{context}\t\t.+'
            result = re.findall(pattern, s) # cpc code及其text 如 'A01\t\tAGRICULTURE; FORESTRY; ANIMAL HUSBANDRY; HUNTING; TRAPPING; FISHING'
            pattern = "^"+pattern[:-2]
            results[context] = cpc_result + ". " + re.sub(pattern, "", result[0]) # 生成字典 like {'A01': 'HUMAN NECESSITIES. AGRICULTURE; FORESTRY; ANIMAL HUSBANDRY; HUNTING; TRAPPING; FISHING'}
    return results

# Dataset

In [None]:
class PatentDatasetV2(Dataset):
    def __init__(self, meta_data: pd.DataFrame, tokenizer, extract_col='text'):
        self.meta_data = meta_data.copy()  # meta_data
        self.meta_data.reset_index(drop=True, inplace=True) # reset index
        if tokenizer.sep_token != '[SEP]': 
            self.meta_data['text'] = self.meta_data['text'].apply(lambda x:x.replace('[SEP]', tokenizer.sep_token)) # replace [SEP] to tokenizer.sep_token
            
        self.text = self.meta_data[extract_col].values # text
        self.batch_max_length = self.meta_data['batch_max_length'].values # target
        self.tokenizer = tokenizer # tokenizer

    def __getitem__(self, index):
        seq = self.text[index] # seq
        target = 1 # target
        batch_max_len = self.batch_max_length[index] # batch_max_len

        encoded = self.tokenizer.encode_plus(
            text=seq, # text
            add_special_tokens=True, # add_special_tokens
            max_length=min(batch_max_len, 512), # max_length
            padding='max_length', # padding
            return_attention_mask=True, # return_attention_mask
            return_tensors='pt', # return_tensors
            truncation=True # truncation
        )
        input_ids = encoded['input_ids'][0] # input_ids
        attention_mask = encoded['attention_mask'][0] # attention_mask

        # input_ids: torch.Size([32, 200]) # padding 为 0
        # like tensor([[    2, 20211,  3269,  ...,     0,     0,     0],
        #              [    2,  2785,  9669,  ...,     0,     0,     0]], device='cuda:0')

        # attention_mask: torch.Size([32, 200]) # padding 为 0，其余为1
        # like tensor([[1, 1, 1,  ..., 0, 0, 0],
        #              [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')

        # target: torch.Size([32])
        # like tensor([0.2500, 0.5000, 0.5000, 0.5000, 0.5000, 0.2500, 0.5000, 0.2500, 0.0000,
        #              0.0000, 0.5000, 0.0000, 0.5000, 0.5000, 0.2500, 0.2500, 0.5000, 0.5000,
        #              0.0000, 0.0000, 0.2500, 0.5000, 0.0000, 0.0000, 0.5000, 0.2500, 0.7500,
        #              0.2500, 0.2500, 1.0000, 0.7500, 0.5000], device='cuda:0')

        return input_ids, attention_mask, np.array(target, dtype=np.float32)

    def __len__(self):
        return len(self.meta_data) # len

# Models

In [None]:
class PatentModel(nn.Module):
    def __init__(self, name, num_classes=1, pretrained=True):
        super(PatentModel, self).__init__()
        self.config = AutoConfig.from_pretrained(name) # config
        self.attention_probs_dropout_prob=0. # attention_probs_dropout_prob
        self.hidden_dropout_prob=0. # hidden_dropout_prob
        if pretrained:
            self.encoder = AutoModel.from_pretrained(name, config=self.config) 
        else:
            self.encoder = AutoModel.from_config(self.config)
        in_dim = self.encoder.config.hidden_size # get hidden_size
        self.last_fc = nn.Linear(in_dim, num_classes) # last_fc
        torch.nn.init.normal_(self.last_fc.weight, std=0.02) # init last_fc
        self.sig = nn.Sigmoid() # Sigmoid

    def forward(self, seq, seq_mask):
        x = self.encoder(seq, attention_mask=seq_mask)["last_hidden_state"] # forward                       # torch.Size([32, 200, 1024])
        x = torch.sum(x * seq_mask.unsqueeze(-1), dim=1) / torch.sum(seq_mask, dim=1).unsqueeze(-1) # mean  # torch.Size([32, 1024])
        out = self.last_fc(x) # last_fc                                                                     # torch.Size([32, 1])
        out = self.sig(out) # Sigmoid                                                                       # torch.Size([32, 1])
        out = torch.squeeze(out)                                                                            # torch.Size([32])
        return out

# Utils

In [None]:
def get_sorted_test_df(df, tokenizer, batch_size):
    # input ids lengths list 
    input_lengths = [] 
    for text in df['text'].fillna("").values:
        length = len(tokenizer(text, add_special_tokens=True)['input_ids'])
        input_lengths.append(length)
    df['input_lengths'] = input_lengths
    length_sorted_idx = np.argsort([-l for l in input_lengths])

    # sort dataframe by lengths
    sort_df = df.iloc[length_sorted_idx]
    # calc max_len per batch
    sorted_input_length = sort_df['input_lengths'].values # 
    batch_max_length = np.zeros_like(sorted_input_length) # zeros_like 
    # every batch
    for i in range((len(sorted_input_length)//batch_size)+1):
        batch_max_length[i*batch_size:(i+1)*batch_size] = np.max(sorted_input_length[i*batch_size:(i+1)*batch_size]) # max input length in every batch
    sort_df['batch_max_length'] = batch_max_length
    return sort_df, length_sorted_idx

def get_model_path(model_name):
    '''
    get model path
    '''
    res = CFG.result_dir
    if model_name in ['electra-base', 'electra-large']:
        res += '/electra/' + model_name.split('-')[1] + '-discriminator'
    elif model_name == 'deberta-v3-large':
        res += '/deberta-v3-large/'
    elif model_name == 'funnel-large':
        res += '/funnel-large/'
    elif model_name == 'bert-for-patents':
        res += '/bert-for-patents/'
    else:
        raise ValueError(model_name)
    return res

# Inference

In [None]:
import numpy as np
import pandas as pd
import os
import re
import sys
import gc
import time
from transformers import BertTokenizer, RobertaTokenizerFast, AutoTokenizer
import torch
from torch.utils.data import DataLoader

CFG.batch_size = 32 # batch size
CFG.n_jobs = 4 # n_jobs
CFG.seq_length = 512 # seq_length
os.environ["TOKENIZERS_PARALLELISM"] = "false" # TOKENIZERS_PARALLELISM

def predict(model, data_loader):
    # switch to evaluate mode
    model.eval() # model
    y_pred = []
    for i, batch_data in enumerate(data_loader): # 载入每个batch的数据
        batch_data = (t.cuda() for t in batch_data)
        seq, seq_mask, _ = batch_data # seq, seq_mask, target
        outputs = model(seq, seq_mask).detach().cpu().numpy() # outputs
        y_pred.append(outputs)
    y_pred = np.concatenate(y_pred)
    return y_pred

def get_preds(my_df, my_loader, my_model, model_path, model_name=''):
    my_model.load_state_dict(torch.load(model_path)['state_dict']) # 载入模型
    my_model = my_model.cuda()
    with torch.no_grad():
        y_pred = predict(my_model, my_loader) # 获得y_pred
    return y_pred

train_df, test_df = get_data() # 获得训练集和测试集
ensemble_weight = [0.2, 0.6, 0.1, 0.1] 
    
print('>> predicting...\n')
start = time.time()
# -------------------- Model 1 --------------------
model_name = 'bert-for-patents' # model_name
tokenizer_path = get_model_path(model_name) # get_model_path
tokenizer = AutoTokenizer.from_pretrained(tokenizer_path) # tokenizer

sort_df, length_sorted_idx = get_sorted_test_df(test_df.copy(), tokenizer, batch_size=CFG.batch_size) # sort_df, length_sorted_idx
test_dataset = PatentDatasetV2(sort_df, tokenizer) # test_dataset
test_loader = DataLoader(test_dataset, shuffle=False, batch_size=CFG.batch_size, num_workers=CFG.n_jobs, drop_last=False, pin_memory=True) # test_loader

res1 = []
folds = range(CFG.k_folds)
for fold in folds:
    model = PatentModel(get_model_path(model_name), pretrained=False) # model
    model_path = '/home/xuming/workspace/pppm/models/{}_fold{}_seed{}.pth.tar'.format(model_name, fold, 42) # model_path
    print(model_path)
    y_preds = get_preds(test_df, test_loader, model, model_path) # y_preds
    y_preds = y_preds[np.argsort(length_sorted_idx)] # y_preds
    res1.append(y_preds)
    del model
    gc.collect()
    torch.cuda.empty_cache()
res1 = np.mean(res1, axis=0)

# -------------------- Model 2 --------------------
model_name = 'deberta-v3-large' # model_name
tokenizer_path = get_model_path(model_name) # get_model_path
tokenizer = AutoTokenizer.from_pretrained(tokenizer_path) # tokenizer

sort_df, length_sorted_idx = get_sorted_test_df(test_df.copy(), tokenizer, batch_size=CFG.batch_size) # sort_df, length_sorted_idx
test_dataset = PatentDatasetV2(sort_df, tokenizer) # test_dataset
test_loader = DataLoader(test_dataset, shuffle=False, batch_size=CFG.batch_size, num_workers=CFG.n_jobs, drop_last=False, pin_memory=True) # test_loader
res2 = []
folds = range(CFG.k_folds)
for fold in folds:
    model = PatentModel(get_model_path(model_name), pretrained=False) # model
    model_path = '/home/xuming/workspace/pppm/models/{}_fold{}_seed{}.pth.tar'.format(model_name, fold, 42) # model_path
    print(model_path)
    y_preds = get_preds(test_df, test_loader, model, model_path) # y_preds
    y_preds = y_preds[np.argsort(length_sorted_idx)] # y_preds
    res2.append(y_preds)
    del model
    gc.collect()
    torch.cuda.empty_cache()
res2 = np.mean(res2, axis=0)

# -------------------- Model 3 --------------------
model_name = 'electra-large' # model_name
tokenizer_path = get_model_path(model_name)# get_model_path
tokenizer = AutoTokenizer.from_pretrained(tokenizer_path) # tokenizer

sort_df, length_sorted_idx = get_sorted_test_df(test_df.copy(), tokenizer, batch_size=CFG.batch_size) # sort_df, length_sorted_idx
test_dataset = PatentDatasetV2(sort_df, tokenizer) # test_dataset
test_loader = DataLoader(test_dataset, shuffle=False, batch_size=CFG.batch_size, num_workers=CFG.n_jobs, drop_last=False, pin_memory=True) # test_loader
res3 = []
folds = range(CFG.k_folds)
for fold in folds:
    model = PatentModel(get_model_path(model_name), pretrained=False) # model
    model_path = '/home/xuming/workspace/pppm/models/{}_fold{}_seed{}.pth.tar'.format(model_name, fold, 42) # model_path
    print(model_path)
    y_preds = get_preds(test_df, test_loader, model, model_path) # y_preds
    y_preds = y_preds[np.argsort(length_sorted_idx)] # y_preds
    res3.append(y_preds)
    del model
    gc.collect()
    torch.cuda.empty_cache()
res3 = np.mean(res3, axis=0)

# -------------------- Model 4 --------------------
model_name = 'funnel-large' # model_name
tokenizer_path = get_model_path(model_name) # get_model_path
tokenizer = AutoTokenizer.from_pretrained(tokenizer_path) # tokenizer

sort_df, length_sorted_idx = get_sorted_test_df(test_df.copy(), tokenizer, batch_size=CFG.batch_size) # sort_df, length_sorted_idx
test_dataset = PatentDatasetV2(sort_df, tokenizer) # test_dataset
test_loader = DataLoader(test_dataset, shuffle=False, batch_size=CFG.batch_size, num_workers=CFG.n_jobs, drop_last=False, pin_memory=True) # test_loader
res4 = []
folds = range(CFG.k_folds)
for fold in folds:
    model = PatentModel(get_model_path(model_name), pretrained=False) # model
    model_path = '/home/xuming/workspace/pppm/models/{}_fold{}_seed{}.pth.tar'.format(model_name, fold, 42) # model_path
    print(model_path)
    y_preds = get_preds(test_df, test_loader, model, model_path) # y_preds
    y_preds = y_preds[np.argsort(length_sorted_idx)] # y_preds
    res4.append(y_preds)
    del model
    gc.collect()
    torch.cuda.empty_cache()
res4 = np.mean(res4, axis=0)

# ensemble
res = [res1,res2,res3,res4]
for i in range(len(res)):
    res[i] = (res[i] - res[i].mean())/res[i].std()
test_df['score'] = np.sum([res[i] * ensemble_weight[i] for i in range(len(res))], axis=0)
test_df['score'] = (test_df['score'] - test_df['score'].mean()) /test_df['score'].std()

# get submission
print(test_df.shape)
test_df[['id', 'score']].to_csv("submission.csv", index=False)