In [1]:
#这些是你没有import的包
#我的函数为FrameToSet、FrameToSet_test、train_the_deberta、test_the_deberta
import gc
import pyarrow as pa
from transformers import AutoTokenizer, DataCollatorWithPadding, DataCollatorForLanguageModeling
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
from datasets import load_metric
import datasets
import transformers
#下面这3段与你一样

In [2]:
from sentence_transformers import SentenceTransformer, LoggingHandler, losses, models, util, InputExample
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from sentence_transformers.readers import InputExample
import torch
from torch import nn
from torch.utils.data import DataLoader

import math
import random
from tqdm import tqdm
from scipy.stats import pearsonr

import numpy as np
import pandas as pd
import os

In [3]:
for dirname, _, filenames in os.walk('./kaggle/input/us-patent-phrase-to-phrase-matching'):
    for filename in filenames:
        if filename == 'train.csv':
            raw_data = pd.read_csv(os.path.join(dirname, filename))
        elif filename == 'test.csv':
            test_data = pd.read_csv(os.path.join(dirname, filename))
        elif filename == 'sample_submission.csv':
            sample_submission = pd.read_csv(os.path.join(dirname, filename))

In [4]:
def split_train_eval(data, train_per=0.8, train_num=None):
    if train_num is None:
        train_num = int(len(data) * train_per)
        
    raw_data_index = list(range(len(data)))
    random.shuffle(raw_data_index)
    train_data = raw_data.loc[raw_data_index[:train_num]]
    train_data = train_data.sort_index(ascending=True)
    eval_data = raw_data.loc[raw_data_index[train_num:]]
    eval_data = eval_data.sort_index(ascending=True)
    return train_data, eval_data

train_data, eval_data = split_train_eval(raw_data, train_num=len(raw_data) - 37 * 4)

In [5]:
def FrameToSet(data):#将原dataframe转化为模型需要的dataframe
    output_data = pd.DataFrame(columns=['sentence1', 'sentence2', 'context1', 'context2', 'label', 'idx'])
    output_data['sentence1'] = list(data['anchor'])
    output_data['sentence2'] = list(data['target'])
    output_data['label'] = list(data['score'])
    output_data['idx'] = list(np.arange(0,len(output_data['label'])))
    contexts = []
    for x in list(data['context']):
        contexts.append([ord(x[0])-65,10*int(x[1])+int(x[2])])
    output_data['context1'] = list(np.array(contexts)[:,0])
    output_data['context2'] = list(np.array(contexts)[:,1])
    return output_data

In [6]:
def FrameToSet_test(data):#与上函数的不同在于考虑测试集没有score，填充全0列
    output_data = pd.DataFrame(columns=['sentence1', 'sentence2', 'context1', 'context2', 'label', 'idx'])
    output_data['sentence1'] = list(data['anchor'])
    output_data['sentence2'] = list(data['target'])
    output_data['label'] = list([0.0]*len(output_data['sentence2']))
    output_data['idx'] = list(np.arange(0,len(output_data['label'])))
    contexts = []
    for x in list(data['context']):
        contexts.append([ord(x[0])-65,10*int(x[1])+int(x[2])])
    output_data['context1'] = list(np.array(contexts)[:,0])
    output_data['context2'] = list(np.array(contexts)[:,1])
    return output_data

In [7]:
def train_the_deberta(train_data, eval_data, batch_size = 8,
                      num_epoch = 3.0, model_checkpoint = "microsoft/deberta-v3-small",
                      learning_rate = 5e-5, weight_decay = 0.01):
    #DeBERTa模型
    #train_data、eval_data需要最初的dataframe类型
    #batch_size 这个不解释了，int型
    #num_epoch 要求为float类型（没错，它默认是3.0）
    #weight_decay 权值衰减,learning_rate 学习率,
    #这些都是args = TrainingArguments()中的参数，想要调一些细节还可以help(TrainingArguments)查看更多参数
    #返回值为模型(model)
    
    thetrain = datasets.arrow_dataset.Dataset(pa.Table.from_pandas(FrameToSet(train_data)))
    theeval = datasets.arrow_dataset.Dataset(pa.Table.from_pandas(FrameToSet(eval_data)))
    thedict = {"train":thetrain,"validation":theeval}
    dataset = datasets.dataset_dict.DatasetDict(thedict)
    #如果提交代码，运行下面这段(2/3)
   
    gc.collect()
    torch.cuda.empty_cache()
    task = "stsb"
    #List of glue keys
    task_to_keys = {
        "stsb": ("sentence1", "sentence2"),
    }
    
    #Collect sentence keys and labels
    sentence1_key, sentence2_key = task_to_keys[task]
    
    # Number of logits to output
    num_labels = 1
    
    ###  Tokenizing Section  ####
    
    # Create tokenizer for respective model
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True, truncation=True, model_max_length=512)
    
    def tokenizer_func(examples):
        if sentence2_key is None:
            return tokenizer(examples[sentence1_key], truncation=True,)
        return tokenizer(examples[sentence1_key], examples[sentence2_key], truncation=True,)
    
    # tokenize sentence(s)
    encoded_dataset = dataset.map(tokenizer_func, batched=True)
    
    ###  Model Section  ####
    
    # Create model and attach ForSequenceClassification head
    model_deberta = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=num_labels)
    #模型定义在上面这行
    # Type of metric for given task
    metric_name = "pearson"
    
    args = TrainingArguments(
        f"{model_checkpoint}-finetuned-Testing-{task}",
        evaluation_strategy = "epoch",
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        weight_decay=weight_decay,
        metric_for_best_model=metric_name,
        eval_accumulation_steps=5,
        learning_rate=learning_rate,
        num_train_epochs=num_epoch
    )
    
    def compute_metrics(eval_pred):
        predictions, labels = eval_pred
        predictions = np.squeeze(predictions)
        labels = np.squeeze(labels)
        if len(labels) <= 1:
            return {"peason":1.0}
        #长度低于2，pearsonr会报错
        pea = pearsonr(predictions, labels)[0]
        return {"peason":pea}
    
    validation_key = "validation_mismatched" if task == "mnli-mm" else "validation_matched" if task == "mnli" else "validation"
    trainer = Trainer(
        model_deberta,
        args,
        train_dataset=encoded_dataset["train"],
        eval_dataset=encoded_dataset[validation_key],
        tokenizer=tokenizer,
        compute_metrics=compute_metrics
    )
    trainer.train()
    trainer.save_model()
    gc.collect()
    torch.cuda.empty_cache()
    return model_deberta

In [8]:
def test_the_deberta(test_data, model_deberta, batch_size = 8,
                      num_epoch = 3.0, model_checkpoint = "microsoft/deberta-v3-small",weight_decay = 0.01):
    #test_data需要最初的dataframe类型
    #model_checkpoint与训练时一致
    #batch_size、num_epoch、weiight_decay不用管，预测不会用到的参数
    #返回值为预测值，类型为np.array，维度为1

    thetest = datasets.arrow_dataset.Dataset(pa.Table.from_pandas(FrameToSet_test(thetest_)))
    thedict = {"test":thetest}
    dataset = datasets.dataset_dict.DatasetDict(thedict)

    gc.collect()
    torch.cuda.empty_cache()

    task = "stsb"
    #List of glue keys
    task_to_keys = {
        "stsb": ("sentence1", "sentence2"),
    }
    #Collect sentence keys and labels
    sentence1_key, sentence2_key = task_to_keys[task]
    
    # Number of logits to output
    num_labels = 1
    
    ###  Tokenizing Section  ####
    
    #Load model    
    # Create tokenizer for respective model
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True, truncation=True, model_max_length=512)
    
    def tokenizer_func(examples):
        if sentence2_key is None:
            return tokenizer(examples[sentence1_key], truncation=True,)
        return tokenizer(examples[sentence1_key], examples[sentence2_key], truncation=True,)
    
    # tokenize sentence(s)
    encoded_dataset = dataset.map(tokenizer_func, batched=True)
    
    ###  Model Section  ####
    metric_name = "pearson"
    
    args = TrainingArguments(
        f"{model_checkpoint}-finetuned-Testing-{task}",
        evaluation_strategy = "epoch",
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        weight_decay=0.01,
        metric_for_best_model=metric_name,
        eval_accumulation_steps=5,
        num_train_epochs=num_epoch
    )
    
    def compute_metrics(eval_pred):
        predictions, labels = eval_pred
        predictions = np.squeeze(predictions)
        labels = np.squeeze(labels)
        if len(labels) <= 1:
            return {"peason":1.0}
        #长度低于2，pearsonr会报错
        pea = pearsonr(predictions, labels)[0]
        return {"peason":pea}
    
    validation_key = "validation_mismatched" if task == "mnli-mm" else "validation_matched" if task == "mnli" else "validation"
    trainer = Trainer(
        model_deberta,
        args,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics
    )
    
    ### Predictions  ###
    
    prediction_one = trainer.predict(encoded_dataset["test"])
    #del trainer
    gc.collect()
    torch.cuda.empty_cache()
    return np.squeeze(prediction_one.predictions)

In [9]:
raw_data1, kkkkk = split_train_eval(raw_data, train_per = 0.1)#小样本测试，只取百分之十数据
the_train, thetest_ = split_train_eval(raw_data1)
thetrain_, theeval_ = split_train_eval(the_train)
'''
the_train, thetest_ = split_train_eval(raw_data)#全部样本
thetrain_, theeval_ = split_train_eval(the_train)
'''

'\nthe_train, thetest_ = split_train_eval(raw_data)#全部样本\nthetrain_, theeval_ = split_train_eval(the_train)\n'

In [10]:
###设置###
model_checkpoint = "microsoft/deberta-v3-small"
#有条件用下面这行
#需要下载
#model_checkpoint = "microsoft/deberta-v3-large"
#model_checkpoint = "google/electra-large-discriminator"
#model_checkpoint = "xlnet-large-cased"
batch_size = 8#有条件大一些
num_epoch = 3.0#就是float型，默认为3.0

In [11]:
#模型使用示例
model = train_the_deberta(thetrain_, theeval_, batch_size = batch_size, num_epoch = num_epoch, model_checkpoint = model_checkpoint)
thepredicts = test_the_deberta(thetest_, model, model_checkpoint = model_checkpoint)
pearsonr(thepredicts, np.array(thetest_['score']))[0]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


  0%|          | 0/3 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

Some weights of the model checkpoint at microsoft/deberta-v3-small were not used when initializing DebertaV2ForSequenceClassification: ['mask_predictions.classifier.weight', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.dense.weight', 'mask_predictions.dense.bias', 'lm_predictions.lm_head.bias', 'mask_predictions.LayerNorm.bias', 'mask_predictions.classifier.bias', 'mask_predictions.LayerNorm.weight', 'lm_predictions.lm_head.LayerNorm.bias']
- This IS expected if you are initializing DebertaV2ForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2ForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from 

Epoch,Training Loss,Validation Loss,Peason
1,No log,0.038014,0.720784
2,0.044100,0.030662,0.785953
3,0.044100,0.027746,0.801874


The following columns in the evaluation set don't have a corresponding argument in `DebertaV2ForSequenceClassification.forward` and have been ignored: context2, idx, context1, sentence1, sentence2. If context2, idx, context1, sentence1, sentence2 are not expected by `DebertaV2ForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 584
  Batch size = 8
Saving model checkpoint to microsoft/deberta-v3-small-finetuned-Testing-stsb\checkpoint-500
Configuration saved in microsoft/deberta-v3-small-finetuned-Testing-stsb\checkpoint-500\config.json
Model weights saved in microsoft/deberta-v3-small-finetuned-Testing-stsb\checkpoint-500\pytorch_model.bin
tokenizer config file saved in microsoft/deberta-v3-small-finetuned-Testing-stsb\checkpoint-500\tokenizer_config.json
Special tokens file saved in microsoft/deberta-v3-small-finetuned-Testing-stsb\checkpoint-500\special_tokens_map.json
The following columns in the evaluation set don

  0%|          | 0/1 [00:00<?, ?ba/s]

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
The following columns in the test set don't have a corresponding argument in `DebertaV2ForSequenceClassification.forward` and have been ignored: context2, idx, context1, sentence1, sentence2. If context2, idx, context1, sentence1, sentence2 are not expected by `DebertaV2ForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 730
  Batch size = 8




0.8426112532787519

In [12]:
'''
#DeBERTa模型       按ABCD分类

ftrain = FrameToSet(thetrain_)
ftest = FrameToSet(thetest_)
feval = FrameToSet(theeval_)
#如果提交代码，运行下面这段(3/3)
'''
'''
ftrain = FrameToSet(thetrain_)
ftest = FrameToSet_test(thetest_)#这行不一样
feval = FrameToSet(theeval_)


pre = []

metric = load_metric('glue', "stsb")#这行需要下载东西##########################################################################
#Load model
#model_checkpoint = "microsoft/deberta-v3-small"
# Create tokenizer for respective model
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True, truncation=True, model_max_length=512)
    
def tokenizer_func(examples):
    if sentence2_key is None:
        return tokenizer(examples[sentence1_key], truncation=True,)
    return tokenizer(examples[sentence1_key], examples[sentence2_key], truncation=True,)


gc.collect()
torch.cuda.empty_cache()
'''
#这里注意一下你分类的时候要处理下，训练集只有A~H，但测试集可能有别的
#for i in range(9):
#    if i == 8:#bigger context not in A~H
#        all_id = ftest.loc[ftest["context1"]>7]['idx']
#        if len(all_id) > 0:
#            p = []
#            for idx in all_id:
#                p.append(prediction_one.predictions[idx])
#            pre.append(np.vstack((np.array(p), np.array(ftest.loc[ftest["context1"]>7]['idx']))))
#        break
'''
    thetrain = datasets.arrow_dataset.Dataset(pa.Table.from_pandas(ftrain.loc[ftrain["context1"]==i]))
    thetest = datasets.arrow_dataset.Dataset(pa.Table.from_pandas(ftest.loc[ftest["context1"]==i]))
    theeval = datasets.arrow_dataset.Dataset(pa.Table.from_pandas(feval.loc[feval["context1"]==i]))
    thedict = {"train":thetrain,"validation":theeval,"test":thetest}
    dataset = datasets.dataset_dict.DatasetDict(thedict)
    task = "stsb"
    #List of glue keys
    task_to_keys = {
        "stsb": ("sentence1", "sentence2"),
    }
    #Select task
    #task = "rte"  #cola, mrpc
    #batch_size = 8 #10 normally, 8 for qnli
    
    # Load dataset based on task variable
    #dataset = load_dataset("glue", actual_task)
    
    #Collect sentence keys and labels
    sentence1_key, sentence2_key = task_to_keys[task]
    
    # Number of logits to output
    num_labels = 1
    
    ###  Tokenizing Section  ####
    # tokenize sentence(s)
    encoded_dataset = dataset.map(tokenizer_func, batched=True)
    
    
    #model_checkpoint = "deberta-v3-small_baseline_cola/"
    #model_checkpoint = "deberta-v3-small_baseline_"+actual_task+"/"
    
    ###  Model Section  ####
    
    # Create model and attach ForSequenceClassification head
    model_deberta = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=num_labels)
    ########################模型定义在上面这行
    
    # Type of metric for given task
    metric_name = "pearson"
    
    args = TrainingArguments(
        f"{model_checkpoint}-finetuned-Testing-{task}",
        evaluation_strategy = "epoch",
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        weight_decay=0.01,
        metric_for_best_model=metric_name,
        eval_accumulation_steps=5,
        num_train_epochs=num_epoch
    )
    
    def compute_metrics(eval_pred):
        predictions, labels = eval_pred
        predictions = predictions[:]#, 0]
        pea = pearsonr(predictions, labels)[0]
        return {"peason":pea}#metric.compute(predictions=predictions, references=labels)
    
    validation_key = "validation"
    trainer = Trainer(
        model_deberta,
        args,
        train_dataset=encoded_dataset["train"],
        eval_dataset=encoded_dataset[validation_key],
        tokenizer=tokenizer,
        compute_metrics=compute_metrics
    )
    trainer.train()#训练过程
    model_deberta.save()

    
    ### Collect Predictions  ###
    
    predicts = trainer.predict(encoded_dataset["test"])
    pre.append(np.vstack((predicts.predictions, np.array(ftest.loc[ftest["context1"]==i]['idx']))))

    
    ## 清理gpu，不然容易炸
    del predicts
    del trainer
    del args
    del model_deberta
    del encoded_dataset
    del dataset
    del thedict
    del theeval
    del thetest
    del thetrain
    gc.collect()
    torch.cuda.empty_cache()
#将各模型结果按原顺序排好
spre = pre[0]
for i in range(1,len(pre)):
    spre = np.concatenate((spre, pre[i]), axis=1)
spre = spre[:,spre[1].argsort()]
#sper[0]是最终的预测结果
pearsonr(spre[0],np.array(ftest["label"]))[0]
'''

'\n    thetrain = datasets.arrow_dataset.Dataset(pa.Table.from_pandas(ftrain.loc[ftrain["context1"]==i]))\n    thetest = datasets.arrow_dataset.Dataset(pa.Table.from_pandas(ftest.loc[ftest["context1"]==i]))\n    theeval = datasets.arrow_dataset.Dataset(pa.Table.from_pandas(feval.loc[feval["context1"]==i]))\n    thedict = {"train":thetrain,"validation":theeval,"test":thetest}\n    dataset = datasets.dataset_dict.DatasetDict(thedict)\n    task = "stsb"\n    #List of glue keys\n    task_to_keys = {\n        "stsb": ("sentence1", "sentence2"),\n    }\n    #Select task\n    #task = "rte"  #cola, mrpc\n    #batch_size = 8 #10 normally, 8 for qnli\n    \n    # Load dataset based on task variable\n    #dataset = load_dataset("glue", actual_task)\n    \n    #Collect sentence keys and labels\n    sentence1_key, sentence2_key = task_to_keys[task]\n    \n    # Number of logits to output\n    num_labels = 1\n    \n    ###  Tokenizing Section  ####\n    # tokenize sentence(s)\n    encoded_dataset = 