## LED: Longformer-Encoder-Decoder
### https://huggingface.co/docs/transformers/v4.40.2/en/model_doc/led#resources

In [2]:
class Args:
    dev_output_dir = '.'
    save_checkpoint_dir = '.'
    test_output_dir = '.'
    model_name = 't5-base'
    train_data_path = '/kaggle/input/scnu-ai-challenge-5/train.json'
    dev_data_path = '/kaggle/input/scnu-ai-challenge-dataset-with-sorted-facts/dev.json'
    test_data_path = '/kaggle/input/scnu-ai-challenge-dataset-with-sorted-facts/test.json'
    default_checkpoint_path='/kaggle/input/led-checkpoint1/checkpoint_best.pkl'
    pg_num=6 #文段数量最大值
    max_source_length = 2048
    max_target_length = 128
    train_batch_size = 16
    predict_batch_size = 16
    seed=48
    device='cuda:0'
    best_score = 0
    scores_list=[]
    dev_god_list=[]
    
class Trainer:
    training_batch=1024 ## 每个epoch训练多少个batch
    testing_batch=32    ## 每次验证使用多少个样例
    updata_batch=8      ## 训练多少个batch更新一次参数
    epochs=3            ## 训练多少个epoch
    
args = Args()
trainer=Trainer()

In [3]:
import random
import os
import numpy as np
import torch
def seed_everything(seed):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True
seed_everything(args.seed)

In [4]:
import json
import random
from tqdm import tqdm

pg_num=args.pg_num  #文段数量最大值
def get_datas(data_path,data_type):
    with open(data_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
    
    d_len={i:0 for i in range(11)}  # 选取的文段数量分布
    d_level={}  # 各难度文段数量
    answer_len={i:0 for i in range(15)} 
    answers=[]
    contexts=[]
    questions=[]
    context_with_ans=0
    bad_answer_type1={i:[] for i in range(1,6)}
    bad_answer_type2={i:[] for i in range(1,6)}
    bad_answer_type1_num=0
    bad_answer_type2_num=0
    support_fact_num=0
    pred_support_fact_num=0
    for i,d in tqdm(enumerate(data)):
        # 删除训练集中的单跳问题
        if data_type=='train':
            if d['level']=='easy':
                continue
        
        # 验证集只保留难度为hard的文段，结果较为稳定
        if data_type=='dev':
            if d['level']!='hard':
                continue
        
        support_facts_list=[]
        fact_set=set()
        
        if data_type=='train':
            titles=[]
            for item in d['supporting_facts']:
                fact_set.add(item[0])
            support_facts_list=list(fact_set)
            for x in d['context']:
                if x[0] not in support_facts_list:
                    titles.append(x[0])
            num=pg_num-len(support_facts_list)
            support_facts_list.extend(random.choices(titles,k=min(len(titles),num)))
            ## 训练集需要打乱文段顺序避免模型利用位置信息
            random.shuffle(support_facts_list)
        else:
            for item in d['pred_support_facts']:
                if len(support_facts_list)>=pg_num:
                    break
                support_facts_list.append(item[0])
            
        d_len[len(support_facts_list)]+=1   
        
        ## 检查损失信息比例
        if data_type=='dev':
            real_support_fact=set()
            for item in d['supporting_facts']:
                real_support_fact.add(item[0])
            support_fact_num+=len(real_support_fact)
            for x in support_facts_list:
                if x in real_support_fact:
                    pred_support_fact_num+=1
        
        
        answer=d['answer'].replace(',',' , ')
        # 检查答案被包含在多少个文段中，如果答案出现的文段数多于两个，说明答案的质量不佳
        paragraph_with_answer=0
        # 拼接得到context
        context=''   
        for x in d['context']:
            s=' '
            if x[0] in support_facts_list:
                for text in x[1]:
                    s+=text
                ## 段落开头和结尾用特殊字符进行标识
                context+=f'<bop> {x[0]}.\n {s} <eop>\n '
                if s.find(answer)!=-1:
                    paragraph_with_answer+=1
        context=context.replace('.',' . ').replace(',',' , ')
        if context.find(answer)!=-1:
            context_with_ans+=1
        
        context=context.replace(f' {answer} ',f' <ans> {answer} </ans> ')
        answer_token_num=len(answer.split())
        answer_len[answer_token_num]=answer_len.get(answer_token_num,0)+1
        
        ## 删除训练集中答案信息不充分的数据如yes,no
        ## 答案中所有单词都为出现在文段中视为信息不充分
        if answer_token_num<=3:
            if paragraph_with_answer>2:
                bad_answer_type1[answer_token_num].append(answer)
                bad_answer_type1_num+=1
                if data_type=='train':
                    continue
            if context.lower().find(answer.lower())==-1:
                bad_answer_type2[answer_token_num].append(answer)
                bad_answer_type2_num+=1
                if data_type=='train':
                    continue
            
        answer=' '+answer+''  
        contexts.append(context)
        answers.append(answer)
        
        ## 统计问题难度
        if data_type!='test':
            questions.append(d['question']) 
            level=d['level']
            d_level[level]=d_level.get(level,0)+1
        
    if data_type=='dev':
        print('check info loss:',pred_support_fact_num,support_fact_num,pred_support_fact_num/support_fact_num)
    print('chosen paragraph num:',d_len)
    print('counting question level:',d_level)
    print('counting num of answer token:',answer_len)
    print('context_with_ans:',context_with_ans)
    print('bad_answer_num_type1:',bad_answer_type1_num)
    print('bad_answer_num_type2:',bad_answer_type2_num)
    print(bad_answer_type1)
    print(bad_answer_type2)
    return contexts,questions,answers

In [5]:
!pip install pycocoevalcap
!pip install bert_score

Collecting pycocoevalcap
  Downloading pycocoevalcap-1.2-py3-none-any.whl.metadata (3.2 kB)
Collecting pycocotools>=2.0.2 (from pycocoevalcap)
  Downloading pycocotools-2.0.7-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.1 kB)
Downloading pycocoevalcap-1.2-py3-none-any.whl (104.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.3/104.3 MB[0m [31m15.8 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading pycocotools-2.0.7-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (426 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m426.2/426.2 kB[0m [31m28.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pycocotools, pycocoevalcap
Successfully installed pycocoevalcap-1.2 pycocotools-2.0.7
Collecting bert_score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Downloading bert_score-0.3.13-py3-none-any.whl (61 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [3

In [6]:
import json
import os
from tqdm import tqdm
import numpy as np
import torch
from torch.utils.data import Dataset
from datasets import load_dataset

from transformers import T5ForConditionalGeneration, T5Tokenizer, DataCollatorForSeq2Seq


from pycocoevalcap.bleu.bleu import Bleu
from pycocoevalcap.meteor.meteor import Meteor
from pycocoevalcap.rouge.rouge import Rouge
from bert_score import BERTScorer

scorers = {
        "Bleu": Bleu(4),
        #"Meteor": Meteor(),
        "Rouge": Rouge(),
    }

bert_scorer = BERTScorer(lang="en",model_type='roberta-large',rescale_with_baseline=True)
# 测评问题的流畅性
def fluencyScore(preds_list, gold_list):
    
    gts = {}
    res = {}
    for i, (p, g) in enumerate(zip(preds_list, gold_list)):
        gts[i] = [p]
        res[i] = [g]
    scores = {}
    for name, scorer in scorers.items():
        score, all_scores = scorer.compute_score(gts, res)
        if isinstance(score, list):
            for i, sc in enumerate(score, 1):
                scores[name + str(i)] = sc
        else:
            scores[name] = score
    return scores

# 测评语义相似度
def SemanticScore(preds_list, gold_list):
    p,r,f1 = bert_scorer.score(preds_list, gold_list, verbose=True)
    bert_score = np.mean(f1.tolist())
    return bert_score

def getTotalScore(preds_list,gold_list):
    bert_score = SemanticScore(preds_list,gold_list)
    scores = fluencyScore(preds_list,gold_list)
    last_score = (bert_score/2+scores['Bleu4']/2)*100
   # print(scores)
    return {'TotalScore':last_score, 
            'BERTScore':bert_score,
            'Bleu1':scores['Bleu1'],
            'Bleu2':scores['Bleu2'],
            'Bleu3':scores['Bleu3'],
            'Bleu4':scores['Bleu4'],
            #'Meteor':scores['Meteor'],
            'Rouge':scores['Rouge'],
           }
    #return {'TotalScore':last_score, 'BERTScore':bert_score,'Bleu4':scores['Bleu4']}
    
def saveJsonResult(generated_questions:list[dict], data_type = 'dev', score_type = 'last'):
    '''
    保存生成结果
    data_type: 'dev'和'test'
    score_type: best（最好结果）和last（最新一次epoch的结果）
    '''
    if data_type == 'dev':
        if score_type != 'best' and score_type != 'last':
            path = os.path.join(args.dev_output_dir,'output_last.json')
        else:
            path = os.path.join(args.dev_output_dir,f'output_{score_type}.json')
    elif data_type == 'test':
        path = os.path.join(args.test_output_dir,'output.json')
    else:
        print("未写明data_type")
        return False
    with open(path, 'w', encoding='utf-8') as json_file:
        json.dump(generated_questions, json_file, ensure_ascii=False, indent=4)
        
print('done')

2024-05-08 11:54:38.387575: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-08 11:54:38.387675: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-08 11:54:38.491056: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


done


In [7]:
'''
数据集构建
'''
import torch
from torch import nn, optim
import torch.nn.functional as F
from transformers import DataCollatorForSeq2Seq
class MyDataSet(torch.utils.data.Dataset):
    def __init__(self,contexts,questions,answers,tokenizer,max_len,data_type):
        self.contexts = contexts
        self.questions = questions
        self.answers=answers
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.data_type=data_type
    def __len__(self):
        return len(self.contexts)
    def __getitem__(self,item):
        context=self.contexts[item]
        answer=self.answers[item]
        source = f'Answer: {answer} [SEP] Context: {context} [SEP] Answer: {answer} [SEP] Question:'
        source_encoding = self.tokenizer(text=source,
                      max_length=args.max_source_length,
                      padding=True,
                      truncation=True,
                      add_special_tokens=True,
                      return_attention_mask=True,
                    )
        
        if self.data_type=='test':
            return {
                #"text":source,
                "input_ids":source_encoding['input_ids'],
               "attention_mask":source_encoding['attention_mask'],
               }
        
        question=self.questions[item]
        target_encoding=self.tokenizer(text=question,
                      max_length=args.max_target_length,
                      padding=True,
                      truncation=True,
                      add_special_tokens=True,
                      return_attention_mask=False,
                                      )
        #print(source)
        return {
                #"text":source,
                "input_ids":torch.LongTensor(source_encoding['input_ids']),
                "attention_mask":torch.LongTensor(source_encoding['attention_mask']),
                "labels":torch.LongTensor(target_encoding['input_ids'])
               }#[source,question,source_encoding['input_ids'].shape,target_encoding['input_ids']]


dev_gold_question_list=[]
def create_data_loader(data_path,data_type,tokenizer,max_len,batch_size=4,shuffle=True):
    contexts,questions,answers=get_datas(data_path,data_type)
    ds = MyDataSet(
                   contexts=contexts,
                   questions=questions,
                   answers=answers,
                   tokenizer = tokenizer,
                   max_len=max_len,
                   data_type=data_type
                  )
    if data_type=='dev':
        global dev_gold_question_list
        dev_gold_question_list=questions
    collate_fn = DataCollatorForSeq2Seq(tokenizer, pad_to_multiple_of=8,padding=True)
    return torch.utils.data.DataLoader(ds,batch_size=batch_size,collate_fn=collate_fn,shuffle=shuffle)


max_len = args.max_source_length
batch_size = args.train_batch_size

print('done!')

done!


In [8]:
'''
启用checkpoints
'''

from transformers import AutoTokenizer, LEDForConditionalGeneration
device=torch.device(args.device)
model_name='allenai/led-base-16384'
print('pretrained model:',model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = LEDForConditionalGeneration.from_pretrained(model_name).to(device)

## 添加特殊字符
tokenizer.add_special_tokens({'additional_special_tokens':["<ans>","</ans>","[SEP]","<eop>","<bop>"]})
model.resize_token_embeddings(len(tokenizer))
print(tokenizer.SPECIAL_TOKENS_ATTRIBUTES)

def load_checkpoint(path=None):
    if path==None:
        path=args.default_checkpoint_path
    print('load from: ',path)
    model.load_state_dict(torch.load(path))
    
mode='load'#'build'
checkpoint=None
print('mode: ',mode)
if mode=='load':
    load_checkpoint(checkpoint)
elif mode=='build':
    trainer.epochs=15

model.gradient_checkpointing_enable()

pretrained model: allenai/led-base-16384


tokenizer_config.json:   0%|          | 0.00/27.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.09k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/648M [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()


generation_config.json:   0%|          | 0.00/168 [00:00<?, ?B/s]

['bos_token', 'eos_token', 'unk_token', 'sep_token', 'pad_token', 'cls_token', 'mask_token', 'additional_special_tokens']
mode:  load
load from:  /kaggle/input/led-checkpoint1/checkpoint_best.pkl


In [9]:
%%time
train_data_path=args.train_data_path  #'/kaggle/input/scnu-ai-challenge-datasetwith-entity/train.json'
dev_data_path=args.dev_data_path      #'/kaggle/input/scnu-ai-challenge-dataset-with-sorted-facts/dev.json'
test_data_path=args.test_data_path    #'/kaggle/input/scnu-ai-challenge-dataset-with-sorted-facts/test.json'
train_data_loader=create_data_loader(train_data_path,
                                     'train',
                                     tokenizer,
                                     max_len,
                                     batch_size=args.train_batch_size,
                                     shuffle=True)
dev_data_loader=create_data_loader(dev_data_path,
                                     'dev',
                                     tokenizer,max_len,batch_size=args.predict_batch_size,shuffle=False)
test_data_loader=create_data_loader(test_data_path,
                                     'test',
                                     tokenizer,max_len,batch_size=args.predict_batch_size,shuffle=False)


88947it [00:05, 15581.69it/s]


chosen paragraph num: {0: 0, 1: 0, 2: 160, 3: 114, 4: 65, 5: 67, 6: 40, 7: 72180, 8: 0, 9: 0, 10: 0}
counting question level: {'medium': 42425, 'hard': 17544}
counting num of answer token: {0: 0, 1: 24715, 2: 23962, 3: 13566, 4: 6237, 5: 2318, 6: 791, 7: 375, 8: 226, 9: 130, 10: 78, 11: 61, 12: 34, 13: 29, 14: 25, 15: 27, 31: 2, 17: 12, 16: 13, 18: 7, 19: 7, 21: 2, 20: 3, 23: 1, 42: 1, 24: 1, 33: 1, 26: 1, 22: 1}
context_with_ans: 68074
bad_answer_num_type1: 8963
bad_answer_num_type2: 3694
{1: ['yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'P.O.D.', 'Poppy.Computer', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'F.E.A.R.', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'no', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'no', 'yes', 

1500it [00:00, 50054.94it/s]


check info loss: 691 720 0.9597222222222223
chosen paragraph num: {0: 0, 1: 0, 2: 0, 3: 0, 4: 0, 5: 0, 6: 0, 7: 360, 8: 0, 9: 0, 10: 0}
counting question level: {'hard': 360}
counting num of answer token: {0: 0, 1: 120, 2: 116, 3: 72, 4: 31, 5: 9, 6: 4, 7: 2, 8: 3, 9: 0, 10: 1, 11: 0, 12: 0, 13: 0, 14: 1, 15: 1}
context_with_ans: 336
bad_answer_num_type1: 48
bad_answer_num_type2: 20
{1: ['no', 'tennis', 'no', 'no', 'director', 'Disney', 'Thames', 'no', 'no', 'Kansas', 'British', 'singer', '1995', 'German', 'no', 'no', 'Cantonese', 'Mexico', 'two', 'Irish', 'Russian', 'Detroit', 'Marvel'], 2: ['Love Actually', 'Supreme Court', 'Nikita Khrushchev', 'Golf Digest', 'Bob Stoops', 'North Dakota', 'Boston University', 'Chicago Outfit', 'Porfirio Rubirosa', 'Loughborough University', 'Eugene Onegin', 'Manhattan Center', 'Alan James', 'Relient K', 'National Geographic', 'documentary film', 'Sleeping Beauty', 'professional wrestler', 'Takahiro Moriuchi'], 3: ['The White Stripes', 'University of 

7405it [00:00, 13394.24it/s]

chosen paragraph num: {0: 0, 1: 0, 2: 23, 3: 15, 4: 12, 5: 6, 6: 1, 7: 7348, 8: 0, 9: 0, 10: 0}
counting question level: {}
counting num of answer token: {0: 0, 1: 2510, 2: 2446, 3: 1368, 4: 587, 5: 257, 6: 86, 7: 40, 8: 32, 9: 12, 10: 10, 11: 7, 12: 9, 13: 6, 14: 6, 36: 3, 15: 4, 39: 1, 23: 1, 20: 3, 17: 2, 19: 3, 18: 2, 31: 1, 27: 1, 26: 2, 16: 1, 22: 2, 21: 2, 29: 1}
context_with_ans: 6941
bad_answer_num_type1: 1410
bad_answer_num_type2: 364
{1: ['Europe', 'Bundesliga', 'no', '201', 'black', '1984', 'Masterpiece', 'American', '2010', 'Cumberland', 'Garbage', 'Aeonium', 'no', 'no', 'novelist', 'cocktail', 'Drosera', 'plants', 'singer', 'Eragrostis', 'Australia', 'no', 'no', 'Brazil', 'Aerie', 'Aechmea', 'basketball', '1967', 'opera', 'Canada', 'Indian', 'Cleveland', 'Canada', 'Scientology', 'Wikstroemia', 'leader', 'rifles', 'Diplomacy', '1989', 'no', 'Loa', 'no', 'no', 'American', 'no', 'no', 'no', 'composer', 'singer', '2001', 'Puyi', 'no', 'film', 'India', 'Elamite', 'American', '




In [10]:
len(train_data_loader),len(dev_data_loader)

(14993, 45)

In [11]:
def get_infinite_data(dataloader):
        while True:
            for images in dataloader:
                yield images
train_generator=get_infinite_data(train_data_loader)
def get_batch():
    return train_generator.__next__()

In [15]:
batch=get_batch()
print(batch['input_ids'].shape)
tokenizer.convert_ids_to_tokens(batch['input_ids'][0])

['<s>',
 'Answer',
 ':',
 'Ġ',
 'Ġcomputers',
 'Ġ',
 '[SEP]',
 'ĠContext',
 ':',
 'Ġ',
 '<bop>',
 'ĠApple',
 'ĠII',
 'Ġgraphics',
 'Ġ.',
 'Ġ',
 'Ċ',
 'Ġ',
 'ĠThe',
 'ĠApple',
 'ĠII',
 'Ġgraphics',
 'Ġwere',
 'Ġcomposed',
 'Ġof',
 'Ġidiosyncr',
 'atic',
 'Ġmodes',
 'Ġand',
 'Ġsettings',
 'Ġthat',
 'Ġcould',
 'Ġbe',
 'Ġexploited',
 'Ġ.',
 'Ġ',
 'ĠThis',
 'Ġgraphics',
 'Ġsystem',
 'Ġdebuted',
 'Ġon',
 'Ġthe',
 'Ġoriginal',
 'ĠApple',
 'ĠII',
 'Ġ,',
 'Ġ',
 'Ġcontinued',
 'Ġwith',
 'Ġthe',
 'ĠApple',
 'ĠII',
 'ĠPlus',
 'Ġand',
 'Ġwas',
 'Ġcarried',
 'Ġforward',
 'Ġand',
 'Ġexpanded',
 'Ġwith',
 'Ġthe',
 'ĠApple',
 'ĠII',
 'e',
 'Ġ,',
 'Ġ',
 'ĠEnhanced',
 'ĠII',
 'e',
 'Ġ,',
 'Ġ',
 'ĠII',
 'c',
 'Ġ,',
 'Ġ',
 'ĠII',
 'c',
 'ĠPlus',
 'Ġand',
 'ĠII',
 'Ġ.',
 'Ġ',
 'Ġ',
 '<eop>',
 'Ċ',
 'Ġ',
 '<bop>',
 'ĠP',
 'ec',
 'om',
 'Ġ32',
 'Ġ.',
 'Ġ',
 'Ċ',
 'Ġ',
 'ĠP',
 'ec',
 'om',
 'Ġ32',
 'Ġwas',
 'Ġan',
 'Ġeducational',
 'Ġand',
 '/',
 'or',
 'Ġhome',
 'Ġcomputer',
 'Ġdeveloped',
 'Ġby',
 'ĠEle',
 '

In [12]:
## 评估模型函数
import logging  
logging.getLogger("transformers").setLevel(logging.ERROR)
def eval_model(epoch=1,testing_batch=len(dev_data_loader)):
    model.eval()
    preds_list = []
    if testing_batch<=0:
        testing_batch=len(dev_data_loader)
    with tqdm(total=testing_batch, desc=f'Validation Epoch {epoch}', unit='batch') as pbar:
        for i,batch in enumerate(dev_data_loader):
            if i>testing_batch:break
            with torch.no_grad():
                input_ids = batch['input_ids'].to(device)
                attention_mask=batch['attention_mask'].to(device)
                generated_ids = model.generate(
                    input_ids=input_ids, 
                    attention_mask=attention_mask,
                    max_new_tokens=256,
                    do_sample=False,
                    #num_beams=5
                )
                preds_list.extend(tokenizer.batch_decode(generated_ids, skip_special_tokens=True))
 
            pbar.update(1)
    

    for i in range(20):
        print('pred: ',preds_list[i])
        print("true: ",dev_gold_question_list[i])

    scores = getTotalScore(preds_list, dev_gold_question_list[:len(preds_list)])
    preds_dict = []
    if scores['TotalScore'] > args.best_score:
        torch.save(model.state_dict(), os.path.join(args.save_checkpoint_dir, "checkpoint_best.pkl"))
        print(f"Total score: {args.best_score} -> {scores['TotalScore'] }")
        print(f"checkpoint_best.pdparams已存储至{args.save_checkpoint_dir}")
        args.best_score = scores['TotalScore'] 
    args.scores_list.append(scores)
    print(f"Scores: {scores}")
    return preds_list
pred_list=eval_model(epoch=0,testing_batch=45)

Validation Epoch 0: 100%|██████████| 45/45 [00:39<00:00,  1.14batch/s]

pred:  Are both Plantago and Trichosanthes plantain?
true:  Are Trichosanthes and Plantago both forms of plant life?
pred:  What season of Indian reality TV series did Lopamudra Raut win the award for?
true:  In which season of the Indian reality TV show "Big Boss" did the model Lopamundra Raut participate?
pred:  Who was elected to the Senate in 2012 and was the senior United States Senator from Nevada?
true:  The 2012 United States Senate election in Nevada concluded with a close victory for which current Republican incumbent?
pred:  When did the episode that Moe Szyslak first appeared in originally air?
true:  When was the Simpson's episode broadcasted that introduced the character Morris "Moe" Szyslak?
pred:  Are both The Shipping News and The Smiths shipping news?
true:  Are Shipping News and Gene both rock bands?
pred:  Which American author, professor and literary critic, wrote the science fiction novel "The Einstein Intersection"?
true:  Who is the science fiction writer, Samue




  0%|          | 0/12 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/6 [00:00<?, ?it/s]

done in 2.15 seconds, 167.77 sentences/sec
{'testlen': 5545, 'reflen': 5418, 'guess': [5545, 5185, 4825, 4465], 'correct': [2015, 896, 446, 244]}
ratio: 1.0234403839053112
Total score: 0 -> 25.582630401285368
checkpoint_best.pdparams已存储至.
Scores: {'TotalScore': 25.582630401285368, 'BERTScore': 0.3781976652910493, 'Bleu1': 0.3633904418394295, 'Bleu2': 0.25059152233883275, 'Bleu3': 0.1797173874589759, 'Bleu4': 0.133454942734658, 'Rouge': 0.31402635996852}


In [13]:
from transformers.optimization import Adafactor, AdafactorSchedule
## 选择Adafactor优化器
optimizer = Adafactor(model.parameters(), relative_step=True, warmup_init=True,lr=None,clip_threshold=1.0)
scheduler = AdafactorSchedule(optimizer)

In [14]:
## 开始训练
from tqdm import tqdm
from torch.nn.utils import clip_grad_norm_
from transformers import  get_linear_schedule_with_warmup
import time
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:128" 

training_batch=trainer.training_batch
testing_batch=trainer.testing_batch
updata_batch=trainer.updata_batch
epochs=trainer.epochs


training_batch=1024
testing_batch=45
updata_batch=4
epochs=8

for epoch in range(epochs):
    # 训练部分
    model.train()
    epoch_loss = 0
    batch_loss=0
    with tqdm(total=training_batch//updata_batch, desc=f'Epoch {epoch + 1}/{epochs}', unit='batch',mininterval=3) as pbar:
        for i in range(training_batch):
            batch=get_batch()
            input_ids= batch['input_ids'].to(device)
            attention_mask=batch['attention_mask'].to(device)
            labels=batch['labels'].to(device)
            #print(input_ids.shape,labels.shape)
            
            #释放碎片以避免显存不足
            if hasattr(torch.cuda, 'empty_cache'):
                torch.cuda.empty_cache()

            outputs = model(input_ids=input_ids,
                            attention_mask=attention_mask,
                            labels=labels, 
                            return_dict=True)
            loss = outputs.loss
            loss.backward() 
            loss_t = loss.detach().cpu()
            
            epoch_loss += loss_t[0] if len(loss_t.shape) > 0 else loss_t.numpy()
            batch_loss +=loss_t[0] if len(loss_t.shape) > 0 else loss_t.numpy()
            if (i+1)%updata_batch==0:
                #clip_grad_norm_(model.parameters(), max_norm=1.0, norm_type=2)
                optimizer.step()
                scheduler.step()
                optimizer.zero_grad()
                pbar.set_postfix({'loss': f'{ batch_loss/updata_batch:.4f}'})
                batch_loss=0
                pbar.update(1)
    print(f"Average Loss: {epoch_loss / training_batch:.4f}")
    # 验证部分
    eval_model(epoch,testing_batch)
    torch.save(model.state_dict(), os.path.join(args.save_checkpoint_dir, "checkpoint_last.pkl"))

Epoch 1/1: 100%|██████████| 1/1 [00:05<00:00,  5.61s/batch, loss=1.3226]


Average Loss: 1.3226


Validation Epoch 0: 6batch [00:04,  1.27batch/s]                    

pred:  Are both Plantago and Trichosanthes plantain?
true:  Are Trichosanthes and Plantago both forms of plant life?
pred:  What season of Indian reality TV series did Lopamudra Raut win the award for?
true:  In which season of the Indian reality TV show "Big Boss" did the model Lopamundra Raut participate?
pred:  Who was elected to the Senate in 2012 and was the senior United States Senator from Nevada?
true:  The 2012 United States Senate election in Nevada concluded with a close victory for which current Republican incumbent?
pred:  When did the episode that Moe Szyslak first appeared in originally air?
true:  When was the Simpson's episode broadcasted that introduced the character Morris "Moe" Szyslak?
pred:  Are both The Shipping News and The Smiths shipping news?
true:  Are Shipping News and Gene both rock bands?
pred:  Which American author, professor and literary critic, wrote the science fiction novel "The Einstein Intersection"?
true:  Who is the science fiction writer, Samue




  0%|          | 0/2 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 0.31 seconds, 153.23 sentences/sec
{'testlen': 705, 'reflen': 747, 'guess': [705, 657, 609, 561], 'correct': [257, 108, 52, 24]}
ratio: 0.9437751004003431
Scores: {'TotalScore': 23.56846752525788, 'BERTScore': 0.35676889517344534, 'Bleu1': 0.34345602839626704, 'Bleu2': 0.23063665641892167, 'Bleu3': 0.16235161693990027, 'Bleu4': 0.11460045533171223, 'Rouge': 0.3044592733285238}


In [15]:
args.scores_list

[{'TotalScore': 25.582630401285368,
  'BERTScore': 0.3781976652910493,
  'Bleu1': 0.3633904418394295,
  'Bleu2': 0.25059152233883275,
  'Bleu3': 0.1797173874589759,
  'Bleu4': 0.133454942734658,
  'Rouge': 0.31402635996852},
 {'TotalScore': 23.56846752525788,
  'BERTScore': 0.35676889517344534,
  'Bleu1': 0.34345602839626704,
  'Bleu2': 0.23063665641892167,
  'Bleu3': 0.16235161693990027,
  'Bleu4': 0.11460045533171223,
  'Rouge': 0.3044592733285238}]

## 获取测试集结果

In [31]:
# 读取test的ids
with open(test_data_path, 'r', encoding='utf-8') as json_file:
    data = json.load(json_file)
test_id_list = [item['_id'] for item in data]


test_generator=get_infinite_data(test_data_loader)
def get_test_batch():
    return test_generator.__next__()
# 测试
generated_questions = []
generated_questions_dict = []
model.eval()
with tqdm(total=len(test_data_loader), desc=f'Test epoch {1}/{1}', unit='batch') as pbar:
    for i in range(len(test_data_loader)):
        batch=get_test_batch()
        with torch.no_grad():
            input_ids = batch['input_ids'].to(device)
            attention_mask=batch['attention_mask'].to(device)
            generated_ids = model.generate(input_ids=input_ids, attention_mask=attention_mask,max_new_tokens=256,do_sample=False)
            #preds_list.extend(tokenizer.batch_decode(generated_ids, skip_special_tokens=True))
        #generated_ids = generate_question(batch)
        qs = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
        generated_questions.extend(qs)
        pbar.update(1)

for _,item in enumerate(generated_questions):
    generated_questions_dict.append({'_id':test_id_list[_],'question':item})

saveJsonResult(generated_questions_dict, data_type = 'test')

Test epoch 1/1: 100%|██████████| 926/926 [12:41<00:00,  1.22batch/s]


## 上传模型到kaggle

In [None]:
use_name='charliepine'
key='3cf117e04af8e9e8d76fb615d1caec0b'
dataset_name='test-dataset-name'
def prepare_dataset_metadata(use_name=use_name,key=key,dataset_name=dataset_name,checkpoint_path='/kaggle/working/checkpoint_best.pkl'):
    os.chdir('/kaggle/working/')
    if not os.path.exists('model'):
        os.mkdir('model')
    try:
        os.rename(checkpoint_path,'/kaggle/working/model/checkpoint_best.pkl')
    except:
        pass
    if not os.path.exists('/root/.kaggle/'):
        os.mkdir('/root/.kaggle')
    os.chdir('/root/.kaggle')
    kaggle_json={
        "username":use_name,
        "key":key
    }
    print(kaggle_json)
    with open('kaggle.json', 'w', encoding='utf-8') as f:
        json.dump(kaggle_json, f, ensure_ascii=False, indent=0)
        
    os.chdir('/kaggle/working/model')
    metadata={
        "title":dataset_name,
        "id": f"{use_name}/{dataset_name}", 
        "licenses": [{"name": "CC0-1.0"}]
    }
    print(metadata)
    with open('dataset-metadata.json', 'w', encoding='utf-8') as f:
        json.dump(metadata, f, ensure_ascii=False, indent=0)
    print('create dataset-metadata.json successful!')
prepare_dataset_metadata()

In [43]:
!kaggle datasets create -p /kaggle/working/model -r skip -u

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Starting upload for file checkpoint_best.pkl
100%|████████████████████████████████████████| 618M/618M [00:25<00:00, 25.7MB/s]
Upload successful: checkpoint_best.pkl (618MB)
Your public Dataset is being created. Please check progress at https://www.kaggle.com/datasets/labi123456/test-dataset-name
