In [1]:
import json
import torch
from torch.utils.data import Dataset
import torch
from transformers import AutoModelForSequenceClassification
from transformers import GPT2LMHeadModel
from transformers import AutoTokenizer, GPT2Tokenizer
import argparse
import json
import os
import numpy as np
import torch
from torch.utils.data import DataLoader
from tqdm import tqdm
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup

In [15]:
SepToken = {'bert': "[SEP]", 'roberta': "</s>", 'electra': "[SEP]", 'deberta': "[SEP]", 'bart': "<s>",
            'gpt2': ''} 
Label2num = {'Entailment': 1, "Contradiction": 0}

In [12]:
class Recorddataset(Dataset):
    def __init__(self, args, path, split='train'):
        super(Recorddataset, self).__init__()
        self.args = args
        self.split = split
        self.statement = []
        self.trail1 = []
        self.trail2 = []
        self.label = []
        self.section = []
        ctpath = path + "/CT json/"
        jspath = path + f"/{split}" + ".json" if split != 'trn&dev' else path + '/train.json'
        with open(jspath) as file:
            self.data = json.load(file)
            self.uuid_list = list(self.data.keys())
        if split == 'trn&dev':  
            with open(path + '/dev.json') as file:
                self.data = {**self.data, **json.load(file)}  
                self.uuid_list = list(self.data.keys())
        for id in self.uuid_list:
            self.statement.append(self.data[id]['Statement'])
            if split != 'test':
                self.label.append(Label2num[self.data[id]['Label']])
            section = self.data[id]['Section_id']
            self.section.append(section)
            with open(
                    ctpath + f"{self.data[id]['Primary_id']}" + ".json") as file: 
                ct = json.load(file)
                trail1 = ct[section]
                self.trail1.append(self.format_change(trail1))
            if self.data[id]['Type'] == "Comparison":  
                with open(ctpath + f"{self.data[id]['Secondary_id']}" + ".json") as file:
                    ct = json.load(file)
                    trail2 = ct[section]
                    self.trail2.append(self.format_change(trail2))
            else:
                self.trail2.append("_")

    def __getitem__(self, index):
        if self.args['prompt'] == 2:
            if self.trail2[index] == '_':
                sent = "{} [SEP] {} [SEP] {}".format(self.statement[index], self.section[index],
                                                     self.trail1[index])
            else:
                sent = "{} [SEP] {} [SEP] {} [SEP] {}".format(self.statement[index], self.section[index],
                                                              self.trail1[index], self.trail2[index])
        elif self.args['prompt'] == 1:
            if self.trail2[index] == '_':
                if 'gpt' in self.args['lmn']:
                    sent = "{}, {}, {} <|endoftext|>".format(self.statement[index],
                                                             f"the {self.section[index]} clue of first trail is: ",
                                                             self.trail1[index])

                elif 'bart' in self.args['lmn']:
                    sent = "{} {} {} {} {}".format(self.statement[index], SepToken[self.args['lmn']],
                                                   f"the {self.section[index]} clue of first trail is: ",
                                                   SepToken[self.args['lmn']],
                                                   self.trail1[index],
                                                   '')
                else:
                    sent = "{} {} {} {} {} {}".format(self.statement[index],
                                                      SepToken[self.args['lmn']],
                                                      f"the {self.section[index]} clue of first trail is: ",
                                                      SepToken[self.args['lmn']],
                                                      self.trail1[index],
                                                      SepToken[self.args['lmn']])
            else:
                if 'gpt' in self.args['lmn']:
                    sent = "{}, {}, {}, {}, {} <|endoftext|>".format(self.statement[index],
                                                                     f"the {self.section[index]} clue of first trail is: ",
                                                                     self.trail1[index],
                                                                     f"the {self.section[index]} clue of second trail is: ",
                                                                     self.trail2[index], )
                elif 'bart' in self.args['lmn']:
                    sent = "{} {} {} {} {} {} {} {} {} {}".format(self.statement[index], SepToken[self.args['lmn']],
                                                                  f"the {self.section[index]} clue of first trail is: ",
                                                                  SepToken[self.args['lmn']],
                                                                  self.trail1[index],
                                                                  SepToken[self.args['lmn']],
                                                                  f"the {self.section[index]} clue of second trail is: ",
                                                                  SepToken[self.args['lmn']],
                                                                  self.trail2[index],
                                                                  # </s>
                                                                  '')
                else:
                    sent = "{} {} {} {} {} {} {} {} {} {}".format(self.statement[index], SepToken[self.args['lmn']],
                                                                  f"the {self.section[index]} clue of first trail is: ",
                                                                  SepToken[self.args['lmn']],
                                                                  self.trail1[index],
                                                                  SepToken[self.args['lmn']],
                                                                  f"the {self.section[index]} clue of second trail is: ",
                                                                  SepToken[self.args['lmn']],
                                                                  self.trail2[index],
                                                                  SepToken[self.args['lmn']])

        elif self.args['prompt'] == 0:  
            sent = "{}, {}, {}, {}, {}".format(self.statement[index],
                                               f"the {self.section[index]} clue of first trail is: ",
                                               self.trail1[index],
                                               f"the {self.section[index]} clue of second trail is: ",
                                               self.trail2[index], )

        elif self.args['prompt'] == 3:
            sent = "{}".format(self.statement[index])

        else:
            raise NotImplementedError("Prompt not implemented")
        if self.split != 'test':
            return sent, torch.tensor(self.label[index])
        else:
            return sent, self.uuid_list[index]

    def __len__(self):
        return len(self.uuid_list)

    def format_change(self, sentence):
        s = ""
        for sent in sentence:
            s += sent.strip() + ","
        return s

    def get_max_length(self):
        print([len(self.__getitem__(i)[0].split(' ')) for i in range(self.__len__())])
        return max([len(self.__getitem__(i)[0].split(' ')) for i in range(self.__len__())])

In [13]:
class Model(torch.nn.Module):
    def __init__(self,args,model_name,from_check_point = False,tokenizer_dir = None, model_dir = None):
        super(Model,self).__init__()
        assert(type(from_check_point) == bool)   

        self.args = args
        if 'gpt' in model_name:
            self.tokenizer = GPT2Tokenizer.from_pretrained(model_name,do_lower_case = True) if from_check_point == False else GPT2Tokenizer.from_pretrained(tokenizer_dir,do_lower_case = True)
            self.model = AutoModelForSequenceClassification.from_pretrained(model_name,num_labels = 2)
            if not from_check_point:
                self.tokenizer.add_special_tokens({'pad_token':'[PAD]'})
            self.model.resize_token_embeddings(len(self.tokenizer))
            self.model.config.pad_token_id = self.tokenizer.pad_token_id
            if from_check_point:
                config = torch.load(model_dir,map_location = {'cuda:0':"cuda:0"})
                self.model.load_state_dict(config)

        else:
            self.model = AutoModelForSequenceClassification.from_pretrained(model_name,num_labels = 2)
            if from_check_point:
                config = torch.load(model_dir)
                self.model.load_state_dict(config)
            #self.model = torch.load(model_dir)
            self.tokenizer = AutoTokenizer.from_pretrained(model_name,do_lower_case = True) if from_check_point == False else AutoTokenizer.from_pretrained(tokenizer_dir,do_lower_case = True)
        
    def forward(self,sent,label,device):
        token = self.tokenizer(sent, padding='max_length', truncation=True, max_length=512, return_tensors="pt").to(device)
        output = self.model(**token,labels = label)

        return output 
    
    def save_model(self,dir):
        self.tokenizer.save_pretrained(dir)
        torch.save(self.model.state_dict(),dir+ f"/dev_best_seed{self.args['seed']}.pth")

In [14]:
def main():
    
    args = {
        'gpu':0,
        'ptlm':'microsoft/deberta-v3-large',
        'lmn':'deberta',
        'data':'./training_data',
        'epoch':40,
        'eval_every':10,
        'prompt':2,
        'mode':'trn',
        'from_check_point':False,
        'tokenizer_dir':'./checkpoint',
        'model_dir':'./models',
        'seed':621
    }

    if torch.cuda.is_available():
        torch.cuda.set_device(args['gpu'])
        device = torch.device('cuda')
    else:
        device = torch.device('cpu')
        
    if args['mode'] == 'trn':
        trn_dataset = Recorddataset(args, args['data'], "train")
    else:
        trn_dataset = Recorddataset(args, args['data'], "trn&dev")
    dev_dataset = Recorddataset(args, args['data'], "dev")
    tst_dataset = Recorddataset(args, args['data'], "test")

    trn_loader = DataLoader(trn_dataset, batch_size=4, shuffle=True, drop_last=False)
    dev_loader = DataLoader(dev_dataset, batch_size=1, shuffle=False, drop_last=False)
    tst_loader = DataLoader(tst_dataset, batch_size=4, shuffle=False, drop_last=False)

    seed_val = args['seed']
    np.random.seed(seed_val)
    torch.manual_seed(seed_val)
    torch.cuda.manual_seed_all(seed_val)
    output_dir = "./result/{}_prompt{}_mode{}_epoch{}_eval{}/".format(args['ptlm'], args['prompt'], args['mode'], args['epoch'],
                                                                          args['eval_every'])
    os.makedirs(output_dir, exist_ok=True)
    epochs = args['epoch']
    num_total_steps = len(trn_loader) * epochs
    num_warmup_steps = len(trn_loader) * int(args['epoch'] / 8)

    model = Model(args, args['ptlm'], args['from_check_point'], args['tokenizer_dir'], args['model_dir'])
    model.to(device)

    optimizer = AdamW(model.parameters(), lr=5e-6, correct_bias=True)
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=num_warmup_steps,
                                                num_training_steps=num_total_steps)

    best_val, best_val_epoch = 0, 0
    best_recall, best_precision = 0, 0

    for epoch in range(epochs):
        total_loss = 0
        for iter, (sent, label) in enumerate(
                tqdm(trn_loader, desc=f'epoch: {epoch + 1}/{epochs}')):  # data = (statement, trail1,trail2,label)
            label = label.to(device)
            output = model(sent, label, device)
            pred = torch.argmax(output[1], dim=-1)
            total_loss += output[0].item()
            optimizer.zero_grad()
            output[0].backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()
            if iter % args['eval_every'] == 0 and iter != 0:
                with torch.no_grad():
                    l, f, p, r = eval(model, dev_loader, device, print_on_screen=False)
                print(
                    f"The Validation result at epoch {epoch + 1} iter {iter}: val_loss: {l}, val_f1: {f}, val_precision: {p}, val_recall: {r}")
                if f > best_val:
                    best_val_epoch = epoch + 1
                    best_val = f
                    best_precision = p
                    best_recall = r
                    model.save_model(output_dir)
                    Test_Results = {}
                    for sent, uuid in tqdm(tst_loader):
                        with torch.no_grad():
                            outputs = model(sent, label, device)
                            output = outputs[1]
                            for i in range(4):
                                if torch.argmax(output[i]) == 0:
                                    Test_Results[str(uuid[i])] = {"Prediction": 'Contradiction'}
                                else:
                                    Test_Results[str(uuid[i])] = {"Prediction": "Entailment"}

                    with open("{}/results.json".format(output_dir), 'w') as jsonFile:
                        jsonFile.write(json.dumps(Test_Results, indent=4))

        print("total_loss_per_epoch: ", total_loss, "best_val", best_val, 'best_r', best_recall, 'best_p',
              best_precision, "best_val_epoch", best_val_epoch)
        if args['mode'] == 'mix':
            Test_Results = {}
            for (sent, uuid) in tqdm(tst_loader):
                outputs = model(sent, label, device)
                output = outputs[1]
                for i in range(4):
                    if torch.argmax(output[i]) == 0:
                        Test_Results[str(uuid[i])] = {"Prediction": 'Contradiction'}
                    else:
                        Test_Results[str(uuid[i])] = {"Prediction": "Entailment"}

            with open("{}/epoch{}_results.json".format(output_dir, epoch + 1), 'w') as jsonFile:
                jsonFile.write(json.dumps(Test_Results, indent=4))


if __name__ == '__main__':

    main()

Some weights of the model checkpoint at microsoft/deberta-v3-large were not used when initializing DebertaV2ForSequenceClassification: ['mask_predictions.LayerNorm.weight', 'mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.dense.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.dense.bias', 'mask_predictions.classifier.bias', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.classifier.weight']
- This IS expected if you are initializing DebertaV2ForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2ForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from 

**********at the epoch**********


epoch: 1/40:   2%|█▍                                                               | 9/425 [30:45<23:41:36, 205.04s/it]


KeyboardInterrupt: 