In [3]:
from my_model import RAEBaseModel
from my_model_utils import set_random_seed, load_and_cache_examples_train, load_and_cache_examples_dev
from transformers import AutoConfig, LongformerTokenizer, AdamW, get_scheduler
import torch
import torch.nn as nn
from tqdm import tqdm



def train(model, train_dataloader, optimizer, lr_scheduler, loss_fn, epochs, eval_steps):
    step = 0
    losses = []
    for epoch in range(epochs):

        model.train()
        
        
        
        for data in tqdm(train_dataloader):
            data = tuple([i.to(model.device) for i in data])
            sent_ids, input_ids, input_attention_mask, input_token_ids, child_start_indices, child_end_indices, head_start_indices, head_end_indices, dep_labels, arcs, sentence_label = data
            
            print(sent_ids)
            print(sent_ids.shape)
            
            #feature['input_tree'] = feature['input_tree'].requires_grad()
            #feature['context_tree'] = feature['context_tree'].requires_grad()
            
            
            output = model(input_ids, input_attention_mask, input_token_ids, child_start_indices, child_end_indices, head_start_indices, head_end_indices, arcs)
            
            loss = loss_fn(output, dep_labels)
            if torch.isnan(loss).any() or torch.isinf(loss).any():
                print("inf or nan in examples")
                continue
            step  +=  1
            
            loss.backward()
            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()
                
            
            losses.append(loss.item())

            # loss.backward()

            
            
            if (step + 1) % eval_steps == 0:
            
                print("\n==========================================EVALUATION_RESULTS===============================")
                print("training loss:")
                print(sum(losses)/len(losses))

In [4]:
config = AutoConfig.from_pretrained('allenai/longformer-base-4096')
model = RAEBaseModel(config)
device = torch.device('cuda')
tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096')



Some weights of the model checkpoint at allenai/longformer-base-4096 were not used when initializing LongformerModel: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing LongformerModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LongformerModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [5]:
data_path = '../cnndm_generation_rst_parsed_train.json'
feature_path = '../my_model_cnndm_train.pt'
# dev_feature_path = '../xsum_generation_rst_parsed_dev_dev.pt'
model = model.to(device)

train_dataset = load_and_cache_examples_train(data_path, feature_path, 2048, tokenizer)

In [6]:


optimizer = AdamW(model.parameters(), lr=2e-5)
loss_fn = nn.CrossEntropyLoss()
from torch.utils.data import DataLoader


lr_scheduler = get_scheduler(
        name="linear", optimizer=optimizer, num_warmup_steps=500, num_training_steps=5 * len(train_dataset)
    )
train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)
train(model, train_dataloader, optimizer, lr_scheduler=lr_scheduler, loss_fn=loss_fn, epochs=50, eval_steps=5)

  0%|          | 0/27 [00:00<?, ?it/s]

tensor([ 33, 105,  87,  80,  17, 145,  88,   1], device='cuda:0')
torch.Size([8])


  4%|▎         | 1/27 [00:03<01:21,  3.13s/it]

tensor([ 40,  81, 148,  34,  33, 149,  25,   1], device='cuda:0')
torch.Size([8])


  7%|▋         | 2/27 [00:05<01:11,  2.85s/it]

tensor([ 10,  54,  53,  15,  29,  65,  21, 118], device='cuda:0')
torch.Size([8])


 11%|█         | 3/27 [00:07<01:03,  2.64s/it]

tensor([ 36,  90, 102,  31, 156,  44,  77, 150], device='cuda:0')
torch.Size([8])


 15%|█▍        | 4/27 [00:09<00:57,  2.49s/it]


training loss:
0.794073611497879
tensor([100, 116, 115,  81,  76,  95, 156, 103], device='cuda:0')
torch.Size([8])


 19%|█▊        | 5/27 [00:11<00:52,  2.41s/it]

tensor([ 80,  92,  22, 106,  48,  76,  54,  28], device='cuda:0')
torch.Size([8])


KeyboardInterrupt: 