In [1]:
import torch
torch.cuda.empty_cache()

In [2]:
import transformers
import datasets
from datasets import load_dataset, load_metric
import logging
from transformers import BertTokenizer, GPT2Tokenizer, GPT2TokenizerFast, EncoderDecoderModel, Trainer, TrainingArguments, BertTokenizerFast
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments,EarlyStoppingCallback
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer

logging.basicConfig(level=logging.INFO)

import types
import argparse
import logging
from functools import partial
import json

import torch
from torch import optim
from torch.utils.data import DataLoader
from tqdm import tqdm
from transformers import (
    BertGenerationConfig,
    BertGenerationEncoder,
    BertTokenizer,
    EncoderDecoderModel,
    EncoderDecoderConfig,
    GPT2LMHeadModel,
    GPT2TokenizerFast,
    Trainer,
    TrainingArguments,
    get_cosine_schedule_with_warmup,
)

import sacrebleu
import random
import numpy as np




In [3]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [4]:
from datasets import load_dataset

raw_dataset = load_dataset("DJANGO")

In [5]:
train_dataset= raw_dataset["train"]
val_dataset = raw_dataset["validation"]
test_dataset = raw_dataset["test"]
     



In [6]:

raw_dataset

DatasetDict({
    train: Dataset({
        features: ['nl', 'code'],
        num_rows: 11428
    })
    validation: Dataset({
        features: ['nl', 'code'],
        num_rows: 1000
    })
    test: Dataset({
        features: ['nl', 'code'],
        num_rows: 1805
    })
})

In [7]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-nl", use_fast = True)

In [8]:
tokenizer

MarianTokenizer(name_or_path='Helsinki-NLP/opus-mt-en-nl', vocab_size=67028, model_max_length=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<pad>'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	67027: AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [9]:
from transformers import AutoModelForSeq2SeqLM
model = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-en-nl")

In [10]:
model

MarianMTModel(
  (model): MarianModel(
    (shared): Embedding(67028, 512, padding_idx=67027)
    (encoder): MarianEncoder(
      (embed_tokens): Embedding(67028, 512, padding_idx=67027)
      (embed_positions): MarianSinusoidalPositionalEmbedding(512, 512)
      (layers): ModuleList(
        (0-5): 6 x MarianEncoderLayer(
          (self_attn): MarianAttention(
            (k_proj): Linear(in_features=512, out_features=512, bias=True)
            (v_proj): Linear(in_features=512, out_features=512, bias=True)
            (q_proj): Linear(in_features=512, out_features=512, bias=True)
            (out_proj): Linear(in_features=512, out_features=512, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation_fn): SiLU()
          (fc1): Linear(in_features=512, out_features=2048, bias=True)
          (fc2): Linear(in_features=2048, out_features=512, bias=True)
          (final_layer_norm): LayerNorm((512,), eps=1e-05

In [11]:
from utils import evaluator
evaluator = evaluator.CodeGenerationEvaluator(tokenizer, device,smooth_bleu=True)

  self.sacre_bleu: Metric = load_metric('sacrebleu')


In [12]:
encoder_length = 32
decoder_length = 32
batch_size = 1


# map data correctly
def map_to_encoder_decoder_inputs(batch):    
    inputs = tokenizer(batch["nl"], padding="max_length", truncation=True, max_length=encoder_length)
    outputs = tokenizer(batch["code"], padding="max_length", truncation=True, max_length=decoder_length)
    batch["input_ids"] = inputs.input_ids
    batch["attention_mask"] = inputs.attention_mask
    batch["decoder_input_ids"] = outputs.input_ids
    batch["labels"] = outputs.input_ids.copy()
    batch["decoder_attention_mask"] = outputs.attention_mask
    
    """
    # complicated list comprehension here because pad_token_id alone is not good enough to know whether label should be excluded or not
    batch["labels"] = [
        [-100 if mask == 0 else token for mask, token in mask_and_tokens] for mask_and_tokens in [zip(masks, labels) for masks, labels in zip(batch["decoder_attention_mask"], batch["labels"])]
    ]
    """
    assert all([len(x) == encoder_length for x in inputs.input_ids])
    assert all([len(x) == decoder_length for x in outputs.input_ids])
    
    return batch
     


In [13]:
# make train dataset ready
train_data = train_dataset.map(
    map_to_encoder_decoder_inputs, batched=True, batch_size=1, remove_columns=['nl', 'code'],
)
train_data.set_format(
    type="torch", columns=["input_ids", "attention_mask", "decoder_input_ids", "decoder_attention_mask", "labels"],
)
# same for validation dataset
val_data = val_dataset.map(
    map_to_encoder_decoder_inputs, batched=True, batch_size=1, remove_columns=['nl', 'code'],
)
val_data.set_format(
    type="torch", columns=["input_ids", "attention_mask", "decoder_input_ids", "decoder_attention_mask", "labels"],
)
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer,
                                       max_length=512,padding=True, ####new
                                       model = model)
     


In [14]:
# set training arguments - these params are not really tuned, feel free to change
training_args = Seq2SeqTrainingArguments(
    output_dir="DJANGO-training",
    evaluation_strategy="epoch",
    save_strategy = "epoch",
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    predict_with_generate=True,
    num_train_epochs=14,
    do_train=True,
    do_eval=True,
    fp16=True,
    overwrite_output_dir=True,
    learning_rate = 1e-5,
    weight_decay=0.01, 
    warmup_ratio = 0.05,
    seed = 1995,
    save_total_limit = 2,
    load_best_model_at_end = True,
   
)
#    

# instantiate trainer
trainer = Seq2SeqTrainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    compute_metrics=evaluator,
    data_collator = data_collator,
    train_dataset=train_data,
    eval_dataset =val_data,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3, early_stopping_threshold=0.01)]

)
#    save_total_limit=3,
# 

In [15]:

#marian-DJANGO-1
trainer.train()



Epoch,Training Loss,Validation Loss,Bleu,Sacrebleu,Bleu-unigram-precision,Bleu-bigram-precision,Bleu-trigram-precision,Rouge-2,Rouge-l,Sacre-unigram-precision,Sacre-bigram-precision,Sacre-trigram-precision
1,0.3488,0.31493,72.509601,74.410591,87.551677,78.489881,71.778868,72.458506,87.895897,88.106515,80.355252,74.926881
2,0.2927,0.256799,76.52572,78.966626,86.698648,79.282917,73.192436,76.169265,89.944295,87.336463,80.960834,76.210631
3,0.183,0.233253,78.632869,80.345448,88.042996,81.243769,76.036343,77.000813,90.530069,88.771251,83.066984,79.089634
4,0.1604,0.221241,79.816368,81.354769,89.377166,83.154979,78.127244,77.432417,90.829688,89.75799,84.373934,80.481249
5,0.1552,0.221766,80.694749,82.162807,90.041262,83.845864,78.832117,78.231007,91.208927,90.370899,85.023826,81.146285
6,0.0894,0.216316,81.272842,82.578538,89.837263,83.916345,78.817524,77.881628,91.154241,90.195483,85.055787,81.097175
7,0.065,0.216034,80.832335,82.576287,89.148051,83.103788,78.006152,78.183603,91.301567,89.504256,84.229271,80.199875


INFO:absl:Using default tokenizer.
INFO:utils.evaluator:Got BLEU of 72.51 and SacreBLEU of 74.41
INFO:absl:Using default tokenizer.
INFO:utils.evaluator:Got BLEU of 76.53 and SacreBLEU of 78.97
INFO:absl:Using default tokenizer.
INFO:utils.evaluator:Got BLEU of 78.63 and SacreBLEU of 80.35
INFO:absl:Using default tokenizer.
INFO:utils.evaluator:Got BLEU of 79.82 and SacreBLEU of 81.35
INFO:absl:Using default tokenizer.
INFO:utils.evaluator:Got BLEU of 80.69 and SacreBLEU of 82.16
INFO:absl:Using default tokenizer.
INFO:utils.evaluator:Got BLEU of 81.27 and SacreBLEU of 82.58
INFO:absl:Using default tokenizer.
INFO:utils.evaluator:Got BLEU of 80.83 and SacreBLEU of 82.58
There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.encoder.embed_positions.weight', 'model.decoder.embed_tokens.weight', 'model.decoder.embed_positions.weight', 'lm_head.weight'].


TrainOutput(global_step=79996, training_loss=0.27258588817216284, metrics={'train_runtime': 9578.5724, 'train_samples_per_second': 16.703, 'train_steps_per_second': 16.703, 'total_flos': 677933400195072.0, 'train_loss': 0.27258588817216284, 'epoch': 7.0})

In [16]:
trainer.save_model()