# Check GPUs and Install libraries

In [1]:
#@title 1.1 Check GPU Status
import subprocess
simple_nvidia_smi_display = True#@param {type:"boolean"}
if simple_nvidia_smi_display:
  #!nvidia-smi
  nvidiasmi_output = subprocess.run(['nvidia-smi', '-L'], stdout=subprocess.PIPE).stdout.decode('utf-8')
  print(nvidiasmi_output)
else:
  #!nvidia-smi -i 0 -e 0
  nvidiasmi_output = subprocess.run(['nvidia-smi'], stdout=subprocess.PIPE).stdout.decode('utf-8')
  print(nvidiasmi_output)
  nvidiasmi_ecc_note = subprocess.run(['nvidia-smi', '-i', '0', '-e', '0'], stdout=subprocess.PIPE).stdout.decode('utf-8')
  print(nvidiasmi_ecc_note)

GPU 0: NVIDIA L4 (UUID: GPU-cdf97e46-a922-3166-30b1-370d2a8877e2)



# Imports

In [2]:
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from transformers.pipelines.pt_utils import KeyDataset
import torch
from datasets import Dataset as HFDataset
import numpy as np
import logging
import evaluate

from peft import LoraConfig, get_peft_model
from datasets import load_dataset

from transformers import (
    pipeline,
    AutoTokenizer,
    AutoModel,
    AutoModelForSequenceClassification,
    BertConfig,
    BertModel,
    BertTokenizer,
    BertGenerationEncoder,
    BertGenerationDecoder,
    BertGenerationConfig,
    EncoderDecoderModel,
    EncoderDecoderConfig,
    Trainer,
    TrainingArguments,
    T5Tokenizer,
    T5ForConditionalGeneration,
    AutoTokenizer, 
    AutoModelForCausalLM,
    AutoModelForSeq2SeqLM,
    GPT2Model,
    GPT2LMHeadModel,
    GPT2Config,
    GPT2ForSequenceClassification,
    GPT2Tokenizer,
)
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sacrebleu import corpus_bleu
from tqdm.autonotebook import tqdm

# Loading and Data pre-processing

In [3]:
train_hf_dataset = HFDataset.load_from_disk("SubCodeXGLUE_train")
validation_hf_dataset = HFDataset.load_from_disk("SubCodeXGLUE_validation")
test_hf_dataset = HFDataset.load_from_disk("SubCodeXGLUE_test")
print(train_hf_dataset)
print(validation_hf_dataset)
print(test_hf_dataset)

Dataset({
    features: ['id', 'repo', 'path', 'func_name', 'original_string', 'language', 'code', 'code_tokens', 'docstring', 'docstring_tokens', 'sha', 'url', '__index_level_0__'],
    num_rows: 2000
})
Dataset({
    features: ['id', 'repo', 'path', 'func_name', 'original_string', 'language', 'code', 'code_tokens', 'docstring', 'docstring_tokens', 'sha', 'url', '__index_level_0__'],
    num_rows: 2000
})
Dataset({
    features: ['id', 'repo', 'path', 'func_name', 'original_string', 'language', 'code', 'code_tokens', 'docstring', 'docstring_tokens', 'sha', 'url', '__index_level_0__'],
    num_rows: 1000
})


In [4]:
# # Load the Tokenizer
# tokenizer_bert = BertTokenizer.from_pretrained("neulab/codebert-java")

# tokenizer_bert.bos_token = tokenizer_bert.cls_token
# tokenizer_bert.eos_token = tokenizer_bert.sep_token

# # make sure GPT2 appends EOS in begin and end
# def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
#     outputs = [self.bos_token_id] + token_ids_0 + [self.eos_token_id]
#     return outputs

# GPT2Tokenizer.build_inputs_with_special_tokens = build_inputs_with_special_tokens
# tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
# # set pad_token_id to unk_token_id -> be careful here as unk_token_id == eos_token_id == bos_token_id
# tokenizer.pad_token = tokenizer.eos_token

# config_encoder = BertConfig()
# config_decoder = GPT2Config()

# config = EncoderDecoderConfig.from_encoder_decoder_configs(config_encoder, config_decoder)
# model = EncoderDecoderModel(config).from_pretrained("codeBert-GPT2-c2s/model")

EncoderDecoderModel has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From ðŸ‘‰v4.50ðŸ‘ˆ onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.


In [5]:
# Load the Tokenizer
tokenizer_bert = AutoTokenizer.from_pretrained("neulab/codebert-java")

tokenizer_bert.bos_token = tokenizer_bert.cls_token
tokenizer_bert.eos_token = tokenizer_bert.sep_token

# make sure GPT2 appends EOS in begin and end
def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
    outputs = [self.bos_token_id] + token_ids_0 + [self.eos_token_id]
    return outputs

GPT2Tokenizer.build_inputs_with_special_tokens = build_inputs_with_special_tokens
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
# set pad_token_id to unk_token_id -> be careful here as unk_token_id == eos_token_id == bos_token_id
tokenizer.pad_token = tokenizer.eos_token

# Preprocess Function for Code Summarization
def preprocess_function(examples):
    # Create input strings with clear task instructions
    inputs = []
    labels = []

    for i in range(len(examples["code"])):
        code_snippet = examples['code'][i]
        docstring = examples['docstring'][i]
        func_name = examples['func_name'][i]
        docstring_tokens_samples = examples['docstring_tokens'][i][:2]
        suffix = ' '.join(docstring_tokens_samples)

        # Tokenize the code snippet and docstring to ensure they fit within limits
        tokenized_code = tokenizer(code_snippet, truncation=True, max_length=495, return_tensors="pt")
        tokenized_docstring = tokenizer(docstring, truncation=True, max_length=123, return_tensors="pt")

        # Decode back to strings to concatenate
        code_half = tokenizer.decode(tokenized_code["input_ids"][0], skip_special_tokens=True)
        docstring_half = tokenizer.decode(tokenized_docstring["input_ids"][0], skip_special_tokens=True)

        # Concatenate the prompt with the truncated code snippet and docstring
        # code_prompt = (
        #     f"{code_half} SUMMARY: {suffix} "
        # )
        code_prompt = (
            f"Please write a summary for the code of {func_name}: {code_half} <The summary>: "
        )
        
        summary_prompt = (
            f"{docstring_half}"
        )
        input_text = f"{code_prompt}"
        label_text = f"{summary_prompt}"

        inputs.append(input_text)
        labels.append(label_text)

    max_length = 512  # Truncate/pad sequences to this length
    max_length_gpt = 128

    # Tokenize inputs and targets
    tokenized_inputs = tokenizer_bert(
        inputs, truncation=True, padding="max_length", max_length=max_length, return_tensors="pt"
    )
    
    tokenized_labels = tokenizer(
        labels, truncation=True, padding="max_length", max_length=max_length_gpt, return_tensors="pt"
    )
    
    # Align labels with tokenized target inputs
    # labels = tokenized_labels["input_ids"].clone()
    # labels[labels == tokenizer.pad_token_id] = -100  # Replace padding token IDs with -100 for loss computation
    # labels = [
    #     [-100 if mask == 0 else token for mask, token in mask_and_tokens] for mask_and_tokens in [zip(masks, labels) for masks, labels in zip(tokenized_labels.attention_mask, labels)]
    # ]

    # assert all([len(x) == max_length for x in tokenized_inputs.input_ids])
    # assert all([len(x) == max_length_gpt for x in labels])
    
    output_ids = tokenized_labels["input_ids"].clone()
    shifted_input_ids = output_ids.new_zeros(output_ids.shape)
    shifted_input_ids[:, :-1] = output_ids[:, 1:].clone()   # del CLS token
    shifted_input_ids[:, -1] = tokenizer.pad_token_id   # append [PAD] token
    labels = shifted_input_ids

    # We have to make sure that the PAD token is ignored
    # labels[labels == tokenizer.pad_token_id] = -100  # Replace padding token IDs with -100 for loss computation
    labels = [
        [-100 if mask == 0 else token for mask, token in mask_and_tokens] for mask_and_tokens in [zip(masks, labels) for masks, labels in zip(tokenized_labels.attention_mask, labels)]
    ]


    return {
        "input_ids": tokenized_inputs["input_ids"],
        "attention_mask": tokenized_inputs["attention_mask"],
        "decoder_input_ids": tokenized_labels["input_ids"],
        "decoder_attention_mask": tokenized_labels["attention_mask"],
        "labels": labels,
        "input_strings": inputs,
    }

# Load the datasets
tokenized_train_dataset = train_hf_dataset.map(preprocess_function, batched=True, batch_size=4)
tokenized_validation_dataset = validation_hf_dataset.map(preprocess_function, batched=True, batch_size=4)
tokenized_test_dataset = test_hf_dataset.map(preprocess_function, batched=True, batch_size=4)
print(tokenized_train_dataset)
print(tokenized_validation_dataset)
print(tokenized_test_dataset)

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Dataset({
    features: ['id', 'repo', 'path', 'func_name', 'original_string', 'language', 'code', 'code_tokens', 'docstring', 'docstring_tokens', 'sha', 'url', '__index_level_0__', 'input_ids', 'attention_mask', 'decoder_input_ids', 'decoder_attention_mask', 'labels', 'input_strings'],
    num_rows: 2000
})
Dataset({
    features: ['id', 'repo', 'path', 'func_name', 'original_string', 'language', 'code', 'code_tokens', 'docstring', 'docstring_tokens', 'sha', 'url', '__index_level_0__', 'input_ids', 'attention_mask', 'decoder_input_ids', 'decoder_attention_mask', 'labels', 'input_strings'],
    num_rows: 2000
})
Dataset({
    features: ['id', 'repo', 'path', 'func_name', 'original_string', 'language', 'code', 'code_tokens', 'docstring', 'docstring_tokens', 'sha', 'url', '__index_level_0__', 'input_ids', 'attention_mask', 'decoder_input_ids', 'decoder_attention_mask', 'labels', 'input_strings'],
    num_rows: 1000
})


In [6]:
tokenizer_bert.decode(tokenized_train_dataset["input_ids"][1][0:], skip_special_tokens=True)

'Please write a summary for the code of Category.translate: public void translate(TranslationService translationService) {\n    if (translationService == null) {\n      description.setValue(descriptionKey.getValue());\n      return;\n    }\n\n    if (!Strings.isNullOrEmpty(descriptionKey.get())) {\n      description.setValue(translationService.translate(descriptionKey.get()));\n    }\n  } <The summary>: '

In [7]:
tokenizer.decode(tokenized_train_dataset["labels"][1][:52], skip_special_tokens=True)

'This internal method is used as a callback for when the translation\nservice or its locale changes. Also applies the translation to all\ncontained sections.\n\n@see com.dlsc.formsfx.model.structure.Group ::translate'

In [8]:
# Load pre-trained model
logging.basicConfig(level=logging.INFO)

bert = AutoModel.from_pretrained("neulab/codebert-java")
gpt2 = GPT2LMHeadModel.from_pretrained("gpt2", add_cross_attention=True, is_decoder=True)

model = EncoderDecoderModel(encoder=bert, decoder=gpt2)

# model.decoder.config.use_cache = False

# set decoding params
model.decoder.config.pad_token_id = model.decoder.config.eos_token_id
model.config.decoder_start_token_id = tokenizer.bos_token_id
model.decoder.config.eos_token_id = tokenizer.eos_token_id

model.config.no_repeat_ngram_size = 2
model.num_beams = 3

Some weights of RobertaModel were not initialized from the model checkpoint at neulab/codebert-java and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of GPT2LMHeadModel were not initialized from the model checkpoint at gpt2 and are newly initialized: ['h.0.crossattention.c_attn.bias', 'h.0.crossattention.c_attn.weight', 'h.0.crossattention.c_proj.bias', 'h.0.crossattention.c_proj.weight', 'h.0.crossattention.q_attn.bias', 'h.0.crossattention.q_attn.weight', 'h.0.ln_cross_attn.bias', 'h.0.ln_cross_attn.weight', 'h.1.crossattention.c_attn.bias', 'h.1.crossattention.c_attn.weight', 'h.1.crossattention.c_proj.bias', 'h.1.crossattention.c_proj.weight', 'h.1.crossattention.q_attn.bias', 'h.1.crossattention.q_attn.weight', 'h.1.ln_cross_attn.bias', 'h.1.ln_cross_attn.weight', 'h.10.crossattention.c_attn.bias', 'h.10.crossattention.c_

In [None]:
# # Config LoRA specifications
# lora_config = LoraConfig(
#     # r=4096,  # lower the rank
#     r=2048,  # lower the rank
#     lora_alpha=32,
#     # target_modules=["q", "v"],  # apply LoRA to q and v of attention modules
#     # target_modules=["c_attn"],  # target query, key, and value together
#     target_modules=[
#                     "c_attn",
#                     "attn.c_attn",
#                     "attn.q_proj",
#                     "attn.k_proj",
#                     "attn.v_proj",
#                     "attn.c_proj",
#                     "mlp.c_fc",
#                     "mlp.c_proj",
#                     "query", 
#                     "value",
#                     ],  # module names for GPT2 and BERT
#     lora_dropout=0.1,
#     bias="none",
#     # task_type="SEQ_2_SEQ_LM",  # task type set to seq2seq generation
#     # task_type="SEQ_CLS",  # task type set to text classification
#     task_type="CAUSAL_LM",
#     fan_in_fan_out=True,
# )

# # Convert the model to LoRA model
# model = get_peft_model(model, lora_config)

# # Check the number of trainable parameters (for LoRA)
# model.print_trainable_parameters()

In [9]:
# Adjust the batch size
batch_size = 4
training_args = TrainingArguments(
    output_dir="./results/codeBert-GPT2-c2s",
    # eval_strategy="epoch",
    learning_rate=5e-4,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    metric_for_best_model="f1",
    weight_decay=0.01,
    num_train_epochs=12,
    save_strategy="no",
    logging_strategy="steps",
    logging_steps=100,
    report_to="mlflow",  # disable wandb etc.
    fp16=True,  # mixed precision training
    optim="adamw_torch",  # use torch original optimizer
)

def compute_metrics(eval_pred):
    predictions, labels = eval_pred

    # Extract logits from tuple and process predictions
    predictions = predictions[0]  # Extract the array
    predictions = np.argmax(predictions, axis=-1)  # Convert logits to token IDs
    predictions = predictions.tolist()  # Convert to a list

    # Decode predictions and labels
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # Format labels for BLEU (expects list of lists for references)
    decoded_labels = [[label] for label in decoded_labels]

    # Compute BLEU score using sacrebleu
    bleu = corpus_bleu(decoded_preds, decoded_labels)

    return {"bleu": bleu.score}

# Define Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    # compute_metrics=compute_metrics,
    # train_dataset=tokenized_train_dataset.select(range(100)),
    train_dataset=tokenized_train_dataset,
    # eval_dataset=tokenized_validation_dataset,
)

# Start Training
trainer.train()

# Save the model after LoRA fine-tuing
model.save_pretrained("./codeBert-GPT2-c2s/model")
tokenizer.save_pretrained("./codeBert-GPT2-c2s/tokenizer")

print("LoRA fine-tuning done, model saved!")

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Step,Training Loss
100,4.0555
200,3.668
300,3.6939
400,3.6319
500,3.587
600,2.6797
700,2.6919
800,2.7495
900,2.742
1000,2.7121




LoRA fine-tuning done, model saved!


In [10]:
model.to("cuda:0")
pred = []
for i in tqdm(KeyDataset(tokenized_test_dataset, "input_strings")):
    input_ids = tokenizer_bert(
                i, truncation=True, padding="max_length", max_length=512, return_tensors="pt"
                ).input_ids.to("cuda:0")
    output_ids = model.generate(input_ids,
                            do_sample=True,
                            # tokenizer=tokenizer,
                            max_new_tokens=25,
                            min_new_tokens=12,
                            pad_token_id=tokenizer.eos_token_id,
                            temperature=0.5,
                            num_beams=5,
                            early_stopping=True,
                            no_repeat_ngram_size=3,
                            top_k=50,
                            top_p=0.9,
                           )
    pred.append(tokenizer.decode(output_ids[0], skip_special_tokens=True))

  0%|          | 0/1000 [00:00<?, ?it/s]

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


In [11]:
google_bleu = evaluate.load("google_bleu")
google_bleu_scores = []
flag = 3
for i, v in enumerate(tqdm(pred)):
    if flag:
        print(f"BERT-GPT2: {[v]}")
        print("\n")
        print(f'Label: {[[tokenized_test_dataset["docstring"][i]]]}')
        print("*" * 30)
        flag -= 1
    google_bleu_scores.append(
                              google_bleu.compute(
                                                  predictions=[v], 
                                                  references=[[tokenized_test_dataset["docstring"][i]]]
                                                  )['google_bleu']
                             )

  0%|          | 0/1000 [00:00<?, ?it/s]

BERT-GPT2: ["Returns a list of class fields. Supports inheritance and doesn't return synthetic fields.\n\n@param beanClass class to be"]


Label: [['Import Pipeline Fragment Configuration & Rules\n\n@param fragmentId Fragment  Id\n@return fragmentEnvelope']]
******************************
BERT-GPT2: ['Sets the publish queue shutdown time.\n\n@param publishQueueShutdowntime the shutdown time to set, parsed']


Label: [['deserialize request command\n\n@return true if deserialize success; false if exception catched']]
******************************
BERT-GPT2: ['Creates a new node as a child of the current node.\n\n@param name the name of the new node\n']


Label: [['Appends TO address by personal name and email address.\n\n@param personalName personal name.\n@param to           email address.\n@return this\n@see #to(EmailAddress)']]
******************************


In [12]:
total_bleu = sum(google_bleu_scores)
average_bleu = total_bleu / len(google_bleu_scores)

print(f"Average BLEU Score: {average_bleu * 100}")

Average BLEU Score: 3.7891931265051997
