# Check GPUs and Install libraries

In [1]:
#@title 1.1 Check GPU Status
import subprocess
simple_nvidia_smi_display = True#@param {type:"boolean"}
if simple_nvidia_smi_display:
  #!nvidia-smi
  nvidiasmi_output = subprocess.run(['nvidia-smi', '-L'], stdout=subprocess.PIPE).stdout.decode('utf-8')
  print(nvidiasmi_output)
else:
  #!nvidia-smi -i 0 -e 0
  nvidiasmi_output = subprocess.run(['nvidia-smi'], stdout=subprocess.PIPE).stdout.decode('utf-8')
  print(nvidiasmi_output)
  nvidiasmi_ecc_note = subprocess.run(['nvidia-smi', '-i', '0', '-e', '0'], stdout=subprocess.PIPE).stdout.decode('utf-8')
  print(nvidiasmi_ecc_note)

GPU 0: NVIDIA L4 (UUID: GPU-df7e561c-3e29-f733-3130-b65c68dce7a7)



# Imports

In [1]:
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from transformers.pipelines.pt_utils import KeyDataset
import torch
from datasets import Dataset as HFDataset
import numpy as np
import evaluate

from peft import LoraConfig, get_peft_model, PeftModel
from datasets import load_dataset

from transformers import (
    pipeline,
    AutoTokenizer,
    AutoModel,
    AutoModelForSequenceClassification,
    BertConfig,
    BertModel,
    Trainer,
    TrainingArguments,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    T5Tokenizer,
    T5ForConditionalGeneration,
    AutoTokenizer, 
    AutoModelForCausalLM,
    AutoModelForSeq2SeqLM,
    GPT2Model,
    GPT2Config,
    GPT2ForSequenceClassification,
    GPT2Tokenizer,
    GPT2LMHeadModel,
    GPT2ForQuestionAnswering,
)
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sacrebleu import corpus_bleu
from tqdm.autonotebook import tqdm
from accelerate import Accelerator

# Loading and Data pre-processing

In [2]:
train_hf_dataset = HFDataset.load_from_disk("SubCodeXGLUE_train")
validation_hf_dataset = HFDataset.load_from_disk("SubCodeXGLUE_validation")
test_hf_dataset = HFDataset.load_from_disk("SubCodeXGLUE_test")
print(train_hf_dataset)
print(validation_hf_dataset)
print(test_hf_dataset)

Dataset({
    features: ['id', 'repo', 'path', 'func_name', 'original_string', 'language', 'code', 'code_tokens', 'docstring', 'docstring_tokens', 'sha', 'url', '__index_level_0__'],
    num_rows: 2000
})
Dataset({
    features: ['id', 'repo', 'path', 'func_name', 'original_string', 'language', 'code', 'code_tokens', 'docstring', 'docstring_tokens', 'sha', 'url', '__index_level_0__'],
    num_rows: 2000
})
Dataset({
    features: ['id', 'repo', 'path', 'func_name', 'original_string', 'language', 'code', 'code_tokens', 'docstring', 'docstring_tokens', 'sha', 'url', '__index_level_0__'],
    num_rows: 1000
})


In [3]:
# Load the Tokenizer

# make sure GPT2 appends EOS in begin and end
def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
    outputs = [self.bos_token_id] + token_ids_0 + [self.eos_token_id]
    return outputs

GPT2Tokenizer.build_inputs_with_special_tokens = build_inputs_with_special_tokens
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

tokenizer.pad_token = tokenizer.eos_token  # GPT-2 doesn't have a pad token by default, so use eos_token
special_tokens = {"additional_special_tokens": ["<|startofcode|>",
                                                "<|endofcode|>", 
                                                "<|startofsummary|>",
                                                "<|endofsummary|>",
                                               ]}
tokenizer.add_special_tokens(special_tokens)

# tokenizer = AutoTokenizer.from_pretrained("lora-gpt-c2s/tokenizer")

# Preprocess Function for Code Summarization
def preprocess_function(examples):
    # Create input strings with clear task instructions
    inputs = []
    labels = []
    input_strings = []

    for i in range(len(examples["code"])):
        code_snippet = examples['code'][i]
        docstring = examples['docstring'][i]
        docstring_tokens_samples = examples['docstring_tokens'][i][:3]
        suffix = ' '.join(docstring_tokens_samples)
        
        # Prompt construction with clear task instructions
        # pre_prompt = (
        #     f"Please write a summary for the following Java code snippet:\n{code_snippet}\n\n"
        # ) # 14
        # pos_prompt = (
        #     f"A summary of the above Java code snippet is that:\n{docstring}"
        # ) # 12
        
        # code_prompt = (
        #     f"<|startoftext|><|code|>\n{code_snippet}<|summary|>"
        # ) # 
        # summary_prompt = (
        #     f"{docstring}<|endoftext|>"
        # ) # 

        # Tokenize the code snippet and docstring to ensure they fit within limits
        # tokenized_code = tokenizer(pre_prompt, truncation=True, max_length=408, return_tensors="pt")
        tokenized_code = tokenizer(code_snippet, truncation=True, max_length=919, return_tensors="pt")
        tokenized_docstring = tokenizer(docstring, truncation=True, max_length=90, return_tensors="pt")

        # Decode back to strings to concatenate
        code_half = tokenizer.decode(tokenized_code["input_ids"][0], skip_special_tokens=True)
        docstring_half = tokenizer.decode(tokenized_docstring["input_ids"][0], skip_special_tokens=True)

        # Concatenate the prompt with the truncated code snippet and docstring
        code_prompt_suffixed = (
            f"<|startofcode|>{code_half}<|endofcode|><|startofsummary|> SUMMARY: {suffix} "
        )
        code_prompt = (
            f"<|startofcode|>{code_half}<|endofcode|><|startofsummary|> SUMMARY: "
        )
        summary_prompt = (
            f"{docstring_half}<|endofsummary|>"
        )
        input_text = f"{code_prompt}{summary_prompt}"
        label_text = f"{code_prompt}{summary_prompt}"

        inputs.append(input_text)
        labels.append(label_text)
        input_strings.append(code_prompt_suffixed)

    max_length = 1024  # Truncate/pad sequences to this length

    # Tokenize inputs and targets
    tokenized_inputs = tokenizer(
        inputs, truncation=True, padding="max_length", max_length=max_length, return_tensors="pt"
    )
    
    tokenized_labels = tokenizer(
        labels, truncation=True, padding="max_length", max_length=max_length, return_tensors="pt"
    )
    
    # Align labels with tokenized target inputs
    labels = tokenized_labels["input_ids"].clone()
    # labels[labels == tokenizer.pad_token_id] = -100  # Replace padding token IDs with -100 for loss computation
    # output_ids = tokenized_labels["input_ids"].clone()
    # shifted_input_ids = output_ids.new_zeros(output_ids.shape)
    # shifted_input_ids[:, :-1] = output_ids[:, 1:].clone()   # del CLS token
    # shifted_input_ids[:, -1] = tokenizer.pad_token_id   # append [PAD] token
    # labels = shifted_input_ids

    # We have to make sure that the PAD token is ignored
    labels = [
        [-100 if mask == 0 else token for mask, token in mask_and_tokens] for mask_and_tokens in [zip(masks, labels) for masks, labels in zip(tokenized_labels.attention_mask, labels)]
    ]


    return {
        "input_ids": tokenized_inputs["input_ids"],
        "attention_mask": tokenized_inputs["attention_mask"],
        "labels": labels,
        "input_strings": input_strings,
    }

# Load the datasets
tokenized_train_dataset = train_hf_dataset.map(preprocess_function, batched=True, batch_size=4)
tokenized_validation_dataset = validation_hf_dataset.map(preprocess_function, batched=True, batch_size=4)
tokenized_test_dataset = test_hf_dataset.map(preprocess_function, batched=True, batch_size=4)
print(tokenized_train_dataset)
print(tokenized_validation_dataset)
print(tokenized_test_dataset)

Dataset({
    features: ['id', 'repo', 'path', 'func_name', 'original_string', 'language', 'code', 'code_tokens', 'docstring', 'docstring_tokens', 'sha', 'url', '__index_level_0__', 'input_ids', 'attention_mask', 'labels', 'input_strings'],
    num_rows: 2000
})
Dataset({
    features: ['id', 'repo', 'path', 'func_name', 'original_string', 'language', 'code', 'code_tokens', 'docstring', 'docstring_tokens', 'sha', 'url', '__index_level_0__', 'input_ids', 'attention_mask', 'labels', 'input_strings'],
    num_rows: 2000
})
Dataset({
    features: ['id', 'repo', 'path', 'func_name', 'original_string', 'language', 'code', 'code_tokens', 'docstring', 'docstring_tokens', 'sha', 'url', '__index_level_0__', 'input_ids', 'attention_mask', 'labels', 'input_strings'],
    num_rows: 1000
})


In [12]:
tokenizer.decode(tokenized_train_dataset["input_ids"][1][0:], skip_special_tokens=True)

'public void translate(TranslationService translationService) {\n    if (translationService == null) {\n      description.setValue(descriptionKey.getValue());\n      return;\n    }\n\n    if (!Strings.isNullOrEmpty(descriptionKey.get())) {\n      description.setValue(translationService.translate(descriptionKey.get()));\n    }\n  } SUMMARY: This internal method is used as a callback for when the translation\nservice or its locale changes. Also applies the translation to all\ncontained sections.\n\n@see com.dlsc.formsfx.model.structure.Group ::translate'

In [13]:
tokenizer.decode(tokenized_train_dataset["labels"][1][:164], skip_special_tokens=False)

'<|endoftext|> <|startofcode|> public void translate(TranslationService translationService) {\n    if (translationService == null) {\n      description.setValue(descriptionKey.getValue());\n      return;\n    }\n\n    if (!Strings.isNullOrEmpty(descriptionKey.get())) {\n      description.setValue(translationService.translate(descriptionKey.get()));\n    }\n  } <|endofcode|> <|startofsummary|>  SUMMARY: This internal method is used as a callback for when the translation\nservice or its locale changes. Also applies the translation to all\ncontained sections.\n\n@see com.dlsc.formsfx.model.structure.Group ::translate <|endofsummary|> <|endoftext|>'

In [4]:
# Load pre-trained model
configuration = GPT2Config()
model = GPT2LMHeadModel(configuration).from_pretrained("gpt2")
model.config.pad_token_id = tokenizer.eos_token_id
model.config.eos_token_id = tokenizer.eos_token_id
model.config.decoder_start_token_id = tokenizer.bos_token_id

model.config.no_repeat_ngram_size = 2
model.num_beams = 3
model.config.max_length = 25
model.config.min_length = 12
model.early_stopping = True

model.resize_token_embeddings(len(tokenizer))

Embedding(50261, 768)

In [10]:
# Load pre-trained model
# configuration = GPT2Config()
# base_model = "gpt2"
# adapter_model = "lora-gpt-c2s/model"

# model = GPT2LMHeadModel(configuration).from_pretrained("gpt2")
# model.config.pad_token_id = model.config.eos_token_id
# model.resize_token_embeddings(len(tokenizer))
# model = PeftModel.from_pretrained(model, adapter_model)

In [5]:
# Config LoRA specifications
lora_config = LoraConfig(
    # r=4608,  # lower the rank
    r=512,  # lower the rank
    lora_alpha=32,
    # target_modules=["q", "v"],  # apply LoRA to q and v of attention modules
    # target_modules=["c_attn", "attn.c_attn"],  # target query, key, and value together
    # target_modules=["query", "value"],  # correct module names for BERT
    target_modules = [
                        "c_attn",
                        "attn.c_attn",
                        "attn.q_proj",
                        "attn.k_proj",
                        "attn.v_proj",
                        "attn.c_proj",
                        "mlp.c_fc",
                        "mlp.c_proj",
                     ],
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM",
    # task_type="SEQ_2_SEQ_LM",  # task type set to seq2seq generation
    # task_type="SEQ_CLS",  # task type set to text classification
    fan_in_fan_out=True,
)

# Convert the model to LoRA model
model = get_peft_model(model, lora_config)

# Check the number of trainable parameters (for LoRA)
model.print_trainable_parameters()

trainable params: 75,497,472 || all params: 199,940,352 || trainable%: 37.7600


In [6]:
accelerator = Accelerator()

# Define accelerator_config as a dictionary
accelerator_config = {
    "split_batches": True,
    "even_batches": True,
    "use_seedable_sampler": True
}

# model.resize_token_embeddings(len(tokenizer))

# Adjust the batch size
batch_size = 4
training_args = Seq2SeqTrainingArguments(
    output_dir="./results/lora-gpt-c2s",
    # eval_strategy="epoch",
    learning_rate=5e-4,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    metric_for_best_model="bleu",
    weight_decay=0.01,
    num_train_epochs=6,
    # save_strategy="epoch",
    save_strategy="no",
    logging_strategy="steps",
    logging_steps=100,
    report_to="mlflow",  # disable wandb etc.
    fp16=True,  # mixed precision training
    optim="adamw_torch",  # use torch original optimizer
    # accelerator_config=accelerator_config,
)

def compute_metrics(eval_pred):
    predictions, labels = eval_pred

    # Extract logits from tuple and process predictions
    predictions = predictions[0]  # Extract the array
    predictions = np.argmax(predictions, axis=-1)  # Convert logits to token IDs
    predictions = predictions.tolist()  # Convert to a list

    # Decode predictions and labels
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # Format labels for BLEU (expects list of lists for references)
    decoded_labels = [[label] for label in decoded_labels]

    # Compute BLEU score using sacrebleu
    bleu = corpus_bleu(decoded_preds, decoded_labels)

    return {"bleu": bleu.score}

# Define Trainer
trainer = accelerator.prepare(
    Seq2SeqTrainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=tokenized_train_dataset,
    # eval_dataset=tokenized_validation_dataset,
                  )
                             )

# Start Training
trainer.train()

# Save the model after LoRA fine-tuing
model.save_pretrained("./lora-gpt-c2s/model", save_embedding_layers=True)
tokenizer.save_pretrained("./lora-gpt-c2s/tokenizer")

print("LoRA fine-tuning done, model saved!")

# test_results = []
# chunk_size = 2
# for i in tqdm(range(0, len(tokenized_test_dataset), chunk_size)):
#     small_test_dataset = tokenized_test_dataset.select(range(i, min(i + chunk_size, len(tokenized_test_dataset))))
#     with torch.no_grad():
#         predictions, labels, metrics = trainer.predict(small_test_dataset)
#         test_results.append(metrics)
#     torch.cuda.empty_cache()

Step,Training Loss
100,7.6146
200,2.0163
300,1.901
400,1.8723
500,1.7978
600,1.7511
700,1.692
800,1.7611
900,1.655
1000,1.6859


LoRA fine-tuning done, model saved!


In [13]:
import statistics 
test_lengths = [len(i) for i in tokenized_test_dataset['docstring_tokens']]
print(f'mean: {statistics.mean(test_lengths)}')
print(f'stdev: {statistics.stdev(test_lengths)}')
print(f'variance: {statistics.variance(test_lengths)}')

mean: 12.353
stdev: 8.579393742974908
variance: 73.60599699699699


In [None]:
tokenizer = AutoTokenizer.from_pretrained("lora-gpt-c2s/tokenizer")

base_model = "gpt2"
adapter_model = "lora-gpt-c2s/model"
configuration = GPT2Config()
model = GPT2LMHeadModel(configuration).from_pretrained(base_model)
model.resize_token_embeddings(len(tokenizer))

In [9]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

summarization_pipeline = pipeline(
                                  "text-generation",
                                  do_sample=True,
                                  model=model,
                                  tokenizer=tokenizer,
                                  max_new_tokens=25,
                                  min_new_tokens=9,
                                  truncation=True,
                                  device=device,
                                  pad_token_id=tokenizer.eos_token_id,
                                  return_full_text=False,
                                  temperature=0.3,
                                  num_beams=4,
                                  # early_stopping=True,
                                  no_repeat_ngram_size=2,
                                  top_k=50,
                                  top_p=0.9,
                                 )
# pipeline.model = PeftModel.from_pretrained(model, adapter_model)
# pipeline.model.resize_token_embeddings(len(tokenizer))
                                            
# code_snippet = "public static void main(String[] args){System.out.println('Hello, world.');}"
# code_snippet = "private void setNodekeyInJsonResponse(String service) throws Exception { String filename = this.baseDirectory + service + '.json'; Scanner s = new Scanner(new File(filename)); PrintWriter fw = new PrintWriter(new File(filename + '.new')); while (s.hasNextLine()) { fw.println(s.nextLine().replaceAll('NODEKEY', this.key)); } s.close(); fw.close(); (new File(filename + '.new')).renameTo(new File(filename)); }"
# result = summarization_pipeline(f"Please write a summary for the following Java code snippet: {code_snippet}. A summary of the above Java code snippet is that:")
# result = summarization_pipeline("A summary of the above code snippet is that:")
# output = summarization_pipeline(f"{code_snippet} SUMMARY: ")
# generated_text = output[0]['generated_text']
# print(generated_text)

google_bleu = evaluate.load("google_bleu")
google_bleu_scores = []
flag = 3
for i, v in enumerate(tqdm(summarization_pipeline(KeyDataset(tokenized_test_dataset, "input_strings")))):
    if flag:
        print(f"GPT2: {[v[0]['generated_text']]}")
        print("\n")
        print(f'Label: {[[tokenized_test_dataset["docstring"][i]]]}')
        print("*" * 30)
        flag -= 1
    google_bleu_scores.append(
                              google_bleu.compute(
                                                  predictions=[v[0]['generated_text']], 
                                                  references=[[tokenized_test_dataset["docstring"][i]]]
                                                  )['google_bleu']
                             )

The model 'PeftModelForCausalLM' is not supported for text-generation. Supported models are ['BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'LlamaForCausalLM', 'CodeGenForCausalLM', 'CohereForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'DbrxForCausalLM', 'ElectraForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'FalconMambaForCausalLM', 'FuyuForCausalLM', 'GemmaForCausalLM', 'Gemma2ForCausalLM', 'GitForCausalLM', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTNeoXJapaneseForCausalLM', 'GPTJForCausalLM', 'GraniteForCausalLM', 'GraniteMoeForCausalLM', 'JambaForCausalLM', 'JetMoeForCausalLM', 'LlamaForCausalLM', 'MambaForCausalLM', 'Mamba2ForCausalLM', 'MarianForCausalLM', 'MBartForCausalLM', 'MegaForCaus

  0%|          | 0/1000 [00:00<?, ?it/s]

GPT2: ['@param fragmentName\n@return\nThe fragment name of the fragment.']


Label: [['Import Pipeline Fragment Configuration & Rules\n\n@param fragmentId Fragment  Id\n@return fragmentEnvelope']]
******************************
GPT2: ['\nThis method is used to check if the request is valid.\n\n@return']


Label: [['deserialize request command\n\n@return true if deserialize success; false if exception catched']]
******************************
GPT2: ['@param to\nThe address of the email address\n@return']


Label: [['Appends TO address by personal name and email address.\n\n@param personalName personal name.\n@param to           email address.\n@return this\n@see #to(EmailAddress)']]
******************************


In [10]:
total_bleu = sum(google_bleu_scores)
average_bleu = total_bleu / len(google_bleu_scores)

print(f"Average BLEU Score: {average_bleu * 100}")

Average BLEU Score: 6.008568243719998
