# Check GPUs and Install libraries

In [1]:
#@title 1.1 Check GPU Status
import subprocess
simple_nvidia_smi_display = True#@param {type:"boolean"}
if simple_nvidia_smi_display:
  #!nvidia-smi
  nvidiasmi_output = subprocess.run(['nvidia-smi', '-L'], stdout=subprocess.PIPE).stdout.decode('utf-8')
  print(nvidiasmi_output)
else:
  #!nvidia-smi -i 0 -e 0
  nvidiasmi_output = subprocess.run(['nvidia-smi'], stdout=subprocess.PIPE).stdout.decode('utf-8')
  print(nvidiasmi_output)
  nvidiasmi_ecc_note = subprocess.run(['nvidia-smi', '-i', '0', '-e', '0'], stdout=subprocess.PIPE).stdout.decode('utf-8')
  print(nvidiasmi_ecc_note)

GPU 0: NVIDIA L4 (UUID: GPU-417126cd-a631-a8e0-0982-caf3b63f21a4)



# Imports

In [10]:
import pandas as pd
from torch.utils.data import Dataset, DataLoader
import torch
from datasets import Dataset as HFDataset
import numpy as np

from peft import LoraConfig, get_peft_model
from datasets import load_dataset
import evaluate

from transformers import (
    pipeline,
    AutoTokenizer,
    AutoModelForSequenceClassification,
    BertConfig,
    BertModel,
    Trainer,
    TrainingArguments,
    T5Tokenizer,
    T5ForConditionalGeneration,
    AutoTokenizer, 
    AutoModelForCausalLM,
    AutoModelForSeq2SeqLM,
    GPT2Model,
    GPT2Config,
    GPT2ForSequenceClassification,
    GPT2Tokenizer,
)
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sacrebleu import corpus_bleu
from tqdm.autonotebook import tqdm

# Loading and Data pre-processing

In [7]:
train_hf_dataset = HFDataset.load_from_disk("SubCodeXGLUE_train")
validation_hf_dataset = HFDataset.load_from_disk("SubCodeXGLUE_validation")
test_hf_dataset = HFDataset.load_from_disk("SubCodeXGLUE_test")
print(train_hf_dataset)
print(validation_hf_dataset)
print(test_hf_dataset)

Dataset({
    features: ['id', 'repo', 'path', 'func_name', 'original_string', 'language', 'code', 'code_tokens', 'docstring', 'docstring_tokens', 'sha', 'url', '__index_level_0__'],
    num_rows: 2000
})
Dataset({
    features: ['id', 'repo', 'path', 'func_name', 'original_string', 'language', 'code', 'code_tokens', 'docstring', 'docstring_tokens', 'sha', 'url', '__index_level_0__'],
    num_rows: 2000
})
Dataset({
    features: ['id', 'repo', 'path', 'func_name', 'original_string', 'language', 'code', 'code_tokens', 'docstring', 'docstring_tokens', 'sha', 'url', '__index_level_0__'],
    num_rows: 1000
})


In [8]:
# Pre-process the original data
def preprocess_function(examples):
    inputs = ["Summarize the CODE into a DOCSTRING. FUNCTION NAME: " + examples["func_name"][i] + " CODE: " + examples["code"][i] for i in range(len(examples["func_name"]))]
    targets = ["DOCSTRING: " + i for i in examples["docstring"]]
    # targets = [str(i) for i in examples["label"]]
    
    max_length = 512
    return {"input_ids": tokenizer(inputs, truncation=True, padding="max_length", max_length=max_length)["input_ids"],
            "attention_mask": tokenizer(inputs, truncation=True, padding="max_length", max_length=max_length)["attention_mask"],
            # "labels": torch.tensor(targets, dtype=torch.long),
            "labels": tokenizer(targets, truncation=True, padding="max_length", max_length=max_length)["input_ids"],
        }

# Load the Tokenizer
tokenizer = AutoTokenizer.from_pretrained("google-t5/t5-base")

# Pre-process data
tokenized_train_dataset = train_hf_dataset.map(preprocess_function, batched=True, batch_size=4)
tokenized_validation_dataset = validation_hf_dataset.map(preprocess_function, batched=True, batch_size=4)
tokenized_test_dataset = test_hf_dataset.map(preprocess_function, batched=True, batch_size=4)

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [5]:
# Load pre-trained model
model = AutoModelForSeq2SeqLM.from_pretrained("google-t5/t5-base")

# Config LoRA specifications
lora_config = LoraConfig(
    r=4096,  # lower the rank
    # r=48,  # lower the rank
    lora_alpha=32,
    target_modules=["q", "v"],  # apply LoRA to q and v of attention modules
    # target_modules=["c_attn"],  # target query, key, and value together
    lora_dropout=0.1,
    bias="none",
    task_type="SEQ_2_SEQ_LM",  # task type set to seq2seq generation
    # task_type="SEQ_CLS",  # task type set to text classification
)

# Convert the model to LoRA model
model = get_peft_model(model, lora_config)

# Check the number of trainable parameters (for LoRA)
model.print_trainable_parameters()

trainable params: 452,984,832 || all params: 675,888,384 || trainable%: 67.0207


In [6]:
# Adjust the batch size
batch_size = 4
training_args = TrainingArguments(
    output_dir="./results/lora-t5-c2s",
    # eval_strategy="epoch",
    learning_rate=5e-4,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    metric_for_best_model="f1",
    weight_decay=0.01,
    num_train_epochs=6,
    save_strategy="epoch",
    logging_strategy="steps",
    logging_steps=10,
    report_to="mlflow",  # disable wandb etc.
    fp16=True,  # mixed precision training
    optim="adamw_torch",  # use torch original optimizer
)

google_bleu = evaluate.load("google_bleu")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred

    # Extract logits from tuple and process predictions
    predictions = np.argmax(predictions[0], axis=-1).tolist()  # Extract the array # Convert logits to token IDs # Convert to a list

    # Decode predictions and labels
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = [[label] for label in tokenizer.batch_decode(labels, skip_special_tokens=True)] # Format labels for BLEU (expects list of lists for references)
    
    # Compute BLEU score using google_bleu
    bleu = google_bleu.compute(
                               predictions=decoded_preds, 
                               references=decoded_labels
                               )['google_bleu']
    
    return {"bleu": bleu}

# Define Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=tokenized_train_dataset,
    # eval_dataset=tokenized_validation_dataset,
)

# Start Training
trainer.train()

# Save the model after LoRA fine-tuing
model.save_pretrained("./lora-t5-c2s/model")
tokenizer.save_pretrained("./lora-t5-c2s/tokenizer")

print("LoRA fine-tuning done, model saved!")

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Step,Training Loss
10,14.1061
20,7.2385
30,1.0685
40,0.5495
50,0.6997
60,0.5106
70,0.4834
80,0.4167
90,0.3537
100,0.3338


LoRA fine-tuning done, model saved!


In [3]:
# Load pre-trained model
model = AutoModelForSeq2SeqLM.from_pretrained("results/lora-t5-c2s/checkpoint-3000")
model.generation_config.use_cache=False

In [11]:
google_bleu = evaluate.load("google_bleu")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred

    # Extract logits from tuple and process predictions
    predictions = np.argmax(predictions[0], axis=-1).tolist()  # Extract the array # Convert logits to token IDs # Convert to a list

    # Decode predictions and labels
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = [[label] for label in tokenizer.batch_decode(labels, skip_special_tokens=True)] # Format labels for BLEU (expects list of lists for references)
    
    # Compute BLEU score using google_bleu
    bleu = google_bleu.compute(
                               predictions=decoded_preds, 
                               references=decoded_labels
                               )['google_bleu']
    
    return {"bleu": bleu}

# Define Trainer
trainer = Trainer(
    model=model,
    compute_metrics=compute_metrics,
)

test_results = []
chunk_size = 2
for i in tqdm(range(0, len(tokenized_test_dataset), chunk_size)):
    small_test_dataset = tokenized_test_dataset.select(range(i, min(i + chunk_size, len(tokenized_test_dataset))))
    with torch.no_grad():
        predictions, labels, metrics = trainer.predict(small_test_dataset)
        test_results.append(metrics)
    torch.cuda.empty_cache()

  0%|          | 0/500 [00:00<?, ?it/s]

In [13]:
# Calculate the average BLEU score
total_bleu = sum(result['test_bleu'] for result in test_results)
average_bleu = total_bleu / len(test_results)

print(f"Average BLEU Score: {average_bleu * 100}")

Average BLEU Score: 19.48574479301977
