# Check GPUs and Install libraries

In [1]:
#@title 1.1 Check GPU Status
import subprocess
simple_nvidia_smi_display = True#@param {type:"boolean"}
if simple_nvidia_smi_display:
  #!nvidia-smi
  nvidiasmi_output = subprocess.run(['nvidia-smi', '-L'], stdout=subprocess.PIPE).stdout.decode('utf-8')
  print(nvidiasmi_output)
else:
  #!nvidia-smi -i 0 -e 0
  nvidiasmi_output = subprocess.run(['nvidia-smi'], stdout=subprocess.PIPE).stdout.decode('utf-8')
  print(nvidiasmi_output)
  nvidiasmi_ecc_note = subprocess.run(['nvidia-smi', '-i', '0', '-e', '0'], stdout=subprocess.PIPE).stdout.decode('utf-8')
  print(nvidiasmi_ecc_note)

GPU 0: NVIDIA L4 (UUID: GPU-471c91dd-7b13-e725-8b8c-200c4cd9f9ba)



# Imports

In [8]:
import pandas as pd
from torch.utils.data import Dataset, DataLoader
import torch
from datasets import Dataset as HFDataset

from peft import LoraConfig, get_peft_model
from datasets import load_dataset

from transformers import (
    pipeline,
    AutoTokenizer,
    AutoModelForSequenceClassification,
    BertConfig,
    BertModel,
    Trainer,
    TrainingArguments,
    T5Tokenizer,
    T5ForConditionalGeneration,
    AutoTokenizer, 
    AutoModelForCausalLM,
    GPT2Model,
    GPT2Config,
    GPT2ForSequenceClassification,
    GPT2Tokenizer,
)
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

# Define dataset class

In [4]:
dataset = load_dataset("nchen909/bigclonebench-processed")
dataset

DatasetDict({
    train: Dataset({
        features: ['label', 'func1', 'func2', 'id'],
        num_rows: 901028
    })
    validation: Dataset({
        features: ['label', 'func1', 'func2', 'id'],
        num_rows: 830832
    })
    test: Dataset({
        features: ['label', 'func1', 'func2', 'id'],
        num_rows: 415416
    })
})

In [24]:
# Convert to Pandas DataFrame for easier manipulation

# df = dataset["train"].to_pandas() # train
# df = dataset["validation"].to_pandas() # validation
# df = dataset["test"].to_pandas() # test

# Specify the column for stratification
# label_column = "label"  # Replace with the correct column name for stratification
# if label_column not in df.columns:
#     raise ValueError(f"Column '{label_column}' not found in the dataset.")

# Stratified sampling using sklearn
# Adjust the test_size to control the size of the sub-dataset
# sub_df, _ = train_test_split(df, stratify=df[label_column], test_size=0.99778, random_state=42) # train
# sub_df, _ = train_test_split(df, stratify=df[label_column], test_size=0.997593, random_state=42) # validation
# sub_df, _ = train_test_split(df, stratify=df[label_column], test_size=0.997593, random_state=42) # test

# Convert back to a Hugging Face Dataset (optional)
# sub_dataset = HFDataset.from_pandas(sub_df)

# Output the sub-dataset
# print(sub_dataset)

Dataset({
    features: ['label', 'func1', 'func2', 'id', '__index_level_0__'],
    num_rows: 999
})


# Data pre-processing

In [2]:
train_hf_dataset = HFDataset.load_from_disk("SubBigCloneBench_train")
validation_hf_dataset = HFDataset.load_from_disk("SubBigCloneBench_validation")
test_hf_dataset = HFDataset.load_from_disk("SubBigCloneBench_test")
print(train_hf_dataset)
print(validation_hf_dataset)
print(test_hf_dataset)

Dataset({
    features: ['label', 'func1', 'func2', 'id', '__index_level_0__'],
    num_rows: 2000
})
Dataset({
    features: ['label', 'func1', 'func2', 'id', '__index_level_0__'],
    num_rows: 1999
})
Dataset({
    features: ['label', 'func1', 'func2', 'id', '__index_level_0__'],
    num_rows: 999
})


In [4]:
# Pre-process the original data
def preprocess_function(examples):
    inputs = ["code_1: " + examples["func1"][i] + " code_2: " + examples["func2"][i] for i in range(len(examples["func1"]))]
    targets = [i for i in examples["label"]]
    # targets = [str(i) for i in examples["label"]]
    
    max_length = 512
    return {"input_ids": tokenizer(inputs, truncation=True, padding="max_length", max_length=max_length)["input_ids"],
            "attention_mask": tokenizer(inputs, truncation=True, padding="max_length", max_length=max_length)["attention_mask"],
            "labels": torch.tensor(targets, dtype=torch.long),
            # "labels": tokenizer(targets, truncation=True, padding="max_length", max_length=256)["input_ids"]
        }

# Load the Tokenizer
tokenizer = AutoTokenizer.from_pretrained("neulab/codebert-java")

# Pre-process data
tokenized_train_dataset = train_hf_dataset.map(preprocess_function, batched=True, batch_size=4)
tokenized_validation_dataset = validation_hf_dataset.map(preprocess_function, batched=True, batch_size=4)
tokenized_test_dataset = test_hf_dataset.map(preprocess_function, batched=True, batch_size=4)

In [6]:
# Load pre-trained model
num_labels = 2
model = AutoModelForSequenceClassification.from_pretrained("neulab/codebert-java", num_labels=num_labels)

# Config LoRA specifications
lora_config = LoraConfig(
    # r=8,  # lower the rank
    r=32,  # lower the rank
    lora_alpha=32,
    # target_modules=["q", "v"],  # apply LoRA to q and v of attention modules
    # target_modules=["c_attn"],  # target query, key, and value together
    target_modules=["query", "value"],  # correct module names for BERT
    lora_dropout=0.1,
    bias="none",
    # task_type="SEQ_2_SEQ_LM",  # task type set to seq2seq generation
    task_type="SEQ_CLS",  # task type set to text classification
)

# Convert the model to LoRA model
model = get_peft_model(model, lora_config)

# Check the number of trainable parameters (for LoRA)
model.print_trainable_parameters()

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at neulab/codebert-java and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 1,771,778 || all params: 126,418,948 || trainable%: 1.4015


In [9]:
# Adjust the batch size
batch_size = 4
training_args = TrainingArguments(
    output_dir="./results/lora-codeBert-ccd",
    eval_strategy="epoch",
    learning_rate=5e-4,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    metric_for_best_model="f1",
    weight_decay=0.01,
    num_train_epochs=6,
    save_strategy="epoch",
    logging_strategy="steps",
    logging_steps=10,
    report_to="mlflow",  # disable wandb etc.
    fp16=True,  # mixed precision training
    optim="adamw_torch",  # use torch original optimizer
)

def compute_metrics(pred):
    labels = pred.label_ids
    # Extract logits from the predictions & assuming logits are the first element in the tuple
    logits = pred.predictions[0] if isinstance(pred.predictions, tuple) else pred.predictions
    preds = logits.argmax(-1)
    f1 = f1_score(labels, preds, average="macro")
    acc = accuracy_score(labels, preds)
    precision = precision_score(labels, preds, average="macro")
    recall = recall_score(labels, preds, average="macro")
    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}

# Define Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_validation_dataset,
)

# Start Training
trainer.train()

# Save the model after LoRA fine-tuing
model.save_pretrained("./lora-codeBert-ccd/model")
tokenizer.save_pretrained("./lora-codeBert-ccd/tokenizer")

print("LoRA fine-tuning done, model saved!")

test_results = trainer.predict(tokenized_test_dataset)
predictions, labels, metrics = test_results
print(metrics)

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.2781,0.265732,0.905453,0.785143,0.865176,0.816943
2,0.2476,0.492029,0.894947,0.771835,0.903506,0.814807
3,0.0327,0.607807,0.889445,0.765609,0.906918,0.809281
4,0.1471,0.439257,0.887944,0.762226,0.894554,0.804272
5,0.3147,0.506364,0.887444,0.762487,0.900839,0.805393
6,0.141,0.454641,0.888944,0.76287,0.890199,0.804225


LoRA fine-tuning done, model saved!


{'test_loss': 0.3830234110355377, 'test_accuracy': 0.9179179179179179, 'test_precision': 0.8130501339456564, 'test_recall': 0.915601131302183, 'test_f1': 0.8518961801588105, 'test_runtime': 8.8728, 'test_samples_per_second': 112.591, 'test_steps_per_second': 28.176}
