## Setting up

In [1]:
# %%capture
# %pip install -U bitsandbytes
# %pip install -U transformers
# %pip install -U accelerate
# %pip install -U peft
# %pip install -U trl

In [2]:
# import wandb

# from kaggle_secrets import UserSecretsClient
# user_secrets = UserSecretsClient()

# wb_token = user_secrets.get_secret("wandb")

# wandb.login(key=wb_token)
# run = wandb.init(
#     project='Fine-tune llama-3.1-8b-it on Sentiment Analysis Dataset',
#     job_type="training",
#     anonymous="allow"
# )

In [3]:
import numpy as np
import pandas as pd
import os
from tqdm import tqdm
import bitsandbytes as bnb
import torch
import torch.nn as nn
import transformers
from datasets import Dataset
from peft import LoraConfig, PeftConfig
from trl import SFTTrainer
from trl import setup_chat_format
from transformers import (AutoModelForCausalLM,
                          AutoTokenizer,
                          BitsAndBytesConfig,
                          TrainingArguments,
                          pipeline,
                          logging)
from sklearn.metrics import (accuracy_score,
                             classification_report,
                             confusion_matrix)
from sklearn.model_selection import train_test_split

  warn(


## Loading and processing the dataset

In [4]:
%pwd

'/root/autodl-tmp'

In [5]:
neg = pd.read_csv(r'/root/reinforcement_commit/datasets/ffmpeg.csv')
pos = pd.read_csv(r'/root/reinforcement_commit/datasets/qemu.csv', encoding='utf_8_sig')
df = pd.concat([neg[['commit_msg','patch','vulnerability']],pos[['commit_msg','patch','vulnerability']]],axis=0)
df.fillna('', inplace=True)
# 1是100%的意思
df = df.sample(frac=1, random_state=42).reset_index(drop=True)
df.rename(columns={'vulnerability':'label','patch':'diff','commit_msg':'message'},inplace=True)
label2id={0:'negative',1:'positive'}
df = df.replace({"label": label2id})
df =  df[df['diff'].str.len()<1024]
df 

Unnamed: 0,message,diff,label
2,MIPS: fix yield handling The parameter for yi...,diff --git a/target-mips/op_helper.c b/target-...,negative
3,jvdec: avoid unsigned overflow in comparison ...,diff --git a/libavformat/jvdec.c b/libavformat...,positive
6,target-ppc: Bug Fix: mulldo OV Detection Fix ...,diff --git a/target-ppc/int_helper.c b/target-...,positive
8,do boundary check based on absolute value (Gl...,diff --git a/hw/cirrus_vga.c b/hw/cirrus_vga.c...,positive
13,ide/isa: Replace unchecked qdev_init() by qde...,diff --git a/hw/ide/isa.c b/hw/ide/isa.c\ninde...,positive
...,...,...,...
25863,s390-ccw.img: Detect devices with stsch. stsc...,diff --git a/pc-bios/s390-ccw/main.c b/pc-bios...,negative
25864,faxcompr: fix out of array read Signed-off-by...,diff --git a/libavcodec/faxcompr.c b/libavcode...,negative
25868,migration: don't close a file descriptor whil...,diff --git a/migration/migration.c b/migration...,positive
25870,"Fix headers so that ""make checkheaders"" passe...",diff --git a/libavcodec/amr.h b/libavcodec/amr...,negative


In [6]:
# Shuffle the DataFrame and select only 3000 rows
df = df.sample(frac=1, random_state=85).reset_index(drop=True).head(3000)

# Split the DataFrame
train_size = 0.8
eval_size = 0.1

# Calculate sizes
train_end = int(train_size * len(df))
eval_end = train_end + int(eval_size * len(df))

# Split the data
X_train = df[:train_end]
X_eval = df[train_end:eval_end]
X_test = df[eval_end:]

# Define the prompt generation functions
def generate_prompt(data_point):
    return f"""
            Classify the text into negative, positive, and return the answer as the corresponding security patch identification label.
text: {data_point["diff"]}
label: {data_point["label"]}""".strip()

def generate_test_prompt(data_point):
    return f"""
            Classify the text into negative, positive, and return the answer as the corresponding security patch identification label.
text: {data_point["diff"]}
label: """.strip()

# Generate prompts for training and evaluation data
X_train.loc[:,'text'] = X_train.apply(generate_prompt, axis=1)
X_eval.loc[:,'text'] = X_eval.apply(generate_prompt, axis=1)

# Generate test prompts and extract true labels
y_true = X_test.loc[:,'label']
X_test = pd.DataFrame(X_test.apply(generate_test_prompt, axis=1), columns=["text"])



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train.loc[:,'text'] = X_train.apply(generate_prompt, axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_eval.loc[:,'text'] = X_eval.apply(generate_prompt, axis=1)


In [7]:
X_train.label.value_counts()

label
negative    1269
positive    1131
Name: count, dtype: int64

In [8]:
y_true.value_counts()

label
negative    158
positive    142
Name: count, dtype: int64

In [9]:
# Convert to datasets
train_data = Dataset.from_pandas(X_train[["text"]])
eval_data = Dataset.from_pandas(X_eval[["text"]])

In [10]:
train_data['text'][3]



## Loading the model and tokenizer

In [11]:
base_model_name = "/root/autodl-tmp/models/Llama3"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=False,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype="float16",
)

model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    device_map="auto",
    torch_dtype="float16",
    quantization_config=bnb_config,
)

model.config.use_cache = False
model.config.pretraining_tp = 1

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [12]:
tokenizer = AutoTokenizer.from_pretrained(base_model_name)

tokenizer.pad_token_id = tokenizer.eos_token_id

## Model evalution before fine-tuning

In [13]:
def predict(test, model, tokenizer):
    y_pred = []
    categories = ["negative", "positive"]

    for i in tqdm(range(len(test))):
        prompt = test.iloc[i]["text"]
        pipe = pipeline(task="text-generation",
                        model=model,
                        tokenizer=tokenizer,
                        max_new_tokens=2,
                        temperature=0.1)

        result = pipe(prompt)
        answer = result[0]['generated_text'].split("label:")[-1].strip()

        # Determine the predicted category
        for category in categories:
            if category.lower() in answer.lower():
                y_pred.append(category)
                break
        else:
            y_pred.append("none")

    return y_pred

In [14]:
y_pred = predict(X_test, model, tokenizer)

  0%|          | 0/300 [00:00<?, ?it/s]Device set to use cuda:0
  0%|          | 1/300 [00:00<03:34,  1.39it/s]Device set to use cuda:0
  1%|          | 2/300 [00:00<01:52,  2.64it/s]Device set to use cuda:0
  1%|          | 3/300 [00:01<01:20,  3.69it/s]Device set to use cuda:0
  1%|▏         | 4/300 [00:01<01:06,  4.43it/s]Device set to use cuda:0
  2%|▏         | 5/300 [00:01<00:59,  4.97it/s]Device set to use cuda:0
  2%|▏         | 6/300 [00:01<00:54,  5.43it/s]Device set to use cuda:0
  2%|▏         | 7/300 [00:01<00:49,  5.89it/s]Device set to use cuda:0
  3%|▎         | 8/300 [00:01<00:47,  6.10it/s]Device set to use cuda:0
  3%|▎         | 9/300 [00:01<00:46,  6.30it/s]Device set to use cuda:0
  3%|▎         | 10/300 [00:02<00:46,  6.26it/s]Device set to use cuda:0
  4%|▎         | 11/300 [00:02<00:46,  6.20it/s]Device set to use cuda:0
  4%|▍         | 12/300 [00:02<00:44,  6.46it/s]Device set to use cuda:0
  4%|▍         | 13/300 [00:02<00:42,  6.69it/s]Device set to use cud

In [24]:
def evaluate(y_true, y_pred):
    labels = ["negative", "positive"]
    mapping = {label: idx for idx, label in enumerate(labels)}

    def map_func(x):
        return mapping.get(x, -1)  # Map to -1 if not found, but should not occur with correct data

    y_true_mapped = np.vectorize(map_func)(y_true)
    y_pred_mapped = np.vectorize(map_func)(y_pred)

    # Calculate accuracy
    accuracy = accuracy_score(y_true=y_true_mapped, y_pred=y_pred_mapped)
    print(f'Accuracy: {accuracy:.3f}')

    # Generate accuracy report
    unique_labels = set(y_true_mapped)  # Get unique labels

    for label in unique_labels:
        label_indices = [i for i in range(len(y_true_mapped)) if y_true_mapped[i] == label]
        label_y_true = [y_true_mapped[i] for i in label_indices]
        label_y_pred = [y_pred_mapped[i] for i in label_indices]
        label_accuracy = accuracy_score(label_y_true, label_y_pred)
        print(f'Accuracy for label {labels[label]}: {label_accuracy:.3f}')

    # Generate classification report
    class_report = classification_report(y_true=y_true_mapped, y_pred=y_pred_mapped, target_names=labels, digits=4, labels=list(range(len(labels))))
    print('\nClassification Report:')
    print(class_report)

    # Generate confusion matrix
    conf_matrix = confusion_matrix(y_true=y_true_mapped, y_pred=y_pred_mapped, labels=list(range(len(labels))))
    print('\nConfusion Matrix:')
    print(conf_matrix)

In [25]:
evaluate(y_true, y_pred)

Accuracy: 0.530
Accuracy for label negative: 0.671
Accuracy for label positive: 0.373

Classification Report:
              precision    recall  f1-score   support

    negative     0.5436    0.6709    0.6006       158
    positive     0.5048    0.3732    0.4291       142

    accuracy                         0.5300       300
   macro avg     0.5242    0.5221    0.5149       300
weighted avg     0.5252    0.5300    0.5194       300


Confusion Matrix:
[[106  52]
 [ 89  53]]


## Extracting the linear modules names

In [26]:
import bitsandbytes as bnb

def find_all_linear_names(model):
    cls = bnb.nn.Linear4bit
    lora_module_names = set()
    for name, module in model.named_modules():
        if isinstance(module, cls):
            names = name.split('.')
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])
    if 'lm_head' in lora_module_names:  # needed for 16 bit
        lora_module_names.remove('lm_head')
    return list(lora_module_names)

In [27]:
modules = find_all_linear_names(model)
modules

['base_layer']

## Setting up the model

In [19]:
output_dir="llama-3.1-fine-tuned-model"

peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0,
    r=64,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=modules,
)

training_arguments = TrainingArguments(
    output_dir=output_dir,                    # directory to save and repository id
    num_train_epochs=1,                       # number of training epochs
    per_device_train_batch_size=1,            # batch size per device during training
    gradient_accumulation_steps=8,            # number of steps before performing a backward/update pass
    gradient_checkpointing=True,              # use gradient checkpointing to save memory
    optim="paged_adamw_32bit",
    logging_steps=1,
    learning_rate=2e-4,                       # learning rate, based on QLoRA paper
    weight_decay=0.001,
    fp16=True,
    bf16=False,
    max_grad_norm=0.3,                        # max gradient norm based on QLoRA paper
    max_steps=-1,
    warmup_ratio=0.03,                        # warmup ratio based on QLoRA paper
    group_by_length=False,
    lr_scheduler_type="cosine",               # use cosine learning rate scheduler
    report_to="none",                  # report metrics to w&b
    eval_strategy="steps",              # save checkpoint every epoch
    eval_steps = 0.2
)

trainer = SFTTrainer(
    model=model,
    args=training_arguments,
    train_dataset=train_data,
    eval_dataset=eval_data,
    peft_config=peft_config,
    processing_class=tokenizer
)

Adding EOS to train dataset:   0%|          | 0/2400 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/2400 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/2400 [00:00<?, ? examples/s]

Adding EOS to eval dataset:   0%|          | 0/300 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/300 [00:00<?, ? examples/s]

Truncating eval dataset:   0%|          | 0/300 [00:00<?, ? examples/s]

## Model Training

In [20]:
# Train model
trainer.train()

Step,Training Loss,Validation Loss
60,1.0381,1.094074
120,1.0085,1.069437
180,0.9991,1.050757
240,1.1255,1.039348
300,0.9564,1.036687


TrainOutput(global_step=300, training_loss=1.082589714328448, metrics={'train_runtime': 1475.434, 'train_samples_per_second': 1.627, 'train_steps_per_second': 0.203, 'total_flos': 2.5538663477673984e+16, 'train_loss': 1.082589714328448})

In [21]:
# wandb.finish()
model.config.use_cache = True

## Saving the model and tokenizer

In [22]:
# Save trained model and tokenizer
trainer.save_model(output_dir)
tokenizer.save_pretrained(output_dir)

('llama-3.1-fine-tuned-model/tokenizer_config.json',
 'llama-3.1-fine-tuned-model/special_tokens_map.json',
 'llama-3.1-fine-tuned-model/chat_template.jinja',
 'llama-3.1-fine-tuned-model/tokenizer.json')

## Testing model after fine-tuning

In [23]:
y_pred = predict(X_test, model, tokenizer)
evaluate(y_true, y_pred)

  0%|          | 0/300 [00:00<?, ?it/s]Device set to use cuda:0
  0%|          | 1/300 [00:00<01:51,  2.69it/s]Device set to use cuda:0
  1%|          | 2/300 [00:00<01:39,  2.99it/s]Device set to use cuda:0
  1%|          | 3/300 [00:00<01:28,  3.35it/s]Device set to use cuda:0
  1%|▏         | 4/300 [00:01<01:24,  3.49it/s]Device set to use cuda:0
  2%|▏         | 5/300 [00:01<01:26,  3.43it/s]Device set to use cuda:0
  2%|▏         | 6/300 [00:01<01:27,  3.36it/s]Device set to use cuda:0
  2%|▏         | 7/300 [00:02<01:28,  3.31it/s]Device set to use cuda:0
  3%|▎         | 8/300 [00:02<01:25,  3.41it/s]Device set to use cuda:0
  3%|▎         | 9/300 [00:02<01:23,  3.49it/s]Device set to use cuda:0
  3%|▎         | 10/300 [00:02<01:22,  3.51it/s]Device set to use cuda:0
  4%|▎         | 11/300 [00:03<01:24,  3.42it/s]Device set to use cuda:0
  4%|▍         | 12/300 [00:03<01:21,  3.52it/s]Device set to use cuda:0
  4%|▍         | 13/300 [00:03<01:20,  3.55it/s]Device set to use cud

Accuracy: 0.530
Accuracy for label negative: 0.671
Accuracy for label positive: 0.373

Classification Report:
              precision    recall  f1-score   support

    negative       0.54      0.67      0.60       158
    positive       0.50      0.37      0.43       142

    accuracy                           0.53       300
   macro avg       0.52      0.52      0.51       300
weighted avg       0.53      0.53      0.52       300


Confusion Matrix:
[[106  52]
 [ 89  53]]



