## Setting up

In [1]:
# %%capture
# %pip install -U bitsandbytes
# %pip install -U transformers
# %pip install -U accelerate
# %pip install -U peft
# %pip install -U trl

In [2]:
# import wandb

# from kaggle_secrets import UserSecretsClient
# user_secrets = UserSecretsClient()

# wb_token = user_secrets.get_secret("wandb")

# wandb.login(key=wb_token)
# run = wandb.init(
#     project='Fine-tune llama-3.1-8b-it on Sentiment Analysis Dataset',
#     job_type="training",
#     anonymous="allow"
# )

In [3]:
import numpy as np
import pandas as pd
import os
from tqdm import tqdm
import bitsandbytes as bnb
import torch
import torch.nn as nn
import transformers
from datasets import Dataset
from peft import LoraConfig, PeftConfig
from trl import SFTTrainer
from trl import setup_chat_format
from transformers import (AutoModelForCausalLM,
                          AutoTokenizer,
                          BitsAndBytesConfig,
                          TrainingArguments,
                          pipeline,
                          logging)
from sklearn.metrics import (accuracy_score,
                             classification_report,
                             confusion_matrix)
from sklearn.model_selection import train_test_split

  warn(


## Loading and processing the dataset

In [4]:
%pwd

'/root/autodl-tmp'

In [5]:
neg = pd.read_csv('/root/reinforcement_commit/datasets/negative+CC-900repos.csv')
neg['label'] = 0
pos = pd.read_csv('/root/reinforcement_commit/datasets/positive+CC-900repos.csv', encoding='utf_8_sig')
pos['label'] = 1
df = pd.concat([neg[['github','message','diff','label']],pos[['github','message','diff','label']]],axis=0)
df.fillna('', inplace=True)
# 1是100%的意思
shuffled_df = df.sample(frac=1, random_state=42).reset_index(drop=True)

label2id={0:'negative',1:'positive'}
df = shuffled_df.replace({"label": label2id})
# df = df.sample(1000,random_state=42)
df =  df[df['diff'].str.len()<1024]
df['project_name'] = df['github'].str.extract(r'github\.com/([^/]+)')
# df= df.sample(1000)
df 

Unnamed: 0,github,message,diff,label,project_name
1,https://github.com/gosa-project/gosa-core/comm...,escape html entities to fix xss at the login s...,diff --git a/html/index.php b/html/index.php\n...,positive,gosa-project
2,https://github.com/dgl/cgiirc/commit/dd8d50752...,0.5.11,diff --git a/client-perl.cgi b/client-perl.cgi...,negative,dgl
3,https://github.com/GNOME/libxml2/commit/b215c2...,Fix cleanup of attributes in XML reader\n\nxml...,diff --git a/xmlreader.c b/xmlreader.c\nindex ...,negative,GNOME
5,https://github.com/pluck-cms/pluck/commit/c140...,prevent seoname crashing the site issue #92,diff --git a/data/inc/editpage.php b/data/inc/...,negative,pluck-cms
13,https://github.com/lightSAML/lightSAML/commit/...,Merge pull request #89 from frostieDE/xmlsecli...,diff --git a/composer.json b/composer.json\nin...,negative,lightSAML
...,...,...,...,...,...
10099,https://github.com/antlarr/audiofile/commit/25...,clamp index values to fix index overflow in IM...,diff --git a/libaudiofile/modules/IMA.cpp b/li...,positive,antlarr
10101,https://github.com/lingej/pnp4nagios/commit/09...,Merge pull request #103 from awiddersheim/misc...,diff --git a/.gitignore b/.gitignore\nindex bf...,negative,lingej
10109,https://github.com/open-classifieds/openclassi...,Merge pull request #3146 from oliverds/master\...,diff --git a/oc/classes/image.php b/oc/classes...,negative,open-classifieds
10110,https://github.com/opnsense/core/commit/2573b7...,firmware: do not show subscription key on firm...,diff --git a/src/opnsense/scripts/firmware/pro...,negative,opnsense


In [6]:
# Shuffle the DataFrame and select only 3000 rows
df = df.sample(frac=1, random_state=85).reset_index(drop=True).head(3000)

# Split the DataFrame
train_size = 0.8
eval_size = 0.1

# Calculate sizes
train_end = int(train_size * len(df))
eval_end = train_end + int(eval_size * len(df))

# Split the data
X_train = df[:train_end]
X_eval = df[train_end:eval_end]
X_test = df[eval_end:]

# Define the prompt generation functions
def generate_prompt(data_point):
    return f"""
            Classify the text into negative, positive, and return the answer as the corresponding security patch identification label.
text: {data_point["diff"]}
label: {data_point["label"]}""".strip()

def generate_test_prompt(data_point):
    return f"""
            Classify the text into negative, positive, and return the answer as the corresponding security patch identification label.
text: {data_point["diff"]}
label: """.strip()

# Generate prompts for training and evaluation data
X_train.loc[:,'text'] = X_train.apply(generate_prompt, axis=1)
X_eval.loc[:,'text'] = X_eval.apply(generate_prompt, axis=1)

# Generate test prompts and extract true labels
y_true = X_test.loc[:,'label']
X_test = pd.DataFrame(X_test.apply(generate_test_prompt, axis=1), columns=["text"])



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train.loc[:,'text'] = X_train.apply(generate_prompt, axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_eval.loc[:,'text'] = X_eval.apply(generate_prompt, axis=1)


In [7]:
X_train.label.value_counts()

label
negative    1759
positive     641
Name: count, dtype: int64

In [8]:
y_true.value_counts()

label
negative    216
positive     84
Name: count, dtype: int64

In [9]:
# Convert to datasets
train_data = Dataset.from_pandas(X_train[["text"]])
eval_data = Dataset.from_pandas(X_eval[["text"]])

In [10]:
train_data['text'][3]

"Classify the text into negative, positive, and return the answer as the corresponding security patch identification label.\ntext: diff --git a/rbd-target-api.py b/rbd-target-api.py\nindex 450f96f..394c7e0 100755\n--- a/rbd-target-api.py\n+++ b/rbd-target-api.py\n@@ -2004,7 +2004,8 @@ def main():\n     # request makes further api requests\n     app.run(host='0.0.0.0',\n             port=settings.config.api_port,\n-            debug=True,\n+            debug=settings.config.debug,\n+            use_evalex=False,\n             threaded=True,\n             use_reloader=False,\n             ssl_context=context)\n\nlabel: positive"

## Loading the model and tokenizer

In [11]:
base_model_name = "/root/autodl-tmp/models/Llama3"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=False,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype="float16",
)

model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    device_map="auto",
    torch_dtype="float16",
    quantization_config=bnb_config,
)

model.config.use_cache = False
model.config.pretraining_tp = 1

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [12]:
tokenizer = AutoTokenizer.from_pretrained(base_model_name)

tokenizer.pad_token_id = tokenizer.eos_token_id

## Model evalution before fine-tuning

In [13]:
def predict(test, model, tokenizer):
    y_pred = []
    categories = ["negative", "positive"]

    for i in tqdm(range(len(test))):
        prompt = test.iloc[i]["text"]
        pipe = pipeline(task="text-generation",
                        model=model,
                        tokenizer=tokenizer,
                        max_new_tokens=2,
                        temperature=0.1)

        result = pipe(prompt)
        answer = result[0]['generated_text'].split("label:")[-1].strip()

        # Determine the predicted category
        for category in categories:
            if category.lower() in answer.lower():
                y_pred.append(category)
                break
        else:
            y_pred.append("none")

    return y_pred

In [14]:
y_pred = predict(X_test, model, tokenizer)

  0%|          | 0/300 [00:00<?, ?it/s]Device set to use cuda:0
  0%|          | 1/300 [00:00<02:34,  1.93it/s]Device set to use cuda:0
  1%|          | 2/300 [00:00<01:24,  3.54it/s]Device set to use cuda:0
  1%|          | 3/300 [00:00<01:01,  4.81it/s]Device set to use cuda:0
  1%|▏         | 4/300 [00:00<00:50,  5.82it/s]Device set to use cuda:0
  2%|▏         | 5/300 [00:00<00:44,  6.59it/s]Device set to use cuda:0
  2%|▏         | 6/300 [00:01<00:41,  7.16it/s]Device set to use cuda:0
  2%|▏         | 7/300 [00:01<00:40,  7.31it/s]Device set to use cuda:0
  3%|▎         | 8/300 [00:01<00:37,  7.79it/s]Device set to use cuda:0
  3%|▎         | 9/300 [00:01<00:35,  8.10it/s]Device set to use cuda:0
  3%|▎         | 10/300 [00:01<00:33,  8.55it/s]Device set to use cuda:0
  4%|▎         | 11/300 [00:01<00:34,  8.35it/s]Device set to use cuda:0
  4%|▍         | 12/300 [00:01<00:35,  8.14it/s]Device set to use cuda:0
  4%|▍         | 13/300 [00:01<00:35,  8.16it/s]Device set to use cud

In [15]:
def evaluate(y_true, y_pred):
    labels = ["negative", "positive"]
    mapping = {label: idx for idx, label in enumerate(labels)}

    def map_func(x):
        return mapping.get(x, -1)  # Map to -1 if not found, but should not occur with correct data

    y_true_mapped = np.vectorize(map_func)(y_true)
    y_pred_mapped = np.vectorize(map_func)(y_pred)

    # Calculate accuracy
    accuracy = accuracy_score(y_true=y_true_mapped, y_pred=y_pred_mapped)
    print(f'Accuracy: {accuracy:.3f}')

    # Generate accuracy report
    unique_labels = set(y_true_mapped)  # Get unique labels

    for label in unique_labels:
        label_indices = [i for i in range(len(y_true_mapped)) if y_true_mapped[i] == label]
        label_y_true = [y_true_mapped[i] for i in label_indices]
        label_y_pred = [y_pred_mapped[i] for i in label_indices]
        label_accuracy = accuracy_score(label_y_true, label_y_pred)
        print(f'Accuracy for label {labels[label]}: {label_accuracy:.3f}')

    # Generate classification report
    class_report = classification_report(y_true=y_true_mapped, y_pred=y_pred_mapped, target_names=labels, digits=4, labels=list(range(len(labels))))
    print('\nClassification Report:')
    print(class_report)

    # Generate confusion matrix
    conf_matrix = confusion_matrix(y_true=y_true_mapped, y_pred=y_pred_mapped, labels=list(range(len(labels))))
    print('\nConfusion Matrix:')
    print(conf_matrix)

In [16]:
evaluate(y_true, y_pred)

Accuracy: 0.043
Accuracy for label negative: 0.023
Accuracy for label positive: 0.095

Classification Report:
              precision    recall  f1-score   support

    negative       0.56      0.02      0.04       216
    positive       0.11      0.10      0.10        84

   micro avg       0.16      0.04      0.07       300
   macro avg       0.33      0.06      0.07       300
weighted avg       0.43      0.04      0.06       300


Confusion Matrix:
[[ 5 63]
 [ 4  8]]


## Extracting the linear modules names

In [17]:
import bitsandbytes as bnb

def find_all_linear_names(model):
    cls = bnb.nn.Linear4bit
    lora_module_names = set()
    for name, module in model.named_modules():
        if isinstance(module, cls):
            names = name.split('.')
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])
    if 'lm_head' in lora_module_names:  # needed for 16 bit
        lora_module_names.remove('lm_head')
    return list(lora_module_names)

In [18]:
modules = find_all_linear_names(model)
modules

['k_proj', 'v_proj', 'o_proj', 'down_proj', 'q_proj', 'gate_proj', 'up_proj']

## Setting up the model

In [19]:
output_dir="llama-3.1-fine-tuned-model"

peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0,
    r=64,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=modules,
)

training_arguments = TrainingArguments(
    output_dir=output_dir,                    # directory to save and repository id
    num_train_epochs=1,                       # number of training epochs
    per_device_train_batch_size=1,            # batch size per device during training
    gradient_accumulation_steps=8,            # number of steps before performing a backward/update pass
    gradient_checkpointing=True,              # use gradient checkpointing to save memory
    optim="paged_adamw_32bit",
    logging_steps=1,
    learning_rate=2e-4,                       # learning rate, based on QLoRA paper
    weight_decay=0.001,
    fp16=True,
    bf16=False,
    max_grad_norm=0.3,                        # max gradient norm based on QLoRA paper
    max_steps=-1,
    warmup_ratio=0.03,                        # warmup ratio based on QLoRA paper
    group_by_length=False,
    lr_scheduler_type="cosine",               # use cosine learning rate scheduler
    report_to="none",                  # report metrics to w&b
    eval_strategy="steps",              # save checkpoint every epoch
    eval_steps = 0.2
)

trainer = SFTTrainer(
    model=model,
    args=training_arguments,
    train_dataset=train_data,
    eval_dataset=eval_data,
    peft_config=peft_config,
    processing_class=tokenizer
)

Adding EOS to train dataset:   0%|          | 0/2400 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/2400 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/2400 [00:00<?, ? examples/s]

Adding EOS to eval dataset:   0%|          | 0/300 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/300 [00:00<?, ? examples/s]

Truncating eval dataset:   0%|          | 0/300 [00:00<?, ? examples/s]

## Model Training

In [20]:
# Train model
trainer.train()

Step,Training Loss,Validation Loss
60,1.382,1.249078
120,1.146,1.228941
180,1.1074,1.21604
240,1.214,1.209024
300,1.2777,1.207538


TrainOutput(global_step=300, training_loss=1.2456631338596345, metrics={'train_runtime': 1427.2075, 'train_samples_per_second': 1.682, 'train_steps_per_second': 0.21, 'total_flos': 2.44072322574336e+16, 'train_loss': 1.2456631338596345})

In [21]:
# wandb.finish()
model.config.use_cache = True

## Saving the model and tokenizer

In [22]:
# Save trained model and tokenizer
trainer.save_model(output_dir)
tokenizer.save_pretrained(output_dir)

('llama-3.1-fine-tuned-model/tokenizer_config.json',
 'llama-3.1-fine-tuned-model/special_tokens_map.json',
 'llama-3.1-fine-tuned-model/chat_template.jinja',
 'llama-3.1-fine-tuned-model/tokenizer.json')

## Testing model after fine-tuning

In [23]:
y_pred = predict(X_test, model, tokenizer)
evaluate(y_true, y_pred)

  0%|          | 0/300 [00:00<?, ?it/s]Device set to use cuda:0
  0%|          | 1/300 [00:00<01:52,  2.66it/s]Device set to use cuda:0
  1%|          | 2/300 [00:00<01:34,  3.16it/s]Device set to use cuda:0
  1%|          | 3/300 [00:01<01:40,  2.95it/s]Device set to use cuda:0
  1%|▏         | 4/300 [00:01<01:34,  3.13it/s]Device set to use cuda:0
  2%|▏         | 5/300 [00:01<01:29,  3.29it/s]Device set to use cuda:0
  2%|▏         | 6/300 [00:01<01:27,  3.34it/s]Device set to use cuda:0
  2%|▏         | 7/300 [00:02<01:25,  3.43it/s]Device set to use cuda:0
  3%|▎         | 8/300 [00:02<01:23,  3.50it/s]Device set to use cuda:0
  3%|▎         | 9/300 [00:02<01:24,  3.43it/s]Device set to use cuda:0
  3%|▎         | 10/300 [00:03<01:24,  3.44it/s]Device set to use cuda:0
  4%|▎         | 11/300 [00:03<01:20,  3.57it/s]Device set to use cuda:0
  4%|▍         | 12/300 [00:03<01:19,  3.60it/s]Device set to use cuda:0
  4%|▍         | 13/300 [00:03<01:19,  3.60it/s]Device set to use cud

Accuracy: 0.803
Accuracy for label negative: 0.903
Accuracy for label positive: 0.548

Classification Report:
              precision    recall  f1-score   support

    negative       0.84      0.90      0.87       216
    positive       0.69      0.55      0.61        84

    accuracy                           0.80       300
   macro avg       0.76      0.73      0.74       300
weighted avg       0.79      0.80      0.80       300


Confusion Matrix:
[[195  21]
 [ 38  46]]



