# Utils

In [None]:
!nvidia-smi

# Install requirements

Run this if you are using a Kaggle notebook (pay attention to the version of torch!).

In [None]:
%pip install -U pip
%pip install torch==2.1.2 torchvision==0.16.2 torchaudio==2.1.2 --index-url https://download.pytorch.org/whl/cu118
%pip install transformers==4.41.2
%pip install datasets==2.19.2
%pip install accelerate==0.30.1
%pip install openpyxl==3.1.3
%pip install matplotlib==3.7.5
%pip install scikit-learn==1.2.2
%pip install tensorboard==2.15.1
%pip install bitsandbytes==0.43.1
%pip install peft==0.11.1
%pip install trl==0.9.4

If you are running everything locally, run this. Remember to create a venv!

In [None]:
%pip install -U pip
%pip install torch==2.3.1
%pip install transformers==4.42.3
%pip install datasets==2.20.0
%pip install accelerate==0.31.0
%pip install colored==2.2.4
%pip install openpyxl==3.1.5
%pip install matplotlib==3.9.1
%pip install scikit-learn==1.5.1
%pip install seaborn==0.13.2
%pip install tensorboard==2.17.0
%pip install bitsandbytes==0.43.1
%pip install peft==0.11.1
%pip install trl==0.9.4

# Load model and tokenizer 

Login to Hugging Face (this is required to download the model).

In [None]:
from huggingface_hub import notebook_login

notebook_login()

Use the same seed everywhere.

In [None]:
import random
import torch
import numpy as np

MY_SEED = 1337

random.seed(MY_SEED)
np.random.seed(MY_SEED)
torch.manual_seed(MY_SEED)

Load the model and the tokenizer, using an appropriate BitsAndBytes configuration for quantization (4bit). 

I still have to decide whether to use flash attention or not... Let's keep it this way.

In [None]:
from transformers import (
    BitsAndBytesConfig,
    AutoTokenizer,
    AutoModelForCausalLM
) 

model_path = "meta-llama/Meta-Llama-3-8B-Instruct"

bnb_config = BitsAndBytesConfig(
    load_in_4bit = True,
    bnb_4bit_use_double_quant = True,
    bnb_4bit_quant_type = "nf4",
    bnb_4bit_compute_dtype = torch.bfloat16
)

tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast = True)
tokenizer.add_special_tokens({"pad_token" : "<|pad|>"})
tokenizer.padding_side = "right"

model = AutoModelForCausalLM.from_pretrained(
    model_path,
    quantization_config = bnb_config,
    #attn_implementation = "flash_attention_2",
    device_map = "auto"
)
model.resize_token_embeddings(len(tokenizer), pad_to_multiple_of = 8)

In [None]:
model

In [None]:
model.config

# Dataset download & preprocessing

Download [ReFair US dataset](https://anonymous.4open.science/r/ReFAIR-Toward-a-Context-Aware-Fairness-Recommender-in-Requirement-Engineering-18C7/README.md).

In [None]:
!wget -O synthetic_user_stories.xlsx https://anonymous.4open.science/r/ReFAIR-Toward-a-Context-Aware-Fairness-Recommender-in-Requirement-Engineering-18C7/3.%20Source%20Code/ReFair/datasets/Synthetic%20User%20Stories.xlsx

Convert the .xlsx file to a .csv one, then load it.

In [None]:
import pandas as pd
from datasets import load_dataset

df = pd.read_excel("synthetic_user_stories.xlsx", sheet_name = "Dataset")
df.to_csv("synthetic_user_stories.csv", index = False)
dataset = load_dataset("csv", data_files = "synthetic_user_stories.csv")
dataset

In [None]:
dataset["train"][ : 5]

Create a dataframe using the loaded dataset.

In [None]:
rows = []
for item in dataset["train"]:
    rows.append(
        {
            "domain_cluster": item["Domain Cluster"],
            "topic": item["Topic"],
            "domain": item["Domain"],
            "ml_task": item["Machine Learning Task"],
            "user_story": item["User Story"],
        }
    )
df = pd.DataFrame(rows)

In [None]:
df.head()

Check null values (just to be safe).

In [None]:
df.isnull().value_counts()

Add a "text" column containing the entire prompt for each element (see example below).

In [None]:
def format_example(row: dict):
    system_message = "You are a helpful AI assistant"
    user_message = f"Considering the following machine learning technique: {row['ml_task']} in the field of machine learning. "\
                   f"Can you provide me with a specific user story for the following application domain? {row['domain']}"
    messages = [
        {"role": "system", "content": system_message},
        {"role": "user", "content": user_message},
        {"role": "assistant", "content": row['user_story']}
    ]
    return tokenizer.apply_chat_template(messages, tokenize = False)

df["text"] = df.apply(format_example, axis = 1)

In [None]:
print(df.iloc[0]["text"])

Add a "token_count" column that keeps track of the number of tokens for each element (see example below).

In [None]:
def count_tokens(row: dict) -> int:
    return tokenizer(row["text"], add_special_tokens = True, return_length = True)["length"][0]

df["token_count"] = df.apply(count_tokens, axis = 1)

In [None]:
print(df.iloc[0]["token_count"])

In [None]:
df.head()

In [None]:
df.token_count

Plot histogram of the "token_count".

In [None]:
import matplotlib.pyplot as plt
from matplotlib.ticker import PercentFormatter

plt.hist(df.token_count, weights = np.ones(len(df.token_count)) / len(df.token_count))
plt.gca().yaxis.set_major_formatter(PercentFormatter(1))
plt.xlabel("Tokens")
plt.ylabel("Percentage")
plt.show()

In [None]:
LIMIT = 128

print(f"Number of rows: {len(df)}")
print(f"Number of rows (token_count < {LIMIT}): {len(df[df.token_count < 128])}")
print(f"% of rows (token_count < {LIMIT}): {(len(df[df.token_count < 128]) / len(df)) * 100}")

Discards prompts with a number of tokens greater than 128.

In [None]:
df = df[df.token_count < 128]
df.shape

In [None]:
plt.hist(df.domain_cluster, bins = (np.arange(10) - 0.5), rwidth = 0.8)
plt.xticks(rotation = 45, ha = "right", fontsize = 10)
plt.xlabel('Domain cluster')
plt.ylabel('US Number')
plt.show()

Generate train set, validation set, test set.

In [None]:
from sklearn.model_selection import train_test_split

train, validation_and_test = train_test_split(df, test_size = 0.2, stratify = df["domain_cluster"], random_state = MY_SEED)
validation, test = train_test_split(validation_and_test, test_size = 0.2, stratify = validation_and_test["domain_cluster"], random_state = MY_SEED)

In [None]:
print(f"Train set elements: {len(train)}, {(len(train) / len(df)) * 100}%")
print(f"Validation set elements: {len(validation)}, {(len(validation) / len(df)) * 100}%")
print(f"Test set elements: {len(test)}, {(len(test) / len(df)) * 100}%")

In [None]:
train = train.sort_values(by = ["domain_cluster"])
validation = validation.sort_values(by = ["domain_cluster"])
test = test.sort_values(by = ["domain_cluster"])

In [None]:
unique_labels = np.unique(np.concatenate([test['domain_cluster'], train['domain_cluster'], validation['domain_cluster']]))

fig, axs = plt.subplots(1, 3, figsize = (12, 8))

axs[0].hist(train["domain_cluster"], bins = (np.arange(10) - 0.5), rwidth = 0.8)
axs[0].set_xticks(np.arange(len(unique_labels)))
axs[0].set_xticklabels(unique_labels, rotation = 45, ha = "right", fontsize = 10)
axs[0].set_xlabel("Domain cluster")
axs[0].set_ylabel("US Number")
axs[0].set_title("Train Dataset")

axs[1].hist(validation["domain_cluster"], bins = (np.arange(10) - 0.5), rwidth=  0.8)
axs[1].set_xticks(np.arange(len(unique_labels)))
axs[1].set_xticklabels(unique_labels, rotation = 45, ha = "right", fontsize = 10)
axs[1].set_xlabel("Domain cluster")
axs[1].set_ylabel("US Number")
axs[1].set_title("Validation Dataset")

axs[2].hist(test["domain_cluster"], bins=(np.arange(10) - 0.5), rwidth = 0.8)
axs[2].set_xticks(np.arange(len(unique_labels)))
axs[2].set_xticklabels(unique_labels, rotation = 45, ha = "right", fontsize = 10)
axs[2].set_xlabel("Domain cluster")
axs[2].set_ylabel("US Number")
axs[2].set_title("Test Dataset")

plt.tight_layout()
plt.show()


In [None]:
train.to_json("train.json", orient = "records", lines = True)
validation.to_json("val.json", orient = "records", lines = True)
test.to_json("test.json", orient = "records", lines = True)

In [None]:
dataset = load_dataset(
    "json",
    data_files = {"train": "train.json", "validation": "val.json", "test": "test.json"},
)

In [None]:
dataset

# Test original model

In [None]:
from transformers import pipeline

pipe = pipeline(
    task = "text-generation",
    model = model,
    tokenizer = tokenizer,
    max_new_tokens = 128,
    return_full_text = False
)

In [None]:
def create_test_prompt(row):
    system_message = "You are a helpful AI assistant"
    user_message = f"Considering the following machine learning technique: {row['ml_task']} in the field of machine learning. "\
                   f"Can you provide me with a specific user story for the following application domain? {row['domain']}"
    messages = [
        {"role": "system", "content": system_message},
        {"role": "user", "content": user_message},
    ]
    return tokenizer.apply_chat_template(messages, tokenize = False, add_generation_prompt = True)


In [None]:
from tqdm import tqdm

rows = []
for row in tqdm(dataset["test"]):
    prompt = create_test_prompt(row)
    outputs = pipe(prompt)
    rows.append(
        {
            "domain_cluster": row["domain_cluster"],
            "topic": row["topic"],
            "domain": row["domain"],
            "ml_task": row["ml_task"],
            "prompt": prompt,
            "original_user_story": row["user_story"],
            "untrained_model_user_story": outputs[0]["generated_text"],
        }
    )

report_df = pd.DataFrame(rows)

In [None]:
report_df.to_csv("report_temp.csv", index = None)

# LoRA setup

In [None]:
from peft import LoraConfig, TaskType, prepare_model_for_kbit_training, get_peft_model

lora_config = LoraConfig(
    r = 32,
    lora_alpha = 16,
    target_modules = "all-linear", # ["self_attn.q_proj","self_attn.k_proj","self_attn.v_proj","self_attn.o_proj","mlp.gate_proj","mlp.up_proj","mlp.down_proj"]
    lora_dropout = 0.05,
    bias = "none",
    task_type = TaskType.CAUSAL_LM,
)
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, lora_config)

In [None]:
model.print_trainable_parameters()

# Train the model

Launch TensorBoard.

In [None]:
%load_ext tensorboard
%tensorboard --logdir "./output/runs"

[Train the model on the generated prompts only](https://huggingface.co/docs/trl/sft_trainer#train-on-completions-only).

In [None]:
from trl import DataCollatorForCompletionOnlyLM

response_template = "<|end_header_id|>"
data_collator = DataCollatorForCompletionOnlyLM(response_template, tokenizer = tokenizer)

If you are using a Kaggle notebook (GPU T4 x 2), you can use a higher batch size. This configuration works locally with an RTX 3060 (12 GB of VRAM).

In [None]:
from trl import SFTConfig, SFTTrainer

bf16 = torch.cuda.is_bf16_supported()
fp16 = not bf16

sft_config = SFTConfig(
    dataset_text_field = "text",
    max_seq_length = 128,
    bf16 = bf16,
    fp16 = fp16,
    num_train_epochs = 1,
    per_device_train_batch_size = 2,
    per_device_eval_batch_size = 2,
    gradient_accumulation_steps = 4,
    optim = "paged_adamw_8bit",
    eval_strategy = "steps",
    eval_steps = 0.2,
    save_strategy = "steps",
    save_steps = 0.2,
    logging_steps = 10,
    learning_rate = 1e-4,
    warmup_ratio = 0.1,
    save_total_limit = 2,
    lr_scheduler_type = "constant",
    report_to = "tensorboard",
    save_safetensors = True,
    seed = MY_SEED,
    output_dir = "./output/",
    dataset_kwargs = {
        "add_special_tokens": False,
        "append_concat_token": False
    }
)

trainer = SFTTrainer(
    model = model,
    args = sft_config,
    train_dataset = dataset["train"],
    eval_dataset = dataset["validation"],
    tokenizer = tokenizer,
    data_collator = data_collator
)

In [None]:
trainer.train()

In [None]:
trainer.save_model("Llama-3-8B-Instruct-Refair-FAIRWAY")

# Merge LoRA with the original model

Warning: this step requires a large amount of VRAM. I was only able to run it on a Kaggle notebook. 

Otherwise you could load the quantized model and merge it with the LoRA, but I think it is very likely that the results will have a lower quality.

Try to free up some VRAM. In the worst case, manually reboot the kernel.

In [None]:
import gc

del model
del tokenizer
gc.collect()

In [None]:
# Free up some VRAM
torch.cuda.empty_cache()

In [None]:
!nvidia-smi

I add these imports in case something crashes and you just need to merge the model with the LoRA.

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

Merge.

In [None]:
from peft import PeftModel

tokenizer = AutoTokenizer.from_pretrained("Llama-3-8B-Instruct-Refair-FAIRWAY")

model = AutoModelForCausalLM.from_pretrained(
    "meta-llama/Meta-Llama-3-8B-Instruct",
    torch_dtype = torch.float16,
    device_map = "auto",
)

model.resize_token_embeddings(len(tokenizer), pad_to_multiple_of = 8)
model = PeftModel.from_pretrained(model, "Llama-3-8B-Instruct-Refair-FAIRWAY")
model = model.merge_and_unload(progressbar = True)

# Upload model and tokenizer to Hugging Face

In [None]:
model.push_to_hub("Llama-3-8B-Instruct-Refair-FAIRWAY", tokenizer = tokenizer, max_shard_size = "5GB")

In [None]:
tokenizer.push_to_hub("Llama-3-8B-Instruct-Refair-FAIRWAY")

# Model evaluation

Reload dataset.

In [None]:
dataset = load_dataset(
    "json",
    data_files = {"train": "train.json", "validation": "val.json", "test": "test.json"},
)

Let's use our model.

In [None]:
from transformers import (
    BitsAndBytesConfig,
    AutoTokenizer,
    AutoModelForCausalLM
) 

bnb_config = BitsAndBytesConfig(
    load_in_4bit = True,
    bnb_4bit_use_double_quant = True,
    bnb_4bit_quant_type = "nf4",
    bnb_4bit_compute_dtype = torch.bfloat16
)

tokenizer = AutoTokenizer.from_pretrained("DG266/Llama-3-8B-Instruct-Refair-FAIRWAY", use_fast = True)

model = AutoModelForCausalLM.from_pretrained(
    "DG266/Llama-3-8B-Instruct-Refair-FAIRWAY",
    quantization_config = bnb_config,
    device_map = "auto"
)

Check the performance of the fine-tuned model on the test set.

In [None]:
from transformers import pipeline

pipe = pipeline(
    task = "text-generation",
    model = model,
    tokenizer = tokenizer,
    max_new_tokens = 128,
    return_full_text = False
)

In [None]:
def create_test_prompt(row):
    system_message = "You are a helpful AI assistant"
    user_message = f"Considering the following machine learning technique: {row['ml_task']} in the field of machine learning. "\
                   f"Can you provide me with a specific user story for the following application domain? {row['domain']}"
    messages = [
        {"role": "system", "content": system_message},
        {"role": "user", "content": user_message},
    ]
    return tokenizer.apply_chat_template(messages, tokenize = False, add_generation_prompt = True)


In [None]:
from tqdm import tqdm

generated_user_stories = []
for row in tqdm(dataset["test"]):
    outputs = pipe(create_test_prompt(row))
    generated_user_stories.append(outputs[0]["generated_text"])

In [None]:
report_df.head()

In [None]:
report_df["trained_model_user_story"] = generated_user_stories

In [None]:
report_df.head()

In [None]:
report_df.to_csv("report.csv", index = None)