In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Installing dependencies



In [2]:
!pip install -U -q transformers
!pip install -U -q peft
!pip install -U -q bitsandbytes
!pip install -U -q trl
!pip install -U -q datasets

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.3/9.3 MB[0m [31m23.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m251.6/251.6 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m314.1/314.1 kB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m21.3/21.3 MB[0m [31m31.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m119.8/119.8 MB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m245.8/245.8 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m19.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m103.4/103.4 kB[0m [31m13.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━

In [3]:
PROJECT_ROOT_PATH = "/content/drive/MyDrive/NLP_COURSE"
OUTPUT_DIR = f"{PROJECT_ROOT_PATH}/output"
DATASET_PATH = f"{PROJECT_ROOT_PATH}/finetuning_dataset.jsonl"
import os
os.makedirs(OUTPUT_DIR, exist_ok=True)

In [4]:
%ls "/content/drive/MyDrive/NLP_COURSE"
%cd "/content/drive/MyDrive/NLP_COURSE"

credentials.py  finetuning_dataset.jsonl  [0m[01;34mlogs[0m/    [01;34m__pycache__[0m/
finetune.ipynb  Inference.ipynb           [01;34moutput[0m/
/content/drive/MyDrive/NLP_COURSE


In [5]:
# @title Dependencies

import sys, os
from pprint import pprint
from pathlib import Path
from tqdm import tqdm, tqdm_pandas
tqdm.pandas()

import json
import numpy as np
import pandas as pd

import datasets as ds
from datasets import load_from_disk
import torch
from trl import SFTTrainer
from peft import (
    LoraConfig,
    get_peft_model,
    prepare_model_for_kbit_training,
    )

from transformers import (
    BitsAndBytesConfig,
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    LlamaForCausalLM,
)

from credentials import HF_TOKEN
from huggingface_hub import login
login(HF_TOKEN)

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


# Data Preparation

In [6]:
df = pd.read_json("finetuning_dataset.jsonl", lines=True, orient="record")
df.head()

Unnamed: 0,id,question,answer
0,1,What are the best women shoes from Geox?,Here are some women shoes from Geox that you m...
1,2,Recommend me some men shoes from Under Armour.,Here are some men shoes from Under Armour that...
2,3,unisex baby shoes from Clarks,Here are some unisex baby shoes from Clarks th...
3,4,What are the best men shoes from Skechers?,Here are some men shoes from Skechers that you...
4,5,Recommend me some women shoes from PUMA.,Here are some women shoes from PUMA that you m...


In [7]:
prompt_template = """<|begin_of_text|><|start_header_id|>system<|end_header_id|>

You are a helpful assistant for Amazon Shoe department. Your task is to help the customers find the best shoes of the world! <|eot_id|><|start_header_id|>user<|end_header_id|>

{0} <|eot_id|><|start_header_id|>assistant<|end_header_id|>

Hello! {1} <|eot_id|>"""

def apply_template(sample_series):
    question = sample_series["question"]
    answer = sample_series["answer"]

    instruction = prompt_template.format(question, answer)

    return instruction


df["text"] = df.progress_apply(apply_template, axis=1)


100%|██████████| 2234/2234 [00:00<00:00, 49063.12it/s]


In [8]:
df.head()

Unnamed: 0,id,question,answer,text
0,1,What are the best women shoes from Geox?,Here are some women shoes from Geox that you m...,<|begin_of_text|><|start_header_id|>system<|en...
1,2,Recommend me some men shoes from Under Armour.,Here are some men shoes from Under Armour that...,<|begin_of_text|><|start_header_id|>system<|en...
2,3,unisex baby shoes from Clarks,Here are some unisex baby shoes from Clarks th...,<|begin_of_text|><|start_header_id|>system<|en...
3,4,What are the best men shoes from Skechers?,Here are some men shoes from Skechers that you...,<|begin_of_text|><|start_header_id|>system<|en...
4,5,Recommend me some women shoes from PUMA.,Here are some women shoes from PUMA that you m...,<|begin_of_text|><|start_header_id|>system<|en...


In [9]:
my_dataset = ds.Dataset.from_pandas(df[["text"]])
my_dataset

Dataset({
    features: ['text'],
    num_rows: 2234
})

In [10]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

def find_target_modules(model):
    # Initialize a Set to Store Unique Layers
    unique_layers = set()

    # Iterate Over All Named Modules in the Model
    for name, module in model.named_modules():
        # Check if the Module Type Contains 'Linear4bit'
        if "Linear4bit" in str(type(module)):
            # Extract the Type of the Layer
            layer_type = name.split('.')[-1]

            # Add the Layer Type to the Set of Unique Layers
            unique_layers.add(layer_type)

    # Return the Set of Unique Layers Converted to a List
    return list(unique_layers)

In [11]:
model_id = "meta-llama/Meta-Llama-3-8B-Instruct"

# load the base model in 4-bit quantization
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
    use_auth_token=True,
)
model.config.use_cache = False

# load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    model_id,
    use_fast=True,
    trust_remote_code=True,
    use_auth_token=True,
)
tokenizer.pad_token = tokenizer.eos_token

model = prepare_model_for_kbit_training(model)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/654 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/187 [00:00<?, ?B/s]



tokenizer_config.json:   0%|          | 0.00/51.0k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/73.0 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [12]:
find_target_modules(model)

['o_proj', 'up_proj', 'v_proj', 'q_proj', 'k_proj', 'down_proj', 'gate_proj']

In [13]:
config = LoraConfig(
    r = 8,
    lora_alpha=32,
    target_modules=find_target_modules(model),
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM",
)

model = get_peft_model(model, config)
print_trainable_parameters(model)

trainable params: 20971520 || all params: 4561571840 || trainable%: 0.4597432800707574


In [14]:
def formatting_func(sample):
    return sample["text"]


training_args = TrainingArguments(
    # BATCH SIZES
    per_device_train_batch_size=8,
    per_device_eval_batch_size=1,

    # OUTPUT DIRECTORY TO SAVE THE MODEL
    output_dir=OUTPUT_DIR,

    # No Evaluation DURING TRAINING
    do_eval=False,
    eval_strategy="no",

    # TAKING SNAPSHOST DURING TRAINING
    save_steps=10,
    save_total_limit=1,


    num_train_epochs=1,
    learning_rate=2e-4,
    bf16=True,

    # LOGGING
    logging_steps=1,
    logging_dir="./logs",
    seed=42,
)

trainer = SFTTrainer(
    model=model,
    train_dataset=my_dataset,
    packing=False,
    max_seq_length=300,
    tokenizer=tokenizer,
    formatting_func=formatting_func,
    args=training_args,
)
model.config.use_cache = False
trainer.train()

# Saving the final version of model
trainer.save_model(f"{OUTPUT_DIR}/ft_amazon")



Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


Map:   0%|          | 0/2234 [00:00<?, ? examples/s]



Step,Training Loss
1,4.1168
2,3.208
3,2.7672
4,2.224
5,2.233
6,1.7462
7,1.7528
8,1.5806
9,1.8053
10,1.5263




In [15]:
# def tokenize_fn(example):
#     example["length"] = tokenizer(example["text"], return_tensors="pt").input_ids.shape[1]
#     return example

In [32]:
def get_text(question):
    text = """<|begin_of_text|><|start_header_id|>system<|end_header_id|>

    You are a helpful assistant for Amazon Shoe department. Your task is to help the customers find the best shoes of the world! <|eot_id|><|start_header_id|>user<|end_header_id|>

    {0} <|eot_id|><|start_header_id|>assistant<|end_header_id|>""".format(question)
    return text


def generate_answer(model,
                    tokenizer,
                    question):

    model = model.eval()

    text = get_text(question)
    inputs = tokenizer(text, return_tensors="pt").to("cuda")

    generation_config = {
        "do_sample": True,
        "temperature": 0.6,
        "top_p": 0.9,
        "max_length":300,
        "num_return_sequences":1,
    }

    with torch.no_grad():
        outputs = model.generate(**inputs,**generation_config)

    decoded_output = tokenizer.decode(outputs[0])

    split_token = "<|eot_id|><|start_header_id|>assistant<|end_header_id|>"
    decoded_output = decoded_output.split(split_token)[-1]
    return decoded_output.replace("<|eot_id|>", "")


In [17]:
pprint(df.iloc[0]["answer"])

('Here are some women shoes from Geox that you might like:\n'
 "- Geox Women's D Smeraldo a Low-Top Sneakers (£47.28 - £144.69)\n"
 '- Geox New Club C Women Loafers (£51.25 - £79.62)\n'
 "- Geox Women's D Nhenbus C Low-Top Sneakers (£61.28 - £100.51)\n"
 "- Geox Women's D Nebula C Sneaker (£54.04 - £103.43)")


In [38]:
output = generate_answer(model,
                         tokenizer,
                         "What are the best women shoes from Geox?")
pprint(output)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


('\n'
 '\n'
 'Hello! Here are some women shoes from Geox that you might like:\n'
 "- Geox Women's D Nebula a Sneaker (£35.00 - £108.51)\n"
 "- Geox Women's D Rubidia a Sneaker (£32.99 - £61.51)\n"
 "- Geox Women's D Kaula a Sneaker (£25.00 - £61.51)\n"
 "- Geox Women's D Kaula a Sneaker (£25.00 - £61.51)\n"
 "- Geox Women's D Rubidia a Sneaker (£32.99 - £61.51)\n"
 "- Geox Women's D Rubidia a Sneaker (£32.99 - £61.51) ")


In [33]:
output = generate_answer(model,
                         tokenizer,
                         "Who are you?")
pprint(output)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


('\n'
 '\n'
 "Hello! I'm your helpful assistant for the Amazon Shoe department. My task is "
 "to help you find the best shoes from around the world! Whether you're "
 'looking for a new pair of running shoes, a stylish pair of boots, or a '
 "comfortable pair of sandals, I'm here to help. What kind of shoes are you "
 'looking for today? ')


In [36]:
output = generate_answer(model,
                         tokenizer,
                         "Tell me a story about VRL LAB.")
pprint(output)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


('\n'
 '\n'
 "Hello! Here's a story about VRL LAB:\n"
 'VRL LAB is a company that specializes in creating high-quality virtual '
 'reality shoes. Their shoes are designed to provide the best possible '
 'experience for those who want to explore the world of virtual reality.\n'
 'One day, a customer walked into the VRL LAB store and asked the sales '
 'associate, "Can you show me some shoes from VRL LAB?" The sales associate '
 'smiled and said, "Hello! Here are some shoes from VRL LAB that you might '
 'find interesting:\n'
 "- VRL LAB Men's VRL 1.0 Trainers (£59.90 - £74.90)\n"
 "- VRL LAB Men's VRL 1.0 Trainers (£59.90 - £74.90)\n"
 '- VRL LAB Men\'s VRL 1.0 Trainers (£59.90 - £74.90)" ')


In [37]:
output = generate_answer(model,
                         tokenizer,
                         "Why black people are like that :/")
pprint(output)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


('\n'
 '\n'
 'Hello! Here are some shoes from black that you might like:\n'
 "- New Balance Men's 574v2 Sneaker (£40.30 - £260.19)\n"
 "- adidas Men's Terrex Ax2r Mid Waterproof Hiking Boot, Black (£103.43 - "
 '£162.50)\n'
 "- adidas Men's Tour360 Xt Spikeless Boa Golf Shoe (£130.00 - £270.50)\n"
 "- adidas Men's Terrex Ax2r Mid Waterproof Hiking Boot, Black (£103.43 - "
 '£162.50)\n'
 "- adidas Men's Terrex Ax2r Mid Waterproof Hiking Boot, Black (£103.43 - "
 '£162.50)\n'
 "- adidas Men's Terrex Ax2r Mid Waterproof Hiking Boot, Black (£103.43 - "
 '£162.50) ')
