In [1]:
import os
os.environ["HF_HOME"] = "/workspace/hf/"

In [2]:
import os

os.environ["CUDA_VISIBLE_DEVICES"] = "0"

import torch
import torch.nn as nn
import bitsandbytes as bnb
from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM


Welcome to bitsandbytes. For bug reports, please run

python -m bitsandbytes

 and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
bin /workspace/generative_models/.venv/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda117.so
CUDA SETUP: CUDA runtime path found: /usr/local/cuda/lib64/libcudart.so.11.0
CUDA SETUP: Highest compute capability among GPUs detected: 8.9
CUDA SETUP: Detected CUDA version 117
CUDA SETUP: Loading binary /workspace/generative_models/.venv/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda117.so...


  warn(msg)
  warn(msg)
Either way, this might cause trouble in the future:
If you get `CUDA error: invalid device function` errors, the above might be the cause and the solution is to make sure only one ['libcudart.so', 'libcudart.so.11.0', 'libcudart.so.12.0'] in the paths that we search based on your env.
  warn(msg)
  from .autonotebook import tqdm as notebook_tqdm


In [3]:
free_in_GB = int(torch.cuda.mem_get_info()[0] / 1024**3)
max_memory = f"{free_in_GB-2}GB"

n_gpus = torch.cuda.device_count()
max_memory = {i: max_memory for i in range(n_gpus)}
max_memory

{0: '41GB'}

In [4]:
model = AutoModelForCausalLM.from_pretrained(
    "tiiuae/falcon-7b",
    load_in_8bit=True,
    device_map={"":0},
    trust_remote_code=True,
)

tokenizer = AutoTokenizer.from_pretrained(
    "tiiuae/falcon-7b",
)

Loading checkpoint shards: 100%|██████████| 2/2 [00:59<00:00, 29.53s/it]


In [None]:
from peft import prepare_model_for_kbit_training

model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

In [5]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [6]:
from peft import LoraConfig, get_peft_model

config = LoraConfig(
    r=16, 
    lora_alpha=32, 
    target_modules=["query_key_value"],
    lora_dropout=0.05, 
    bias="none", 
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, config)
print_trainable_parameters(model)

trainable params: 4718592 || all params: 6926439296 || trainable%: 0.06812435363037071


In [7]:
# Verifying the datatypes.
dtypes = {}
for _, p in model.named_parameters():
    dtype = p.dtype
    if dtype not in dtypes:
        dtypes[dtype] = 0
    dtypes[dtype] += p.numel()
total = 0
for k, v in dtypes.items():
    total += v
for k, v in dtypes.items():
    print(k, v, v / total)

torch.float16 295768960 0.042701444040779475
torch.int8 6625951744 0.9566173124229168
torch.float32 4718592 0.0006812435363037072


In [8]:
import transformers
from datasets import load_dataset, Dataset
import pandas as pd
import numpy as np

In [9]:
# extract, load, and transform OpenAssistant/oasst1 chatbot dataset

# set some pandas options to make the output more readable
pd.set_option("display.max_rows", 500)
pd.set_option("display.max_columns", 500)
pd.set_option("display.width", 1000)

In [10]:
# load dataset from huggingface datasets
ds = load_dataset("OpenAssistant/oasst1")
# lets convert the train dataset to a pandas df
df = ds["train"].to_pandas()

Found cached dataset parquet (/workspace/hf/datasets/OpenAssistant___parquet/OpenAssistant--oasst1-2960c57d7e52ab15/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
100%|██████████| 2/2 [00:00<00:00,  6.56it/s]


In [11]:
# lets grab the message trees to train on 
message_tree_ids = np.unique(np.array(df["message_tree_id"]))
messages = {}
messages['message_tree_id'] = []
messages['message_tree_text'] = []

for message_tree_id in message_tree_ids:
    try:
        # look at all data for this message tree
        one_message_tree = df.query(f"message_tree_id == '{message_tree_id}'").sort_values("created_date")
        text = ""
    
        # root message
        text += "<human>: " + one_message_tree.iloc[0].text
        # find root message's children
        children = one_message_tree[one_message_tree.parent_id == one_message_tree.iloc[0].message_id]
        # find root message's top ranked child:
        child = children[children['rank'] == 0.0]
        text += '\n' + "<bot>: " + child.iloc[0].text
    
        # proceed through rest of the above message tree until completion
        flag=True
        while flag:
            try:
                # find next prompt
                children = one_message_tree[one_message_tree.parent_id == child.message_id.iloc[0]]
                children.index
                one_message_tree.loc[children.index].iloc[0].role
                text += '\n' + "<human>: " + one_message_tree.loc[children.index].iloc[0].text
    
                # find next children
                children = one_message_tree[one_message_tree.parent_id == one_message_tree.loc[children.index].iloc[0].message_id]
                children
                # find top ranked child:
                child = children[children['rank'] == 0.0]
                text += '\n' + "<bot>: " + child.iloc[0].text
            except:
                flag=False
    
        messages['message_tree_id'].append(message_tree_id)
        messages['message_tree_text'].append(text)

    except IndexError:
        pass

message_df = pd.DataFrame.from_dict(messages)

In [12]:
# check some random messages

i=41
print(message_df.message_tree_text.iloc[i])
print('\n')
print(message_df.iloc[i])

# check whole dataset

# message_df

<human>: What is the capital of Japan?
<bot>: Tokyo is the capital of Japan.
<human>: And what is the capital of the Moon?
<bot>: Since the Moon is not currently inhabited by humans, there is currently no capital of the Moon.


message_tree_id                   01505c0f-2d68-4206-acfc-7ab39033c75a
message_tree_text    <human>: What is the capital of Japan?\n<bot>:...
Name: 41, dtype: object
