<a href="https://colab.research.google.com/github/9-coding/Google_ML_Bootcamp_5th/blob/main/experiment/gemma2_without_unsloth.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%%capture
%pip install -U bitsandbytes
%pip install -U transformers
%pip install -U accelerate
%pip install -U peft
%pip install -U trl

In [2]:
import transformers
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import (
    LoraConfig,
    PeftModel,
    prepare_model_for_kbit_training,
    get_peft_model,
)
import os, torch
from datasets import load_dataset
from trl import SFTTrainer, setup_chat_format

In [3]:
base_model = "google/gemma-2-2b-it"
new_model = "Gemma-2-2b-it-sql-generator"

# Loading the model and tokenizer

In [4]:
from huggingface_hub import login
hf_token = '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'

login(hf_token)

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [5]:
torch_dtype = torch.float16
attn_implementation = "eager"

In [6]:
# QLoRA config
#bnb_config = BitsAndBytesConfig(
#    load_in_4bit=True,
#    bnb_4bit_quant_type="nf4",
#    bnb_4bit_compute_dtype=torch_dtype,
#    bnb_4bit_use_double_quant=True,
#)

bnb_config = BitsAndBytesConfig(load_in_8bit=True)

In [7]:
# Load model
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=bnb_config,
    device_map="auto",
    attn_implementation=attn_implementation
)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/838 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/24.2k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/241M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/187 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/47.0k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

# Adding the adapter to the layer

In [8]:
import bitsandbytes as bnb

def find_all_linear_names(model):
    cls = bnb.nn.Linear8bitLt
    lora_module_names = set()
    for name, module in model.named_modules():
        if isinstance(module, cls):
            names = name.split('.')
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])
    return list(lora_module_names)

def print_mdules(model):
    lora_module_names = set()
    for name, module in model.named_modules():
        print(f'name:{name} - module:{module}')

def find_all_linear_names_old(model):
    lora_module_names = set()
    for name, module in model.named_modules():
        if isinstance(module, torch.nn.Linear) or isinstance(module, torch.nn.Embedding) or isinstance(module, torch.nn.Conv2d) or isinstance(module, transformers.pytorch_utils.Conv1D):
            names = name.split('.')
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])
    return list(lora_module_names)

modules = find_all_linear_names(model)

In [9]:
# LoRA config
peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=modules
)
model, tokenizer = setup_chat_format(model, tokenizer)
model = get_peft_model(model, peft_config)

## Loading the dataset


In [10]:
from datasets import load_dataset

if False:
    ds = load_dataset("b-mc2/sql-create-context", split='train')
else:
    ds = load_dataset("Clinton/Text-to-sql-v1", split='train')



README.md:   0%|          | 0.00/118 [00:00<?, ?B/s]

texttosqlv2.jsonl:   0%|          | 0.00/635M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/262208 [00:00<?, ? examples/s]

In [11]:
count = 10000
iteration = 0
start_index = count * iteration
end_index = min(start_index + count, len(ds))
print(f'{start_index} ~ {end_index}')

0 ~ 10000


In [12]:
ds = ds.select(range(start_index, end_index))
ds

Dataset({
    features: ['instruction', 'input', 'response', 'source', 'text'],
    num_rows: 10000
})

In [13]:
def format_chat_template(row):
    row_json = [
        {"role": "system", "content": f"""
Use the below SQL tables schemas paired with instruction that describes a task. make SQL query that appropriately completes the request for the provided tables. And make SQL query according the steps.
{row["input"]}
step 1. check columns that user wants.
step 2. check condition that user wants.
step 3. make SQL query to get every information that user wants.
"""},
        {"role": "user", "content": row["instruction"]},
        {"role": "assistant", "content": row["response"]}
    ]
    row["text"] = tokenizer.apply_chat_template(row_json, tokenize=False)
    return row

ds = ds.map(
    format_chat_template,
    num_proc=4,
)

ds

Map (num_proc=4):   0%|          | 0/10000 [00:00<?, ? examples/s]

Dataset({
    features: ['instruction', 'input', 'response', 'source', 'text'],
    num_rows: 10000
})

In [14]:
ds['text'][0]

'<|im_start|>system\n\nUse the below SQL tables schemas paired with instruction that describes a task. make SQL query that appropriately completes the request for the provided tables. And make SQL query according the steps.\nCREATE TABLE table_name_77 (\n    home_team VARCHAR,\n    away_team VARCHAR\n)\nstep 1. check columns that user wants.\nstep 2. check condition that user wants.\nstep 3. make SQL query to get every information that user wants.\n<|im_end|>\n<|im_start|>user\nName the home team for carlton away team<|im_end|>\n<|im_start|>assistant\nSELECT home_team FROM table_name_77 WHERE away_team = "carlton"<|im_end|>\n'

In [15]:
ds = ds.train_test_split(test_size=0.01)
ds

DatasetDict({
    train: Dataset({
        features: ['instruction', 'input', 'response', 'source', 'text'],
        num_rows: 9900
    })
    test: Dataset({
        features: ['instruction', 'input', 'response', 'source', 'text'],
        num_rows: 100
    })
})

## Complaining and training the model

In [16]:
import torch.cuda
import torch

from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, AutoConfig
from tensorflow.keras.optimizers import Adam

In [18]:
# Setting Hyperparamter
training_arguments = TrainingArguments(
    output_dir=new_model,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=2,
    optim="adamw_8bit",
    num_train_epochs=1,
    eval_strategy="steps",
    eval_steps=0.2,
    logging_steps=10,
    warmup_steps=10,
    #max_steps=1000,
    logging_strategy="steps",
    learning_rate=2e-4,
    fp16=False,
    bf16=False,
    group_by_length=True,
    report_to="none"
)

tokenizer.padding_side = 'right'
# Setting sft parameters
trainer = SFTTrainer(
    model=model,
    train_dataset=ds["train"],
    eval_dataset=ds["test"],
    peft_config=peft_config,
    max_seq_length= 512,
    dataset_text_field="text",
    tokenizer=tokenizer,
    args=training_arguments,
    packing= False,
)

model.config.use_cache = False
trainer.train()


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


Map:   0%|          | 0/9900 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Step,Training Loss,Validation Loss
990,0.4224,0.368355
1980,0.4161,0.34165
2970,0.3133,0.315836
3960,0.1037,0.298805
4950,0.3523,0.291324




TrainOutput(global_step=4950, training_loss=0.34239731673038365, metrics={'train_runtime': 10670.0741, 'train_samples_per_second': 0.928, 'train_steps_per_second': 0.464, 'total_flos': 3.400077859883827e+16, 'train_loss': 0.34239731673038365, 'epoch': 1.0})

In [19]:
model.config.use_cache = True

In [20]:
trainer.model.save_pretrained(new_model)
trainer.model.push_to_hub(new_model, use_temp_dir=False)



adapter_model.safetensors:   0%|          | 0.00/2.44G [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/9-coding/Gemma-2-2b-it-sql-generator/commit/617c9d4956c4887485660888663de9f8bf11c5dd', commit_message='Upload model', commit_description='', oid='617c9d4956c4887485660888663de9f8bf11c5dd', pr_url=None, pr_revision=None, pr_num=None)