In [1]:
# kaggle format
%%capture
!pip install pip3-autoremove
!pip-autoremove torch torchvision torchaudio -y
!pip install unsloth

import os
os.environ["WANDB_DISABLED"] = "true"

In [None]:
# Colab format
'''
%%capture
!pip install unsloth
# Also get the latest nightly Unsloth!
!pip uninstall unsloth -y && pip install --upgrade --no-cache-dir "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"

# Install Flash Attention 2 for softcapping support
import torch
if torch.cuda.get_device_capability()[0] >= 8:
    !pip install --no-deps packaging ninja einops "flash-attn>=2.6.3"
'''

In [2]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/gemma-2-2b-it-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


==((====))==  Unsloth 2024.9.post3: Fast Gemma2 patching. Transformers = 4.45.1.
   \\   /|    GPU: Tesla T4. Max memory: 14.741 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.4.1+cu121. CUDA = 7.5. CUDA Toolkit = 12.1.
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.28.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/2.22G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/209 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/47.0k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

In [3]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth 2024.9.post3 patched 26 layers with 26 QKV layers, 26 O layers and 26 MLP layers.


In [4]:
from huggingface_hub import login
hf_token = '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'

login(hf_token)

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [5]:
torch_dtype = torch.float16
attn_implementation = "eager"

In [6]:
import bitsandbytes as bnb

def find_all_linear_names(model):
    cls = bnb.nn.Linear8bitLt
    lora_module_names = set()
    for name, module in model.named_modules():
        if isinstance(module, cls):
            names = name.split('.')
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])
    return list(lora_module_names)

def print_mdules(model):
    lora_module_names = set()
    for name, module in model.named_modules():
        print(f'name:{name} - module:{module}')

def find_all_linear_names_old(model):
    lora_module_names = set()
    for name, module in model.named_modules():
        if isinstance(module, torch.nn.Linear) or isinstance(module, torch.nn.Embedding) or isinstance(module, torch.nn.Conv2d) or isinstance(module, transformers.pytorch_utils.Conv1D):
            names = name.split('.')
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])
    return list(lora_module_names)

modules = find_all_linear_names(model)

In [7]:
from datasets import load_dataset

if False:
    ds = load_dataset("b-mc2/sql-create-context", split='train')
else:
    ds = load_dataset("Clinton/Text-to-sql-v1", split='train')

README.md:   0%|          | 0.00/118 [00:00<?, ?B/s]

texttosqlv2.jsonl:   0%|          | 0.00/635M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/262208 [00:00<?, ? examples/s]

In [8]:
count = 10000
iteration = 0
start_index = count * iteration
end_index = min(start_index + count, len(ds))
print(f'{start_index} ~ {end_index}')

0 ~ 10000


In [9]:
ds = ds.select(range(start_index, end_index))
ds

Dataset({
    features: ['instruction', 'input', 'response', 'source', 'text'],
    num_rows: 10000
})

In [10]:
def format_chat_template(row):
    row_json = [
        {"role": "user", "content": f"""
Use the below SQL tables schemas paired with instruction that describes a task. make SQL query that appropriately completes the request for the provided tables. And make SQL query according the steps.
{row["input"]}
step 1. check columns that user wants.
step 2. check condition that user wants.
step 3. make SQL query to get every information that user wants.

{row["instruction"]}
"""},
        {"role": "model", "content": row["response"]}
    ]
    row["text"] = tokenizer.apply_chat_template(row_json, tokenize=False)
    return row

ds = ds.map(
    format_chat_template,
    num_proc=4,
)

ds

Map (num_proc=4):   0%|          | 0/10000 [00:00<?, ? examples/s]

Dataset({
    features: ['instruction', 'input', 'response', 'source', 'text'],
    num_rows: 10000
})

In [11]:
ds['text'][0]

'<bos><start_of_turn>user\nUse the below SQL tables schemas paired with instruction that describes a task. make SQL query that appropriately completes the request for the provided tables. And make SQL query according the steps.\nCREATE TABLE table_name_77 (\n    home_team VARCHAR,\n    away_team VARCHAR\n)\nstep 1. check columns that user wants.\nstep 2. check condition that user wants.\nstep 3. make SQL query to get every information that user wants.\n\nName the home team for carlton away team<end_of_turn>\n<start_of_turn>model\nSELECT home_team FROM table_name_77 WHERE away_team = "carlton"<end_of_turn>\n'

In [12]:
ds = ds.train_test_split(test_size=0.01)
ds

DatasetDict({
    train: Dataset({
        features: ['instruction', 'input', 'response', 'source', 'text'],
        num_rows: 9900
    })
    test: Dataset({
        features: ['instruction', 'input', 'response', 'source', 'text'],
        num_rows: 100
    })
})

In [13]:
import torch.cuda
import torch

from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, AutoConfig
from tensorflow.keras.optimizers import Adam

In [16]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

training_arguments = TrainingArguments(
    output_dir='output',
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=2,
    optim = "adamw_8bit",
    num_train_epochs=1,
    eval_strategy="steps",
    eval_steps=0.2,
    warmup_steps = 10,
#    max_steps = 1000,
    learning_rate = 2e-4,
    fp16 = not is_bfloat16_supported(),
    bf16 = is_bfloat16_supported(),
    logging_steps = 10,
    weight_decay = 0.01,
    lr_scheduler_type = "linear",
)

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset=ds["train"],
    eval_dataset=ds["test"],
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = training_arguments,
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Map (num_proc=2):   0%|          | 0/9900 [00:00<?, ? examples/s]

Map (num_proc=2):   0%|          | 0/100 [00:00<?, ? examples/s]

In [17]:
model.config.use_cache = False
trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 9,900 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 4,950
 "-____-"     Number of trainable parameters = 20,766,720


Step,Training Loss,Validation Loss
990,0.3598,0.322311
1980,0.2849,0.29361
2970,0.294,0.274669
3960,0.3294,0.262802
4950,0.2867,0.257756


TrainOutput(global_step=4950, training_loss=0.34689224175732547, metrics={'train_runtime': 6651.3311, 'train_samples_per_second': 1.488, 'train_steps_per_second': 0.744, 'total_flos': 6.1175363889024e+16, 'train_loss': 0.34689224175732547, 'epoch': 1.0})

In [18]:
FastLanguageModel.for_inference(model)
table_schema = 'CREATE TABLE person ( name VARCHAR, age INTEGER, address VARCHAR )'
user_qery = 'people whoes ages are older than 27 and name starts with k'

messages = [
    {
        "role": "user",
        "content": f"""Use the below SQL tables schemas paired with instruction that describes a task. make SQL query that appropriately completes the request for the provided tables. And make SQL query according the steps.
{table_schema}
step 1. check columns that user wants.
step 2. check condition that user wants.
step 3. make SQL query to get every information that user wants.

{user_qery}
"""
    }
]

formated_messages = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True, return_tensors="pt")
input_ids = tokenizer(formated_messages, return_tensors="pt")

In [19]:
formated_messages

'<bos><start_of_turn>user\nUse the below SQL tables schemas paired with instruction that describes a task. make SQL query that appropriately completes the request for the provided tables. And make SQL query according the steps.\nCREATE TABLE person ( name VARCHAR, age INTEGER, address VARCHAR )\nstep 1. check columns that user wants.\nstep 2. check condition that user wants.\nstep 3. make SQL query to get every information that user wants.\n\npeople whoes ages are older than 27 and name starts with k<end_of_turn>\n<start_of_turn>model\n'

In [35]:
response = model.generate(**input_ids, max_new_tokens=32, repetition_penalty=1.1)
response = str(tokenizer.batch_decode(response))
#print(str(response))
print(f'assistant: {response[response.find("<start_of_turn>model")+22:-1].strip()}')

assistant: SELECT T1.name FROM person AS T1 JOIN person AS T2 ON T1.age > 27 WHERE T1.name LIKE '%k"


In [36]:
model.save_pretrained("gemma-2-2b-it-nl2sql") # Local saving
tokenizer.save_pretrained("gemma-2-2b-it-nl2sql")
model.push_to_hub("gemma-2-2b-it-nl2sql") # Online saving
tokenizer.push_to_hub("gemma-2-2b-it-nl2sql") # Online saving

README.md:   0%|          | 0.00/583 [00:00<?, ?B/s]

  0%|          | 0/1 [00:00<?, ?it/s]

adapter_model.safetensors:   0%|          | 0.00/83.1M [00:00<?, ?B/s]

Saved model to https://huggingface.co/gemma-2-2b-it-nl2sql


No files have been modified since last commit. Skipping to prevent empty commit.


In [37]:
model.save_pretrained_merged("gemma-2-2b-it-nl2sql", tokenizer, save_method = "merged_16bit",)
model.push_to_hub_merged("gemma-2-2b-it-nl2sql", tokenizer, save_method = "merged_16bit")

Unsloth: You have 2 CPUs. Using `safe_serialization` is 10x slower.
We shall switch to Pytorch saving, which will take 3 minutes and not 30 minutes.
To force `safe_serialization`, set it to `None` instead.
Unsloth: Kaggle/Colab has limited disk space. We need to delete the downloaded
model which will save 4-16GB of disk space, allowing you to save on Kaggle/Colab.
Unsloth: Will remove a cached repo with size 2.2G


Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 19.31 out of 31.36 RAM for saving.


100%|██████████| 26/26 [00:00<00:00, 35.83it/s]


Unsloth: Saving tokenizer... Done.
Unsloth: Saving model... This might take 5 minutes for Llama-7b...
Unsloth: Saving gemma-2-2b-it-nl2sql/pytorch_model-00001-of-00002.bin...
Unsloth: Saving gemma-2-2b-it-nl2sql/pytorch_model-00002-of-00002.bin...
Done.
Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 19.29 out of 31.36 RAM for saving.


100%|██████████| 26/26 [00:00<00:00, 38.34it/s]


Unsloth: Saving to organization with address 9-coding/gemma-2-2b-it-nl2sql
Unsloth: Saving tokenizer... Done.
Unsloth: Saving model... This might take 5 minutes for Llama-7b...
Unsloth: Saving to organization with address 9-coding/gemma-2-2b-it-nl2sql
Unsloth: Saving 9-coding/gemma-2-2b-it-nl2sql/pytorch_model-00001-of-00002.bin...
Unsloth: Saving 9-coding/gemma-2-2b-it-nl2sql/pytorch_model-00002-of-00002.bin...
Unsloth: Uploading all files... Please wait...


  0%|          | 0/2 [00:00<?, ?it/s]

pytorch_model-00001-of-00002.bin:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

pytorch_model-00002-of-00002.bin:   0%|          | 0.00/241M [00:00<?, ?B/s]

Done.
Saved merged model to https://huggingface.co/None/gemma-2-2b-it-nl2sql
