In [None]:
!pip install datasets
!pip install transformers -U
!pip install accelerate -U
!pip install trl
!pip install bitsandbytes
!pip uninstall -y wandb


Found existing installation: wandb 0.22.1
Uninstalling wandb-0.22.1:
  Successfully uninstalled wandb-0.22.1


In [None]:
!pip install peft



In [None]:
import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
from datasets import load_dataset

DATASET_NAME = "ChrisHayduk/Llama-2-SQL-Dataset"
dataset = load_dataset(DATASET_NAME)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
full_training_dataset = dataset["train"]
shuffled = full_training_dataset.shuffle()
training_dataset = shuffled.select(range(1000)) # selects only the first 1k examples for fine tuning.

In [None]:
import bitsandbytes as bnb
from transformers import BitsAndBytesConfig
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype="float16"
)

In [None]:
import transformers
from transformers import AutoModelForCausalLM
from transformers import AutoTokenizer

MODEL_NAME = "NousResearch/Llama-2-7b-hf"

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=quantization_config,
    device_map="auto")

model.config.use_cache = True

tokenizer = AutoTokenizer.from_pretrained(
    MODEL_NAME,
    trust_remote_code=True)

tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
def construct_datapoint(x):
    combined = x['input'] + x['output']
    return tokenizer(combined, padding=True)

training_dataset = training_dataset.map(construct_datapoint)

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [None]:
print(training_dataset)

Dataset({
    features: ['input', 'output', 'input_ids', 'attention_mask'],
    num_rows: 1000
})


In [None]:
from peft import (
    LoraConfig,
    get_peft_model,
    prepare_model_for_kbit_training
)


peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=['q_proj', 'k_proj', 'down_proj', 'v_proj', 'gate_proj', 'o_proj'],
    lora_dropout=0.05,  #nerfs the model, prevents overfitting
    task_type="CAUSAL_LM"
)

model = prepare_model_for_kbit_training(model)  #4bit training, defined earlier
model = get_peft_model(model, peft_config)  #all layers except attention layers are frozen

generation_configuration = model.generation_config
generation_configuration.pad_token_id = tokenizer.eos_token_id
generation_configuration.eos_token_id = tokenizer.eos_token_id
generation_configuration.max_new_tokens = 256
generation_configuration.temperature = 0.7
generation_configuration.top_p = 0.9
generation_configuration.do_sample = True

# Given a list of probs, we need to choose the next token

In [None]:
def generate(prompt):

    generation_configuration.max_new_tokens = 20

    encoded = tokenizer.encode(prompt, add_special_tokens=True, return_tensors = "pt").to(device)
    with torch.inference_mode():
        out = model.generate(input_ids=encoded, generation_config=generation_configuration, repetition_penalty=2.0)
    string_decoded = tokenizer.decode(out[0], clean_up_tokenization_spaces=True)
    print(string_decoded)


In [None]:
generate('Tonights the night we')

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


<s> Tonights the night we take it higher,
 Bedeutet das im Englischen „we are taking things to a whole


In [None]:
train_arguments = transformers.TrainingArguments(
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,  # simulate a larger batch size
    num_train_epochs=1,
    learning_rate=2e-4,
    fp16=True,
    optim="paged_adamw_8bit",
    lr_scheduler_type="cosine",
    warmup_ratio=0.05,
    output_dir="fine_tuning")

trainer = transformers.Trainer(
    model=model,
    train_dataset=training_dataset,
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
    args=train_arguments
)

model.config.use_cache = False


In [None]:


trainer.train()


  return fn(*args, **kwargs)


Step,Training Loss


TrainOutput(global_step=250, training_loss=0.6514481811523437, metrics={'train_runtime': 866.7589, 'train_samples_per_second': 1.154, 'train_steps_per_second': 0.288, 'total_flos': 5028690650480640.0, 'train_loss': 0.6514481811523437, 'epoch': 1.0})

In [None]:
evaluation_dataset = dataset["eval"].shuffle()

sample_sql_question = evaluation_dataset[0]["input"]
correct_answer = evaluation_dataset[0]["output"]

generate(sample_sql_question)



`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
Caching is incompatible with gradient checkpointing in LlamaDecoderLayer. Setting `past_key_values=None`.
  return fn(*args, **kwargs)


<s> Below is an instruction that describes a SQL generation task, paired with an input that provides further context about the available table schemas. Write SQL code that appropriately answers the request.

### Instruction:
What was the smallest crowd of vfl park?

### Input:
CREATE TABLE table_name_83 (crowd INTEGER, venue VARCHAR)

### Response:  SELECT belowbelowBel above Below Below Below Below Below Below Below Below Below Below Below Below Below write below


In [None]:
correct_answer

# Despite barebones training, we should be able to see clear signs of
# of successful verticalisation: in this case, understanding of SQL syntax.

'SELECT MIN(crowd) FROM table_name_83 WHERE venue = "vfl park"'