https://ai.google.dev/gemma/docs/core/huggingface_text_finetune_qlora

https://huggingface.co/datasets/philschmid/gretel-synthetic-text-to-sql

In [None]:
import json
from random import randint
import re

from datasets import load_dataset

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModelForImageTextToText, BitsAndBytesConfig, pipeline

from peft import LoraConfig, PeftModel

from trl import SFTConfig, SFTTrainer

In [2]:
system_prompt = """Some system prompt"""
n_example = 4


def create_conversation(sample):
    return {
        "messages": [
            # Gemma 3 doesn't support system prompts per se
            # {"role": "system", "content": system_prompt},
            {
                "role": "user",
                "content": sample['parent_text'],
            },
            {"role": "assistant", "content": sample['comment_body']},
        ]
    }


def filter_out_nulls(sample):
    return sample['parent_text'] is not None and sample['comment_body'] is not None


dataset = load_dataset("csv", name="csv-for-gemma3-hf", split="train", data_files="conversations.csv")
dataset = dataset.filter(filter_out_nulls)
dataset = dataset.shuffle(seed=42)
column_names_orig = dataset.column_names
print(column_names_orig)
print(json.dumps(dataset[n_example], indent=2))

['timestamp', 'comment_id', 'comment_body', 'parent_text']
{
  "timestamp": "2019-05-02 08:52:13 UTC",
  "comment_id": "emau7u9",
  "comment_body": "Reasons are overrated.\n\nWe just are.",
  "parent_text": "There's no reason for us to exist."
}


In [3]:
dataset = dataset.map(create_conversation, remove_columns=column_names_orig, batched=False)
dataset = dataset.train_test_split(test_size=0.1)
print(json.dumps(dataset['train'][n_example], indent=2))

{
  "messages": [
    {
      "content": "We only know that the machine displaying the board is running Ubuntu. The thousands of machines actually running the AlphaGo might be running something else altogether.",
      "role": "user"
    },
    {
      "content": "I wouldn't be surprised if server side is also Ubuntu. AlphaGo is just a bunch of instances in Google cloud, and Ubuntu is one of the most popular OS choices for cloud computing.",
      "role": "assistant"
    }
  ]
}


In [4]:
# Hugging Face model id
# model_id = "google/gemma-3-1b-it-qat-q4_0-unquantized"
model_id = "google/gemma-3-4b-it-qat-q4_0-unquantized"
# model_id = "google/gemma-3-27b-it-qat-q4_0-unquantized"

# Select model class based on id
if model_id == "google/gemma-3-1b-it-qat-q4_0-unquantized":
    model_class = AutoModelForCausalLM
else:
    model_class = AutoModelForImageTextToText

# Check if GPU benefits from bfloat16
if torch.cuda.get_device_capability()[0] >= 8:
    torch_dtype = torch.bfloat16
else:
    torch_dtype = torch.float16

# Define model init arguments
model_kwargs = dict(
    attn_implementation="eager",  # Use "flash_attention_2" when running on Ampere or newer GPU
    torch_dtype=torch_dtype,  # What torch dtype to use, defaults to auto
    device_map="auto",  # Let torch decide how to load the model
)

# BitsAndBytesConfig: Enables 4-bit quantization to reduce model size/memory usage
model_kwargs["quantization_config"] = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_compute_dtype=model_kwargs['torch_dtype'],
    bnb_4bit_quant_storage=model_kwargs['torch_dtype'],
)

# Load model and tokenizer
model = model_class.from_pretrained(model_id, **model_kwargs)
tokenizer = AutoTokenizer.from_pretrained(model_id)  # Load the Instruction Tokenizer to use the official Gemma template

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [5]:
peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.05,
    r=16,
    bias="none",
    target_modules="all-linear",
    task_type="CAUSAL_LM",
    # make sure to save the lm_head and embed_tokens as you train the special tokens
    modules_to_save=["lm_head", "embed_tokens"],
)

In [6]:
args = SFTConfig(
    output_dir="gemma-text-to-sql",  # directory to save and repository id
    max_seq_length=512,  # max sequence length for model and packing of the dataset
    packing=True,  # Groups multiple samples in the dataset into a single sequence
    # full training run
    # num_train_epochs=1,  # number of training epochs
    # save_steps=1000,
    # short training run
    max_steps=100,
    save_steps=10,
    per_device_train_batch_size=1,  # batch size per device during training
    gradient_accumulation_steps=2,  # number of steps before performing a backward/update pass
    gradient_checkpointing=True,  # use gradient checkpointing to save memory
    # optim="adamw_torch_fused",
    optim="adamw_8bit",
    logging_strategy="steps",
    logging_steps=1,  # log every 10 steps
    save_strategy="steps",
    learning_rate=2e-4,  # learning rate, based on QLoRA paper
    fp16=True if torch_dtype == torch.float16 else False,  # use float16 precision
    bf16=True if torch_dtype == torch.bfloat16 else False,  # use bfloat16 precision
    max_grad_norm=0.3,  # max gradient norm based on QLoRA paper
    warmup_ratio=0.03,  # warmup ratio based on QLoRA paper
    lr_scheduler_type="constant",  # use constant learning rate scheduler
    weight_decay=0.01,
    seed=42,
    push_to_hub=False,
    report_to="tensorboard",  # report metrics to tensorboard
    dataset_kwargs={
        "add_special_tokens": False,  # We template with special tokens
        "append_concat_token": True,  # Add EOS token as separator token between examples
    },
)

In [7]:
# Create Trainer object
trainer = SFTTrainer(
    model=model, args=args, train_dataset=dataset['train'], peft_config=peft_config, processing_class=tokenizer
)



Tokenizing train dataset:   0%|          | 0/39505 [00:00<?, ? examples/s]

Packing train dataset:   0%|          | 0/39505 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [8]:
trainer.train()

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss
1,9.0601
2,8.4148
3,8.3008
4,7.2554
5,7.6637
6,6.0631
7,6.2383
8,6.3768
9,6.2906
10,5.7523


TrainOutput(global_step=100, training_loss=5.679462494850159, metrics={'train_runtime': 224.4429, 'train_samples_per_second': 0.891, 'train_steps_per_second': 0.446, 'total_flos': 2656534017893184.0, 'train_loss': 5.679462494850159})

In [9]:
trainer.save_model()

In [10]:
# free the memory again
del model
del trainer
torch.cuda.empty_cache()

In [11]:
# Load Model base model
model = model_class.from_pretrained(model_id, low_cpu_mem_usage=True)

# Merge LoRA and base model and save
peft_model = PeftModel.from_pretrained(model, args.output_dir)
merged_model = peft_model.merge_and_unload()
merged_model.save_pretrained("merged_model", safe_serialization=True, max_shard_size="2GB")

processor = AutoTokenizer.from_pretrained(args.output_dir)
processor.save_pretrained("merged_model")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

('merged_model/tokenizer_config.json',
 'merged_model/special_tokens_map.json',
 'merged_model/chat_template.jinja',
 'merged_model/tokenizer.json')

In [12]:
# Load Model with PEFT adapter
model = model_class.from_pretrained(
    args.output_dir,
    device_map="auto",
    torch_dtype=torch_dtype,
    attn_implementation="eager",
)
tokenizer = AutoTokenizer.from_pretrained(model_id)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [13]:
# Load the model and tokenizer into the pipeline
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)

# Load a random sample from the test dataset
rand_idx = randint(0, len(dataset["test"]))
test_sample = dataset["test"][rand_idx]

# Convert as test example into a prompt with the Gemma template
stop_token_ids = [tokenizer.eos_token_id, tokenizer.convert_tokens_to_ids("<end_of_turn>")]
prompt = pipe.tokenizer.apply_chat_template(test_sample["messages"][:2], tokenize=False, add_generation_prompt=True)

# Generate our query.
outputs = pipe(
    prompt,
    max_new_tokens=256,
    do_sample=False,
    temperature=0.1,
    top_k=50,
    top_p=0.1,
    eos_token_id=stop_token_ids,
    disable_compile=True,
)

print()
print(f"Prompt:\n{test_sample['messages'][0]['content']}")
print()
print(f"Original Answer:\n{test_sample['messages'][1]['content']}")
print()
print(f"Generated Answer:\n{outputs[0]['generated_text'][len(prompt):].strip()}")

Device set to use cuda:0
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.



Prompt:
I've got a question, how do you see the interference pattern for the double slit with electrons? Once again this is just basic bottom of the barrel ideas I'd thought of after seeing a couple of the mit opencourseware quantum lectures but I was thinking maybe after passing the slit the electrons hit, well, a fluorescent screen where it would light up where they hit? That idea also came from the crt too but when I think about the Hitachi technology double slit experiment video, I could see it being something like that

Original Answer:
Yeah, fluorescence might work. Or you could have some kind of moving probe, getting hit by electrons, generating a slight current, which varies when you move the probe.

Regardless, to truly show quantum effects, you need to shoot only one thing at a time. *This is extraordinarily hard to do*, because it's very, very hard to measure what comes out at the other end. You will have lots of noise, and not much signal.

In normal conditions, when you s