# Colab: Fine-tune with LoRA (PEFT) on your combined dataset
 - Upload `combined_dataset.jsonl` (JSONL with {"input": "...", "output": "..."} per line)
 - Uses 4-bit quantization (bitsandbytes) + LoRA adapters to keep GPU use small
 - Conservative defaults: small LR, few epochs. Adjust carefully.

In [None]:
# (install dependencies) - run once
# NOTE: runtime must be restarted if you change installed versions drastically.
!pip install -q --upgrade pip
# core libs
!pip install -q transformers accelerate datasets peft bitsandbytes safetensors
# optional: if you want a nicer progress bar and trainer integration
!pip install -q "git+https://github.com/huggingface/peft.git@main"
!pip install -q evaluate

[31mERROR: Operation cancelled by user[0m[31m
[0mTraceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/pip/_internal/cli/base_command.py", line 107, in _run_wrapper
    status = _inner_run()
             ^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/pip/_internal/cli/base_command.py", line 98, in _inner_run
    return self.run(options, args)
           ^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/pip/_internal/cli/req_command.py", line 71, in wrapper
    return func(self, options, args)
           ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/pip/_internal/commands/install.py", line 393, in run
    requirement_set = resolver.resolve(
                      ^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/pip/_internal/resolution/resolvelib/resolver.py", line 79, in resolve
    collected = self.factory.collect_root_requirements(root_reqs)
                ^^^^^^^^^^^^^^^

KeyboardInterrupt: 

In [None]:
#  (choose model + runtime hints)
# Pick a model that fits your runtime. Set MODEL_NAME accordingly.
# - Smaller / safe: "declare-model-here/3b-model" (replace with an actual small HF model you can use)
# - Good balance (try if you have luck on Colab T4): "mistralai/Mistral-7B-Instruct" (may OOM)
# - If 7B OOMs, use a 3B–4B model
MODEL_NAME = "openlm-research/open_llama_3b_v2"  # <-- change if you want mistralai/Mistral-7B-Instruct" (Just May OOM ;))
OUTPUT_DIR = "./lora_adapter"
DEVICE = "cuda"


In [None]:
# If model requires HF token or you want to push to Hub, set HF_TOKEN
from getpass import getpass
HF_TOKEN = getpass("Enter your Hugging Face token (if needed):")

In [None]:
#  (upload or mount dataset)
from google.colab import files
# after upload, use the uploaded filename below

dataset_file = "combined_dataset.jsonl"  # fallback; change to your path

print("Using dataset file:", dataset_file)

# Option B (alternative): mount Google Drive


Using dataset file: combined_dataset.jsonl


In [None]:
# (quick peek at dataset)
!sed -n '1,5p' {dataset_file}
# show count
!wc -l {dataset_file}


{"input": "ME: ??", "output": "HER: ??"}
{"input": "ME: This too", "output": "HER: ThTs kirby"}
{"input": "ME: What about this", "output": "HER: Idk"}
{"input": "ME: ððððð", "output": "HER: ?"}
{"input": "ME: Wrong one", "output": "HER: Yes"}
59911 combined_dataset.jsonl


In [None]:
# (load dataset with datasets library)
from datasets import load_dataset
ds = load_dataset("json", data_files=dataset_file, field=None)
# The dataset will have fields "input" and "output"
print(ds)


DatasetDict({
    train: Dataset({
        features: ['input', 'output'],
        num_rows: 59911
    })
})


In [None]:
# (preview few examples)
for ex in ds["train"].select(range(min(5, len(ds["train"])))):
    print("IN:", ex.get("input")[:200])
    print("OUT:", ex.get("output")[:200])
    print("----")


IN: ME: ??
OUT: HER: ??
----
IN: ME: This too
OUT: HER: ThTs kirby
----
IN: ME: What about this
OUT: HER: Idk
----
IN: ME: ððððð
OUT: HER: ?
----
IN: ME: Wrong one
OUT: HER: Yes
----


In [None]:
# (tokenizer + model load - quantized + prepare for k-bit training)
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from transformers import LlamaTokenizer, LlamaForCausalLM

tokenizer = LlamaTokenizer.from_pretrained(MODEL_NAME, use_fast=False)
base_model = LlamaForCausalLM.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.float16,
    device_map="auto",
)



# Bitsandbytes config for 4-bit loading (conservative settings)
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4"
)

print("Loading model (quantized 4-bit). This may take a while...")
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
    token=HF_TOKEN if HF_TOKEN else None
)
print("Model loaded.")

tokenizer_config.json:   0%|          | 0.00/593 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/512k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/330 [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 - if you loaded a llama tokenizer from a GGUF file you can ignore this message
`torch_dtype` is deprecated! Use `dtype` instead!


config.json:   0%|          | 0.00/506 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/6.85G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/6.85G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

Loading model (quantized 4-bit). This may take a while...
Model loaded.


In [None]:
# (prepare for int/8/4-bit training and apply LoRA)
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model

# prepare model for k-bit + gradient checkpoints
model = prepare_model_for_kbit_training(model)

# LoRA config - conservative defaults
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()


trainable params: 2,662,400 || all params: 3,429,136,000 || trainable%: 0.0776


In [None]:
# %% (tokenize dataset into input_ids and labels)
from transformers import default_data_collator

# Tokenization function: produce single string combining input and output for causal LM.
# Format: "<s>IN: ...\nOUT: ..." or a template you prefer.
def preprocess_function(examples):
    inputs = []
    for inp, out in zip(examples["input"], examples["output"]):
        # ensure strings
        inp = inp if isinstance(inp, str) else ""
        out = out if isinstance(out, str) else ""
        # prompt template
        prompt = f"### Instruction:\n{inp}\n\n### Response:\n{out}"
        inputs.append(prompt)
    tokenized = tokenizer(inputs, truncation=True, max_length=256, padding="max_length")
    tokenized["labels"] = tokenized["input_ids"].copy()
    return tokenized

# Add padding token
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

tokenized_ds = ds["train"].map(preprocess_function, batched=True, remove_columns=ds["train"].column_names)
tokenized_ds = tokenized_ds.shuffle(seed=42)
print(tokenized_ds)

Map:   0%|          | 0/59911 [00:00<?, ? examples/s]

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 59911
})


In [None]:
# %% (training args and trainer)
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./lora_training",
    per_device_train_batch_size=2,   # try 2 on Colab T4, reduce to 1 if OOM
    gradient_accumulation_steps=8,   # effective batch = 1*4 = 16
    warmup_steps=50,
    num_train_epochs=1,
    learning_rate=2e-4,
    fp16=True,
    logging_steps=20,
    save_strategy="epoch",
    save_total_limit=2,
    optim="paged_adamw_32bit"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds,
    data_collator=default_data_collator,
    tokenizer=tokenizer
)

# Start training
trainer.train()

  trainer = Trainer(
  return fn(*args, **kwargs)


Step,Training Loss
20,6.0503
40,1.5156
60,0.3933
80,0.3435
100,0.3505


KeyboardInterrupt: 

In [None]:
# (save LoRA adapter)
# PEFT adapters are small; save the adapter weights
model.save_pretrained(OUTPUT_DIR)
print("Saved LoRA adapter to", OUTPUT_DIR)

# Optionally save tokenizer too (if changed)
tokenizer.save_pretrained(OUTPUT_DIR + "_tokenizer")


In [None]:
# (inference demo - load base model + adapter for chatting)
# Reload base model (quantized) then apply adapter for inference
from peft import PeftModel

# load base model again (quantized) - if still in memory you can reuse `model` but here we show reload
base = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
    use_auth_token=HF_TOKEN if HF_TOKEN else None
)

from peft import PeftModel
instruct_model = PeftModel.from_pretrained(base, OUTPUT_DIR)
instruct_model.to("cuda")

def chat(prompt, max_new_tokens=64, temperature=0.9):
    input_ids = tokenizer(prompt, return_tensors="pt").to("cuda")
    out = instruct_model.generate(**input_ids, max_new_tokens=max_new_tokens, temperature=temperature)
    return tokenizer.decode(out[0], skip_special_tokens=True)

prompt = "ME: are you coming to class?\nHER:"
print(chat(prompt))


In [None]:
# %% [markdown]
# ## Troubleshooting notes
# - If you get OOM: reduce `per_device_train_batch_size` to 1 and `gradient_accumulation_steps` to increase effective batch size, or switch to a smaller model (3–4B).
# - If tokenizer/model uses different special tokens, adjust templates and tokenization truncation.
# - If `target_modules` in LoRA config doesn't match model internals, try `target_modules=None` or inspect model.named_modules() to choose q/k/v/wo etc.
# - If Trainer errors due to optimizer/accelerate mismatch, consider using `accelerate launch train_script.py` pattern instead of `Trainer` in Colab.
