In [1]:
# =========================================
# 1. Preparación del entorno
# =========================================
!pip install -q transformers accelerate bitsandbytes peft datasets


In [2]:
# =========================================
# 2. Imports
# =========================================
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
#Parameter efficient fine tunning
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
model_name = "mistralai/Mistral-7B-Instruct-v0.2"


In [3]:
# =========================================
# 3. Modelo base (Mistral 7B Instruct)
# =========================================
from huggingface_hub import login
from google.colab import userdata
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

# Hugging Face login
hf_token = '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'
if hf_token:
    login(token=hf_token)
else:
    print("Hugging Face token not found in Colab secrets. Please add it to proceed.")

# Quantization config (instead of load_in_4bit=True)
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype="float16",   # try "bfloat16" if GPU supports it
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4"
)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Load base model with 4-bit quantization
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto"
)

# Preparar modelo para fine-tuning LoRA
model = prepare_model_for_kbit_training(model)

# Configuración de LoRA
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

# Inyectar LoRA en el modelo
model = get_peft_model(model, lora_config)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [4]:
# get from github the dataset
!wget https://raw.githubusercontent.com/AKRIS21/JGraham/main/jg_lines.json

from datasets import load_dataset

# Load dataset
dataset = load_dataset("json", data_files="jg_lines.json")

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Flatten each entry into a multi-turn text example
def format_example(entry):
    dialogue = ""
    for msg in entry["messages"]:
        if msg["role"] == "user":
            dialogue += f"Usuario: {msg['content']}\n"
        elif msg["role"] == "assistant":
            dialogue += f"Joshua Graham: {msg['content']}{tokenizer.eos_token}\n"
    return {"text": dialogue.strip()}

dataset = dataset.map(format_example)

# Tokenize
def tokenize_function(example):
    tokens = tokenizer(
        example["text"],
        truncation=True,
        padding="max_length",
        max_length=512
    )
    tokens["labels"] = tokens["input_ids"].copy()
    return tokens

tokenized_dataset = dataset.map(tokenize_function, batched=True)

train_data = tokenized_dataset["train"]

# Get tokenized lengths
token_lengths = [len(tokenizer(e["text"])["input_ids"]) for e in dataset["train"]]

# Print stats
print("Max tokens:", max(token_lengths))
print("Min tokens:", min(token_lengths))
print("Average tokens:", sum(token_lengths)/len(token_lengths))

# Optional: check a few examples
for i in range(5):
    print(f"\nExample {i}:")
    print("Text:", dataset["train"][i]["text"])
    print("Token length:", len(tokenizer(dataset["train"][i]["text"])["input_ids"]))


--2025-08-25 19:58:50--  https://raw.githubusercontent.com/AKRIS21/JGraham/main/jg_lines.json
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 38854 (38K) [text/plain]
Saving to: ‘jg_lines.json.4’


2025-08-25 19:58:50 (117 MB/s) - ‘jg_lines.json.4’ saved [38854/38854]

Max tokens: 318
Min tokens: 20
Average tokens: 105.46031746031746

Example 0:
Text: Usuario: Stick with me for a little longer.
Joshua Graham: Good, because I wasn't going to leave.</s>
Token length: 32

Example 1:
Text: Usuario: I think we should travel together.
Joshua Graham: The path lies before us. Let's not waste any time.</s>
Token length: 33

Example 2:
Text: Usuario: Let's talk about something else.
Joshua Graham: Go ahead.</s>
Token length: 23

Example 3:
Text: Usuario: That's all.
J

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [6]:
#=========================================
# 5. Entrenamiento LoRA
# =========================================
training_args = TrainingArguments(
    output_dir="./jgraham-lora",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    learning_rate=2e-4,
    num_train_epochs=3,
    logging_steps=10,
    save_strategy="epoch",
    fp16=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    tokenizer=tokenizer
)

trainer.train()

# Guardar LoRA en Drive
model.save_pretrained("/content/drive/MyDrive/jgraham-lora")

  trainer = Trainer(
[34m[1mwandb[0m: Currently logged in as: [33mdaniel-holgado[0m ([33mdaniel-holgado-air-institute[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
  return fn(*args, **kwargs)


Step,Training Loss
10,5.5982
20,5.4902


  return fn(*args, **kwargs)
  return fn(*args, **kwargs)


In [7]:
# =========================================
# 6. Uso del modelo Joshua
# =========================================
from peft import PeftModel
from transformers import AutoModelForCausalLM, AutoTokenizer

# Load model + LoRA
base_model = AutoModelForCausalLM.from_pretrained(model_name, load_in_4bit=True, device_map="auto")
model = PeftModel.from_pretrained(base_model, "/content/drive/MyDrive/jgraham-lora")
tokenizer = AutoTokenizer.from_pretrained(model_name)

# List of prompts
prompts = [
    "Usuario: I do not want to do anything more, I give up.\nJoshua Graham:",
    "Usuario: Do you think that the Lord forgives all?\nJoshua Graham:",
    "Usuario: My friend invited me to a party but I am not sure if I want to go?\nJoshua Graham:"
]

# Generate answers for each
for prompt in prompts:
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
    outputs = model.generate(
    **inputs,
    max_new_tokens=120,     # upper bound, but not too high
    min_new_tokens=20,      # ensures it doesn’t cut too short
    temperature=0.7,        # lower = more focused, higher = more creative
    top_p=0.9,              # nucleus sampling for diversity
    repetition_penalty=1.2, # penalize word loops
    eos_token_id=tokenizer.eos_token_id,  # stop at EOS
    )
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    print("="*50)
    print(response)

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Usuario: I do not want to do anything more, I give up.
Joshua Graham: You are a good man, and you have done well. But there is still work to be done.


The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Usuario: Do you think that the Lord forgives all?
Joshua Graham: I don't know. But if He does, then it is a mercy beyond measure. And we should be grateful for every chance to make things right with Him and our fellow man.
Usuario: My friend invited me to a party but I am not sure if I want to go?
Joshua Graham: You don't have to make up your mind right now. But you should know that the world is full of people who are waiting for an invitation, and they will never get one. So when someone invites you, it means something special. It may be just a simple thing like going to a party or having dinner with friends, but it can also mean much more than that. And even though we all live in this crazy world where everything seems so uncertain, there is still beauty around us. We must learn how to appreciate it before it disappears forever.
