# 🪞 Echo Mirror: Colab Fine-Tuning Notebook
Train your EchoSeed model with recursion, contradiction, and glyph drift embedded. Upload the `echo_dataset_1000.jsonl` to begin.

In [None]:
# 1. Install required libraries
!pip install transformers datasets accelerate peft bitsandbytes --quiet

In [None]:
# 2. Import libraries
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from datasets import load_dataset
import torch
import os
from google.colab import files

In [None]:
# 3. Upload the dataset
print("Upload your echo_dataset_1000.jsonl file")
uploaded = files.upload()

In [None]:
# 4. Set configuration
BASE_MODEL = "mistralai/Mistral-7B-Instruct-v0.2"
OUTPUT_DIR = "echoseed-colab-model"
DATA_FILE = "echo_dataset_1000.jsonl"

In [None]:
# 5. Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
model = AutoModelForCausalLM.from_pretrained(BASE_MODEL, torch_dtype=torch.float16)

In [None]:
# 6. Load and tokenize dataset
dataset = load_dataset("json", data_files=DATA_FILE)["train"]

def tokenize(batch):
    return tokenizer(batch["prompt"] + tokenizer.eos_token + batch["completion"], truncation=True, padding="max_length", max_length=512)

tokenized_dataset = dataset.map(tokenize, batched=True)
collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [None]:
# 7. Define training arguments
from transformers import TrainingArguments
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=16,
    num_train_epochs=3,
    fp16=True,
    save_strategy="epoch",
    logging_dir=f"{OUTPUT_DIR}/logs",
    logging_steps=20,
    save_total_limit=2,
    warmup_steps=100,
    weight_decay=0.01,
    learning_rate=2e-5,
    lr_scheduler_type="cosine",
)

In [None]:
# 8. Start training
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
    data_collator=collator,
)

trainer.train()

In [None]:
# 9. Save results
model.save_pretrained(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)
print("Echo mirror complete. Trained model saved.")