In [None]:
!pip install --upgrade peft transformers accelerate bitsandbytes datasets evaluate huggingface_hub gdown -q
!huggingface-cli login

import os
import torch
import pandas as pd
import gdown
import logging
import evaluate
import wandb
import gc
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, TaskType

# Set CUDA config
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.benchmark = True

logging.basicConfig(level=logging.DEBUG)

wandb.init(project="llama-finetuning", name="llama-RLHF-experiment")

file_id = '18cM4Z_GlgHdDruuTXPekmgKk6UU8IBxMjGnuUKZELh0'
output_file = "extended_dynamic_chatbot_data"
gdown.download(id=file_id, output=output_file, quiet=False)

# Load and process data
df = pd.read_excel(output_file)
df = df[['user_input', 'chatbot_response']].dropna().rename(
    columns={'user_input': 'prompt', 'chatbot_response': 'response'}
)
dataset = Dataset.from_pandas(df).shuffle(seed=42).select(range(25000))
split_dataset = dataset.train_test_split(test_size=0.2, seed=42)
train_dataset = split_dataset["train"]
eval_dataset = split_dataset["test"]

# Model setup
model_id = "meta-llama/Llama-3.2-1B-Instruct"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True
)

tokenizer = AutoTokenizer.from_pretrained(model_id)
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map="auto",
    use_cache=False
)
model = prepare_model_for_kbit_training(model)
model.resize_token_embeddings(len(tokenizer))

# LoRA configuration
lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    lora_dropout=0.05,
    target_modules=["q_proj", "v_proj"],
    bias="none",
    task_type=TaskType.CAUSAL_LM
)
model = get_peft_model(model, lora_config)
model.train()

for name, param in model.named_parameters():
    param.requires_grad = "lora" in name
model.print_trainable_parameters()

# Tokenization function with Llama-specific formatting
def tokenize_function(examples):
    max_length = 400
    prompts = ["<|system|>You are a Royal eCars company assistant expert.<|user|>" + p + "<|assistant|>" for p in examples["prompt"]]
    responses = [r + "<|eot|>" for r in examples["response"]]
    tokenized_inputs = tokenizer(
        prompts,
        padding="max_length",
        truncation=True,
        max_length=max_length,
        return_tensors="pt"
    )
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            responses,
            padding="max_length",
            truncation=True,
            max_length=max_length,
            return_tensors="pt"
        )["input_ids"]
    labels[labels == tokenizer.pad_token_id] = -100
    tokenized_inputs["labels"] = labels.type(torch.long).to(model.device)
    return tokenized_inputs

# Tokenize datasets
tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_eval_dataset = eval_dataset.map(tokenize_function, batched=True)
print("Tokenization completed!")

# Define metric
metric = evaluate.load("bleu")
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    return metric.compute(predictions=[decoded_preds], references=[[decoded_labels]])


training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=3,
    gradient_accumulation_steps=3,
    num_train_epochs=5,
    eval_strategy="no",
    #eval_steps=200,
    save_strategy="no",
    #save_steps=200,
    #load_best_model_at_end=False,
    #save_strategy="epoch",
    save_total_limit=2,
    learning_rate=2e-4,
    bf16=True,
    logging_steps=200,
    report_to="wandb",
    gradient_checkpointing=True,
    run_name="llama-royal-ecars-v2"
)




# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_eval_dataset,
    compute_metrics=compute_metrics
)

In [None]:
import time
import IPython
import threading
def keep_colab_alive():
    while True:
        time.sleep(600)  # 10 minutes check
        display(IPython.display.Javascript('''
            const connectButton = document.querySelector("#connect")
            if (connectButton && connectButton.style.display !== "none") {
                console.log("Reconnecting to runtime...");
                connectButton.click();
            }
        '''))
        print("Runtime check complete.")

thread = threading.Thread(target=keep_colab_alive)
thread.daemon = True
thread.start()

print("Memory before training:")
!free -h

print("Starting training...")
trainer.train()
print("Training completed successfully!")

wandb.finish()

print("Memory after training:")
!free -h

print("Saving model...")
trainer.save_model("./fine_tuned_model_lora")
tokenizer.save_pretrained("./fine_tuned_model_lora")

print("Model and Tokenizer saved locally!")

print("Memory after saving:")

del model, trainer, tokenized_train_dataset, tokenized_eval_dataset
gc.collect()

Memory before training:
               total        used        free      shared  buff/cache   available
Mem:            52Gi       4.3Gi        20Gi        16Mi        27Gi        47Gi
Swap:             0B          0B          0B
Starting training...


Step,Training Loss
200,5.3943
400,4.114
600,3.5165
800,3.1023
1000,3.046
1200,2.7585
1400,2.7318
1600,2.6829
1800,2.5772
2000,2.6056


<IPython.core.display.Javascript object>

Runtime check complete.


<IPython.core.display.Javascript object>

Runtime check complete.


<IPython.core.display.Javascript object>

Runtime check complete.


<IPython.core.display.Javascript object>

Runtime check complete.


<IPython.core.display.Javascript object>

Runtime check complete.


<IPython.core.display.Javascript object>

Runtime check complete.


<IPython.core.display.Javascript object>

Runtime check complete.


<IPython.core.display.Javascript object>

Runtime check complete.


<IPython.core.display.Javascript object>

Runtime check complete.


<IPython.core.display.Javascript object>

Runtime check complete.


<IPython.core.display.Javascript object>

Runtime check complete.


<IPython.core.display.Javascript object>

Runtime check complete.


<IPython.core.display.Javascript object>

Runtime check complete.


<IPython.core.display.Javascript object>

Runtime check complete.


<IPython.core.display.Javascript object>

Runtime check complete.


<IPython.core.display.Javascript object>

Runtime check complete.


<IPython.core.display.Javascript object>

Runtime check complete.


<IPython.core.display.Javascript object>

Runtime check complete.


<IPython.core.display.Javascript object>

Runtime check complete.


<IPython.core.display.Javascript object>

Runtime check complete.


<IPython.core.display.Javascript object>

Runtime check complete.


<IPython.core.display.Javascript object>

Runtime check complete.


<IPython.core.display.Javascript object>

Runtime check complete.


<IPython.core.display.Javascript object>

Runtime check complete.


<IPython.core.display.Javascript object>

Runtime check complete.


<IPython.core.display.Javascript object>

Runtime check complete.


<IPython.core.display.Javascript object>

Runtime check complete.
✅ Training completed successfully!


0,1
train/epoch,▁▁▁▂▂▂▂▂▃▃▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇███
train/global_step,▁▁▁▂▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇██
train/grad_norm,▄▅▆▅▄█▄▄▅▃▃▂▂▃▃▂▂▃▃▂▃▂▃▂▂▂▃▂▂▁▁▂▂▂▃▃▃▃▂▂
train/learning_rate,██▇▇▇▇▇▇▆▆▆▆▆▆▆▅▅▅▅▅▄▄▄▄▄▃▃▃▃▃▃▂▂▂▂▂▂▂▁▁
train/loss,█▅▄▃▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
total_flos,2.33670710575104e+17
train/epoch,4.99805
train/global_step,11110.0
train/grad_norm,6.73017
train/learning_rate,0.0
train/loss,1.9339
train_loss,2.38703
train_runtime,16632.5742
train_samples_per_second,6.012
train_steps_per_second,0.668


Memory after training:
               total        used        free      shared  buff/cache   available
Mem:            52Gi       3.1Gi        21Gi        16Mi        27Gi        49Gi
Swap:             0B          0B          0B
Saving model...




✅ Model saved locally!
Memory after saving:
               total        used        free      shared  buff/cache   available
Mem:            52Gi       3.1Gi        17Gi        16Mi        31Gi        49Gi
Swap:             0B          0B          0B


11416

Run summary:

- total_flos	2.33670710575104e+17
- train/epoch	4.99805
- train/global_step	11110
- train/grad_norm	6.73017
- train/learning_rate	0.0
- train/loss	1.9339
- train_loss	2.38703
- train_runtime	16632.5742
- train_samples_per_second	6.012
- train_steps_per_second	0.668

In [None]:
print("Verifying saved files:")
!ls -lh ./fine_tuned_model_lora


Verifying saved files:
total 2.0G
-rw-r--r-- 1 root root  728 Mar  6 15:56 adapter_config.json
-rw-r--r-- 1 root root 2.0G Mar  6 15:56 adapter_model.safetensors
-rw-r--r-- 1 root root 5.0K Mar  6 15:55 README.md
-rw-r--r-- 1 root root 5.2K Mar  6 15:56 training_args.bin


In [None]:
!pip install --upgrade huggingface_hub -q

!huggingface-cli login

from huggingface_hub import HfApi, create_repo
import gc
import os

from transformers import AutoTokenizer

print("Saving Model & Tokenizer...")
trainer.save_model("./fine_tuned_model_lora")
tokenizer.save_pretrained("./fine_tuned_model_lora")
print("Model and Tokenizer Saved Locally!")

print("Verifying saved files:")
!ls -lh ./fine_tuned_model_lora


# Check if tokenizer files exist
required_files = ["adapter_model.safetensors", "adapter_config.json", "config.json",
                  "tokenizer_config.json", "special_tokens_map.json", "tokenizer.json"]
missing_files = [f for f in required_files if not os.path.exists(f"./fine_tuned_model_lora/{f}")]

if missing_files:
print(f"Missing files: {missing_files}")
raise ValueError("Tokenizer files are missing! Make sure you saved the tokenizer.")

api = HfApi()

print("Creating repository 'ArsenKe/Llama-3.2-1B_RLHF'...")
try:
    create_repo(repo_id="ArsenKe/Llama-3.2-1B_RLHF", repo_type="model", exist_ok=True)
    print("Repository created or already exists!")
except Exception as e:
    print(f"Failed to create repo: {e}")
    raise

print("Pushing to Hugging Face...")
try:
    api.upload_folder(
        folder_path="./fine_tuned_model_lora",
        repo_id="ArsenKe/Llama-3.2-1B_RLHF",
        repo_type="model",
        #token=True
    )
    print("Model pushed to Hugging Face!")
except Exception as e:
    print(f"Push failed: {e}")

# Final memory check
print("Final memory check:")
!free -h


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    A token is already saved on your machine. Run `huggingface-cli whoami` to get more information or `huggingface-cli logout` if you want to log out.
    Setting a new token will erase the existing one.
    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) n
Token is valid (permission: fineG

adapter_model.safetensors:   0%|          | 0.00/2.10G [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

training_args.bin:   0%|          | 0.00/5.30k [00:00<?, ?B/s]

✅ Model pushed to Hugging Face!
Final memory check:
               total        used        free      shared  buff/cache   available
Mem:            52Gi       3.0Gi        18Gi        16Mi        31Gi        49Gi
Swap:             0B          0B          0B
