# Training Model

In [None]:
import os
import json
import torch
from huggingface_hub import login
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    Trainer
)
from datasets import Dataset
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

########################
# CONFIG
########################
HF_TOKEN = "YOUR_TOKEN_HERE"  # Replace with your Hugging Face token
MODEL_NAME = "llama-2-13b-chat"
REPO_ID = f"meta-llama/{MODEL_NAME}-hf"
CACHE_DIR = "D:/huggingface_cache"
DATASET_PATH = "datasets/no_gpu_limit_500.json" # colab_gpu_limited_100.json # no_gpu_limit_500
N_SAMPLES = 500 #! change to 100 for colab_gpu_limited_100.json
OUTPUT_DIR = f"trained_models/rerun_old_version/{MODEL_NAME}-lora-output-{N_SAMPLES}"
FINAL_DIR = f"trained_models/rerun_old_version/{MODEL_NAME}-lora-final-{N_SAMPLES}"

# Quantization config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True
)

# LoRA config
lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

########################
# DEVICE MANAGEMENT
########################
def set_device():
    if torch.cuda.is_available():
        device = torch.device("cuda")
        print("▶ Using GPU:", torch.cuda.get_device_name(0))
        torch.cuda.empty_cache()
        return device
    else:
        raise RuntimeError("GPU not available, but required for this setup.")

########################
# DATA LOADING
########################
def load_data(file_path, n_samples=N_SAMPLES):
    with open(file_path, 'r') as f:
        data = json.load(f)
    dataset = {
        "question": [item["question"] for item in data],
        "answer": [item["answer"] for item in data]
    }
    ds = Dataset.from_dict(dataset)
    if len(ds) < n_samples:
        print(f"▶ Warning: Dataset has {len(ds)} samples, less than requested {n_samples}")
        n_samples = len(ds)
    return ds.shuffle(seed=42).select(range(n_samples))

########################
# TRAINING FUNCTION
########################
def train_model(data_path=DATASET_PATH, n_samples=N_SAMPLES):
    device = set_device()
    login(token=HF_TOKEN)

    print(f"▶ Loading JSON dataset from {data_path}...")
    train_ds = load_data(data_path, n_samples=n_samples)
    print(f"▶ Using {len(train_ds)} examples for training")

    expected_columns = {"question", "answer"}
    if not all(col in train_ds.column_names for col in expected_columns):
        raise ValueError(f"Dataset must contain {expected_columns}, but found {train_ds.column_names}")

    tokenizer = AutoTokenizer.from_pretrained(REPO_ID, token=HF_TOKEN, cache_dir=CACHE_DIR)
    tokenizer.pad_token = tokenizer.eos_token

    print(f"▶ Loading {MODEL_NAME} in 4-bit...")
    model = AutoModelForCausalLM.from_pretrained(
        REPO_ID,
        quantization_config=bnb_config,
        device_map="auto",
        token=HF_TOKEN,
        cache_dir=CACHE_DIR,
        low_cpu_mem_usage=True
    )
    model = prepare_model_for_kbit_training(model)

    print("▶ Applying LoRA adapters...")
    model = get_peft_model(model, lora_config)
    model.print_trainable_parameters()

    def tokenize_function(examples):
        # Enhanced prompt engineering for better fine-tuning
        texts = [f"[INST] {q} [/INST] {a}" for q, a in zip(examples["question"], examples["answer"])]
        tokenized = tokenizer(texts, truncation=True, max_length=512, padding="max_length")
        tokenized["labels"] = tokenized["input_ids"].copy()
        return tokenized

    print("▶ Tokenizing dataset...")
    tokenized_train = train_ds.map(tokenize_function, batched=True, remove_columns=train_ds.column_names)

    # Updated training arguments for hyperparameter tuning
    training_args = TrainingArguments(
        output_dir=OUTPUT_DIR,
        overwrite_output_dir=True,
        num_train_epochs=3,  # Increased epochs for better tuning
        per_device_train_batch_size=1,
        gradient_accumulation_steps=8, #! chaneg to "16" to reduce memory usage
        fp16=True,
        logging_steps=10,
        save_strategy="epoch",
        save_total_limit=1,
        learning_rate=2e-5,  # Adjusted learning rate
        warmup_steps=50,     # Added warmup for stability
        weight_decay=0.01    # Added regularization
    )

    print("▶ Initializing Trainer...")
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train,
        tokenizer=tokenizer
        # processing_class=tokenizer  # Replace tokenizer=tokenizer #! do it later
    )

    print("▶ Starting training with hyperparameter tuning...")
    trainer.train()

    print(f"▶ Saving LoRA model to {FINAL_DIR}...")
    model.save_pretrained(FINAL_DIR)
    tokenizer.save_pretrained(FINAL_DIR)

    print(f"▶ Merging LoRA weights...")
    merged_model = model.merge_and_unload()
    merged_dir = FINAL_DIR + "_merged"
    merged_model.save_pretrained(merged_dir)
    tokenizer.save_pretrained(merged_dir)

    print(f"▶ Training complete. Model saved in {FINAL_DIR}, merged in {merged_dir}")
    print("▶ Next steps: Evaluate using ROUGE-L, ROUGE-1, ROUGE-2, BERTScore, and BLEU metrics")

if __name__ == "__main__":
    train_model(data_path=DATASET_PATH, n_samples=N_SAMPLES)

  from .autonotebook import tqdm as notebook_tqdm


▶ Using GPU: NVIDIA GeForce RTX 3060
▶ Loading JSON dataset from datasets/no_gpu_limit_500.json...
▶ Using 500 examples for training
▶ Loading llama-2-13b-chat in 4-bit...


Loading checkpoint shards: 100%|██████████| 3/3 [00:29<00:00,  9.67s/it]


▶ Applying LoRA adapters...
trainable params: 6,553,600 || all params: 13,022,417,920 || trainable%: 0.0503
▶ Tokenizing dataset...


Map: 100%|██████████| 500/500 [00:00<00:00, 2309.54 examples/s]
  trainer = Trainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


▶ Initializing Trainer...
▶ Starting training with hyperparameter tuning...


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
  return fn(*args, **kwargs)


Step,Training Loss
10,2.9778
20,2.8052
30,2.8499
40,2.8512
50,2.0352
60,1.5173
70,1.075
80,0.9879
90,0.9005
100,0.926


  return fn(*args, **kwargs)
  return fn(*args, **kwargs)


▶ Saving LoRA model to trained_models/rerun_old_version/llama-2-13b-chat-lora-final-500...
▶ Merging LoRA weights...




▶ Training complete. Model saved in trained_models/rerun_old_version/llama-2-13b-chat-lora-final-500, merged in trained_models/rerun_old_version/llama-2-13b-chat-lora-final-500_merged
▶ Next steps: Evaluate using ROUGE-L, ROUGE-1, ROUGE-2, BERTScore, and BLEU metrics


# Evaluation
### Evaluate the model after train with out is an excel file. also summary in the terminal

In [None]:
import json
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import Dataset
import evaluate  # Hugging Face's evaluate library
from bert_score import score as bert_score
import pandas as pd
from datetime import datetime

########################
# CONFIG
########################
HF_TOKEN = "YOUR_TOKEN_HERE"  # Replace with your Hugging Face token
MODEL_NAME = "llama-2-13b-chat"
MERGED_MODEL_DIR = f"trained_models/rerun_old_version/{MODEL_NAME}-lora-final-500_merged" #!!! Change to your merged model path
CACHE_DIR = "D:/huggingface_cache"
TEST_DATASET_PATH = "datasets/test_data.json"  # Replace with your test dataset path
REPORT_OUTPUT = f"evaluation_report_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"

# Evaluation settings
MAX_LENGTH = 512
BATCH_SIZE = 1

########################
# DEVICE MANAGEMENT
########################
def set_device():
    if torch.cuda.is_available():
        device = torch.device("cuda")
        print("▶ Using GPU:", torch.cuda.get_device_name(0))
        torch.cuda.empty_cache()
        return device
    else:
        print("▶ No GPU available, using CPU.")
        return torch.device("cpu")

########################
# DATA LOADING
########################
def load_test_data(file_path):
    with open(file_path, 'r') as f:
        data = json.load(f)
    dataset = {
        "question": [item["question"] for item in data],
        "answer": [item["answer"] for item in data]
    }
    return Dataset.from_dict(dataset)

########################
# EVALUATION FUNCTION
########################
def evaluate_model(test_data_path=TEST_DATASET_PATH):
    device = set_device()

    # Load the merged fine-tuned model and tokenizer
    print(f"▶ Loading fine-tuned model from {MERGED_MODEL_DIR}...")
    tokenizer = AutoTokenizer.from_pretrained(MERGED_MODEL_DIR, cache_dir=CACHE_DIR)
    model = AutoModelForCausalLM.from_pretrained(
        MERGED_MODEL_DIR,
        device_map="auto",
        cache_dir=CACHE_DIR,
        low_cpu_mem_usage=True
    ).to(device)
    model.eval()

    # Load test dataset
    print(f"▶ Loading test dataset from {test_data_path}...")
    test_ds = load_test_data(test_data_path)
    print(f"▶ Evaluating on {len(test_ds)} examples")

    # Prepare metrics
    rouge = evaluate.load("rouge")
    bleu = evaluate.load("bleu")
    predictions = []
    references = []

    # Generate predictions
    print("▶ Generating model predictions...")
    for example in test_ds:
        input_text = f"[INST] {example['question']} [/INST]"
        inputs = tokenizer(input_text, return_tensors="pt", truncation=True, max_length=MAX_LENGTH).to(device)
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_length=MAX_LENGTH,
                num_return_sequences=1,
                do_sample=False
            )
        pred = tokenizer.decode(outputs[0], skip_special_tokens=True).replace(input_text, "").strip()
        predictions.append(pred)
        references.append(example["answer"])

    # Compute ROUGE scores
    print("▶ Computing ROUGE scores...")
    rouge_scores = rouge.compute(predictions=predictions, references=references)

    # Compute BLEU score
    print("▶ Computing BLEU score...")
    bleu_score = bleu.compute(predictions=predictions, references=references)

    # Compute BERTScore
    print("▶ Computing BERTScore...")
    P, R, F1 = bert_score(predictions, references, lang="en", verbose=True)
    bertscore_results = {
        "precision": P.mean().item(),
        "recall": R.mean().item(),
        "f1": F1.mean().item()
    }

    # Compile results
    results = {
        "ROUGE-1": rouge_scores["rouge1"],
        "ROUGE-2": rouge_scores["rouge2"],
        "ROUGE-L": rouge_scores["rougeL"],
        "BLEU": bleu_score["bleu"],
        "BERTScore_Precision": bertscore_results["precision"],
        "BERTScore_Recall": bertscore_results["recall"],
        "BERTScore_F1": bertscore_results["f1"]
    }

    # Export to CSV
    print(f"▶ Exporting evaluation report to {REPORT_OUTPUT}...")
    results_df = pd.DataFrame([results])
    results_df.to_csv(REPORT_OUTPUT, index=False)

    # Print summary
    print("\n▶ Evaluation Summary:")
    for metric, value in results.items():
        print(f"  {metric}: {value:.4f}")

    return results

if __name__ == "__main__":
    # Replace with your actual test dataset path
    evaluate_model(test_data_path="datasets/test_data.json")

  from .autonotebook import tqdm as notebook_tqdm


▶ Using GPU: NVIDIA GeForce RTX 3060
▶ Loading fine-tuned model from trained_models/rerun_old_version/llama-2-13b-chat-lora-final-500_merged...


Loading checkpoint shards: 100%|██████████| 2/2 [00:04<00:00,  2.15s/it]


▶ Loading test dataset from datasets/test_data.json...
▶ Evaluating on 10 examples


Downloading builder script: 100%|██████████| 6.27k/6.27k [00:00<00:00, 16.4MB/s]
Downloading builder script: 100%|██████████| 5.94k/5.94k [00:00<00:00, 9.31MB/s]
Downloading extra modules: 4.07kB [00:00, 6.78MB/s]                   
Downloading extra modules: 100%|██████████| 3.34k/3.34k [00:00<00:00, 5.54MB/s]


▶ Generating model predictions...
▶ Computing ROUGE scores...
▶ Computing BLEU score...
▶ Computing BERTScore...


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


100%|██████████| 1/1 [00:00<00:00,  1.15it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 25.12it/s]

done in 0.92 seconds, 10.87 sentences/sec
▶ Exporting evaluation report to evaluation_report_20250506_040845.csv...

▶ Evaluation Summary:
  ROUGE-1: 0.3709
  ROUGE-2: 0.1083
  ROUGE-L: 0.1874
  BLEU: 0.0539
  BERTScore_Precision: 0.8523
  BERTScore_Recall: 0.8746
  BERTScore_F1: 0.8632





# Chat Bot
### Run and chat with the bot in terminal
- Answer start with: "Bot:"

In [3]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

# Configuration
MODEL_PATH = "trained_models/rerun_old_version/llama-2-13b-chat-lora-final-500_merged"  # Path to your fine-tuned 13B model
CACHE_DIR = "D:/huggingface_cache"
MAX_LENGTH = 768  # Adjust based on memory constraints (256, 512, 765, 1024, 1280, 1536, ...)

# Device setup
def set_device():
    if torch.cuda.is_available():
        device = torch.device("cuda")
        print("▶ Using GPU:", torch.cuda.get_device_name(0))
        torch.cuda.empty_cache()
        return device
    else:
        print("▶ No GPU available, using CPU.")
        return torch.device("cpu")

# Load the model and tokenizer
def load_model_and_tokenizer():
    device = set_device()
    print(f"▶ Loading model and tokenizer from {MODEL_PATH}...")

    try:
        tokenizer = AutoTokenizer.from_pretrained(
            MODEL_PATH,
            cache_dir=CACHE_DIR,
            local_files_only=True,
            use_fast=True
        )
    except Exception as e:
        print(f"Error loading tokenizer: {e}")
        raise

    try:
        model = AutoModelForCausalLM.from_pretrained(
            MODEL_PATH,
            device_map="auto",
            cache_dir=CACHE_DIR,
            local_files_only=True,
            load_in_4bit=True,  # Use 4-bit quantization
            bnb_4bit_compute_dtype=torch.float16
        ).to(device)
        model.eval()
    except Exception as e:
        print(f"Error loading model: {e}")
        raise

    return model, tokenizer, device

# Generate a response from the model
def generate_response(model, tokenizer, device, user_input):
    # Format the input as per the model's expected prompt
    input_text = f"[INST] {user_input} [/INST]"
    inputs = tokenizer(input_text, return_tensors="pt", truncation=True, max_length=MAX_LENGTH).to(device)

    try:
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_length=MAX_LENGTH,
                num_return_sequences=1,
                do_sample=False,
                pad_token_id=tokenizer.eos_token_id
            )
        response = tokenizer.decode(outputs[0], skip_special_tokens=True).replace(input_text, "").strip()
        return response
    except Exception as e:
        print(f"Error generating response: {e}")
        return "Sorry, I encountered an error while generating a response."

# Main chatbot loop
def run_chatbot():
    model, tokenizer, device = load_model_and_tokenizer()
    print("▶ Chatbot is ready! Type 'exit' to quit.")

    while True:
        user_input = input("You: ")
        if user_input.lower() == "exit":
            print("▶ Goodbye!")
            break

        response = generate_response(model, tokenizer, device, user_input)
        print(f"Bot: {response}")

if __name__ == "__main__":
    run_chatbot()

▶ Using GPU: NVIDIA GeForce RTX 3060
▶ Loading model and tokenizer from trained_models/rerun_old_version/llama-2-13b-chat-lora-final-500_merged...


The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.
Loading checkpoint shards: 100%|██████████| 2/2 [00:04<00:00,  2.17s/it]


▶ Chatbot is ready! Type 'exit' to quit.




Bot: Sorry to hear that you're feeling stressed and overwhelmed. It's completely normal to feel this way at times, and there are many things you can do to manage your stress and improve your well-being. Here are some suggestions that might help:

1. Take a break: Sometimes, we just need a break from our daily routines to recharge and refocus. Take some time off from work or other responsibilities to relax and do something you enjoy.
2. Practice self-care: Make sure you're taking care of yourself physically, emotionally, and mentally. This can include things like getting enough sleep, eating healthy foods, exercising regularly, and finding time for activities that bring you joy and relaxation.
3. Identify the sources of your stress: Is there something specific that's causing you stress? If so, see if you can address it or find ways to manage it. Sometimes, just identifying the source of our stress can help us feel more in control.
4. Seek support: Talk to a trusted friend, family member