### Necessary Imports and set-up of the paths like - 
base model path, adapter path, dataset path, model save path

In [None]:
import torch
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import PeftModel
import math
import json
import os
from tqdm import tqdm
import gc 
BASE_MODEL_NAME = "unsloth/zephyr-sft-bnb-4bit"
FINAL_ADAPTER_PATH = "E:/papers-i-implement/LoRA/main_scripts/results_optimized_all_tones_dynamic/final_model"
SMALL_ADAPTER_PATH = "E:/papers-i-implement/LoRA/models/results_optimized_all_tones/final_model"
FINETUNED_ADAPTER_PATH = "E:/papers-i-implement/LoRA/main_scripts/results_optimized_all_tones_dynamic/checkpoint-1200" 
DATA_TEST_PATH = "E:/papers-i-implement/poet/data/conversation_data_test.json"
MAX_SEQ_LENGTH = 2048 

NUM_COMPARISON_SAMPLES = 10
COMPARISON_OUTPUT_FILE = "post_hoc_model_comparison_sequential_2.json"
PERPLEXITY_BATCH_SIZE = 2 
TARGET_COMPARISON_TONE = "poetic"


### Model & Tokenizer Loader
This function loads either:

A base model (optionally in 4-bit for efficiency), or A fine-tuned adapter (LoRA) on top of a base model.

Key Features:

Uses 4-bit quantization (nf4, bfloat16) if enabled.

Handles missing eos_token or pad_token in tokenizer.

Automatically maps to available device (GPU/CPU).

Cleans up memory on failure when loading adapters.

Returns:
(model, tokenizer) — ready for evaluation or inference.

In [None]:

def load_model_tokenizer_for_analysis(model_name_or_path, is_adapter_path=False, base_model_name_for_adapter=None, load_in_4bit=True):
    print("-" * 50)
    if is_adapter_path:
        print(f"Loading FINE-TUNED model (Base: {base_model_name_for_adapter} + Adapters: {model_name_or_path})")
        if base_model_name_for_adapter is None:
             raise ValueError("base_model_name_for_adapter must be provided when loading adapters.")
    
        base_model, tokenizer = load_model_tokenizer_for_analysis(base_model_name_for_adapter, load_in_4bit=load_in_4bit)
        
        try:
             model = PeftModel.from_pretrained(base_model, model_name_or_path)
             print("LoRA adapters loaded successfully.")
             return model, tokenizer
        except Exception as e:
            print(f"Error loading adapters from {model_name_or_path}: {e}")
            del base_model, tokenizer
            gc.collect()
            torch.cuda.empty_cache()
            raise e 
    else:
        print(f"Loading BASE model: {model_name_or_path}")
        bnb_config = None
        dtype = None
        if load_in_4bit:
            bnb_config = BitsAndBytesConfig(
                load_in_4bit=True,
                bnb_4bit_quant_type="nf4",
                bnb_4bit_compute_dtype=torch.bfloat16,
                bnb_4bit_use_double_quant=True,
            )
            dtype = torch.bfloat16

        model = AutoModelForCausalLM.from_pretrained(
            model_name_or_path,
            quantization_config=bnb_config,
            torch_dtype=dtype,
            device_map="auto", 
        )
        tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)

        if tokenizer.eos_token is None: tokenizer.add_special_tokens({'eos_token': '<|endoftext|>'})
        if tokenizer.pad_token is None:
            tokenizer.pad_token = tokenizer.eos_token
            if hasattr(model, 'config'): 
                 model.config.pad_token_id = tokenizer.pad_token_id

        print(f"Model {model_name_or_path} and tokenizer loaded onto device: {model.device}")
        return model, tokenizer


### Text Generation Function
Generates responses from the model based on user prompts and a target tone.

Key Steps:

Formats each prompt with USER, TONE, and ASSISTANT tags.

Uses model.generate() with top-k/top-p sampling for creative outputs.

Truncates long inputs to half the max sequence length.

Returns a list of generated responses.

In [None]:

def generate_text(model, tokenizer, prompts, tone, max_new=256):
    outputs = []
    model.eval()
    with torch.no_grad():
        for prompt in tqdm(prompts, desc=f"Generating ({tone})"):
            formatted_prompt = f"USER: {prompt}\n TONE:{tone} \nASSISTANT: "
            inputs = tokenizer(formatted_prompt, return_tensors="pt", truncation=True, max_length=MAX_SEQ_LENGTH // 2).to(model.device)

            generated_ids = model.generate(
                **inputs,
                max_new_tokens=max_new,
                eos_token_id=tokenizer.eos_token_id,
                pad_token_id=tokenizer.pad_token_id,
                do_sample=True, top_k=50, top_p=0.9
            )
            output_text = tokenizer.decode(generated_ids[0, inputs['input_ids'].shape[1]:], skip_special_tokens=True)
            outputs.append(output_text.strip())
    return outputs


### Perplexity Calculation
Evaluates how well the model predicts a given dataset by computing perplexity.

How it works:

Batches the tokenized inputs and pads them.

Computes loss using the model in evaluation mode.

Aggregates total loss and valid tokens across batches.

Returns perplexity = exp(avg_loss).

In [None]:

def calculate_perplexity(model, tokenizer, tokenized_dataset, batch_size=4):
    model.eval()
    total_loss = 0
    total_tokens = 0
    print(f"\nCalculating perplexity with batch size {batch_size}...")

    with torch.no_grad():
        for i in tqdm(range(0, len(tokenized_dataset), batch_size), desc="Perplexity Batches"):
            batch_indices = range(i, min(i + batch_size, len(tokenized_dataset)))
            batch_texts = [tokenized_dataset[j]['input_ids'] for j in batch_indices]

            batch_padded = tokenizer.pad({"input_ids": batch_texts}, padding=True, return_tensors="pt").to(model.device)

            outputs = model(**batch_padded, labels=batch_padded["input_ids"])
            loss = outputs.loss

            valid_tokens_in_batch = batch_padded["attention_mask"].sum().item()
            if valid_tokens_in_batch > 0:
                total_loss += loss.item() * valid_tokens_in_batch
                total_tokens += valid_tokens_in_batch
            del batch_padded, outputs, loss

    if total_tokens == 0:
        print("Warning: No valid tokens found for perplexity calculation.")
        return None

    avg_loss = total_loss / total_tokens
    perplexity = math.exp(avg_loss)
    print(f"Calculated Average Loss: {avg_loss:.4f}")
    gc.collect()
    torch.cuda.empty_cache()
    return perplexity


### Load Comparison Prompts
Loads a small set of prompts from the test dataset for output comparison.

Key Steps:

Selects NUM_COMPARISON_SAMPLES prompts from the dataset.

Filters out prompts that contain "conversation" with an "input" field.

Optionally retrieves reference responses in the target tone for evaluation.

Outputs:

comparison_prompts: List of selected prompt texts.

comparison_prompts_data_refs: Corresponding target-tone references (if available).

Useful for evaluating different models or adapter outputs side-by-side.

In [None]:


# 1. Load Comparison Prompts
print(f"\nSelecting {NUM_COMPARISON_SAMPLES} comparison samples...")
comparison_prompts = []
comparison_prompts_data_refs = []
try:
    # Load only the necessary part of the test data
    comparison_raw_data = load_dataset("json", data_files=DATA_TEST_PATH, split=f"train[:{NUM_COMPARISON_SAMPLES * 2}]")
    for example in comparison_raw_data:
         if 'conversation' in example and 'input' in example['conversation']:
             comparison_prompts.append(example['conversation']['input'])
             comparison_prompts_data_refs.append(example['conversation'].get('responses', {}).get(TARGET_COMPARISON_TONE, "N/A"))
         if len(comparison_prompts) >= NUM_COMPARISON_SAMPLES:
            break
    comparison_prompts = comparison_prompts[:NUM_COMPARISON_SAMPLES] # Ensure exact number
    comparison_prompts_data_refs = comparison_prompts_data_refs[:NUM_COMPARISON_SAMPLES]
    print(f"Selected {len(comparison_prompts)} prompts.")
except Exception as e:
    print(f"Error loading test data for comparison: {e}")
    comparison_prompts = []
    comparison_prompts_data_refs = []



Selecting 10 comparison samples...
Selected 10 prompts.


###  Load & Preprocess Test Data for Perplexity
Prepares test data samples for evaluating model perplexity.

Steps Involved:

Loads the full test set from JSON.

Filters and formats each example into:

php-template
Copy
Edit
USER: "< input >"
TONE: "< target_tone >"
ASSISTANT: "< response >< eos >"
Only keeps samples with valid user input and target-tone response.

Uses a temporary tokenizer to ensure correct formatting and special tokens.

In [None]:

# 2. Load and Preprocess Data for Perplexity
print("\nLoading and preprocessing test data for perplexity calculation...")
tokenized_perplexity_dataset = None
try:
    raw_test_data = load_dataset("json", data_files=DATA_TEST_PATH, split="train")

    temp_tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_NAME, trust_remote_code=True)
    if temp_tokenizer.eos_token is None: temp_tokenizer.add_special_tokens({'eos_token': '<|endoftext|>'})

    def format_prompt_ppl(example, response_type=TARGET_COMPARISON_TONE):
        conv = example.get('conversation')
        if not conv or 'input' not in conv or 'responses' not in conv or response_type not in conv['responses']:
            return {"text": None}
        user_input = conv['input']
        response = conv['responses'][response_type]
        formatted_text = f"USER: {user_input}\n TONE:{response_type} \nASSISTANT: {response}{temp_tokenizer.eos_token}"
        return {"text": formatted_text}

    formatted_test_data = raw_test_data.map(
        lambda x: format_prompt_ppl(x), num_proc=1
    ).filter(lambda x: x['text'] is not None)

    perplexity_ready_data = formatted_test_data
    print(f"Loaded {len(perplexity_ready_data)} samples for perplexity calculation.")
    del temp_tokenizer 
    gc.collect()

except Exception as e:
    print(f"Error loading or processing test data for perplexity: {e}")
    perplexity_ready_data = None



Loading and preprocessing test data for perplexity calculation...
Loaded 2600 samples for perplexity calculation.


### Phase 1:
This analysis the base model and calculates - 
perplexity, 
loss, 


In [7]:
# --- Main Analysis Steps (Sequential) ---

baseline_outputs = []
base_perplexity = None
finetuned_outputs = []
ft_perplexity = None

# == Phase 1: Base Model Analysis ==
print("\n" + "="*20 + " Phase 1: Base Model Analysis " + "="*20)
if comparison_prompts or perplexity_ready_data:
    model = None
    tokenizer = None
    try:
        # Load Base Model + Tokenizer
        model, tokenizer = load_model_tokenizer_for_analysis(BASE_MODEL_NAME)

        # Generate Baseline Outputs
        if comparison_prompts:
            print("\n--- Generating Baseline Outputs ---")
            baseline_outputs = generate_text(model, tokenizer, comparison_prompts, TARGET_COMPARISON_TONE)

        # Calculate Base Perplexity
        if perplexity_ready_data:
            print("\n--- Tokenizing Perplexity Data (Base Model) ---")
            # NOW tokenize using the loaded tokenizer
            tokenized_perplexity_dataset = perplexity_ready_data.map(
                 lambda examples: tokenizer(examples["text"], truncation=True, max_length=MAX_SEQ_LENGTH, padding=False),
                 batched=True, num_proc=1, remove_columns=perplexity_ready_data.column_names
            )
            print("--- Calculating Base Model Perplexity ---")
            base_perplexity = calculate_perplexity(model, tokenizer, tokenized_perplexity_dataset, PERPLEXITY_BATCH_SIZE)
            print(f"Base Model Perplexity: {base_perplexity:.4f}" if base_perplexity else "Base Model Perplexity: Error")
            del tokenized_perplexity_dataset # Clean up tokenized data for this phase
            gc.collect()

    except Exception as e:
        print(f"An error occurred during Base Model analysis: {e}")
    finally:
        # *** CRITICAL: Unload Base Model ***
        print("\n--- Unloading Base Model ---")
        del model
        del tokenizer
        gc.collect() # Force garbage collection
        torch.cuda.empty_cache() # Release GPU memory
        print("Base Model unloaded.")
else:
    print("Skipping Base Model analysis due to lack of comparison prompts or perplexity data.")




--------------------------------------------------
Loading BASE model: unsloth/zephyr-sft-bnb-4bit




Model unsloth/zephyr-sft-bnb-4bit and tokenizer loaded onto device: cuda:0

--- Generating Baseline Outputs ---


Generating (poetic): 100%|██████████| 10/10 [08:51<00:00, 53.18s/it]



--- Tokenizing Perplexity Data (Base Model) ---
--- Calculating Base Model Perplexity ---

Calculating perplexity with batch size 2...


Perplexity Batches:   0%|          | 0/1300 [00:00<?, ?it/s]You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
Perplexity Batches: 100%|██████████| 1300/1300 [18:30<00:00,  1.17it/s]


Calculated Average Loss: 2.3826
Base Model Perplexity: 10.8328

--- Unloading Base Model ---
Base Model unloaded.


### Phase 2:
This analysis the finetuned model and calculates - 
perplexity, 
loss, 


In [None]:

# == Phase 2: Fine-tuned Model Analysis ==
print("\n" + "="*20 + " Phase 2: Fine-tuned Model Analysis " + "="*20)
if comparison_prompts or perplexity_ready_data:
    model = None
    tokenizer = None
    try:
        # Load Fine-tuned Model (Base + Adapters) + Tokenizer
        model, tokenizer = load_model_tokenizer_for_analysis(
            FINAL_ADAPTER_PATH,
            is_adapter_path=True,
            base_model_name_for_adapter=BASE_MODEL_NAME
        )

        # Generate Fine-tuned Outputs
        if comparison_prompts:
            print("\n--- Generating Fine-tuned Outputs ---")
            finetuned_outputs = generate_text(model, tokenizer, comparison_prompts, TARGET_COMPARISON_TONE)

        # Calculate Fine-tuned Perplexity
        if perplexity_ready_data:
            print("\n--- Tokenizing Perplexity Data (Fine-tuned Model) ---")
            tokenized_perplexity_dataset = perplexity_ready_data.map(
                 lambda examples: tokenizer(examples["text"], truncation=True, max_length=MAX_SEQ_LENGTH, padding=False),
                 batched=True, num_proc=1, remove_columns=perplexity_ready_data.column_names
            )
            print("--- Calculating Fine-tuned Model Perplexity ---")
            ft_perplexity = calculate_perplexity(model, tokenizer, tokenized_perplexity_dataset, PERPLEXITY_BATCH_SIZE)
            print(f"Fine-tuned Model Perplexity: {ft_perplexity:.4f}" if ft_perplexity else "Fine-tuned Model Perplexity: Error")
            del tokenized_perplexity_dataset # Clean up tokenized data
            gc.collect()

    except Exception as e:
        print(f"An error occurred during Fine-tuned Model analysis: {e}")
    finally:
        # *** CRITICAL: Unload Fine-tuned Model ***
        print("\n--- Unloading Fine-tuned Model ---")
        del model # This should release the PeftModel and the underlying base
        del tokenizer
        gc.collect() # Force garbage collection
        torch.cuda.empty_cache() # Release GPU memory
        print("Fine-tuned Model unloaded.")
else:
     print("Skipping Fine-tuned Model analysis due to lack of comparison prompts or perplexity data.")




--------------------------------------------------
Loading FINE-TUNED model (Base: unsloth/zephyr-sft-bnb-4bit + Adapters: E:/papers-i-implement/LoRA/main_scripts/results_optimized_all_tones_dynamic/final_model)
--------------------------------------------------
Loading BASE model: unsloth/zephyr-sft-bnb-4bit




Model unsloth/zephyr-sft-bnb-4bit and tokenizer loaded onto device: cuda:0
LoRA adapters loaded successfully.

--- Generating Fine-tuned Outputs ---


Generating (poetic): 100%|██████████| 10/10 [03:19<00:00, 19.96s/it]



--- Tokenizing Perplexity Data (Fine-tuned Model) ---
--- Calculating Fine-tuned Model Perplexity ---

Calculating perplexity with batch size 2...


Perplexity Batches:   0%|          | 0/1300 [00:00<?, ?it/s]You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
Perplexity Batches: 100%|██████████| 1300/1300 [22:09<00:00,  1.02s/it]


Calculated Average Loss: 1.7373
Fine-tuned Model Perplexity: 5.6818

--- Unloading Fine-tuned Model ---
Fine-tuned Model unloaded.


### Some examples with prompts and outputs 

In [None]:



# == Phase 3: Comparison and Saving ==
print("\n" + "="*20 + " Phase 3: Final Comparison " + "="*20)

# Display Perplexity Summary
print("\n--- Perplexity Summary ---")
print(f"Base Model Perplexity: {base_perplexity:.4f}" if base_perplexity else "Base Model Perplexity: Not calculated or Error")
print(f"Fine-tuned Model Perplexity: {ft_perplexity:.4f}" if ft_perplexity else "Fine-tuned Model Perplexity: Not calculated or Error")

# Save and Display Generation Comparison
print("\n--- Generation Comparison Results ---")
comparison_results = []
# Ensure we have prompts and both sets of outputs before proceeding
if comparison_prompts and baseline_outputs and finetuned_outputs and len(comparison_prompts) == len(baseline_outputs) == len(finetuned_outputs):
    for i in range(len(comparison_prompts)):
        result = {
            "prompt": comparison_prompts[i],
            "reference_output": comparison_prompts_data_refs[i],
            "baseline_output": baseline_outputs[i],
            "finetuned_output": finetuned_outputs[i],
        }
        comparison_results.append(result)
        print("-" * 30)
        print(f"Prompt {i+1}: {result['prompt']}")
        print(f"Reference ({TARGET_COMPARISON_TONE}): {result['reference_output']}")
        print(f"Baseline Output: {result['baseline_output']}")
        print(f"Fine-tuned Output: {result['finetuned_output']}")
        print("-" * 30 + "\n")

    # Save comparison to JSON file
    try:
        with open(COMPARISON_OUTPUT_FILE, 'w', encoding='utf-8') as f:
            json.dump(comparison_results, f, indent=4, ensure_ascii=False)
        print(f"Comparison results saved to {COMPARISON_OUTPUT_FILE}")
    except Exception as e:
        print(f"Error saving comparison results: {e}")
elif not comparison_prompts:
     print("No comparison prompts were loaded.")
else:
    print("Could not generate full comparison results (missing baseline or fine-tuned outputs, or length mismatch).")
    print(f"Prompts: {len(comparison_prompts)}, Baseline Outputs: {len(baseline_outputs)}, Finetuned Outputs: {len(finetuned_outputs)}")






--- Perplexity Summary ---
Base Model Perplexity: 10.8328
Fine-tuned Model Perplexity: 5.7182

--- Generation Comparison Results ---
------------------------------
Prompt 1: What does the concept of home evoke for you - is it a physical place, a sense of belonging, or a feeling of security, or is it something more abstract?
Reference (poetic): A whispered promise of solace, a haven from life's tempests, home is a tapestry woven from threads of memory and longing, a sanctuary where love and laughter entwine, where the heart finds a sense of belonging, a refuge from the world's wild, wandering soul.
Baseline Output: Home, oh sweet abode,
A shelter that molds us whole,
A place where secrets are told,
Where love and warmth take hold.

It's a physical place we call,
Yet its essence transcends walls,
Where memories rise tall,
And our hearts and souls enthrall.

It's the soft rustle of leaves,
Whispering stories in our ears,
The scent of a mother's ease,
And the laughter of loved ones near.