In [None]:
import torch
import glob, os
import pandas as pd
import gc
from transformers import pipeline
from torch.utils.data import Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import LoraConfig, get_peft_model, TaskType, PeftModel
from transformers import TrainingArguments, Trainer, StoppingCriteria, StoppingCriteriaList
import torch.nn.functional as F

# Set CUDA memory allocation to reduce fragmentation
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'

# Clear CUDA cache at startup
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    torch.cuda.ipc_collect()
    torch.cuda.synchronize()
    print(f"CUDA available. GPU: {torch.cuda.get_device_name(0)}")
    print(f"Total GPU memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB")


CUDA available. GPU: Tesla T4
Total GPU memory: 14.74 GB


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!unzip /content/drive/MyDrive/daic_data/daic_data.zip

In [None]:
# !unzip /content/drive/MyDrive/daic_data/tiny_llama_instruction_tuned_old.zip

## Data processing

In [None]:
def get_questions_answers_df(transcripts_dir):
  transcripts_files = glob.glob(os.path.join(transcripts_dir, "*.csv"))

  # Load and concatenate all transcript files
  df = pd.concat(
    (
      pd.read_csv(file, sep="\t", encoding="utf-8-sig").assign(source=os.path.basename(file))
      for file in transcripts_files
    ),
    ignore_index=True
  )

  # Create block_id to identify contiguous speaker segments
  df['block_id'] = (df['speaker'] != df['speaker'].shift(1)).cumsum()

  # Aggregate by source and block_id to merge contiguous segments by the same speaker
  df = df.groupby(['source', 'block_id']).agg(
    speaker=('speaker', 'first'),
    start_time=('start_time', 'min'),
    stop_time=('stop_time', 'max'),
    value=('value', lambda x: ' '.join(x.astype(str)))
  )

  # Sort by participant and time
  df = df.sort_values(by=['source', 'start_time']).reset_index()

  # Add previous speaker and value columns only if the previous source is the same
  df['prev_speaker'] = df.groupby('source')['speaker'].shift(1)
  df['prev_value'] = df.groupby('source')['value'].shift(1)

  is_answer = (
    (df['speaker'] == 'Participant') &
    (df['prev_speaker'] == 'Ellie') &
    (df['source'] == df['source'].shift(1))
  )

  df = df[is_answer].copy()
  df = df.rename(columns={
    'prev_value': 'question', # The previous Ellie utterance is the question
    'value': 'answer',            # The current Participant utterance is the answer
  })

  df['participant_id'] = df['source'].str.split("_").str[0].astype(int)
  df = df[['participant_id', 'question', 'answer', 'start_time']]

  return df

def add_labels_to_df(qa_df, labels_dir):
  splits = ['train', 'dev', 'test']

  all_labels_df = pd.DataFrame()
  for split in splits:
    split_labels_df = pd.read_csv(os.path.join(labels_dir, f"{split}.csv"))
    split_labels_df = split_labels_df.rename(columns={
      "Participant_ID": "participant_id",
      "PHQ8_Binary": "depression_label",
      "PHQ8_Score": "depression_severity",
      "PHQ_Binary": "depression_label",
      "PHQ_Score": "depression_severity",
    })
    split_labels_df = split_labels_df[["participant_id", "depression_label", "depression_severity"]]
    split_labels_df["split"] = split
    all_labels_df = pd.concat([all_labels_df, split_labels_df], ignore_index=True)

  merged_df = pd.merge(qa_df, all_labels_df, on="participant_id", how="left")
  return merged_df

def format_input(df, row, n_context=3):
  past_pairs = df[
    (df['participant_id'] == row['participant_id']) &
    (df.index < row.name)
  ].tail(n_context)

  context_lines = []
  for _, past_row in past_pairs.iterrows():
    q = str(past_row.get("question", "")).strip()
    a = str(past_row.get("answer", "")).strip()
    context_lines.append(f"Q: {q}\nA: {a}")

  context = "[START]\n" + "\n".join(context_lines) if context_lines else "[START]\n"

  instruction = (
    "### Instruction:\n"
    "You are analyzing a therapeutic interview between a virtual interviewer (Ellie) and a participant.\n"
    "The participant has a PHQ-8 score ranging from 0 (no depression) to 24 (severe depression). "
    f"This participant’s score is {row['depression_severity']}. "
    "Scores of 10 or higher are typically considered indicative of depression.\n"
    "Given the participant’s previous responses and their PHQ score, "
    "predict how they might answer the next question in a coherent and realistic way. "
    "Use natural, casual language. Avoid overly formal styles. "
    "Tolerate some irregularities (omissions, repetitions, filler words).\n\n"
  )

  question = str(row.get("question", "")).strip()

  input_text = f"### Input:\n{context}\nQ: {question}\nA:"

  return instruction + input_text

class InstructionDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length=512):
        dataframe = dataframe.sort_values(
            by=['participant_id', 'start_time']
        ).reset_index(drop=True)

        self.df = dataframe
        self.tokenizer = tokenizer
        self.max_length = max_length

        self.samples = []

        # Filter out empty responses to prevent NaN loss
        skipped = 0
        for _, row in self.df.iterrows():
            response = str(row.get("answer", "")).strip()

            # Skip empty or very short responses (less than 3 characters)
            if not response or len(response) < 3:
                skipped += 1
                continue

            prompt = format_input(self.df, row)
            full = f"{prompt}\n\n### Response:\n{response} [END]"
            self.samples.append((prompt, full))

        if skipped > 0:
            print(f"Warning: Skipped {skipped} samples with empty or very short responses")

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        prompt, full_text = self.samples[idx]

        encoded_full = self.tokenizer(
            full_text,
            truncation=True,
            padding="max_length",
            max_length=self.max_length,
            return_tensors="pt"
        )

        encoded_prompt = self.tokenizer(
            prompt,
            truncation=True,
            padding="max_length",
            max_length=self.max_length,
            return_tensors="pt"
        )

        input_ids = encoded_full["input_ids"].squeeze(0)
        attention_mask = encoded_full["attention_mask"].squeeze(0)

        prompt_len = (encoded_prompt["input_ids"] != self.tokenizer.pad_token_id).sum()

        labels = input_ids.clone()
        labels[:prompt_len] = -100  # ignore instruction & input tokens

        # Validate that we have at least some non-ignored labels
        # This prevents NaN loss when all labels are -100
        valid_labels = (labels != -100).sum().item()
        if valid_labels == 0:
            # If somehow all labels are -100, this is a problem
            # Find the first non-pad token after prompt_len and make it a valid label
            for i in range(prompt_len, len(input_ids)):
                if input_ids[i] != self.tokenizer.pad_token_id:
                    labels[i] = input_ids[i]
                    break
            # If still no valid labels (all padding), use EOS token
            if (labels != -100).sum().item() == 0:
                labels[-1] = self.tokenizer.eos_token_id

        return {
            "input_ids": input_ids,
            "labels": labels,
            "attention_mask": attention_mask
        }

def load_daic_data(tokenizer, data_dir="./daic_data/", should_create_csv=False, return_splits=False):
  """Load DAIC data and optionally return train/validation/test splits.

  Args:
    tokenizer: Tokenizer instance
    data_dir: Directory containing data
    should_create_csv: Whether to save CSV file
    return_splits: If True, return train, validation, and test datasets separately

  Returns:
    If return_splits=False: single InstructionDataset with all data
    If return_splits=True: tuple of (train_dataset, val_dataset, test_dataset)
  """
  transcripts_dir = os.path.join(data_dir, "transcripts")
  labels_dir = os.path.join(data_dir, "labels")

  qa_df = get_questions_answers_df(transcripts_dir)
  qa_df = add_labels_to_df(qa_df, labels_dir)

  # Filter out rows with missing depression_severity (NaN values cause NaN loss)
  initial_count = len(qa_df)
  qa_df = qa_df.dropna(subset=['depression_severity']).copy()
  filtered_count = len(qa_df)

  if initial_count != filtered_count:
    print(f"Warning: Filtered out {initial_count - filtered_count} rows with missing depression_severity")

  # Ensure depression_severity is numeric
  qa_df['depression_severity'] = pd.to_numeric(qa_df['depression_severity'], errors='coerce')
  qa_df = qa_df.dropna(subset=['depression_severity']).copy()

  if should_create_csv:
    qa_df.to_csv("questions_and_answers.csv", index=False, encoding="utf-8-sig")

  if return_splits:
    # Split into train, validation (dev), and test
    train_df = qa_df[qa_df['split'] == 'train'].copy()
    val_df = qa_df[qa_df['split'] == 'dev'].copy()
    test_df = qa_df[qa_df['split'] == 'test'].copy()

    # Additional validation: ensure we have data in all splits
    if len(train_df) == 0:
      raise ValueError("Train dataset is empty after filtering!")
    if len(val_df) == 0:
      raise ValueError("Validation dataset is empty after filtering! Check if 'dev' split exists in labels.")
    if len(test_df) == 0:
      raise ValueError("Test dataset is empty after filtering! Check if 'test' split exists in labels.")

    # Check for NaN values in splits
    train_nan = train_df['depression_severity'].isna().sum()
    val_nan = val_df['depression_severity'].isna().sum()
    test_nan = test_df['depression_severity'].isna().sum()

    if train_nan > 0:
      print(f"Warning: {train_nan} train samples still have NaN depression_severity")
      train_df = train_df.dropna(subset=['depression_severity']).copy()
    if val_nan > 0:
      print(f"Warning: {val_nan} validation samples still have NaN depression_severity")
      val_df = val_df.dropna(subset=['depression_severity']).copy()
    if test_nan > 0:
      print(f"Warning: {test_nan} test samples still have NaN depression_severity")
      test_df = test_df.dropna(subset=['depression_severity']).copy()

    # Filter out empty answers before creating datasets
    train_df = train_df[train_df['answer'].astype(str).str.strip().str.len() >= 3].copy()
    val_df = val_df[val_df['answer'].astype(str).str.strip().str.len() >= 3].copy()
    test_df = test_df[test_df['answer'].astype(str).str.strip().str.len() >= 3].copy()

    print(f"After filtering empty answers - Train: {len(train_df)}, Val: {len(val_df)}, Test: {len(test_df)}")

    train_dataset = InstructionDataset(train_df, tokenizer)
    val_dataset = InstructionDataset(val_df, tokenizer)
    test_dataset = InstructionDataset(test_df, tokenizer)

    print(f"Train samples: {len(train_dataset)}")
    print(f"Validation samples: {len(val_dataset)}")
    print(f"Test samples: {len(test_dataset)}")

    if len(val_dataset) == 0:
      raise ValueError("Validation dataset is empty! Cannot proceed with validation.")
    if len(test_dataset) == 0:
      raise ValueError("Test dataset is empty! Cannot proceed with test evaluation.")

    return train_dataset, val_dataset, test_dataset
  else:
    instruction_dataset = InstructionDataset(qa_df, tokenizer)
    return instruction_dataset

In [None]:
def get_tokenizer_and_early_model(model_name="TinyLlama/TinyLlama-1.1B-Chat-v1.0"):
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    # Add [END] as a special token
    special_tokens_dict = {"additional_special_tokens": ["[END]"]}
    tokenizer.add_special_tokens(special_tokens_dict)

    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        load_in_8bit=False,
        device_map="auto"
    )

    # Resize token embeddings to accommodate the new [END] token
    model.resize_token_embeddings(len(tokenizer))

    model.config.pad_token_id = tokenizer.pad_token_id

    return tokenizer, model, model_name

def get_lora_model(model):
  lora_config = LoraConfig(
    r=8, # rank
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"], # depends on model architecture
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.CAUSAL_LM
  )

  model = get_peft_model(model, lora_config)

  # Note: Gradient checkpointing with PEFT can be tricky
  # We'll let TrainingArguments handle it if needed
  # For now, we'll disable it to avoid the "no requires_grad" error

  # Ensure model is in train mode
  model.train()

  # Verify that LoRA parameters have requires_grad=True
  trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
  print(f"Trainable parameters: {trainable_params:,}")

  model.print_trainable_parameters()
  return model

def validate_dataset(dataset, tokenizer, name="dataset", sample_size=10):
    """Validate that a dataset has valid labels and won't cause NaN loss.

    Args:
        dataset: Dataset to validate
        tokenizer: Tokenizer instance
        name: Name of dataset for logging
        sample_size: Number of samples to check
    """
    print(f"\nValidating {name}...")
    invalid_samples = []

    for i in range(min(sample_size, len(dataset))):
        sample = dataset[i]
        labels = sample["labels"]
        valid_labels = (labels != -100).sum().item()

        if valid_labels == 0:
            invalid_samples.append(i)
            print(f"  Warning: Sample {i} has no valid labels (all -100)")

    if invalid_samples:
        print(f"  Found {len(invalid_samples)} invalid samples in first {sample_size} samples")
        print(f"  This may cause NaN validation loss!")
    else:
        print(f"  ✓ All {sample_size} checked samples have valid labels")

    return len(invalid_samples) == 0

def compute_metrics(eval_pred):
    """Compute metrics for evaluation, handling NaN values gracefully."""
    import numpy as np
    predictions, labels = eval_pred

    # Handle NaN in predictions
    if isinstance(predictions, tuple):
        predictions = predictions[0]

    # Check for NaN values
    if np.isnan(predictions).any():
        print("Warning: NaN values detected in predictions during evaluation")
        predictions = np.nan_to_num(predictions, nan=0.0)

    # For language modeling, we typically just return a dummy metric
    # The actual loss is computed by the model
    return {"eval_loss": 0.0}  # Dummy return, actual loss comes from model

def fine_tune_model(
    model,
    tokenizer,
    train_dataset,
    output_dir="./tiny_llama_instruction_tuned",
    eval_dataset=None,
    test_dataset=None,
):
    """Fine-tune model with optional validation and test datasets.

    Args:
        model: Model to fine-tune
        tokenizer: Tokenizer instance
        train_dataset: Training dataset
        output_dir: Output directory for checkpoints
        eval_dataset: Optional validation dataset for monitoring during training
        test_dataset: Optional test dataset for final evaluation after training
    """

    training_args = TrainingArguments(
        output_dir=output_dir,
        per_device_train_batch_size=2,
        per_device_eval_batch_size=2,
        gradient_accumulation_steps=4,
        eval_accumulation_steps=4,
        warmup_steps=50,
        num_train_epochs=1,
        learning_rate=1e-4,
        lr_scheduler_type="cosine",  # Use cosine learning rate schedule
        fp16=True,
        logging_steps=50,
        save_steps=200,
        eval_strategy="steps" if eval_dataset else "no",  # Enable evaluation during training
        eval_steps=200,  # Evaluate every 200 steps
        save_total_limit=3,
        load_best_model_at_end=True if eval_dataset else False,  # Load best model based on eval loss
        metric_for_best_model="eval_loss" if eval_dataset else None,
        greater_is_better=False,  # Lower loss is better
        max_grad_norm=1.0,  # Gradient clipping to prevent exploding gradients
        dataloader_pin_memory=False,  # Disable pinning to save memory
        remove_unused_columns=False,  # Keep all columns for debugging
        report_to="none",  # Disable wandb/tensorboard to avoid issues
        dataloader_num_workers=0,  # Disable multiprocessing to save memory
        prediction_loss_only=True,  # Only compute loss during eval, don't store predictions (saves memory)
    )

    # Ensure model is in train mode before training
    model.train()

    def collator(batch):
        # Validate batch before returning
        batch_dict = {
            "input_ids": torch.stack([x["input_ids"] for x in batch]),
            "labels": torch.stack([x["labels"] for x in batch]),
            "attention_mask": torch.stack([x["attention_mask"] for x in batch]),
        }

        # Check for samples with no valid labels (all -100) and fix them
        labels = batch_dict["labels"]
        input_ids = batch_dict["input_ids"]
        fixed_count = 0

        for i in range(len(batch)):
            valid_labels = (labels[i] != -100).sum().item()
            if valid_labels == 0:
                fixed_count += 1
                # Make at least one label valid to prevent NaN
                # Find first non-pad token and make it a valid label
                for j in range(len(labels[i])):
                    if input_ids[i][j] != tokenizer.pad_token_id:
                        labels[i][j] = input_ids[i][j]
                        break
                # If still all padding, use EOS token
                if (labels[i] != -100).sum().item() == 0:
                    labels[i][-1] = tokenizer.eos_token_id

        if fixed_count > 0:
            print(f"Warning: Fixed {fixed_count} samples in batch with no valid labels")

        return batch_dict

    # Validate datasets before training
    print("\nValidating datasets before training...")
    validate_dataset(train_dataset, tokenizer, "train_dataset", sample_size=20)
    if eval_dataset:
        validate_dataset(eval_dataset, tokenizer, "eval_dataset", sample_size=20)
    if test_dataset:
        validate_dataset(test_dataset, tokenizer, "test_dataset", sample_size=20)

    # Create a custom Trainer that clears memory before evaluation
    class MemoryEfficientTrainer(Trainer):
        def evaluate(self, eval_dataset=None, ignore_keys=None, metric_key_prefix="eval"):
            # Clear memory before evaluation
            clear_memory()
            return super().evaluate(eval_dataset, ignore_keys, metric_key_prefix)

    trainer = MemoryEfficientTrainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        data_collator=collator,
        compute_metrics=compute_metrics if eval_dataset else None,
    )

    trainer.train()

    # Evaluate on test dataset after training if provided
    # if test_dataset:
    #     print("\n" + "=" * 80)
    #     print("EVALUATING ON TEST DATASET")
    #     print("=" * 80)
    #     clear_memory()

    #     # Evaluate on test set
    #     test_metrics = trainer.evaluate(eval_dataset=test_dataset, metric_key_prefix="test")

    #     print("\nTest Dataset Results:")
    #     for key, value in test_metrics.items():
    #         print(f"  {key}: {value:.4f}")

    #     clear_memory()

    model.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)

def use_tokenizer(tokenizer, text):
  return tokenizer(text, truncation=True, padding='max_length', max_length=512)

# Stopping criteria for [END] token
class EndTokenStoppingCriteria(StoppingCriteria):
    def __init__(self, end_token_id, min_tokens=5):
        self.end_token_id = end_token_id
        self.min_tokens = min_tokens  # Minimum tokens to generate before allowing stop
        self.initial_length = None

    def __call__(self, input_ids, scores, **kwargs):
        # Stop if the last generated token is [END]
        # input_ids is a tensor of shape [batch_size, sequence_length]
        # Check the last token of the first (and only) sequence
        if input_ids.shape[0] > 0 and input_ids.shape[1] > 0:
            # Track initial length on first call
            if self.initial_length is None:
                self.initial_length = input_ids.shape[1]

            # Calculate how many new tokens have been generated
            new_tokens_count = input_ids.shape[1] - self.initial_length

            # Only stop on [END] if we've generated at least min_tokens
            if new_tokens_count >= self.min_tokens:
                return input_ids[0][-1].item() == self.end_token_id
        return False

def create_stopping_criteria(tokenizer, min_tokens=5):
    """Create stopping criteria that stops at [END] token.

    Args:
        tokenizer: Tokenizer instance
        min_tokens: Minimum number of tokens to generate before allowing stop (default: 5)
    """
    try:
        end_token_id = tokenizer.convert_tokens_to_ids("[END]")
        if end_token_id is None or end_token_id == tokenizer.unk_token_id:
            print("Warning: [END] token not found in tokenizer. Stopping criteria will not work correctly.")
            print(f"Available special tokens: {tokenizer.special_tokens_map}")
            # Fallback: use EOS token instead
            end_token_id = tokenizer.eos_token_id
            print(f"Using EOS token ({end_token_id}) as fallback for stopping criteria.")
        return StoppingCriteriaList([EndTokenStoppingCriteria(end_token_id, min_tokens=min_tokens)])
    except Exception as e:
        print(f"Error creating stopping criteria: {e}")
        # Fallback: use EOS token
        end_token_id = tokenizer.eos_token_id
        return StoppingCriteriaList([EndTokenStoppingCriteria(end_token_id, min_tokens=min_tokens)])

def clear_memory():
    """Clear GPU and CPU memory cache."""
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        torch.cuda.synchronize()

def load_finetuned_model(model_name, tokenizer, checkpoint_path=None):
    """Load a finetuned model from checkpoint or final model.
    Note: Base model is loaded fresh each time to avoid PEFT weight conflicts.

    Args:
        model_name: Base model name
        tokenizer: Tokenizer instance
        checkpoint_path: Path to checkpoint, or None for final model
    """
    # Determine device
    device = "cuda:0" if torch.cuda.is_available() else "cpu"
    print(f"Device set to use {device}")

    # Always load base model fresh to avoid PEFT weight conflicts
    # (PEFT models modify base model in place, so we can't reuse it)
    # Use explicit device placement instead of device_map="auto" to avoid multi-device issues
    base = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
        device_map=None  # Don't use device_map to avoid offloading issues
    )

    # Manually move to device
    base = base.to(device)

    # Resize token embeddings if [END] token was added
    if len(tokenizer) != base.get_input_embeddings().weight.shape[0]:
        base.resize_token_embeddings(len(tokenizer))

    if checkpoint_path:
        lora = PeftModel.from_pretrained(base, checkpoint_path, device_map=None)
    else:
        lora = PeftModel.from_pretrained(base, "./tiny_llama_instruction_tuned", device_map=None)

    # Ensure model is on the correct device
    lora = lora.to(device)

    # Set model to eval mode for proper inference behavior
    lora.eval()
    print("Model set to eval mode for inference")

    return lora, base  # Return both so we can clean up base separately

def generate_response(model, tokenizer, prompt, max_new_tokens=100, stopping_criteria=None,
                      do_sample=True, temperature=0.7, top_p=0.9, top_k=50, repetition_penalty=1.1):
    """Generate a response using the model. Cleans up pipeline after use.

    Args:
        model: Model to use for generation
        tokenizer: Tokenizer instance
        prompt: Input prompt
        max_new_tokens: Maximum number of tokens to generate
        stopping_criteria: Optional stopping criteria
        do_sample: Whether to use sampling (default: True)
        temperature: Sampling temperature (default: 0.7)
        top_p: Nucleus sampling parameter (default: 0.9)
        top_k: Top-k sampling parameter (default: 50)
        repetition_penalty: Repetition penalty (default: 1.1)
    """
    pipe = None
    try:
        pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)

        if stopping_criteria is None:
            stopping_criteria = create_stopping_criteria(tokenizer)

        # Print generation parameters for debugging
        print(f"Generating with parameters: do_sample={do_sample}, temperature={temperature}, top_p={top_p}, top_k={top_k}")
        print(f"Input prompt length: {len(prompt)} characters")

        # Tokenize prompt to check input
        input_ids = tokenizer(prompt, return_tensors="pt")["input_ids"]
        print(f"Input token IDs shape: {input_ids.shape}")
        print(f"Input token IDs (first 20): {input_ids[0][:20].tolist()}")

        # Generate with proper parameters
        res = pipe(
            prompt,
            max_new_tokens=max_new_tokens,
            stopping_criteria=stopping_criteria,
            do_sample=do_sample,
            temperature=temperature,
            top_p=top_p,
            top_k=top_k,
            repetition_penalty=repetition_penalty,
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=tokenizer.eos_token_id,
            return_full_text=True
        )

        generated_text = res[0]["generated_text"]

        # Debug: Check generated tokens
        full_ids = tokenizer(generated_text, return_tensors="pt")["input_ids"]
        new_tokens = full_ids[0][input_ids.shape[1]:]
        print(f"Generated token IDs shape: {new_tokens.shape}")
        print(f"Generated token IDs (first 20): {new_tokens[:20].tolist()}")
        print(f"Full generated text length: {len(generated_text)} characters")
        print(f"Full generated text (first 200 chars): {generated_text[:200]}")

        return generated_text
    finally:
        # Clean up pipeline
        if pipe is not None:
            del pipe
        clear_memory()

def extract_response_only(full_output, prompt):
    """Extract only the generated response part, removing the prompt."""
    if full_output.startswith(prompt):
        return full_output[len(prompt):].strip()
    return full_output

def unload_model(model):
    """Unload a model from memory."""
    if model is not None:
        # Move to CPU and delete
        if hasattr(model, 'cpu'):
            model.cpu()
        del model
        clear_memory()

def evaluate_model_on_test(model, tokenizer, test_dataset, output_dir="./tiny_llama_instruction_tuned"):
    """Evaluate a model on the test dataset and return metrics.

    Args:
        model: Model to evaluate
        tokenizer: Tokenizer instance
        test_dataset: Test dataset
        output_dir: Output directory for training args (needed for Trainer)

    Returns:
        Dictionary of test metrics
    """
    from transformers import TrainingArguments, Trainer

    # Create minimal training args for evaluation
    eval_args = TrainingArguments(
        output_dir=output_dir,
        per_device_eval_batch_size=1,
        eval_accumulation_steps=4,
        fp16=True,
        dataloader_num_workers=0,
        prediction_loss_only=True,
        report_to="none",
    )

    def collator(batch):
        return {
            "input_ids": torch.stack([x["input_ids"] for x in batch]),
            "labels": torch.stack([x["labels"] for x in batch]),
            "attention_mask": torch.stack([x["attention_mask"] for x in batch]),
        }

    # Create trainer for evaluation
    trainer = Trainer(
        model=model,
        args=eval_args,
        data_collator=collator,
        compute_metrics=compute_metrics,
    )

    # Evaluate on test set
    clear_memory()
    test_metrics = trainer.evaluate(eval_dataset=test_dataset, metric_key_prefix="test")
    clear_memory()

    return test_metrics

In [None]:
# Clear all memory before starting
print('=' * 80)
print('CLEARING GPU MEMORY BEFORE TRAINING')
print('=' * 80)
clear_memory()

if torch.cuda.is_available():
    torch.cuda.empty_cache()
    torch.cuda.ipc_collect()
    total = torch.cuda.get_device_properties(0).total_memory / 1024**3
    reserved = torch.cuda.memory_reserved() / 1024**3
    free = total - reserved
    print(f'GPU memory - Total: {total:.2f} GB, Reserved: {reserved:.2f} GB, Free: {free:.2f} GB')
    if free < 8.0:
        print(f'\n⚠️  WARNING: Only {free:.2f} GB free. Consider restarting kernel to clear all memory.')

print('\nLoading tokenizer and model...')
tokenizer, model, model_name = get_tokenizer_and_early_model()

# Check memory after loading model
if torch.cuda.is_available():
    allocated = torch.cuda.memory_allocated() / 1024**3
    reserved = torch.cuda.memory_reserved() / 1024**3
    print(f'After loading model - Allocated: {allocated:.2f} GB, Reserved: {reserved:.2f} GB')

print('\nLoading datasets (train, validation, and test)...')
train_dataset, val_dataset, test_dataset = load_daic_data(tokenizer, should_create_csv=False, return_splits=True)

print('\nGetting LoRA model...')
model = get_lora_model(model)

# Check memory after LoRA
if torch.cuda.is_available():
    allocated = torch.cuda.memory_allocated() / 1024**3
    reserved = torch.cuda.memory_reserved() / 1024**3
    print(f'After LoRA - Allocated: {allocated:.2f} GB, Reserved: {reserved:.2f} GB')

print('\nFine-tuning model with validation monitoring and test evaluation...')
fine_tune_model(model, tokenizer, train_dataset, eval_dataset=val_dataset, test_dataset=test_dataset)

CLEARING GPU MEMORY BEFORE TRAINING
GPU memory - Total: 14.74 GB, Reserved: 0.00 GB, Free: 14.74 GB

Loading tokenizer and model...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/551 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/608 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`
The new lm_head weights will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


After loading model - Allocated: 4.11 GB, Reserved: 4.98 GB

Loading datasets (train, validation, and test)...
After filtering empty answers - Train: 5953, Val: 1785, Test: 2588
Train samples: 5953
Validation samples: 1785
Test samples: 2588

Getting LoRA model...
Trainable parameters: 1,126,400
trainable params: 1,126,400 || all params: 1,101,178,880 || trainable%: 0.1023
After LoRA - Allocated: 4.11 GB, Reserved: 4.98 GB

Fine-tuning model with validation monitoring and test evaluation...

Validating datasets before training...

Validating train_dataset...
  ✓ All 20 checked samples have valid labels

Validating eval_dataset...
  ✓ All 20 checked samples have valid labels

Validating test_dataset...


The model is already on multiple devices. Skipping the move to device specified in `args`.


  ✓ All 20 checked samples have valid labels


Step,Training Loss,Validation Loss
200,0.4234,1.166789
400,0.431,1.002135
600,0.429,0.964098




### Best model

In [None]:
request = """
You are analyzing a therapeutic interview between a virtual interviewer (Ellie) and a participant.
The participant has a PHQ-8 score ranging from 0 (no depression) to 24 (severe depression). This participant’s score is 3. Scores of 10 or higher are typically considered indicative of depression.
Given the participant’s previous responses and their PHQ score, predict how they might answer the next question in a coherent and realistic way.Use natural, casual language. Avoid overly formal styles.Tolerate some irregularities (omissions, repetitions, filler words) given the conversational context.

### Input:
[START]
Q: right there are always trade offs in life aren't there
A: yeah
Q: what made you decide to do that
A: so um i think i think my in my life i knew that i there's a lot of things i have <ha> there's more dislikes <laughter> than likes so i kinda narrowed it down to what am i good at and what am i not good at and what am i gonna work well or who who am i gonna work well with and who will i not work well with so i kind of i kinda sorted out and then the list kind of mmm kind of answered itself so
Q: that sounds really hard
A: no it i don't think it was hard but it was just but i think it was a real reality check and i think it it's kind of a good thing 'cause sometimes trying to conform to doing things that doesn't really fit you doesn't make sense it's like trying to shove a a round peg into a square a square hole and it's like it just no matter how you try to shove it in it's not gonna go in so sometimes it's just might as well go down a path that seems to work better for you
Q: right that makes sense what's one of your most memorable experiences

### Response:
"""

In [None]:
print("Loading final model...")
model, base_model = load_finetuned_model(model_name, tokenizer)

# Load test dataset for evaluation
print("\nLoading test dataset for evaluation...")
# _, _, test_dataset = load_daic_data(tokenizer, should_create_csv=False, return_splits=True)

try:
    # Evaluate on test dataset
    print("\n" + "=" * 80)
    print("EVALUATING FINAL MODEL ON TEST DATASET")
    print("=" * 80)
    # test_metrics = evaluate_model_on_test(model, tokenizer, test_dataset)

    # print("\nTest Dataset Results:")
    # for key, value in test_metrics.items():
    #     print(f"  {key}: {value:.4f}")

    # Generate example response
    print("\n" + "=" * 80)
    print("GENERATING EXAMPLE RESPONSE")
    print("=" * 80)
    print("\nGenerating response...")
    full_output = generate_response(model, tokenizer, request, max_new_tokens=100)
    response_only = extract_response_only(full_output, request)

    print("=" * 80)
    print("FULL OUTPUT:")
    print("=" * 80)
    print(full_output)
    print("\n" + "=" * 80)
    print("RESPONSE ONLY:")
    print("=" * 80)
    print(response_only)
finally:
    # Clean up models from memory
    unload_model(model)
    unload_model(base_model)
    print("\nModels unloaded from memory.")


`torch_dtype` is deprecated! Use `dtype` instead!


Loading final model...
Device set to use cuda:0


Device set to use cuda:0


Model set to eval mode for inference

Loading test dataset for evaluation...

EVALUATING FINAL MODEL ON TEST DATASET

GENERATING EXAMPLE RESPONSE

Generating response...
Generating with parameters: do_sample=True, temperature=0.7, top_p=0.9, top_k=50
Input prompt length: 1697 characters
Input token IDs shape: torch.Size([1, 470])
Input token IDs (first 20): [1, 29871, 13, 3492, 526, 29537, 292, 263, 266, 1572, 412, 329, 293, 15593, 1546, 263, 6901, 1006, 29894, 15580]
Generated token IDs shape: torch.Size([100])
Generated token IDs (first 20): [398, 474, 1016, 29915, 29873, 1073, 565, 372, 29915, 29879, 1063, 529, 348, 28327, 29958, 474, 2099, 474, 723, 1827]
Full generated text length: 2102 characters
Full generated text (first 200 chars): 
You are analyzing a therapeutic interview between a virtual interviewer (Ellie) and a participant.
The participant has a PHQ-8 score ranging from 0 (no depression) to 24 (severe depression). This par
FULL OUTPUT:

You are analyzing a therapeutic in

In [None]:
def test_all_checkpoints(model_name, tokenizer, prompt, test_dataset=None, output_dir="./tiny_llama_instruction_tuned"):
    """Test all checkpoint models and return results. Memory-efficient version.
    Each model is loaded, tested, and immediately unloaded to save memory.

    Args:
        model_name: Base model name
        tokenizer: Tokenizer instance
        prompt: Prompt to test with
        test_dataset: Optional test dataset for evaluation
        output_dir: Directory containing checkpoints
    """
    checkpoint_folders = sorted([
        f for f in os.listdir(output_dir)
        if f.startswith("checkpoint-") and os.path.isdir(os.path.join(output_dir, f))
    ])

    if not checkpoint_folders:
        print("No checkpoints found.")
        return {}

    results = {}
    stopping_criteria = create_stopping_criteria(tokenizer)

    print(f"Found {len(checkpoint_folders)} checkpoints. Testing each (memory-efficient mode)...\n")

    for i, folder in enumerate(checkpoint_folders, 1):
        checkpoint_path = os.path.join(output_dir, folder)
        print(f"[{i}/{len(checkpoint_folders)}] Testing {folder}...")

        model = None
        base_model = None
        try:
            # Load model (returns both lora and base for cleanup)
            model, base_model = load_finetuned_model(model_name, tokenizer, checkpoint_path)

            # Generate response
            full_output = generate_response(model, tokenizer, prompt, max_new_tokens=100, stopping_criteria=stopping_criteria)
            response_only = extract_response_only(full_output, prompt)

            # Evaluate on test dataset if provided
            test_metrics = None
            # if test_dataset is not None:
            #     print(f"  Evaluating {folder} on test dataset...")
            #     test_metrics = evaluate_model_on_test(model, tokenizer, test_dataset, output_dir)

            results[folder] = {
                "full_output": full_output,
                "response_only": response_only,
                "test_metrics": test_metrics
            }

            if test_metrics:
                print(f"  Test loss: {test_metrics.get('test_loss', 'N/A'):.4f}")

            print(f"✓ {folder} completed")

        except Exception as e:
            print(f"✗ Error testing {folder}: {e}")
            results[folder] = {"error": str(e)}
        finally:
            # Always unload models after each checkpoint to free memory
            if model is not None:
                unload_model(model)
            if base_model is not None:
                unload_model(base_model)
            print(f"  Memory freed after {folder}\n")

    return results

# Load test dataset for evaluation
print("Loading test dataset for checkpoint evaluation...")
_, _, test_dataset = load_daic_data(tokenizer, should_create_csv=False, return_splits=True)

# Test all checkpoints
checkpoint_results = test_all_checkpoints(model_name, tokenizer, request, test_dataset=test_dataset)

# Display results
print("\n" + "=" * 80)
print("CHECKPOINT COMPARISON")
print("=" * 80)
for checkpoint_name, result in checkpoint_results.items():
    if "error" in result:
        print(f"\n{checkpoint_name}: ERROR - {result['error']}")
    else:
        print(f"\n{checkpoint_name}:")
        print("-" * 80)

        # Display test metrics if available
        if result.get("test_metrics"):
            print("Test Metrics:")
            for key, value in result["test_metrics"].items():
                print(f"  {key}: {value:.4f}")
            print()

        # Display generated response
        print("Generated Response:")
        print(result["response_only"])
        print()



Loading test dataset for checkpoint evaluation...
After filtering empty answers - Train: 5953, Val: 1785, Test: 2588
Train samples: 5953
Validation samples: 1785
Test samples: 2588
Found 3 checkpoints. Testing each (memory-efficient mode)...

[1/3] Testing checkpoint-400...
Device set to use cuda:0


Device set to use cuda:0


Model set to eval mode for inference
Generating with parameters: do_sample=True, temperature=0.7, top_p=0.9, top_k=50
Input prompt length: 1697 characters
Input token IDs shape: torch.Size([1, 470])
Input token IDs (first 20): [1, 29871, 13, 3492, 526, 29537, 292, 263, 266, 1572, 412, 329, 293, 15593, 1546, 263, 6901, 1006, 29894, 15580]
Generated token IDs shape: torch.Size([100])
Generated token IDs (first 20): [29875, 4140, 474, 508, 1827, 393, 697, 310, 278, 1556, 26959, 519, 27482, 393, 474, 29915, 345, 750, 338, 746]
Full generated text length: 2111 characters
Full generated text (first 200 chars): 
You are analyzing a therapeutic interview between a virtual interviewer (Ellie) and a participant.
The participant has a PHQ-8 score ranging from 0 (no depression) to 24 (severe depression). This par
✓ checkpoint-400 completed
  Memory freed after checkpoint-400

[2/3] Testing checkpoint-600...
Device set to use cuda:0


Device set to use cuda:0


Model set to eval mode for inference
Generating with parameters: do_sample=True, temperature=0.7, top_p=0.9, top_k=50
Input prompt length: 1697 characters
Input token IDs shape: torch.Size([1, 470])
Input token IDs (first 20): [1, 29871, 13, 3492, 526, 29537, 292, 263, 266, 1572, 412, 329, 293, 15593, 1546, 263, 6901, 1006, 29894, 15580]
Generated token IDs shape: torch.Size([100])
Generated token IDs (first 20): [398, 474, 1348, 278, 278, 931, 393, 474, 2355, 17285, 515, 590, 937, 4982, 1156, 318, 29882, 1023, 2440, 474]
Full generated text length: 2139 characters
Full generated text (first 200 chars): 
You are analyzing a therapeutic interview between a virtual interviewer (Ellie) and a participant.
The participant has a PHQ-8 score ranging from 0 (no depression) to 24 (severe depression). This par
✓ checkpoint-600 completed
  Memory freed after checkpoint-600

[3/3] Testing checkpoint-745...
Device set to use cuda:0


Device set to use cuda:0


Model set to eval mode for inference
Generating with parameters: do_sample=True, temperature=0.7, top_p=0.9, top_k=50
Input prompt length: 1697 characters
Input token IDs shape: torch.Size([1, 470])
Input token IDs (first 20): [1, 29871, 13, 3492, 526, 29537, 292, 263, 266, 1572, 412, 329, 293, 15593, 1546, 263, 6901, 1006, 29894, 15580]
Generated token IDs shape: torch.Size([100])
Generated token IDs (first 20): [16099, 474, 508, 29915, 29873, 1348, 310, 385, 7271, 474, 3926, 750, 29871, 338, 697, 310, 278, 1556, 26959, 519]
Full generated text length: 1965 characters
Full generated text (first 200 chars): 
You are analyzing a therapeutic interview between a virtual interviewer (Ellie) and a participant.
The participant has a PHQ-8 score ranging from 0 (no depression) to 24 (severe depression). This par
✓ checkpoint-745 completed
  Memory freed after checkpoint-745


CHECKPOINT COMPARISON

checkpoint-400:
--------------------------------------------------------------------------------

In [None]:
!zip -r tiny_llama_instruction_tuned.zip /content/tiny_llama_instruction_tuned