In [2]:
import torch
import glob, os
import pandas as pd
import gc
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel
from transformers import StoppingCriteria, StoppingCriteriaList

# Set CUDA memory allocation to reduce fragmentation
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'

# Clear CUDA cache at startup
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    torch.cuda.synchronize()
    print(f"CUDA available. GPU: {torch.cuda.get_device_name(0)}")
    print(f"Total GPU memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB")


CUDA available. GPU: Tesla T4
Total GPU memory: 14.74 GB


# Dataset Augmentation

This notebook loads a finetuned model and generates augmented answers for each question in the dataset.


## Data loading


In [3]:
# from google.colab import drive
# drive.mount('/content/drive')

In [4]:
# !unzip /content/drive/MyDrive/daic_data/daic_data.zip
# !unzip /content/drive/MyDrive/daic_data/finetuned_llama.zip

# Gabriel:
# !unzip /content/drive/MyDrive/daic_data.zip
# !unzip /content/drive/MyDrive/finetuned_llama.zip

In [9]:
# Utility functions from daic_finetune.ipynb

def get_questions_answers_df(transcripts_dir):
  transcripts_files = glob.glob(os.path.join(transcripts_dir, "*.csv"))

  # Load and concatenate all transcript files
  df = pd.concat(
    (
      pd.read_csv(file, sep="\t", encoding="utf-8-sig").assign(source=os.path.basename(file))
      for file in transcripts_files
    ),
    ignore_index=True
  )

  # Create block_id to identify contiguous speaker segments
  df['block_id'] = (df['speaker'] != df['speaker'].shift(1)).cumsum()

  # Aggregate by source and block_id to merge contiguous segments by the same speaker
  df = df.groupby(['source', 'block_id']).agg(
    speaker=('speaker', 'first'),
    start_time=('start_time', 'min'),
    stop_time=('stop_time', 'max'),
    value=('value', lambda x: ' '.join(x.astype(str)))
  )

  # Sort by participant and time
  df = df.sort_values(by=['source', 'start_time']).reset_index()

  # Add previous speaker and value columns only if the previous source is the same
  df['prev_speaker'] = df.groupby('source')['speaker'].shift(1)
  df['prev_value'] = df.groupby('source')['value'].shift(1)

  is_answer = (
    (df['speaker'] == 'Participant') &
    (df['prev_speaker'] == 'Ellie') &
    (df['source'] == df['source'].shift(1))
  )

  df = df[is_answer].copy()
  df = df.rename(columns={
    'prev_value': 'question', # The previous Ellie utterance is the question
    'value': 'answer',            # The current Participant utterance is the answer
  })

  df['participant_id'] = df['source'].str.split("_").str[0].astype(int)
  df = df[['participant_id', 'question', 'answer', 'start_time']]

  return df

def add_labels_to_df(qa_df, labels_dir):
  splits = ['train', 'dev', 'test']

  all_labels_df = pd.DataFrame()
  for split in splits:
    split_labels_df = pd.read_csv(os.path.join(labels_dir, f"{split}.csv"))
    split_labels_df = split_labels_df.rename(columns={
      "Participant_ID": "participant_id",
      "PHQ8_Binary": "depression_label",
      "PHQ8_Score": "depression_severity",
      "PHQ_Binary": "depression_label",
      "PHQ_Score": "depression_severity",
    })
    split_labels_df = split_labels_df[["participant_id", "depression_label", "depression_severity"]]
    split_labels_df["split"] = split
    all_labels_df = pd.concat([all_labels_df, split_labels_df], ignore_index=True)

  merged_df = pd.merge(qa_df, all_labels_df, on="participant_id", how="left")
  return merged_df

def format_input(df, row, n_context=3):
  past_pairs = df[
    (df['participant_id'] == row['participant_id']) &
    (df.index < row.name)
  ].tail(n_context)

  context_lines = []
  for _, past_row in past_pairs.iterrows():
    q = str(past_row.get("question", "")).strip()
    a = str(past_row.get("answer", "")).strip()
    context_lines.append(f"Q: {q}\nA: {a}")

  context = "[START]\n" + "\n".join(context_lines) if context_lines else "[START]\n"

  instruction = (
    "### Instruction:\n"
    "You are analyzing a therapeutic interview between a virtual interviewer (Ellie) and a participant.\n"
    "The participant has a PHQ-8 score ranging from 0 (no depression) to 24 (severe depression). "
    f"This participant’s score is {row['depression_severity']}. "
    "Scores of 10 or higher are typically considered indicative of depression.\n"
    "Given the participant’s previous responses and their PHQ score, "
    "predict how they might answer the next question in a coherent and realistic way. "
    "Use natural, casual language. Avoid overly formal styles. "
    "Tolerate some irregularities (omissions, filler words).\n\n"
  )

  question = str(row.get("question", "")).strip()

  input_text = f"### Input:\n{context}\nQ: {question}\nA:"

  return instruction + input_text


# Stopping criteria for [END] token
class EndTokenStoppingCriteria(StoppingCriteria):
    def __init__(self, end_token_id, eos_token_id=None, min_tokens=5):
        self.end_token_id = end_token_id
        self.eos_token_id = eos_token_id  # Also stop on EOS token as fallback
        self.min_tokens = min_tokens  # Minimum tokens to generate before allowing stop
        self.initial_length = None

    def __call__(self, input_ids, scores, **kwargs):
        # Stop if the last generated token is [END]
        # input_ids is a tensor of shape [batch_size, sequence_length]
        # Check the last token of the first (and only) sequence
        if input_ids.shape[0] > 0 and input_ids.shape[1] > 0:
            # Track initial length on first call
            if self.initial_length is None:
                self.initial_length = input_ids.shape[1]

            # Calculate how many new tokens have been generated
            new_tokens_count = input_ids.shape[1] - self.initial_length

            # Get the last generated token
            last_token = input_ids[0][-1].item()

            # Always stop on EOS token (as fallback)
            if self.eos_token_id is not None and last_token == self.eos_token_id:
                return True

            # Only stop on [END] if we've generated at least min_tokens
            if new_tokens_count >= self.min_tokens:
                return last_token == self.end_token_id
        return False

    def reset(self):
        """Reset the initial length for a new generation."""
        self.initial_length = None

def create_stopping_criteria(tokenizer, min_tokens=5):
    """Create stopping criteria that stops at [END] token.

    Args:
        tokenizer: Tokenizer instance
        min_tokens: Minimum number of tokens to generate before allowing stop (default: 5)
    """
    try:
        end_token_id = tokenizer.convert_tokens_to_ids("[END]")
        eos_token_id = tokenizer.eos_token_id

        if end_token_id is None or end_token_id == tokenizer.unk_token_id:
            print("Warning: [END] token not found in tokenizer. Stopping criteria will use EOS token only.")
            print(f"Available special tokens: {tokenizer.special_tokens_map}")
            # Fallback: use EOS token instead
            end_token_id = eos_token_id

        # Create stopping criteria with both [END] and EOS tokens
        return StoppingCriteriaList([EndTokenStoppingCriteria(end_token_id, eos_token_id=eos_token_id, min_tokens=min_tokens)])
    except Exception as e:
        print(f"Error creating stopping criteria: {e}")
        # Fallback: use EOS token
        end_token_id = tokenizer.eos_token_id
        eos_token_id = tokenizer.eos_token_id
        return StoppingCriteriaList([EndTokenStoppingCriteria(end_token_id, eos_token_id=eos_token_id, min_tokens=min_tokens)])

def clear_memory():
    """Clear GPU and CPU memory cache."""
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        torch.cuda.ipc_collect()
        torch.cuda.synchronize()

def reset_cuda_memory():
    """Aggressively clear all CUDA memory."""
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        torch.cuda.ipc_collect()
        torch.cuda.synchronize()
        torch.cuda.reset_peak_memory_stats()

def load_daic_data(data_dir="./daic_data/", should_create_csv=False):
  """Load DAIC data and return a DataFrame with the same filtering as daic_finetune.ipynb.

  Args:
    data_dir: Directory containing data
    should_create_csv: Whether to save CSV file

  Returns:
    DataFrame with questions, answers, labels, and splits
  """
  transcripts_dir = os.path.join(data_dir, "transcripts")
  labels_dir = os.path.join(data_dir, "labels")

  qa_df = get_questions_answers_df(transcripts_dir)
  qa_df = add_labels_to_df(qa_df, labels_dir)

  # Filter out rows with missing depression_severity (NaN values cause NaN loss)
  initial_count = len(qa_df)
  qa_df = qa_df.dropna(subset=['depression_severity']).copy()
  filtered_count = len(qa_df)

  if initial_count != filtered_count:
    print(f"Warning: Filtered out {initial_count - filtered_count} rows with missing depression_severity")

  # Ensure depression_severity is numeric
  qa_df['depression_severity'] = pd.to_numeric(qa_df['depression_severity'], errors='coerce')
  qa_df = qa_df.dropna(subset=['depression_severity']).copy()

  # Sort by participant and time to maintain interview order
  qa_df = qa_df.sort_values(by=['participant_id', 'start_time']).reset_index(drop=True)

  if should_create_csv:
    qa_df.to_csv("questions_and_answers.csv", index=False, encoding="utf-8-sig")

  return qa_df

def load_finetuned_model(model_name, tokenizer, checkpoint_path=None):
    """Load a finetuned model from checkpoint or final model.
    Note: Base model is loaded fresh each time to avoid PEFT weight conflicts.

    Args:
        model_name: Base model name
        tokenizer: Tokenizer instance
        checkpoint_path: Path to checkpoint, or None for final model
    """
    # Determine device
    device = "cuda:0" if torch.cuda.is_available() else "cpu"
    print(f"Device set to use {device}")

    # Always load base model fresh to avoid PEFT weight conflicts
    # (PEFT models modify base model in place, so we can't reuse it)
    # Use explicit device placement instead of device_map="auto" to avoid multi-device issues
    base = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
        device_map=None  # Don't use device_map to avoid offloading issues
    )

    # Manually move to device
    base = base.to(device)

    # Resize token embeddings if [END] token was added
    if len(tokenizer) != base.get_input_embeddings().weight.shape[0]:
        base.resize_token_embeddings(len(tokenizer))

    if checkpoint_path:
        lora = PeftModel.from_pretrained(base, checkpoint_path, device_map=None)
    else:
        lora = PeftModel.from_pretrained(base, "./tiny_llama_instruction_tuned", device_map=None)

    # Ensure model is on the correct device
    lora = lora.to(device)

    # Set model to eval mode for proper inference behavior
    lora.eval()
    print("Model set to eval mode for inference")

    return lora, base  # Return both so we can clean up base separately


def generate_response(model, tokenizer, prompt, pipe, max_new_tokens=50, stopping_criteria=None,
                      do_sample=True, temperature=0.7, top_p=0.9, top_k=50, repetition_penalty=1.1):
    """Generate a response using the model. Cleans up pipeline after use.

    Args:
        model: Model to use for generation
        tokenizer: Tokenizer instance
        prompt: Input prompt
        max_new_tokens: Maximum number of tokens to generate
        stopping_criteria: Optional stopping criteria
        do_sample: Whether to use sampling (default: True)
        temperature: Sampling temperature (default: 0.7)
        top_p: Nucleus sampling parameter (default: 0.9)
        top_k: Top-k sampling parameter (default: 50)
        repetition_penalty: Repetition penalty (default: 1.1)
    """

    if pipe is None:
      pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)

    try:
        if stopping_criteria is None:
            stopping_criteria = create_stopping_criteria(tokenizer)
        else:
            # Reset stopping criteria for new generation
            for criteria in stopping_criteria:
                if hasattr(criteria, 'reset'):
                    criteria.reset()

        # Print generation parameters for debugging
        # print(f"Generating with parameters: do_sample={do_sample}, temperature={temperature}, top_p={top_p}, top_k={top_k}")
        # print(f"Input prompt length: {len(prompt)} characters")

        # Tokenize prompt to check input
        input_ids = tokenizer(prompt, return_tensors="pt")["input_ids"]
        # print(f"Input token IDs shape: {input_ids.shape}")
        # print(f"Input token IDs (first 20): {input_ids[0][:20].tolist()}")

        # Generate with proper parameters
        res = pipe(
            prompt,
            max_new_tokens=max_new_tokens,
            stopping_criteria=stopping_criteria,
            do_sample=do_sample,
            temperature=temperature,
            top_p=top_p,
            top_k=top_k,
            repetition_penalty=repetition_penalty,
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=tokenizer.eos_token_id,
            return_full_text=True
        )

        generated_text = res[0]["generated_text"]

        # Debug: Check generated tokens
        full_ids = tokenizer(generated_text, return_tensors="pt")["input_ids"]
        new_tokens = full_ids[0][input_ids.shape[1]:]

        return generated_text
    finally:
        # Clean up pipeline
        if pipe is not None:
            del pipe
        clear_memory()

def extract_response_only(full_output, prompt):
    """Extract only the generated response part, removing the prompt."""
    if full_output.startswith(prompt):
        return full_output[len(prompt):].strip()
    return full_output

def clean_augmented_answer(answer):
    """Clean the augmented answer by removing [END] token and extra whitespace."""
    if answer:
        # Remove [END] token if present
        answer = answer.replace("[END]", "").strip()
        # Remove any trailing incomplete sentences or fragments
        return answer
    return ""

def calculate_augmentation_multipliers(df, target_ratio=0.5, min_multiplier=0, max_multiplier=5):
    """Calculate how many times each sample should be augmented based on class distribution.

    Args:
        df: DataFrame with 'depression_label' column
        target_ratio: Target ratio of depressed samples (0.5 = 50/50 balance)
        min_multiplier: Minimum augmentation multiplier (all samples get at least this)
        max_multiplier: Maximum augmentation multiplier (cap to avoid excessive augmentation)

    Returns:
        Dictionary mapping depression_label to augmentation multiplier
    """
    # Calculate current class distribution
    label_counts = df['depression_label'].value_counts().sort_index()
    total_samples = len(df)

    print(f"\nCurrent class distribution:")
    for label, count in label_counts.items():
        percentage = (count / total_samples) * 100
        label_name = "Depressed" if label == 1 else "Non-depressed"
        print(f"  {label_name} (label={label}): {count} samples ({percentage:.1f}%)")

    # Determine target counts
    if len(label_counts) == 2:
        # Binary classification
        non_depressed_count = label_counts.get(0, 0)
        depressed_count = label_counts.get(1, 0)

        # Calculate target counts based on target_ratio
        if target_ratio == 0.5:
            # Balance to 50/50
            target_depressed = max(non_depressed_count, depressed_count)
            target_non_depressed = target_depressed
        else:
            # Use target_ratio to determine balance
            target_depressed = int(total_samples * target_ratio)
            target_non_depressed = total_samples - target_depressed

        # Calculate multipliers
        if depressed_count > 0:
            depressed_multiplier = max(min_multiplier, min(max_multiplier,
                target_depressed / depressed_count))
        else:
            depressed_multiplier = min_multiplier

        if non_depressed_count > 0:
            non_depressed_multiplier = max(min_multiplier, min(max_multiplier,
                target_non_depressed / non_depressed_count))
        else:
            non_depressed_multiplier = min_multiplier

        # Subtract 1 from multipliers and set non-depressed to 0
        # This means: non-depressed = 0 (no augmentation), depressed = original - 1
        non_depressed_multiplier = 0  # Don't augment non-depressed samples
        depressed_multiplier = max(0, depressed_multiplier - 1)  # Subtract 1, ensure >= 0

        multipliers = {0: non_depressed_multiplier, 1: depressed_multiplier}

        print(f"\nAugmentation multipliers:")
        print(f"  Non-depressed (label=0): {non_depressed_multiplier:.2f}x (no augmentation)")
        print(f"  Depressed (label=1): {depressed_multiplier:.2f}x")

        # Calculate expected final distribution
        expected_non_depressed = int(non_depressed_count * non_depressed_multiplier)
        expected_depressed = int(depressed_count * depressed_multiplier)
        expected_total = expected_non_depressed + expected_depressed

        print(f"\nExpected final distribution:")
        if expected_total > 0:
            print(f"  Non-depressed: {expected_non_depressed} samples ({(expected_non_depressed/expected_total*100):.1f}%)")
            print(f"  Depressed: {expected_depressed} samples ({(expected_depressed/expected_total*100):.1f}%)")
        else:
            print(f"  Non-depressed: {expected_non_depressed} samples (original: {non_depressed_count})")
            print(f"  Depressed: {expected_depressed} samples (original: {depressed_count})")

    else:
        # Fallback: augment all samples equally
        print(f"\nWarning: Unexpected number of classes. Using equal augmentation.")
        multipliers = {label: min_multiplier for label in label_counts.index}

    return multipliers


## Load Finetuned Model


In [6]:
# Configuration
model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
checkpoint_path = '/content/tiny_llama_instruction_tuned/checkpoint-400'  # Set to checkpoint path if you want to use a specific checkpoint, else uses final model
model_dir = "/content/tiny_llama_instruction_tuned"  # Directory where finetuned model is saved
data_dir = "/content/daic_data/"
output_csv_path = "./augmented_dataset.csv"

# Clear memory before loading
print("Clearing GPU memory...")
reset_cuda_memory()

if torch.cuda.is_available():
    total = torch.cuda.get_device_properties(0).total_memory / 1024**3
    reserved = torch.cuda.memory_reserved() / 1024**3
    free = total - reserved
    print(f"GPU memory - Total: {total:.2f} GB, Reserved: {reserved:.2f} GB, Free: {free:.2f} GB")

print("\nLoading tokenizer...")
# Try to load tokenizer from saved model directory first (has [END] token)
if os.path.exists(model_dir) and os.path.exists(os.path.join(model_dir, "tokenizer_config.json")):
    print(f"Loading tokenizer from {model_dir}...")
    tokenizer = AutoTokenizer.from_pretrained(model_dir)
else:
    print(f"Loading tokenizer from {model_name}...")
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    # Add [END] as a special token if not already present
    if "[END]" not in tokenizer.get_vocab():
        special_tokens_dict = {"additional_special_tokens": ["[END]"]}
        tokenizer.add_special_tokens(special_tokens_dict)

print("\nLoading finetuned model...")
model, base_model = load_finetuned_model(model_name, tokenizer, checkpoint_path)

# Check memory after loading
if torch.cuda.is_available():
    allocated = torch.cuda.memory_allocated() / 1024**3
    reserved = torch.cuda.memory_reserved() / 1024**3
    print(f"\nModel loaded. Memory - Allocated: {allocated:.2f} GB, Reserved: {reserved:.2f} GB")
print("Model loaded successfully!")


Clearing GPU memory...
GPU memory - Total: 14.74 GB, Reserved: 0.00 GB, Free: 14.74 GB

Loading tokenizer...
Loading tokenizer from /content/tiny_llama_instruction_tuned...

Loading finetuned model...
Device set to use cuda:0


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
`torch_dtype` is deprecated! Use `dtype` instead!
The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`
The new lm_head weights will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


Model set to eval mode for inference

Model loaded. Memory - Allocated: 2.06 GB, Reserved: 3.34 GB
Model loaded successfully!


## Load Original Dataset


In [7]:
print("Loading original dataset...")
qa_df = load_daic_data(data_dir=data_dir, should_create_csv=False)

print(f"Total samples: {len(qa_df)}")
print(f"Participants: {qa_df['participant_id'].nunique()}")
print(f"\nSplit distribution:")
print(qa_df['split'].value_counts())
print(f"\nFirst few rows:")
print(qa_df.head())


Loading original dataset...
Total samples: 10664
Participants: 186

Split distribution:
split
train    6149
test     2683
dev      1832
Name: count, dtype: int64

First few rows:
   participant_id                                           question  \
0             300  hi i'm ellie thanks for coming in today i was ...   
1             300          that's good where are you from originally   
2             300                       really why'd you move to l_a   
3             300                                how do you like l_a   
4             300     what are some things you really like about l_a   

                                              answer  start_time  \
0                                               good      62.328   
1                                    atlanta georgia      68.978   
2                     um my parents are from here um      75.028   
3                                          i love it      83.808   
4  i like the weather i like the opportunities u

## Augment Dataset


In [10]:
# Updated augmentation function with dynamic class balancing
def augment_dataset_balanced(df, model, tokenizer, progress_interval=50,
                    balance_classes=True, target_ratio=0.5,
                    min_multiplier=1, max_multiplier=5):
    """
    Augment dataset by generating new answers for each question.
    Optionally balances classes by augmenting depressed samples more.

    Args:
        df: DataFrame with questions and answers
        model: Finetuned model for generation
        tokenizer: Tokenizer instance
        progress_interval: Print progress every N samples
        balance_classes: If True, augment depressed samples more to balance dataset
        target_ratio: Target ratio of depressed samples (0.5 = 50/50 balance)
        min_multiplier: Minimum augmentation multiplier (all samples get at least this)
        max_multiplier: Maximum augmentation multiplier (cap to avoid excessive augmentation)

    Returns:
        DataFrame with original and augmented answers
    """
    augmented_data = []
    stopping_criteria = create_stopping_criteria(tokenizer)

    # Calculate augmentation multipliers if balancing
    if balance_classes and 'depression_label' in df.columns:
        multipliers = calculate_augmentation_multipliers(
            df, target_ratio=target_ratio,
            min_multiplier=min_multiplier, max_multiplier=max_multiplier
        )
    else:
        # Augment all samples once
        multipliers = {label: 1 for label in df.get('depression_label', [0, 1]).unique()}
        print(f"\nAugmenting all samples equally (1x each)")

    # Calculate total work to be done
    total_augmentations = 0
    for idx, row in df.iterrows():
        label = row.get('depression_label', 0)
        multiplier = multipliers.get(label, 1)
        total_augmentations += int(multiplier)

    print(f"\nStarting augmentation for {len(df)} samples ({total_augmentations} total augmentations)...\n")

    pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
    processed_samples = 0
    successful = 0
    failed = 0

    # change multipliers to fixed

    for idx, row in df.iterrows():
        label = row.get('depression_label', 0)
        multiplier = multipliers.get(label, 1)
        num_augmentations = int(multiplier)

        # Augment this sample multiple times if needed
        for aug_idx in range(num_augmentations):
            try:
                # Build prompt with context
                prompt = format_input(df, row, n_context=3)

                # Generate augmented answer
                full_output = generate_response(
                    model,
                    tokenizer,
                    prompt,
                    pipe,
                    max_new_tokens=100,
                    stopping_criteria=stopping_criteria
                )

                # Extract only the response
                augmented_answer = extract_response_only(full_output, prompt)
                augmented_answer = clean_augmented_answer(augmented_answer)

                # Only print for first augmentation of each sample to reduce output
                if aug_idx == 0:
                    print(f"Sample {idx + 1}/{len(df)} (label={label}, {num_augmentations}x): {augmented_answer[:100]}...")

                # Store results
                augmented_data.append({
                    'participant_id': row['participant_id'],
                    'question': row['question'],
                    'original_answer': row['answer'],
                    'augmented_answer': augmented_answer,
                    'depression_severity': row['depression_severity'],
                    'depression_label': row.get('depression_label', None),
                    'split': row.get('split', None),
                    'start_time': row['start_time'],
                    'augmentation_index': aug_idx + 1,  # Track which augmentation this is
                })

                successful += 1
                processed_samples += 1

                # Progress update
                if processed_samples % progress_interval == 0:
                    print(f"Progress: {processed_samples}/{total_augmentations} ({100*processed_samples/total_augmentations:.1f}%) - "
                          f"Successful: {successful}, Failed: {failed}")

            except Exception as e:
                print(f"Error processing sample {idx + 1}, augmentation {aug_idx + 1} (participant {row['participant_id']}): {e}")
                failed += 1
                processed_samples += 1
                # Still add the row with empty augmented answer
                augmented_data.append({
                    'participant_id': row['participant_id'],
                    'question': row['question'],
                    'original_answer': row['answer'],
                    'augmented_answer': '',  # Empty on error
                    'depression_severity': row['depression_severity'],
                    'depression_label': row.get('depression_label', None),
                    'split': row.get('split', None),
                    'start_time': row['start_time'],
                    'augmentation_index': aug_idx + 1,
                })

    print(f"\nAugmentation complete!")
    print(f"Total samples processed: {len(df)}")
    print(f"Total augmentations: {total_augmentations}")
    print(f"Successful: {successful}, Failed: {failed}")

    # Show final distribution
    augmented_df = pd.DataFrame(augmented_data)
    if 'depression_label' in augmented_df.columns:
        print(f"\nFinal class distribution:")
        final_counts = augmented_df['depression_label'].value_counts().sort_index()
        for label, count in final_counts.items():
            percentage = (count / len(augmented_df)) * 100
            label_name = "Depressed" if label == 1 else "Non-depressed"
            print(f"  {label_name} (label={label}): {count} samples ({percentage:.1f}%)")

    return augmented_df

# Configuration for balancing
BALANCE_CLASSES = True
TARGET_RATIO = 0.5
MIN_MULTIPLIER = 0
MAX_MULTIPLIER = 5

# Run augmentation with class balancing
train_qa_df = qa_df[qa_df['split'] == 'train'].copy()

print("=" * 80)
print("AUGMENTING DATASET WITH DYNAMIC CLASS BALANCING")
print("=" * 80)

augmented_df = augment_dataset_balanced(
    train_qa_df,
    model,
    tokenizer,
    progress_interval=50,
    balance_classes=BALANCE_CLASSES,
    target_ratio=TARGET_RATIO,
    min_multiplier=MIN_MULTIPLIER,
    max_multiplier=MAX_MULTIPLIER
)


AUGMENTING DATASET WITH DYNAMIC CLASS BALANCING

Current class distribution:
  Non-depressed (label=0): 4382 samples (71.3%)
  Depressed (label=1): 1767 samples (28.7%)

Augmentation multipliers:
  Non-depressed (label=0): 0.00x (no augmentation)
  Depressed (label=1): 1.48x

Expected final distribution:
  Non-depressed: 0 samples (0.0%)
  Depressed: 2615 samples (100.0%)

Starting augmentation for 6149 samples (1767 total augmentations)...



Device set to use cuda:0


Sample 1172/6149 (label=1, 1x): ### Response:
yes äterst ște  <iste> macht  >>>...
Sample 1173/6149 (label=1, 1x): ### Response:
um good thank you...
Sample 1174/6149 (label=1, 1x): ### Response:
i'm from ohio fahrenheit 69  july 1st  '75  thank you  '''...
Sample 1175/6149 (label=1, 1x): ### Response:
um the beach the mountains the beaches um um  allem uh the weather  'cause it's warm  ...
Sample 1176/6149 (label=1, 1x): ### Response:
<laughter> yeah but also i've lived in l_a for five years now so i get to see more of ...
Sample 1177/6149 (label=1, 1x): ### Response:
um i don't really like the traffic or the crowds  Bevölkerheit...
Sample 1178/6149 (label=1, 1x): ### Response:
oh it's not for me at least ▶...
Sample 1179/6149 (label=1, 1x): ### Response:
um something to do with writing ã Text ã't be able to write for a living ã't know ã't ...
Sample 1180/6149 (label=1, 1x): ### Response:
what do i do now i work at a bank ▶️<|assistant|>...
Sample 1181/6149 (label=1, 1x): ### Response

## Save Augmented Dataset


In [11]:
# Display statistics
import csv
print("Augmented Dataset Statistics:")
print(f"Total samples: {len(augmented_df)}")
print(f"Non-empty augmented answers: {(augmented_df['augmented_answer'] != '').sum()}")
print(f"Empty augmented answers: {(augmented_df['augmented_answer'] == '').sum()}")

print("\nSample augmented responses:")
sample_df = augmented_df[augmented_df['augmented_answer'] != ''].head(3)
for idx, row in sample_df.iterrows():
    print(f"\n--- Sample {idx + 1} ---")
    print(f"Question: {row['question'][:100]}...")
    print(f"Original: {row['original_answer'][:100]}...")
    print(f"Augmented: {row['augmented_answer'][:100]}...")

# Save to CSV
a = augmented_df.copy()
a['augmented_answer'] = a['augmented_answer'].replace({'"': "'", "\r": " ", "\n": " "}, regex=True)
print(f"\nSaving augmented dataset to {output_csv_path}...")
a.to_csv(output_csv_path, index=False,quoting=csv.QUOTE_ALL,escapechar='\\', encoding="utf-8-sig")
print("Dataset saved successfully!")

Augmented Dataset Statistics:
Total samples: 1767
Non-empty augmented answers: 1703
Empty augmented answers: 64

Sample augmented responses:

--- Sample 1 ---
Question: hi i'm ellie thanks for coming in today i was created to talk to people in a safe and secure environ...
Original: sure...
Augmented: ### Response:
yes äterst ște  <iste> macht  >>>...

--- Sample 2 ---
Question: okay so how are you doing today...
Original: mm okay...
Augmented: ### Response:
um good thank you...

--- Sample 3 ---
Question: that's good where are you from originally ...
Original: los angeles...
Augmented: ### Response:
i'm from ohio fahrenheit 69  july 1st  '75  thank you  '''...

Saving augmented dataset to ./augmented_dataset.csv...
Dataset saved successfully!
