In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from collections import Counter
import re

# Load merged dataset
df_merged = pd.read_csv("/content/merged_dataset.csv")

# Remove duplicate rows
num_duplicates = df_merged.duplicated().sum()
print(f"Number of duplicate rows found: {num_duplicates}")
df_merged = df_merged.drop_duplicates(keep='first').reset_index(drop=True)
print(f"Number of rows after removing duplicates: {df_merged.shape[0]}")

# Remove empty rows (where Context or Response is empty/whitespace)
empty_rows_mask = (df_merged['Context'].str.strip() == '') | (df_merged['Response'].str.strip() == '')
num_empty_rows = empty_rows_mask.sum()
print(f"Number of empty rows found: {num_empty_rows}")
df_merged = df_merged[~empty_rows_mask].reset_index(drop=True)
print(f"Number of rows after removing empty rows: {df_merged.shape[0]}")

# Remove rows with missing values
rows_with_missing = df_merged[df_merged['Context'].isnull() | df_merged['Response'].isnull()]
print(f"Number of rows with missing values found: {rows_with_missing.shape[0]}")
df_merged = df_merged.dropna(subset=['Context', 'Response']).reset_index(drop=True)
print(f"Number of rows after removing missing values: {df_merged.shape[0]}")
print("\nMissing values after removal:")
print(df_merged.isnull().sum())

# Save cleaned dataset for later
df_merged.to_csv("merged_dataset_cleaned.csv", index=False)

# EDA
print(f"Number of rows: {df_merged.shape[0]}")
print(f"Number of columns: {df_merged.shape[1]}")
print("\nData types:")
print(df_merged.dtypes)
print("\nMissing values per column:")
print(df_merged.isnull().sum())

# Visualize missing values
plt.figure(figsize=(10, 6))
sns.heatmap(df_merged.isnull(), cbar=False, cmap='viridis')
plt.title('Missing Values Heatmap')
plt.savefig("missing_values_heatmap.png")
plt.close()

# Word counts
def count_words(text):
    if isinstance(text, str):return len(text.split())
    return 0

df_merged['Context_word_count'] = df_merged['Context'].apply(count_words)
df_merged['Response_word_count'] = df_merged['Response'].apply(count_words)
print("\nDescriptive statistics for word counts:")
print(df_merged[['Context_word_count', 'Response_word_count']].describe())

# Most common words
def most_common_words(text_series, n=20):
    all_words = []
    for text in text_series.dropna():
        all_words.extend(re.findall(r'\w+', text.lower()))
    return Counter(all_words).most_common(n)

common_context_words = most_common_words(df_merged['Context'])
common_response_words = most_common_words(df_merged['Response'])
print("\nMost common words in 'Context':")
print(common_context_words)
print("\nMost common words in 'Response':")
print(common_response_words)

# Visualize the distribution of word counts
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
sns.histplot(df_merged['Context_word_count'], bins=50, kde=True)
plt.title('Distribution of Context Word Count')
plt.xlabel('Word Count')
plt.ylabel('Frequency')
plt.subplot(1, 2, 2)
sns.histplot(df_merged['Response_word_count'], bins=50, kde=True, color='orange')
plt.title('Distribution of Response Word Count')
plt.xlabel('Word Count')
plt.ylabel('Frequency')
plt.tight_layout()
plt.savefig("word_count_distribution.png")
plt.close()

Number of duplicate rows found: 4344
Number of rows after removing duplicates: 85847
Number of empty rows found: 0
Number of rows after removing empty rows: 85847
Number of rows with missing values found: 746
Number of rows after removing missing values: 85101

Missing values after removal:
Context     0
Response    0
dtype: int64
Number of rows: 85101
Number of columns: 2

Data types:
Context     object
Response    object
dtype: object

Missing values per column:
Context     0
Response    0
dtype: int64

Descriptive statistics for word counts:
       Context_word_count  Response_word_count
count        85101.000000         85101.000000
mean           175.250455           244.935124
std            104.287518           123.429608
min              1.000000             1.000000
25%             94.000000           148.000000
50%            157.000000           225.000000
75%            235.000000           327.000000
max           2664.000000          1865.000000

Most common words in 'Con

In [None]:
import os
from huggingface_hub import snapshot_download
from google.colab import userdata

# Set your Hugging Face token securely using Colab's secrets manager
os.environ["HF_TOKEN"] = userdata.get("HF_TOKEN")

# Model repo - choose the correct Llama 3 repo name from Hugging Face
model_repo = "meta-llama/Meta-Llama-3-8B"  # Change this if you want a different version

# Download the model weights and files
print("Starting download of Llama 3 model...")
snapshot_download(
    repo_id=model_repo,
    cache_dir="./llama3_model",
    token=os.environ["HF_TOKEN"],
    resume_download=True
)
print("Download complete! Files saved in ./llama3_model")

Starting download of Llama 3 model...




Fetching 17 files:   0%|          | 0/17 [00:00<?, ?it/s]

config.json:   0%|          | 0.00/654 [00:00<?, ?B/s]

.gitattributes:   0%|          | 0.00/1.52k [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

README.md:   0%|          | 0.00/36.6k [00:00<?, ?B/s]

USE_POLICY.md:   0%|          | 0.00/4.70k [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

LICENSE:   0%|          | 0.00/7.80k [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/177 [00:00<?, ?B/s]

original/consolidated.00.pth:   0%|          | 0.00/16.1G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

original/tokenizer.model:   0%|          | 0.00/2.18M [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

params.json:   0%|          | 0.00/211 [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/73.0 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

Download complete! Files saved in ./llama3_model


In [None]:
!pip install transformers datasets torch peft bitsandbytes accelerate --upgrade

Collecting datasets
  Downloading datasets-4.3.0-py3-none-any.whl.metadata (18 kB)
Collecting torch
  Downloading torch-2.9.0-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (30 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.48.1-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Collecting pyarrow>=21.0.0 (from datasets)
  Downloading pyarrow-22.0.0-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (3.2 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.8.93 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.8.93-py3-none-manylinux2010_x86_64.manylinux_2_12_x86_64.whl.metadata (1.7 kB)
Collecting nvidia-cuda-runtime-cu12==12.8.90 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.8.90-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (1.7 kB)
Collecting nvidia-cuda-cupti-cu12==12.8.90 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.8.90-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (1.7 kB)
Collecting nvidia-cublas-cu12==12.8.4.1 (from torch)

In [None]:
import os
import gc
import sys
import torch
import pandas as pd
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling, BitsAndBytesConfig
from datasets import Dataset
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from torch.amp import autocast
from google.colab import userdata # Import userdata

# Set environment variables
os.environ["HF_TOKEN"] = userdata.get("HF_TOKEN")  # Ensure HF token is set

def clean_cuda_memory(device_id):
    """Attempt to completely clean CUDA memory for the specified device."""
    print(f"Cleaning CUDA memory on GPU {device_id}...")
    torch.cuda.set_device(device_id)
    torch.cuda.empty_cache()
    gc.collect()
    torch.cuda.reset_peak_memory_stats()
    print(f"CUDA memory cleaning completed on GPU {device_id}. Free memory: {torch.cuda.mem_get_info(device_id)[0] / 1024**3:.2f} GB")

def finetune_on_gpu(device_id, model_load_path, output_path, df_path):
    print(f"Fine-tuning on GPU {device_id} ...")
    clean_cuda_memory(device_id)

    torch.cuda.set_device(device_id)
    gc.collect()
    torch.cuda.empty_cache()

    # Load and prepare dataset
    df = pd.read_csv(df_path)
    df = df.dropna(subset=['Context', 'Response'])
    df = df[(df['Context'].str.strip() != '') & (df['Response'].str.strip() != '')].reset_index(drop=True)
    df = df.head(85101)  # Use full dataset (adjust based on memory)
    df = df.rename(columns={"Context": "prompt", "Response": "response"})
    df['prompt'] = df['prompt'].astype(str)
    df['response'] = df['response'].astype(str)
    def format_conversation(example):
        text = f"<s>[INST] {example['prompt']} [/INST] {example['response']} </s>"
        return {"text": text}
    dataset = Dataset.from_pandas(df)
    dataset = dataset.map(format_conversation)

    # Debug: List directory contents
    print(f"Contents of {model_load_path}: {os.listdir(model_load_path)}")

    # Load tokenizer
    try:
        tokenizer = AutoTokenizer.from_pretrained(model_load_path, local_files_only=True)
        if tokenizer.pad_token is None:
            tokenizer.pad_token = tokenizer.eos_token
        print("Tokenizer loaded successfully.")
    except Exception as e:
        print(f"Error loading tokenizer: {e}")
        raise

    # Configure 4-bit quantization with BitsAndBytesConfig
    quantization_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_compute_dtype=torch.float16,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4"  # Use 4-bit NormalFloat (nf4) for better accuracy
    )

    # Load model with 4-bit quantization (QLORA)
    model = AutoModelForCausalLM.from_pretrained(
        model_load_path,
        quantization_config=quantization_config,
        device_map="auto",
        local_files_only=True
    )
    model = prepare_model_for_kbit_training(model)  # Prepare for QLORA

    # Configure LoRA (PEFT)
    lora_config = LoraConfig(
        r=8,  # Rank of low-rank updates
        lora_alpha=16,  # Scaling factor
        target_modules=["q_proj", "v_proj"],  # Target attention layers (adjust based on model)
        lora_dropout=0.1,
        bias="none",
        task_type="CAUSAL_LM"
    )
    model = get_peft_model(model, lora_config)
    model.gradient_checkpointing_enable()

    # Preprocess dataset
    def preprocess_function(examples):
        tokenized = tokenizer(
            examples["text"],
            padding="max_length",
            truncation=True,
            max_length=256,  # Adjust based on memory
        )
        tokenized["labels"] = tokenized["input_ids"]
        return tokenized
    tokenized_dataset = dataset.map(preprocess_function, batched=True, remove_columns=dataset.column_names)

    # Training arguments
    training_args = TrainingArguments(
        output_dir=output_path,
        num_train_epochs=3,
        per_device_train_batch_size=1,
        gradient_accumulation_steps=64,  # High accumulation for memory efficiency
        save_steps=500,  # Save checkpoint every 500 steps
        logging_steps=100,
        fp16=True,  # Use FP16 with quantization
        bf16=False,
        save_total_limit=2,  # Keep only the last 2 checkpoints
        load_best_model_at_end=False,
        report_to="none"
    )

    # Data collator and trainer
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=False,
    )
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_dataset,
        data_collator=data_collator,
        tokenizer=tokenizer,
    )

    # Memory profiling
    print(f"Memory allocated: {torch.cuda.memory_allocated(device_id) / 1024**3:.2f} GB")
    print(f"Max memory allocated: {torch.cuda.max_memory_allocated(device_id) / 1024**3:.2f} GB")
    gc.collect()
    torch.cuda.empty_cache()

    # Train the model
    trainer.train()
    torch.cuda.empty_cache()

    # Save the final model
    trainer.save_model(output_path)
    tokenizer.save_pretrained(output_path)
    print(f"Fine-tuning complete. Model and tokenizer saved in {output_path} on GPU {device_id}")

def main():
    # Define paths
    model_load_path = "/content/llama3_model/models--meta-llama--Meta-Llama-3-8B/snapshots/8cde5ca8380496c9a6cc7ef3a8b46a0372a1d920"
    output_path = "./llama3_finetuned"
    df_path = "merged_dataset_cleaned.csv"

    # Ensure output directory exists
    os.makedirs(output_path, exist_ok=True)

    # Check for GPU
    if torch.cuda.is_available():
        device_id = 0
        print(f"GPU {device_id} is available. Starting fine-tuning...")
        finetune_on_gpu(device_id, model_load_path, output_path, df_path)
    else:
        print("No CUDA device found. Exiting script.")
        sys.exit(1)

if __name__ == "__main__":
    main()

GPU 0 is available. Starting fine-tuning...
Fine-tuning on GPU 0 ...
Cleaning CUDA memory on GPU 0...
CUDA memory cleaning completed on GPU 0. Free memory: 48.89 GB


Map:   0%|          | 0/85101 [00:00<?, ? examples/s]

Contents of /content/llama3_model/models--meta-llama--Meta-Llama-3-8B/snapshots/8cde5ca8380496c9a6cc7ef3a8b46a0372a1d920: ['model-00002-of-00004.safetensors', 'tokenizer.json', 'tokenizer_config.json', 'USE_POLICY.md', 'model-00001-of-00004.safetensors', 'LICENSE', 'special_tokens_map.json', '.gitattributes', 'model-00004-of-00004.safetensors', 'generation_config.json', 'model.safetensors.index.json', 'original', 'README.md', 'config.json', 'model-00003-of-00004.safetensors']
Tokenizer loaded successfully.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Map:   0%|          | 0/85101 [00:00<?, ? examples/s]

  trainer = Trainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 128001}.


Memory allocated: 14.63 GB
Max memory allocated: 15.59 GB


Step,Training Loss
100,2.1873
200,2.0571
300,2.0228
400,1.9898
500,1.9605
600,1.9799
700,1.9495
800,1.9461
900,1.9626
1000,1.9571


Step,Training Loss
100,2.1873
200,2.0571
300,2.0228
400,1.9898
500,1.9605
600,1.9799
700,1.9495
800,1.9461
900,1.9626
1000,1.9571


In [None]:
import os
import gc
import sys
import torch
import pandas as pd
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling, BitsAndBytesConfig
from datasets import Dataset
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from torch.amp import autocast
from google.colab import userdata

# Set environment variables
os.environ["HF_TOKEN"] = userdata.get("HF_TOKEN")  # Ensure HF token is set

def clean_cuda_memory(device_id):
    """Attempt to completely clean CUDA memory for the specified device."""
    print(f"Cleaning CUDA memory on GPU {device_id}...")
    torch.cuda.set_device(device_id)
    torch.cuda.empty_cache()
    gc.collect()
    torch.cuda.reset_peak_memory_stats()
    print(f"CUDA memory cleaning completed on GPU {device_id}. Free memory: {torch.cuda.mem_get_info(device_id)[0] / 1024**3:.2f} GB")

def finetune_on_gpu(device_id, model_load_path, output_path, df_path):
    print(f"Fine-tuning on GPU {device_id} ...")
    clean_cuda_memory(device_id)

    torch.cuda.set_device(device_id)
    gc.collect()
    torch.cuda.empty_cache()

    # Load and prepare dataset
    df = pd.read_csv(df_path)
    df = df.dropna(subset=['Context', 'Response'])
    df = df[(df['Context'].str.strip() != '') & (df['Response'].str.strip() != '')].reset_index(drop=True)
    df = df.head(10000)  # Reduced to 10,000 examples for faster testing
    df = df.rename(columns={"Context": "prompt", "Response": "response"})
    df['prompt'] = df['prompt'].astype(str)
    df['response'] = df['response'].astype(str)
    def format_conversation(example):
        text = f"<s>[INST] {example['prompt']} [/INST] {example['response']} </s>"
        return {"text": text}
    dataset = Dataset.from_pandas(df)
    dataset = dataset.map(format_conversation, num_proc=2)  # Parallelize tokenization

    # Debug: List directory contents
    print(f"Contents of {model_load_path}: {os.listdir(model_load_path)}")

    # Load tokenizer
    try:
        tokenizer = AutoTokenizer.from_pretrained(model_load_path, local_files_only=True)
        if tokenizer.pad_token is None:
            tokenizer.pad_token = tokenizer.eos_token
        print("Tokenizer loaded successfully.")
    except Exception as e:
        print(f"Error loading tokenizer: {e}")
        raise

    # Configure 4-bit quantization with BitsAndBytesConfig
    quantization_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_compute_dtype=torch.float16,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4"
    )

    # Load model with 4-bit quantization (QLoRA)
    model = AutoModelForCausalLM.from_pretrained(
        model_load_path,
        quantization_config=quantization_config,
        device_map="auto",
        local_files_only=True
    )
    model = prepare_model_for_kbit_training(model)

    # Configure LoRA (PEFT) with optimized settings
    lora_config = LoraConfig(
        r=4,  # Reduced rank for faster updates
        lora_alpha=8,  # Reduced scaling factor
        target_modules=["q_proj", "v_proj", "k_proj", "o_proj"],  # Expanded target layers
        lora_dropout=0.05,  # Lowered dropout for speed
        bias="none",
        task_type="CAUSAL_LM"
    )
    model = get_peft_model(model, lora_config)
    # model.gradient_checkpointing_enable()  # Disabled for speed (re-enable if memory exceeds 67.32 GB)

    # Preprocess dataset
    def preprocess_function(examples):
        tokenized = tokenizer(
            examples["text"],
            padding="max_length",
            truncation=True,
            max_length=128,  # Reduced to 128 for faster processing
        )
        tokenized["labels"] = tokenized["input_ids"]
        return tokenized
    tokenized_dataset = dataset.map(preprocess_function, batched=True, num_proc=2, remove_columns=dataset.column_names)

    # Training arguments (optimized for speed)
    training_args = TrainingArguments(
        output_dir=output_path,
        num_train_epochs=1,  # Reduced to 1 for initial test
        per_device_train_batch_size=4,  # Increased to 4 (adjust if memory issues)
        gradient_accumulation_steps=4,  # Reduced to 4 for faster updates
        save_steps=500,
        logging_steps=50,  # More frequent feedback
        fp16=True,
        bf16=False,
        save_total_limit=2,
        load_best_model_at_end=False,
        report_to="none"
    )

    # Data collator and trainer
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=False,
    )
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_dataset,
        data_collator=data_collator,
        tokenizer=tokenizer,
    )

    # Memory profiling
    print(f"Memory allocated: {torch.cuda.memory_allocated(device_id) / 1024**3:.2f} GB")
    print(f"Max memory allocated: {torch.cuda.max_memory_allocated(device_id) / 1024**3:.2f} GB")
    gc.collect()
    torch.cuda.empty_cache()

    # Train the model
    trainer.train()
    torch.cuda.empty_cache()

    # Save the final model
    trainer.save_model(output_path)
    tokenizer.save_pretrained(output_path)
    print(f"Fine-tuning complete. Model and tokenizer saved in {output_path} on GPU {device_id}")

def main():
    # Define paths
    model_load_path = "/content/llama3_model/models--meta-llama--Meta-Llama-3-8B/snapshots/8cde5ca8380496c9a6cc7ef3a8b46a0372a1d920"
    output_path = "./llama3_finetuned"
    df_path = "merged_dataset_cleaned.csv"

    # Ensure output directory exists
    os.makedirs(output_path, exist_ok=True)

    # Check for GPU
    if torch.cuda.is_available():
        device_id = 0
        print(f"GPU {device_id} is available. Starting fine-tuning...")
        finetune_on_gpu(device_id, model_load_path, output_path, df_path)
    else:
        print("No CUDA device found. Exiting script.")
        sys.exit(1)

if __name__ == "__main__":
    main()

GPU 0 is available. Starting fine-tuning...
Fine-tuning on GPU 0 ...
Cleaning CUDA memory on GPU 0...
CUDA memory cleaning completed on GPU 0. Free memory: 78.90 GB


Map (num_proc=2):   0%|          | 0/10000 [00:00<?, ? examples/s]

Contents of /content/llama3_model/models--meta-llama--Meta-Llama-3-8B/snapshots/8cde5ca8380496c9a6cc7ef3a8b46a0372a1d920: ['config.json', 'USE_POLICY.md', 'README.md', 'generation_config.json', 'original', '.gitattributes', 'model-00002-of-00004.safetensors', 'tokenizer_config.json', 'LICENSE', 'special_tokens_map.json', 'tokenizer.json', 'model-00001-of-00004.safetensors', 'model-00003-of-00004.safetensors', 'model.safetensors.index.json', 'model-00004-of-00004.safetensors']
Tokenizer loaded successfully.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Map (num_proc=2):   0%|          | 0/10000 [00:00<?, ? examples/s]

  trainer = Trainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 128001}.


Memory allocated: 7.28 GB
Max memory allocated: 8.25 GB


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
  return fn(*args, **kwargs)


Step,Training Loss
50,2.0238
100,1.8136
150,1.7276
200,1.7002
250,1.7256
300,1.657
350,1.6186
400,1.5994
450,1.5656
500,1.5832


  return fn(*args, **kwargs)


Fine-tuning complete. Model and tokenizer saved in ./llama3_finetuned on GPU 0


In [None]:
import os
import gc
import sys
import torch
import pandas as pd
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling, BitsAndBytesConfig
from datasets import Dataset
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from torch.amp import autocast
from google.colab import userdata

# Set environment variables
os.environ["HF_TOKEN"] = userdata.get("HF_TOKEN")  # Ensure HF token is set

def clean_cuda_memory(device_id):
    """Attempt to completely clean CUDA memory for the specified device."""
    print(f"Cleaning CUDA memory on GPU {device_id}...")
    torch.cuda.set_device(device_id)
    torch.cuda.empty_cache()
    gc.collect()
    torch.cuda.reset_peak_memory_stats()
    print(f"CUDA memory cleaning completed on GPU {device_id}. Free memory: {torch.cuda.mem_get_info(device_id)[0] / 1024**3:.2f} GB")

def finetune_on_gpu(device_id, model_load_path, output_path, df_path):
    print(f"Fine-tuning on GPU {device_id} ...")
    clean_cuda_memory(device_id)

    torch.cuda.set_device(device_id)
    gc.collect()
    torch.cuda.empty_cache()

    # Load and prepare dataset (full dataset)
    df = pd.read_csv(df_path)
    df = df.dropna(subset=['Context', 'Response'])
    df = df[(df['Context'].str.strip() != '') & (df['Response'].str.strip() != '')].reset_index(drop=True)
    df = df.head(85101)  # Use full dataset
    df = df.rename(columns={"Context": "prompt", "Response": "response"})
    df['prompt'] = df['prompt'].astype(str)
    df['response'] = df['response'].astype(str)
    def format_conversation(example):
        text = f"<s>[INST] {example['prompt']} [/INST] {example['response']} </s>"
        return {"text": text}
    dataset = Dataset.from_pandas(df)
    dataset = dataset.map(format_conversation, num_proc=2)  # Parallelize tokenization

    # Debug: List directory contents
    print(f"Contents of {model_load_path}: {os.listdir(model_load_path)}")

    # Load tokenizer
    try:
        tokenizer = AutoTokenizer.from_pretrained(model_load_path, local_files_only=True)
        if tokenizer.pad_token is None:
            tokenizer.pad_token = tokenizer.eos_token
        print("Tokenizer loaded successfully.")
    except Exception as e:
        print(f"Error loading tokenizer: {e}")
        raise

    # Configure 4-bit quantization with BitsAndBytesConfig
    quantization_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_compute_dtype=torch.float16,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4"
    )

    # Load model with 4-bit quantization (QLoRA)
    model = AutoModelForCausalLM.from_pretrained(
        model_load_path,
        quantization_config=quantization_config,
        device_map="auto",
        local_files_only=True
    )
    model = prepare_model_for_kbit_training(model)

    # Configure LoRA (PEFT) with optimized settings
    lora_config = LoraConfig(
        r=4,  # Reduced rank for faster updates
        lora_alpha=8,  # Reduced scaling factor
        target_modules=["q_proj", "v_proj", "k_proj", "o_proj"],  # Expanded target layers
        lora_dropout=0.05,  # Lowered dropout for speed
        bias="none",
        task_type="CAUSAL_LM"
    )
    model = get_peft_model(model, lora_config)
    model.gradient_checkpointing_enable()  # Re-enabled for memory management with full dataset

    # Preprocess dataset
    def preprocess_function(examples):
        tokenized = tokenizer(
            examples["text"],
            padding="max_length",
            truncation=True,
            max_length=128,  # Reduced to 128 for faster processing
        )
        tokenized["labels"] = tokenized["input_ids"]
        return tokenized
    tokenized_dataset = dataset.map(preprocess_function, batched=True, num_proc=2, remove_columns=dataset.column_names)

    # Training arguments (optimized for speed with full dataset)
    training_args = TrainingArguments(
        output_dir=output_path,
        num_train_epochs=1,  # Start with 1 epoch, increase to 3 if time allows
        per_device_train_batch_size=4,  # Increased to 4 (adjust if memory issues)
        gradient_accumulation_steps=4,  # Reduced to 4 for faster updates
        save_steps=500,
        logging_steps=50,  # More frequent feedback
        fp16=True,
        bf16=False,
        save_total_limit=2,
        load_best_model_at_end=False,
        report_to="none"
    )

    # Data collator and trainer
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=False,
    )
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_dataset,
        data_collator=data_collator,
        tokenizer=tokenizer,
    )

    # Memory profiling
    print(f"Memory allocated: {torch.cuda.memory_allocated(device_id) / 1024**3:.2f} GB")
    print(f"Max memory allocated: {torch.cuda.max_memory_allocated(device_id) / 1024**3:.2f} GB")
    gc.collect()
    torch.cuda.empty_cache()

    # Train the model
    trainer.train()
    torch.cuda.empty_cache()

    # Save the final model
    trainer.save_model(output_path)
    tokenizer.save_pretrained(output_path)
    print(f"Fine-tuning complete. Model and tokenizer saved in {output_path} on GPU {device_id}")

def main():
    # Define paths
    model_load_path = "/content/llama3_model/models--meta-llama--Meta-Llama-3-8B/snapshots/8cde5ca8380496c9a6cc7ef3a8b46a0372a1d920"
    output_path = "./llama3_finetuned"
    df_path = "merged_dataset_cleaned.csv"

    # Ensure output directory exists
    os.makedirs(output_path, exist_ok=True)

    # Check for GPU
    if torch.cuda.is_available():
        device_id = 0
        print(f"GPU {device_id} is available. Starting fine-tuning...")
        finetune_on_gpu(device_id, model_load_path, output_path, df_path)
    else:
        print("No CUDA device found. Exiting script.")
        sys.exit(1)

if __name__ == "__main__":
    main()

GPU 0 is available. Starting fine-tuning...
Fine-tuning on GPU 0 ...
Cleaning CUDA memory on GPU 0...
CUDA memory cleaning completed on GPU 0. Free memory: 67.75 GB


Map (num_proc=2):   0%|          | 0/85101 [00:00<?, ? examples/s]

Contents of /content/llama3_model/models--meta-llama--Meta-Llama-3-8B/snapshots/8cde5ca8380496c9a6cc7ef3a8b46a0372a1d920: ['config.json', 'USE_POLICY.md', 'README.md', 'generation_config.json', 'original', '.gitattributes', 'model-00002-of-00004.safetensors', 'tokenizer_config.json', 'LICENSE', 'special_tokens_map.json', 'tokenizer.json', 'model-00001-of-00004.safetensors', 'model-00003-of-00004.safetensors', 'model.safetensors.index.json', 'model-00004-of-00004.safetensors']
Tokenizer loaded successfully.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Map (num_proc=2):   0%|          | 0/85101 [00:00<?, ? examples/s]

  trainer = Trainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 128001}.


Memory allocated: 7.30 GB
Max memory allocated: 8.27 GB


Step,Training Loss
50,2.4439
100,2.2461
150,2.225
200,2.1824
250,2.1122
300,2.1026
350,2.0737
400,2.083
450,2.0395
500,2.0607


Fine-tuning complete. Model and tokenizer saved in ./llama3_finetuned on GPU 0


In [None]:
import os
import gc
import sys
import torch
import pandas as pd
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling, BitsAndBytesConfig
from datasets import Dataset
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from torch.amp import autocast
from google.colab import userdata

# Set environment variables
os.environ["HF_TOKEN"] = userdata.get("HF_TOKEN")  # Ensure HF token is set

def clean_cuda_memory(device_id):
    """Attempt to completely clean CUDA memory for the specified device."""
    print(f"Cleaning CUDA memory on GPU {device_id}...")
    torch.cuda.set_device(device_id)
    torch.cuda.empty_cache()
    gc.collect()
    torch.cuda.reset_peak_memory_stats()
    print(f"CUDA memory cleaning completed on GPU {device_id}. Free memory: {torch.cuda.mem_get_info(device_id)[0] / 1024**3:.2f} GB")

def finetune_on_gpu(device_id, model_load_path, output_path, df_path):
    print(f"Fine-tuning on GPU {device_id} ...")
    clean_cuda_memory(device_id)

    torch.cuda.set_device(device_id)
    gc.collect()
    torch.cuda.empty_cache()

    # Load and prepare dataset (full dataset)
    df = pd.read_csv(df_path)
    df = df.dropna(subset=['Context', 'Response'])
    df = df[(df['Context'].str.strip() != '') & (df['Response'].str.strip() != '')].reset_index(drop=True)
    df = df.head(85101)  # Use full dataset
    df = df.rename(columns={"Context": "prompt", "Response": "response"})
    df['prompt'] = df['prompt'].astype(str)
    df['response'] = df['response'].astype(str)
    def format_conversation(example):
        text = f"<s>[INST] {example['prompt']} [/INST] {example['response']} </s>"
        return {"text": text}
    dataset = Dataset.from_pandas(df)
    dataset = dataset.map(format_conversation, num_proc=2)  # Parallelize tokenization

    # Debug: List directory contents
    print(f"Contents of {model_load_path}: {os.listdir(model_load_path)}")

    # Load tokenizer
    try:
        tokenizer = AutoTokenizer.from_pretrained(model_load_path, local_files_only=True)
        if tokenizer.pad_token is None:
            tokenizer.pad_token = tokenizer.eos_token
        print("Tokenizer loaded successfully.")
    except Exception as e:
        print(f"Error loading tokenizer: {e}")
        raise

    # Configure 4-bit quantization with BitsAndBytesConfig
    quantization_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_compute_dtype=torch.float16,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4"
    )

    # Load model with 4-bit quantization (QLoRA)
    model = AutoModelForCausalLM.from_pretrained(
        model_load_path,
        quantization_config=quantization_config,
        device_map="auto",
        local_files_only=True
    )
    model = prepare_model_for_kbit_training(model)

    # Configure LoRA (PEFT) with optimized settings
    lora_config = LoraConfig(
        r=4,  # Reduced rank for faster updates
        lora_alpha=8,  # Reduced scaling factor
        target_modules=["q_proj", "v_proj", "k_proj", "o_proj"],  # Expanded target layers
        lora_dropout=0.05,  # Lowered dropout for speed
        bias="none",
        task_type="CAUSAL_LM"
    )
    model = get_peft_model(model, lora_config)
    model.gradient_checkpointing_enable()  # Enabled for memory management with full dataset and 3 epochs

    # Preprocess dataset
    def preprocess_function(examples):
        tokenized = tokenizer(
            examples["text"],
            padding="max_length",
            truncation=True,
            max_length=128,  # Reduced to 128 for faster processing
        )
        tokenized["labels"] = tokenized["input_ids"]
        return tokenized
    tokenized_dataset = dataset.map(preprocess_function, batched=True, num_proc=2, remove_columns=dataset.column_names)

    # Training arguments (optimized for speed with 3 epochs)
    training_args = TrainingArguments(
        output_dir=output_path,
        num_train_epochs=3,  # Set to 3 epochs for meaningful fine-tuning
        per_device_train_batch_size=4,  # Increased to 4 (adjust if memory issues)
        gradient_accumulation_steps=4,  # Reduced to 4 for faster updates
        save_steps=500,
        logging_steps=50,  # More frequent feedback
        fp16=True,
        bf16=False,
        save_total_limit=2,
        load_best_model_at_end=False,
        report_to="none"
    )

    # Data collator and trainer
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=False,
    )
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_dataset,
        data_collator=data_collator,
        tokenizer=tokenizer,
    )

    # Memory profiling
    print(f"Memory allocated: {torch.cuda.memory_allocated(device_id) / 1024**3:.2f} GB")
    print(f"Max memory allocated: {torch.cuda.max_memory_allocated(device_id) / 1024**3:.2f} GB")
    gc.collect()
    torch.cuda.empty_cache()

    # Train the model
    trainer.train()
    torch.cuda.empty_cache()

    # Save the final model
    trainer.save_model(output_path)
    tokenizer.save_pretrained(output_path)
    print(f"Fine-tuning complete. Model and tokenizer saved in {output_path} on GPU {device_id}")

def main():
    # Define paths
    model_load_path = "/content/llama3_model/models--meta-llama--Meta-Llama-3-8B/snapshots/8cde5ca8380496c9a6cc7ef3a8b46a0372a1d920"
    output_path = "./llama3_finetuned"
    df_path = "merged_dataset_cleaned.csv"

    # Ensure output directory exists
    os.makedirs(output_path, exist_ok=True)

    # Check for GPU
    if torch.cuda.is_available():
        device_id = 0
        print(f"GPU {device_id} is available. Starting fine-tuning...")
        finetune_on_gpu(device_id, model_load_path, output_path, df_path)
    else:
        print("No CUDA device found. Exiting script.")
        sys.exit(1)

if __name__ == "__main__":
    main()

GPU 0 is available. Starting fine-tuning...
Fine-tuning on GPU 0 ...
Cleaning CUDA memory on GPU 0...
CUDA memory cleaning completed on GPU 0. Free memory: 66.20 GB


Map (num_proc=2):   0%|          | 0/85101 [00:00<?, ? examples/s]

Contents of /content/llama3_model/models--meta-llama--Meta-Llama-3-8B/snapshots/8cde5ca8380496c9a6cc7ef3a8b46a0372a1d920: ['config.json', 'USE_POLICY.md', 'README.md', 'generation_config.json', 'original', '.gitattributes', 'model-00002-of-00004.safetensors', 'tokenizer_config.json', 'LICENSE', 'special_tokens_map.json', 'tokenizer.json', 'model-00001-of-00004.safetensors', 'model-00003-of-00004.safetensors', 'model.safetensors.index.json', 'model-00004-of-00004.safetensors']
Tokenizer loaded successfully.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Map (num_proc=2):   0%|          | 0/85101 [00:00<?, ? examples/s]

  trainer = Trainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 128001}.


Memory allocated: 7.30 GB
Max memory allocated: 8.27 GB


Step,Training Loss
50,2.4436
100,2.2459
150,2.2245
200,2.1812
250,2.1083
300,2.1021
350,2.0732
400,2.0824
450,2.0388
500,2.0601


Step,Training Loss
50,2.4436
100,2.2459
150,2.2245
200,2.1812
250,2.1083
300,2.1021
350,2.0732
400,2.0824
450,2.0388
500,2.0601


Fine-tuning complete. Model and tokenizer saved in ./llama3_finetuned on GPU 0


In [None]:
import os
import torch
import pandas as pd
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import PeftModel
from google.colab import userdata

# Set environment variables
os.environ["HF_TOKEN"] = userdata.get("HF_TOKEN")  # Ensure HF token is set

# Paths
base_model_path = "/content/llama3_model/models--meta-llama--Meta-Llama-3-8B/snapshots/8cde5ca8380496c9a6cc7ef3a8b46a0372a1d920"
fine_tuned_path = "./llama3_finetuned"

# Configure 4-bit quantization
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4"
)

# Load base model
print("Loading base model...")
base_model = AutoModelForCausalLM.from_pretrained(
    base_model_path,
    quantization_config=quantization_config,
    device_map="auto",
    local_files_only=True
)
base_tokenizer = AutoTokenizer.from_pretrained(base_model_path, local_files_only=True)
if base_tokenizer.pad_token is None:
    base_tokenizer.pad_token = base_tokenizer.eos_token

# Load fine-tuned model (apply LoRA adapters)
print("Loading fine-tuned model...")
fine_tuned_model = PeftModel.from_pretrained(base_model, fine_tuned_path)
fine_tuned_tokenizer = AutoTokenizer.from_pretrained(fine_tuned_path)
if fine_tuned_tokenizer.pad_token is None:
    fine_tuned_tokenizer.pad_token = fine_tuned_tokenizer.eos_token

# Load dataset and extract test prompts
df = pd.read_csv("merged_dataset_cleaned.csv")
test_prompts = df[df['Context'].str.contains("depression|stress|sadness", case=False, na=False)]['Context'].head(3).tolist()
print("Test Prompts from Dataset:", test_prompts)

# Function to generate response
def generate_response(model, tokenizer, prompt, max_length=128):
    inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=max_length).to("cuda")
    outputs = model.generate(**inputs, max_new_tokens=50, do_sample=True, temperature=0.7)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Compare performance
print("Comparing base model and fine-tuned model performance:")
for prompt in test_prompts:
    print(f"\nPrompt: {prompt}")
    print("Base Model Response:")
    base_response = generate_response(base_model, base_tokenizer, prompt)
    print(base_response)
    print("Fine-Tuned Model Response:")
    fine_tuned_response = generate_response(fine_tuned_model, fine_tuned_tokenizer, prompt)
    print(fine_tuned_response)


Loading base model...


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Loading fine-tuned model...


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Test Prompts from Dataset: ["I've been dealing with feelings of sadness and hopelessness for the past few months. I've tried to distract myself with work and other activities, but nothing seems to help. I've been thinking about seeking professional help, but I'm unsure if therapy is the right choice for me. I've heard that therapy can be expensive and time-consuming, and I'm not sure if I can afford it or if I have the energy to commit to it.", "I've been feeling overwhelmed lately, with the added stress of managing my children's emotional needs and dealing with the grief of losing my spouse. It all feels like too much, and I'm struggling to keep up. I've been trying to focus on the positive aspects of my life, like my children and the support I've received from the hospice counselor, but it's hard to maintain that perspective when I'm feeling so drained.", "I understand, counselor. I didn't mean to upset him or cause any harm. I just wanted to share some memories and bring a smile to 

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


I've been dealing with feelings of sadness and hopelessness for the past few months. I've tried to distract myself with work and other activities, but nothing seems to help. I've been thinking about seeking professional help, but I'm unsure if therapy is the right choice for me. I've heard that therapy can be expensive and time-consuming, and I'm not sure if I can afford it or if I have the energy to commit to it. I'm also worried about the stigma associated with therapy and if people will judge me for seeking help. Can you please provide some guidance on how to navigate these concerns and decide if therapy is the right choice for me?
Fine-Tuned Model Response:


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


I've been dealing with feelings of sadness and hopelessness for the past few months. I've tried to distract myself with work and other activities, but nothing seems to help. I've been thinking about seeking professional help, but I'm unsure if therapy is the right choice for me. I've heard that therapy can be expensive and time-consuming, and I'm not sure if I can afford it or if I have the energy to commit to it. I've also heard that therapy can be beneficial, but I'm not sure how long it takes to see results or if it's worth the effort. I'm feeling overwhelmed and uncertain about what to do next.
I've been dealing with feelings of sadness

Prompt: I've been feeling overwhelmed lately, with the added stress of managing my children's emotional needs and dealing with the grief of losing my spouse. It all feels like too much, and I'm struggling to keep up. I've been trying to focus on the positive aspects of my life, like my children and the support I've received from the hospice counsel

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


I've been feeling overwhelmed lately, with the added stress of managing my children's emotional needs and dealing with the grief of losing my spouse. It all feels like too much, and I'm struggling to keep up. I've been trying to focus on the positive aspects of my life, like my children and the support I've received from the hospice counselor, but it's hard to maintain that perspective when I'm feeling so drained. I've been trying to keep up with my responsibilities at work, but it's been a challenge. I've been having trouble sleeping and eating properly, and I've been feeling anxious and irritable. I've been trying to take care of myself,
Fine-Tuned Model Response:


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


I've been feeling overwhelmed lately, with the added stress of managing my children's emotional needs and dealing with the grief of losing my spouse. It all feels like too much, and I'm struggling to keep up. I've been trying to focus on the positive aspects of my life, like my children and the support I've received from the hospice counselor, but it's hard to maintain that perspective when I'm feeling so drained. I've been thinking about my mother, who always seemed to find joy in the simplest things, and I'm trying to channel that positivity into my own life. But I can't help but feel like I'm failing, both as a mother and as

Prompt: I understand, counselor. I didn't mean to upset him or cause any harm. I just wanted to share some memories and bring a smile to his face. But I'll definitely be more mindful of his condition and avoid topics that might cause distress.
Base Model Response:


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


I understand, counselor. I didn't mean to upset him or cause any harm. I just wanted to share some memories and bring a smile to his face. But I'll definitely be more mindful of his condition and avoid topics that might cause distress. Thank you for your guidance.

I appreciate your understanding, counselor. I'll make sure to keep these things in mind in the future. I've learned a lot from this experience, and I hope to use this newfound awareness to help others in similar situations
Fine-Tuned Model Response:
I understand, counselor. I didn't mean to upset him or cause any harm. I just wanted to share some memories and bring a smile to his face. But I'll definitely be more mindful of his condition and avoid topics that might cause distress. Thank you for your understanding and support.

Yes, I'd like to continue our conversation. I've been feeling so alone and helpless since my father's diagnosis. It's been a long journey, and I've been struggling to find ways to cope.


In [None]:
import os
import torch
import pandas as pd
import time
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import PeftModel
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.tokenize import word_tokenize
from evaluate import load
from collections import Counter

# Set environment variables
os.environ["HF_TOKEN"] = userdata.get("HF_TOKEN")  # Ensure HF token is set

# Paths
base_model_path = "/content/llama3_model/models--meta-llama--Meta-Llama-3-8B/snapshots/8cde5ca8380496c9a6cc7ef3a8b46a0372a1d920"
fine_tuned_path = "./llama3_finetuned"

# Configure 4-bit quantization
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4"
)

# Load base model
print("Loading base model...")
base_model = AutoModelForCausalLM.from_pretrained(
    base_model_path,
    quantization_config=quantization_config,
    device_map="auto",
    local_files_only=True
)
base_tokenizer = AutoTokenizer.from_pretrained(base_model_path, local_files_only=True)
if base_tokenizer.pad_token is None:
    base_tokenizer.pad_token = base_tokenizer.eos_token

# Load fine-tuned model (apply LoRA adapters)
print("Loading fine-tuned model...")
fine_tuned_model = PeftModel.from_pretrained(base_model, fine_tuned_path)
fine_tuned_tokenizer = AutoTokenizer.from_pretrained(fine_tuned_path)
if fine_tuned_tokenizer.pad_token is None:
    fine_tuned_tokenizer.pad_token = fine_tuned_tokenizer.eos_token

# Load dataset and extract test prompts and references
df = pd.read_csv("merged_dataset_cleaned.csv")
test_df = df[df['Context'].str.contains("depression|stress|sadness", case=False, na=False)].head(10)  # Use 10 samples for metrics
test_prompts = test_df['Context'].tolist()
references = test_df['Response'].tolist()
print("Test Prompts from Dataset:", test_prompts)

# Function to generate response and measure inference time
def generate_response(model, tokenizer, prompt, max_length=128):
    start_time = time.time()
    inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=max_length).to("cuda")
    outputs = model.generate(**inputs, max_new_tokens=150, do_sample=True, temperature=0.7)
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    inference_time = time.time() - start_time
    return response, inference_time

# Generate responses for base and fine-tuned models
base_responses = []
fine_tuned_responses = []
base_inference_times = []
fine_tuned_inference_times = []
for prompt in test_prompts:
    base_response, base_time = generate_response(base_model, base_tokenizer, prompt)
    fine_tuned_response, fine_tuned_time = generate_response(fine_tuned_model, fine_tuned_tokenizer, prompt)
    base_responses.append(base_response)
    fine_tuned_responses.append(fine_tuned_response)
    base_inference_times.append(base_time)
    fine_tuned_inference_times.append(fine_tuned_time)

# Compute response lengths (average words)
def average_length(responses):
    return sum(len(response.split()) for response in responses) / len(responses)

base_avg_length = average_length(base_responses)
fine_tuned_avg_length = average_length(fine_tuned_responses)

# Install NLTK if needed (uncomment if required)
# !pip install nltk evaluate
import nltk
nltk.download('wordnet')
nltk.download('punkt_tab')

# Compute BLEU, ROUGE, METEOR
rouge = load("rouge")
meteor = load("meteor")
bleu_scores = []
rouge_scores = []
meteor_scores = []
distinct1_scores = []
distinct2_scores = []
for generated, reference in zip(fine_tuned_responses, references):
    gen_tokens = word_tokenize(generated.lower())
    ref_tokens = [word_tokenize(reference.lower())]  # BLEU expects list of references
    bleu = sentence_bleu(ref_tokens, gen_tokens, smoothing_function=SmoothingFunction().method1)
    bleu_scores.append(bleu)

    rouge_result = rouge.compute(predictions=[generated], references=[reference])
    rouge_scores.append(rouge_result['rougeL'])

    meteor_result = meteor.compute(predictions=[generated], references=[reference])
    meteor_scores.append(meteor_result['meteor'])

    # Distinct-1/2
    unigrams = set(gen_tokens)
    bigrams = set(nltk.bigrams(gen_tokens))
    distinct1 = len(unigrams) / len(gen_tokens) if gen_tokens else 0
    distinct2 = len(bigrams) / len(list(nltk.bigrams(gen_tokens))) if len(gen_tokens) > 1 else 0
    distinct1_scores.append(distinct1)
    distinct2_scores.append(distinct2)

avg_bleu = sum(bleu_scores) / len(bleu_scores)
avg_rouge = sum(rouge_scores) / len(rouge_scores)
avg_meteor = sum(meteor_scores) / len(meteor_scores)
avg_distinct1 = sum(distinct1_scores) / len(distinct1_scores)
avg_distinct2 = sum(distinct2_scores) / len(distinct2_scores)

# Compute perplexity
def compute_perplexity(model, tokenizer, texts):
    total_loss = 0.0
    total_tokens = 0
    for text in texts:
        inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True).to("cuda")
        labels = inputs.input_ids
        outputs = model(**inputs, labels=labels)
        loss = outputs.loss
        total_loss += loss.item() * inputs.attention_mask.sum().item()
        total_tokens += inputs.attention_mask.sum().item()
    avg_loss = total_loss / total_tokens
    perplexity = torch.exp(torch.tensor(avg_loss))
    return perplexity.item()

fine_tuned_perplexity = compute_perplexity(fine_tuned_model, fine_tuned_tokenizer, references)  # Use references as test texts

# Average inference time
avg_base_inference_time = sum(base_inference_times) / len(base_inference_times)
avg_fine_tuned_inference_time = sum(fine_tuned_inference_times) / len(fine_tuned_inference_times)

# Print results
print("\nMetrics for Fine-Tuned Model:")
print(f"Average BLEU: {avg_bleu:.4f}")
print(f"Average ROUGE-L: {avg_rouge:.4f}")
print(f"Average METEOR: {avg_meteor:.4f}")
print(f"Perplexity: {fine_tuned_perplexity:.4f}")
print(f"Average Distinct-1: {avg_distinct1:.4f}")
print(f"Average Distinct-2: {avg_distinct2:.4f}")
print(f"Average Response Length (words): {fine_tuned_avg_length:.2f} (dataset avg ~239 words)")
print(f"Average Inference Time (base): {avg_base_inference_time:.2f} seconds")
print(f"Average Inference Time (fine-tuned): {avg_fine_tuned_inference_time:.2f} seconds")

Loading base model...


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Loading fine-tuned model...


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Test Prompts from Dataset: ["I've been dealing with feelings of sadness and hopelessness for the past few months. I've tried to distract myself with work and other activities, but nothing seems to help. I've been thinking about seeking professional help, but I'm unsure if therapy is the right choice for me. I've heard that therapy can be expensive and time-consuming, and I'm not sure if I can afford it or if I have the energy to commit to it.", "I've been feeling overwhelmed lately, with the added stress of managing my children's emotional needs and dealing with the grief of losing my spouse. It all feels like too much, and I'm struggling to keep up. I've been trying to focus on the positive aspects of my life, like my children and the support I've received from the hospice counselor, but it's hard to maintain that perspective when I'm feeling so drained.", "I understand, counselor. I didn't mean to upset him or cause any harm. I just wanted to share some memories and bring a smile to 

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.



Metrics for Fine-Tuned Model:
Average BLEU: 0.0308
Average ROUGE-L: 0.1542
Average METEOR: 0.2024
Perplexity: 3.2012
Average Distinct-1: 0.4499
Average Distinct-2: 0.7117
Average Response Length (words): 179.60 (dataset avg ~239 words)
Average Inference Time (base): 15.31 seconds
Average Inference Time (fine-tuned): 14.53 seconds


In [None]:
import os
import gc
import sys
import torch
import pandas as pd
import json
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import PeftModel
from sentence_transformers import SentenceTransformer, util
from google.colab import userdata

# Set environment variables
os.environ["HF_TOKEN"] = userdata.get("HF_TOKEN")  # Ensure HF token is set

# Knowledge base path
knowledge_base_path = "/content/combine cbt.json"

# Load knowledge base
with open(knowledge_base_path, 'r') as f:
    knowledge_base = json.load(f)
knowledge_base_entries = [entry for entry in knowledge_base.values()]

# Embedding model for RAG
embedder = SentenceTransformer('all-MiniLM-L6-v2')  # Simple embedding model
kb_texts = [entry['Input.client_statement'] + " " + entry['Input.res_a'] + " " + entry['Input.res_b'] for entry in knowledge_base_entries]  # Combine relevant fields for embedding
kb_embeddings = embedder.encode(kb_texts, convert_to_tensor=True)

def retrieve_from_kb(prompt, top_k=3):
    prompt_embedding = embedder.encode(prompt, convert_to_tensor=True)
    hits = util.semantic_search(prompt_embedding, kb_embeddings, top_k=top_k)
    retrieved = [knowledge_base_entries[hit['corpus_id']] for hit in hits[0]]
    return retrieved

# Paths
base_model_path = "/content/llama3_model/models--meta-llama--Meta-Llama-3-8B/snapshots/8cde5ca8380496c9a6cc7ef3a8b46a0372a1d920"
fine_tuned_path = "./llama3_finetuned"

# Configure 4-bit quantization
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4"
)

# Load fine-tuned model
print("Loading fine-tuned model...")
base_model = AutoModelForCausalLM.from_pretrained(
    base_model_path,
    quantization_config=quantization_config,
    device_map="auto",
    local_files_only=True
)
fine_tuned_model = PeftModel.from_pretrained(base_model, fine_tuned_path)
fine_tuned_tokenizer = AutoTokenizer.from_pretrained(fine_tuned_path, local_files_only=True)
if fine_tuned_tokenizer.pad_token is None:
    fine_tuned_tokenizer.pad_token = fine_tuned_tokenizer.eos_token

# CoT and RAG integration for inference
def rag_cot_generate(model, tokenizer, prompt):
    # Retrieve from KB
    retrieved = retrieve_from_kb(prompt)
    rag_context = "\n".join([f"Client: {entry['Input.client_statement']}\nResponse A: {entry['Input.res_a']}\nResponse B: {entry['Input.res_b']}" for entry in retrieved])

    # CoT prompt with RAG
    cot_prompt = f"Knowledge Base: {rag_context}\n\nPrompt: {prompt}\n\nThink step by step:\n1. Analyze the prompt.\n2. Recall relevant knowledge.\n3. Generate response.\nResponse:"

    inputs = tokenizer(cot_prompt, return_tensors="pt", padding=True, truncation=True, max_length=256).to("cuda")
    outputs = model.generate(**inputs, max_new_tokens=150, do_sample=True, temperature=0.7)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Load dataset and extract test prompts
df = pd.read_csv("merged_dataset_cleaned.csv")
test_prompts = df[df['Context'].str.contains("depression|stress|sadness", case=False, na=False)]['Context'].head(3).tolist()
print("Test Prompts from Dataset:", test_prompts)

# Generate responses using CoT + RAG on fine-tuned model
print("Generating responses with CoT + RAG on fine-tuned model:")
for prompt in test_prompts:
    print(f"\nPrompt: {prompt}")
    response = rag_cot_generate(fine_tuned_model, fine_tuned_tokenizer, prompt)
    print(f"CoT + RAG Response: {response}")



Loading fine-tuned model...


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Test Prompts from Dataset: ["I've been dealing with feelings of sadness and hopelessness for the past few months. I've tried to distract myself with work and other activities, but nothing seems to help. I've been thinking about seeking professional help, but I'm unsure if therapy is the right choice for me. I've heard that therapy can be expensive and time-consuming, and I'm not sure if I can afford it or if I have the energy to commit to it.", "I've been feeling overwhelmed lately, with the added stress of managing my children's emotional needs and dealing with the grief of losing my spouse. It all feels like too much, and I'm struggling to keep up. I've been trying to focus on the positive aspects of my life, like my children and the support I've received from the hospice counselor, but it's hard to maintain that perspective when I'm feeling so drained.", "I understand, counselor. I didn't mean to upset him or cause any harm. I just wanted to share some memories and bring a smile to 

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


CoT + RAG Response: Knowledge Base: Client: [Dismissive] I’m not sure this is the right thing for me.
Response A: I completely understand your reservations. Cognitive Behavioral Therapy (CBT) may seem unfamiliar, but I'd like to explain how it can help. CBT is a problem-focused approach that helps you identify and change negative thought patterns and behaviors that contribute to your distress. By understanding how your thoughts, feelings, and actions interact, we can work together to develop more constructive ways of thinking and coping. Would you like me to elaborate on how this process works?
Response B: I think that’s really important to acknowledge. I’d like to better understand what’s behind your concerns. I can say it’s pretty common for people to be uncertain about whether therapy is the right option for them, whether therapy in general or of a particular type. Based on what I’ve heard thus far, I believe that therapy could help you, but I want to help you make an informed choic

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


CoT + RAG Response: Knowledge Base: Client: [Weeping] I just can’t imagine anything changing. I feel so hopeless.
Response A: It's completely understandable to feel overwhelmed and hopeless when you're going through a tough time. Let's start by identifying small, achievable goals that can help you feel more in control. Change often begins with small steps. Can we talk about one thing, no matter how small, that would make your day a little better? By setting and working towards these small goals, we can start to build a path towards positive change.
Response B: I can see how it’s hard to picture how things might be different, and at this point, there might be a part of you that’s overwhelmed by the work that’s ahead. And there is another part that brought you here today, and that reality is not trivial and gives me hope for you. Perhaps this feeling of hopelessness is itself a problem worth working on (i.e., to reduce or overcome) as a goal in here. Do you have thoughts on that?
Client:

In [None]:
import os
import gc
import sys
import torch
import pandas as pd
import json
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import PeftModel
from sentence_transformers import SentenceTransformer, util
from google.colab import userdata
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.tokenize import word_tokenize
from evaluate import load
from collections import Counter
import time
import nltk

nltk.download('wordnet')
nltk.download('punkt_tab')

# Set environment variables
os.environ["HF_TOKEN"] = userdata.get("HF_TOKEN")  # Ensure HF token is set

# Knowledge base path
knowledge_base_path = "/content/combine cbt.json"

# Load knowledge base
with open(knowledge_base_path, 'r') as f:
    knowledge_base = json.load(f)
knowledge_base_entries = [entry for entry in knowledge_base.values()]

# Embedding model for RAG
embedder = SentenceTransformer('all-MiniLM-L6-v2')  # Simple embedding model
kb_texts = [entry['Input.client_statement'] + " " + entry['Input.res_a'] + " " + entry['Input.res_b'] for entry in knowledge_base_entries]  # Combine relevant fields for embedding
kb_embeddings = embedder.encode(kb_texts, convert_to_tensor=True)

def retrieve_from_kb(prompt, top_k=3):
    prompt_embedding = embedder.encode(prompt, convert_to_tensor=True)
    hits = util.semantic_search(prompt_embedding, kb_embeddings, top_k=top_k)
    retrieved = [knowledge_base_entries[hit['corpus_id']] for hit in hits[0]]
    return retrieved

# Paths
base_model_path = "/content/llama3_model/models--meta-llama--Meta-Llama-3-8B/snapshots/8cde5ca8380496c9a6cc7ef3a8b46a0372a1d920"
fine_tuned_path = "./llama3_finetuned"

# Configure 4-bit quantization
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4"
)

# Load fine-tuned model
print("Loading fine-tuned model...")
base_model = AutoModelForCausalLM.from_pretrained(
    base_model_path,
    quantization_config=quantization_config,
    device_map="auto",
    local_files_only=True
)
fine_tuned_model = PeftModel.from_pretrained(base_model, fine_tuned_path)
fine_tuned_tokenizer = AutoTokenizer.from_pretrained(fine_tuned_path, local_files_only=True)
if fine_tuned_tokenizer.pad_token is None:
    fine_tuned_tokenizer.pad_token = fine_tuned_tokenizer.eos_token

# Standard generation function
def generate_standard(model, tokenizer, prompt):
    start_time = time.time()
    inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=256).to("cuda")
    outputs = model.generate(**inputs, max_new_tokens=150, do_sample=True, temperature=0.7)
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    inference_time = time.time() - start_time
    return response, inference_time

# CoT and RAG integration for inference
def rag_cot_generate(model, tokenizer, prompt):
    # Retrieve from KB
    retrieved = retrieve_from_kb(prompt)
    rag_context = "\n".join([f"Client: {entry['Input.client_statement']}\nResponse A: {entry['Input.res_a']}\nResponse B: {entry['Input.res_b']}" for entry in retrieved])

    # CoT prompt with RAG
    cot_prompt = f"Knowledge Base: {rag_context}\n\nPrompt: {prompt}\n\nThink step by step:\n1. Analyze the prompt.\n2. Recall relevant knowledge.\n3. Generate response.\nResponse:"

    start_time = time.time()
    inputs = tokenizer(cot_prompt, return_tensors="pt", padding=True, truncation=True, max_length=256).to("cuda")
    outputs = model.generate(**inputs, max_new_tokens=150, do_sample=True, temperature=0.7)
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    inference_time = time.time() - start_time
    return response, inference_time

# Load dataset and extract test prompts and references
df = pd.read_csv("merged_dataset_cleaned.csv")
test_df = df[df['Context'].str.contains("depression|stress|sadness", case=False, na=False)].head(3)
test_prompts = test_df['Context'].tolist()
references = test_df['Response'].tolist()
print("Test Prompts from Dataset:", test_prompts)

# Generate responses for each prompt using both methods
standard_responses = []
cot_rag_responses = []
standard_times = []
cot_rag_times = []
for prompt in test_prompts:
    print(f"\nPrompt: {prompt}")

    # Standard fine-tuned response
    standard_res, standard_time = generate_standard(fine_tuned_model, fine_tuned_tokenizer, prompt)
    print(f"Standard Fine-Tuned Response: {standard_res}")

    # CoT + RAG response
    cot_rag_res, cot_rag_time = rag_cot_generate(fine_tuned_model, fine_tuned_tokenizer, prompt)
    print(f"CoT + RAG Response: {cot_rag_res}")

    standard_responses.append(standard_res)
    cot_rag_responses.append(cot_rag_res)
    standard_times.append(standard_time)
    cot_rag_times.append(cot_rag_time)

# Compute metrics for both
def compute_metrics(generated_responses, references):
    bleu_scores = [sentence_bleu([word_tokenize(ref.lower())], word_tokenize(gen.lower()), smoothing_function=SmoothingFunction().method1) for gen, ref in zip(generated_responses, references)]
    rouge_scores = [rouge.compute(predictions=[gen], references=[ref])['rougeL'] for gen, ref in zip(generated_responses, references)]
    meteor_scores = [meteor.compute(predictions=[gen], references=[ref])['meteor'] for gen, ref in zip(generated_responses, references)]
    distinct1_scores = [len(set(word_tokenize(gen.lower()))) / len(word_tokenize(gen.lower())) if word_tokenize(gen.lower()) else 0 for gen in generated_responses]
    distinct2_scores = [len(set(nltk.bigrams(word_tokenize(gen.lower())))) / len(list(nltk.bigrams(word_tokenize(gen.lower())))) if len(word_tokenize(gen.lower())) > 1 else 0 for gen in generated_responses]
    perplexity = compute_perplexity(fine_tuned_model, fine_tuned_tokenizer, generated_responses)  # Use generated as test texts for perplexity

    avg_bleu = sum(bleu_scores) / len(bleu_scores)
    avg_rouge = sum(rouge_scores) / len(rouge_scores)
    avg_meteor = sum(meteor_scores) / len(meteor_scores)
    avg_distinct1 = sum(distinct1_scores) / len(distinct1_scores)
    avg_distinct2 = sum(distinct2_scores) / len(distinct2_scores)

    return avg_bleu, avg_rouge, avg_meteor, perplexity, avg_distinct1, avg_distinct2

# Load evaluate metrics
rouge = load("rouge")
meteor = load("meteor")

# Compute for standard fine-tuned
avg_bleu_std, avg_rouge_std, avg_meteor_std, perplexity_std, avg_dist1_std, avg_dist2_std = compute_metrics(standard_responses, references)

# Compute for CoT + RAG
avg_bleu_cot, avg_rouge_cot, avg_meteor_cot, perplexity_cot, avg_dist1_cot, avg_dist2_cot = compute_metrics(cot_rag_responses, references)

# Average response length and inference time
def average_length(responses):
    return sum(len(response.split()) for response in responses) / len(responses)

avg_length_std = average_length(standard_responses)
avg_length_cot = average_length(cot_rag_responses)
avg_time_std = sum(standard_times) / len(standard_times)
avg_time_cot = sum(cot_rag_times) / len(cot_rag_times)

# Print comparison
print("\nMetrics Comparison:")
print("Standard Fine-Tuned Model:")
print(f"Average BLEU: {avg_bleu_std:.4f}")
print(f"Average ROUGE-L: {avg_rouge_std:.4f}")
print(f"Average METEOR: {avg_meteor_std:.4f}")
print(f"Perplexity: {perplexity_std:.4f}")
print(f"Average Distinct-1: {avg_dist1_std:.4f}")
print(f"Average Distinct-2: {avg_dist2_std:.4f}")
print(f"Average Response Length: {avg_length_std:.2f} words")
print(f"Average Inference Time: {avg_time_std:.2f} seconds")

print("\nCoT + RAG on Fine-Tuned Model:")
print(f"Average BLEU: {avg_bleu_cot:.4f}")
print(f"Average ROUGE-L: {avg_rouge_cot:.4f}")
print(f"Average METEOR: {avg_meteor_cot:.4f}")
print(f"Perplexity: {perplexity_cot:.4f}")
print(f"Average Distinct-1: {avg_dist1_cot:.4f}")
print(f"Average Distinct-2: {avg_dist2_cot:.4f}")
print(f"Average Response Length: {avg_length_cot:.2f} words")
print(f"Average Inference Time: {avg_time_cot:.2f} seconds")

# Helper for perplexity
def compute_perplexity(model, tokenizer, texts):
    total_loss = 0.0
    total_tokens = 0
    for text in texts:
        inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True).to("cuda")
        labels = inputs.input_ids
        with torch.no_grad():
            outputs = model(**inputs, labels=labels)
            loss = outputs.loss
            total_loss += loss.item() * inputs.attention_mask.sum().item()
            total_tokens += inputs.attention_mask.sum().item()
    avg_loss = total_loss / total_tokens
    return torch.exp(torch.tensor(avg_loss)).item()


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Loading fine-tuned model...


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Test Prompts from Dataset: ["I've been dealing with feelings of sadness and hopelessness for the past few months. I've tried to distract myself with work and other activities, but nothing seems to help. I've been thinking about seeking professional help, but I'm unsure if therapy is the right choice for me. I've heard that therapy can be expensive and time-consuming, and I'm not sure if I can afford it or if I have the energy to commit to it.", "I've been feeling overwhelmed lately, with the added stress of managing my children's emotional needs and dealing with the grief of losing my spouse. It all feels like too much, and I'm struggling to keep up. I've been trying to focus on the positive aspects of my life, like my children and the support I've received from the hospice counselor, but it's hard to maintain that perspective when I'm feeling so drained.", "I understand, counselor. I didn't mean to upset him or cause any harm. I just wanted to share some memories and bring a smile to 

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Standard Fine-Tuned Response: I've been dealing with feelings of sadness and hopelessness for the past few months. I've tried to distract myself with work and other activities, but nothing seems to help. I've been thinking about seeking professional help, but I'm unsure if therapy is the right choice for me. I've heard that therapy can be expensive and time-consuming, and I'm not sure if I can afford it or if I have the energy to commit to it. I've also heard that therapy can be helpful, but I'm not sure if it's worth it for me. I'm feeling lost and unsure of what to do next. I'd appreciate any guidance or resources you could provide to help me make an informed decision about therapy.
I've been dealing with feelings of sadness and hopelessness for the past few months. I've tried to distract myself with work and other activities, but nothing seems to help. I've been thinking about seeking professional help, but I'm unsure if therapy is the right choice for me. I've heard that therapy ca

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


CoT + RAG Response: Knowledge Base: Client: [Dismissive] I’m not sure this is the right thing for me.
Response A: I completely understand your reservations. Cognitive Behavioral Therapy (CBT) may seem unfamiliar, but I'd like to explain how it can help. CBT is a problem-focused approach that helps you identify and change negative thought patterns and behaviors that contribute to your distress. By understanding how your thoughts, feelings, and actions interact, we can work together to develop more constructive ways of thinking and coping. Would you like me to elaborate on how this process works?
Response B: I think that’s really important to acknowledge. I’d like to better understand what’s behind your concerns. I can say it’s pretty common for people to be uncertain about whether therapy is the right option for them, whether therapy in general or of a particular type. Based on what I’ve heard thus far, I believe that therapy could help you, but I want to help you make an informed choic

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Standard Fine-Tuned Response: I've been feeling overwhelmed lately, with the added stress of managing my children's emotional needs and dealing with the grief of losing my spouse. It all feels like too much, and I'm struggling to keep up. I've been trying to focus on the positive aspects of my life, like my children and the support I've received from the hospice counselor, but it's hard to maintain that perspective when I'm feeling so drained. I've been trying to make a list of things I need to do, but I can't seem to get past the first item. I've been feeling disconnected from my friends and family, and I've been having trouble sleeping. I've been trying to stay positive, but it's been a challenge. I've been feeling a bit down, and I've been having trouble focusing on my work. I've been trying to take care of myself, but it's been a struggle. I've been feeling a bit down, and I've been having trouble focusing on my work. I've been trying to take care of myself, but it's been a struggl

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


CoT + RAG Response: Knowledge Base: Client: [Weeping] I just can’t imagine anything changing. I feel so hopeless.
Response A: It's completely understandable to feel overwhelmed and hopeless when you're going through a tough time. Let's start by identifying small, achievable goals that can help you feel more in control. Change often begins with small steps. Can we talk about one thing, no matter how small, that would make your day a little better? By setting and working towards these small goals, we can start to build a path towards positive change.
Response B: I can see how it’s hard to picture how things might be different, and at this point, there might be a part of you that’s overwhelmed by the work that’s ahead. And there is another part that brought you here today, and that reality is not trivial and gives me hope for you. Perhaps this feeling of hopelessness is itself a problem worth working on (i.e., to reduce or overcome) as a goal in here. Do you have thoughts on that?
Client:

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Standard Fine-Tuned Response: I understand, counselor. I didn't mean to upset him or cause any harm. I just wanted to share some memories and bring a smile to his face. But I'll definitely be more mindful of his condition and avoid topics that might cause distress. I appreciate your guidance and support. I feel a bit better now. I'll try to focus on the positive aspects of our relationship and the memories we've shared. I'll also make sure to express my gratitude for all he's done for me and our family. I'll keep practicing the techniques you've taught me and try to be more mindful and compassionate in all my interactions. Thank you again for your help. I feel a bit lighter now. I'm looking forward to our next session.
CoT + RAG Response: Knowledge Base: Client: [Nervous] I’ve never been in therapy before. What do we talk about here?
Response A: Welcome to our session! I'm glad you're taking this first step. In cognitive behavioral therapy, we'll work together to identify and understan

Downloading builder script: 0.00B [00:00, ?B/s]

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.



Metrics Comparison:
Standard Fine-Tuned Model:
Average BLEU: 0.0480
Average ROUGE-L: 0.1703
Average METEOR: 0.2143
Perplexity: 2.1483
Average Distinct-1: 0.4200
Average Distinct-2: 0.6437
Average Response Length: 172.67 words
Average Inference Time: 14.28 seconds

CoT + RAG on Fine-Tuned Model:
Average BLEU: 0.0317
Average ROUGE-L: 0.1519
Average METEOR: 0.2696
Perplexity: 4.2014
Average Distinct-1: 0.3607
Average Distinct-2: 0.6460
Average Response Length: 323.67 words
Average Inference Time: 15.76 seconds


In [None]:
import os
import gc
import sys
import torch
import pandas as pd
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling, BitsAndBytesConfig
from datasets import Dataset
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from torch.amp import autocast
from google.colab import userdata

# Set environment variables
os.environ["HF_TOKEN"] = userdata.get("HF_TOKEN")

def clean_cuda_memory(device_id):
    """Attempt to completely clean CUDA memory for the specified device."""
    torch.cuda.set_device(device_id)
    torch.cuda.empty_cache()
    gc.collect()
    torch.cuda.reset_peak_memory_stats()

def finetune_on_gpu(device_id, model_load_path, output_path, df_path):
    clean_cuda_memory(device_id)
    torch.cuda.set_device(device_id)
    gc.collect()

    # Load and prepare full dataset
    df = pd.read_csv(df_path)
    df = df.dropna(subset=['Context', 'Response'])
    df = df[(df['Context'].str.strip() != '') & (df['Response'].str.strip() != '')].reset_index(drop=True)
    df = df.rename(columns={"Context": "prompt", "Response": "response"})
    df['prompt'] = df['prompt'].astype(str)
    df['response'] = df['response'].astype(str)
    def format_conversation(example):
        return {"text": f"<s>[INST] {example['prompt']} [/INST] {example['response']} </s>"}
    dataset = Dataset.from_pandas(df)
    dataset = dataset.map(format_conversation, num_proc=2)

    # Debug: List directory contents
    print(f"Contents of {model_load_path}: {os.listdir(model_load_path)}")

    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_load_path, local_files_only=True)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    # Configure 4-bit quantization
    quantization_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_compute_dtype=torch.float16,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4"
    )

    # Load model with 4-bit quantization (QLoRA)
    model = AutoModelForCausalLM.from_pretrained(
        model_load_path,
        quantization_config=quantization_config,
        device_map="auto",
        local_files_only=True
    )
    model = prepare_model_for_kbit_training(model)

    # Configure LoRA with adjusted settings for deeper learning
    lora_config = LoraConfig(
        r=8,  # Increased rank for more parameter updates
        lora_alpha=16,  # Increased scaling for stronger adaptation
        target_modules=["q_proj", "v_proj", "k_proj", "o_proj", "down_proj", "up_proj"],  # Expanded layers
        lora_dropout=0.1,  # Increased dropout for regularization
        bias="none",
        task_type="CAUSAL_LM"
    )
    model = get_peft_model(model, lora_config)
    model.gradient_checkpointing_enable()

    # Preprocess dataset with maximum length
    def preprocess_function(examples):
        tokenized = tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)  # Set to 512
        tokenized["labels"] = tokenized["input_ids"]
        return tokenized
    tokenized_dataset = dataset.map(preprocess_function, batched=True, num_proc=4, remove_columns=dataset.column_names)

    # Training arguments for deeper fine-tuning
    training_args = TrainingArguments(
        output_dir=output_path,
        num_train_epochs=2,  # Set to 2 epochs
        per_device_train_batch_size=4,
        gradient_accumulation_steps=2,
        save_steps=1000,
        logging_steps=100,
        fp16=True,
        bf16=False,
        save_total_limit=3,
        load_best_model_at_end=False,
        report_to="none"
    )

    # Data collator and trainer
    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_dataset,
        data_collator=data_collator,
        tokenizer=tokenizer,
    )

    # Memory profiling
    print(f"Memory allocated: {torch.cuda.memory_allocated(device_id) / 1024**3:.2f} GB")
    print(f"Max memory allocated: {torch.cuda.max_memory_allocated(device_id) / 1024**3:.2f} GB")
    gc.collect()
    torch.cuda.empty_cache()

    # Train the model
    trainer.train()
    torch.cuda.empty_cache()

    # Save the final model
    trainer.save_model(output_path)
    tokenizer.save_pretrained(output_path)
    print(f"Fine-tuning complete. Model and tokenizer saved in {output_path} on GPU {device_id}")

def main():
    model_load_path = "/content/llama3_model/models--meta-llama--Meta-Llama-3-8B/snapshots/8cde5ca8380496c9a6cc7ef3a8b46a0372a1d920"
    output_path = "./llama3_finetuned"
    df_path = "merged_dataset_cleaned.csv"

    os.makedirs(output_path, exist_ok=True)

    if torch.cuda.is_available():
        device_id = 0
        print(f"GPU {device_id} is available. Starting fine-tuning...")
        finetune_on_gpu(device_id, model_load_path, output_path, df_path)
    else:
        print("No CUDA device found. Exiting script.")
        sys.exit(1)

if __name__ == "__main__":
    main()

GPU 0 is available. Starting fine-tuning...


Map (num_proc=2):   0%|          | 0/85101 [00:00<?, ? examples/s]

Contents of /content/llama3_model/models--meta-llama--Meta-Llama-3-8B/snapshots/8cde5ca8380496c9a6cc7ef3a8b46a0372a1d920: ['generation_config.json', 'config.json', 'original', 'special_tokens_map.json', 'model-00004-of-00004.safetensors', 'model-00003-of-00004.safetensors', 'tokenizer.json', '.gitattributes', 'model-00002-of-00004.safetensors', 'model-00001-of-00004.safetensors', 'LICENSE', 'model.safetensors.index.json', 'USE_POLICY.md', 'README.md', 'tokenizer_config.json']


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Map (num_proc=4):   0%|          | 0/85101 [00:00<?, ? examples/s]

  trainer = Trainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 128001}.


Memory allocated: 7.33 GB
Max memory allocated: 8.25 GB


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss


KeyboardInterrupt: 

In [None]:
import os
import gc
import sys
import time
import json
import torch
import pandas as pd
import nltk
from collections import Counter

from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import PeftModel
from sentence_transformers import SentenceTransformer, util

from nltk.tokenize import word_tokenize
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import evaluate
import nltk

nltk.download('wordnet')
nltk.download('punkt')

# ---------- Config ----------
knowledge_base_path = "/content/combine cbt.json"
base_model_path = "/content/llama3_model/models--meta-llama--Meta-Llama-3-8B/snapshots/8cde5ca8380496c9a6cc7ef3a8b46a0372a1d920"
fine_tuned_path = "./llama3_finetuned"
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)

# ---------- Load Knowledge Base ----------
with open(knowledge_base_path, 'r', encoding='utf-8') as f:
    knowledge_base = json.load(f)
knowledge_base_entries = [entry for entry in knowledge_base.values()]

def _kb_to_text(entry):
    parts = []
    for k in ['Input.client_statement', 'Input.res_a', 'Input.res_b', 'ori_text', 'situation']:
        if k in entry and entry[k]:
            parts.append(str(entry[k]))
    return " ".join(parts)

kb_texts = [_kb_to_text(e) for e in knowledge_base_entries]

print("Loading embedding model...")
embedder = SentenceTransformer('all-MiniLM-L6-v2')
kb_embeddings = embedder.encode(kb_texts, convert_to_tensor=True)

def retrieve_from_kb(prompt, top_k=3):
    prompt_embedding = embedder.encode(prompt, convert_to_tensor=True)
    hits = util.semantic_search(prompt_embedding, kb_embeddings, top_k=top_k)
    retrieved = []
    for hit in hits[0]:
        idx = hit['corpus_id']
        if 0 <= idx < len(knowledge_base_entries):
            retrieved.append(knowledge_base_entries[idx])
    return retrieved

# ---------- Load Fine-tuned Model ----------
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4"
)

print("Loading base model...")
base_model = AutoModelForCausalLM.from_pretrained(
    base_model_path,
    quantization_config=quantization_config,
    device_map="auto",
    local_files_only=True
)

print("Loading PEFT fine-tuned weights...")
fine_tuned_model = PeftModel.from_pretrained(base_model, fine_tuned_path)
fine_tuned_model.eval()

print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(fine_tuned_path, local_files_only=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# ---------- Safety Check ----------
SELF_HARM_KEYWORDS = [
    "suicide", "i want to die", "kill myself", "end my life", "hang myself", "hurt myself",
    "i can't go on", "want to die", "i'm going to die", "i'll kill myself"
]

def contains_self_harm(text: str) -> bool:
    t = text.lower()
    return any(k in t for k in SELF_HARM_KEYWORDS)

def crisis_response():
    return ("I'm really sorry you're feeling so overwhelmed. I can't provide medical care, "
            "but if you're thinking about hurting yourself, please contact your local emergency services "
            "or a crisis line right now. If you'd like, I can help you find resources or steps to stay safe.")

# ---------- CoT Template ----------
COT_TEMPLATE = """
You are a compassionate, friendly listener (not a diagnostician). Do NOT show these steps to the user. Instead, only provide the final supportive response.

Steps (INTERNAL ONLY, not to be shown):
1) Empathic opening
2) Validate feelings
3) If a KB example is relevant, briefly reflect it
4) Ask one open question
5) Offer 1-2 coping suggestions
6) If self-harm is detected, switch to crisis response
7) Close supportively
"""

def build_cot_prompt(user_prompt: str, retrieved_entries: list):
    if retrieved_entries:
        rag_context = "\n\n".join(
            [
                f"KB example: Client: {entry.get('Input.client_statement','')}\n"
                f"Response A: {entry.get('Input.res_a','')}\nResponse B: {entry.get('Input.res_b','')}"
                for entry in retrieved_entries
            ]
        )
    else:
        rag_context = "No direct KB examples."

    return (
        f"{COT_TEMPLATE}\n\n"
        f"KnowledgeBaseContext:\n{rag_context}\n\n"
        f"User: {user_prompt}\n\n"
        "Now provide the final user-facing response only, without showing steps or template. Response:"
    )

# ---------- Generation Helpers ----------
def _generate(model, tokenizer, prompt, max_new_tokens=200, temperature=0.7):
    inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=2048)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=True,
            temperature=temperature,
            pad_token_id=tokenizer.pad_token_id
        )
    text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return text

def _clean_response(full_text, original_prompt):
    if "Response:" in full_text:
        return full_text.split("Response:", 1)[-1].strip()
    return full_text.replace(original_prompt, "").strip()

def generate_standard_clean(model, tokenizer, prompt):
    raw = _generate(model, tokenizer, prompt)
    cleaned = _clean_response(raw, prompt)
    return cleaned

def rag_cot_generate_clean(model, tokenizer, prompt, top_k=3):
    if contains_self_harm(prompt):
        return crisis_response()
    retrieved = retrieve_from_kb(prompt, top_k)
    cot_prompt = build_cot_prompt(prompt, retrieved)
    raw = _generate(model, tokenizer, cot_prompt)
    cleaned = _clean_response(raw, cot_prompt)
    return cleaned

# ---------- Load test prompts ----------
df = pd.read_csv("merged_dataset_cleaned.csv")
test_df = df[df['Context'].str.contains("depression|stress|sadness", case=False, na=False)].head(3)
test_prompts = test_df['Context'].tolist()
references = test_df['Response'].tolist()

standard_responses = []
cot_rag_responses = []

for prompt in test_prompts:
    print(f"\nPrompt:\n{prompt}\n")
    std_resp = generate_standard_clean(fine_tuned_model, tokenizer, prompt)
    print(f"Standard Model Response:\n{std_resp}\n")
    cot_resp = rag_cot_generate_clean(fine_tuned_model, tokenizer, prompt)
    print(f"CoT + RAG Response:\n{cot_resp}\n")
    standard_responses.append(std_resp)
    cot_rag_responses.append(cot_resp)

# ---------- Metrics ----------
rouge = evaluate.load("rouge")
meteor = evaluate.load("meteor")

def compute_metrics(generated, references):
    bleu_scores = []
    rouge_scores = []
    meteor_scores = []
    distinct1_scores = []
    distinct2_scores = []

    for gen, ref in zip(generated, references):
        try:
            bleu = sentence_bleu(
                [word_tokenize(ref.lower())],
                word_tokenize(gen.lower()),
                smoothing_function=SmoothingFunction().method1
            )
        except:
            bleu = 0.0
        bleu_scores.append(bleu)

        try:
            r = rouge.compute(predictions=[gen], references=[ref])['rougeL']
        except:
            r = 0.0
        rouge_scores.append(r)

        try:
            m = meteor.compute(predictions=[gen], references=[ref])['meteor']
        except:
            m = 0.0
        meteor_scores.append(m)

        toks = word_tokenize(gen.lower())
        if len(toks) > 0:
            distinct1_scores.append(len(set(toks)) / len(toks))
        else:
            distinct1_scores.append(0.0)

        bigrams = list(nltk.bigrams(toks))
        if len(bigrams) > 0:
            distinct2_scores.append(len(set(bigrams)) / len(bigrams))
        else:
            distinct2_scores.append(0.0)

    def compute_perplexity(texts):
        total_loss = 0.0
        total_tokens = 0
        for text in texts:
            inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True).to(device)
            labels = inputs.input_ids
            with torch.no_grad():
                outputs = fine_tuned_model(**inputs, labels=labels)
                loss = outputs.loss
                total_loss += loss.item() * inputs.attention_mask.sum().item()
                total_tokens += inputs.attention_mask.sum().item()
        avg_loss = total_loss / total_tokens if total_tokens else 1e6
        return torch.exp(torch.tensor(avg_loss)).item()

    perplexity = compute_perplexity(generated)

    return {
        "BLEU": sum(bleu_scores) / len(bleu_scores),
        "ROUGE-L": sum(rouge_scores) / len(rouge_scores),
        "METEOR": sum(meteor_scores) / len(meteor_scores),
        "Distinct-1": sum(distinct1_scores) / len(distinct1_scores),
        "Distinct-2": sum(distinct2_scores) / len(distinct2_scores),
        "Perplexity": perplexity
    }

metrics_std = compute_metrics(standard_responses, references)
metrics_cot = compute_metrics(cot_rag_responses, references)

print("\n===== METRICS FOR FINE-TUNED MODEL =====")
for k, v in metrics_std.items():
    print(f"{k}: {v:.4f}")

print("\n===== METRICS FOR FINE-TUNED + CoT + RAG =====")
for k, v in metrics_cot.items():
    print(f"{k}: {v:.4f}")

gc.collect()
torch.cuda.empty_cache()


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Using device: cuda
Loading embedding model...
Loading base model...


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Loading PEFT fine-tuned weights...
Loading tokenizer...

Prompt:
I've been dealing with feelings of sadness and hopelessness for the past few months. I've tried to distract myself with work and other activities, but nothing seems to help. I've been thinking about seeking professional help, but I'm unsure if therapy is the right choice for me. I've heard that therapy can be expensive and time-consuming, and I'm not sure if I can afford it or if I have the energy to commit to it.

Standard Model Response:
I'm also worried about what my friends and family might think if I seek help. I've always been the strong one, and I don't want to be a burden to anyone. Can you help me understand if therapy is the right choice for me and if it's something I can afford and have the energy for? What should I expect from therapy, and how can I make the most out of it? I've never been to therapy before, and I'm feeling a bit anxious about the whole process.
I can understand that you're feeling uncertain a

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.



===== METRICS FOR FINE-TUNED MODEL =====
BLEU: 0.0580
ROUGE-L: 0.1694
METEOR: 0.2472
Distinct-1: 0.4255
Distinct-2: 0.7211
Perplexity: 2.7439

===== METRICS FOR FINE-TUNED + CoT + RAG =====
BLEU: 0.0608
ROUGE-L: 0.2033
METEOR: 0.2539
Distinct-1: 0.6163
Distinct-2: 0.9174
Perplexity: 4.6047


In [None]:
import os
import gc
import sys
import time
import json
import torch
import pandas as pd
import nltk
from collections import Counter

from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import PeftModel
from sentence_transformers import SentenceTransformer, util

from nltk.tokenize import word_tokenize
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import evaluate
import nltk

nltk.download('wordnet')
nltk.download('punkt')

# ---------- Config ----------
knowledge_base_path = "/content/combine cbt.json"
base_model_path = "/content/llama3_model/models--meta-llama--Meta-Llama-3-8B/snapshots/8cde5ca8380496c9a6cc7ef3a8b46a0372a1d920"
fine_tuned_path = "./llama3_finetuned"
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)

# ---------- Load Knowledge Base ----------
with open(knowledge_base_path, 'r', encoding='utf-8') as f:
    knowledge_base = json.load(f)
knowledge_base_entries = [entry for entry in knowledge_base.values()]

def _kb_to_text(entry):
    parts = []
    for k in ['Input.client_statement', 'Input.res_a', 'Input.res_b', 'ori_text', 'situation']:
        if k in entry and entry[k]:
            parts.append(str(entry[k]))
    return " ".join(parts)

kb_texts = [_kb_to_text(e) for e in knowledge_base_entries]

print("Loading embedding model...")
embedder = SentenceTransformer('all-MiniLM-L6-v2')
kb_embeddings = embedder.encode(kb_texts, convert_to_tensor=True)

def retrieve_from_kb(prompt, top_k=3):
    prompt_embedding = embedder.encode(prompt, convert_to_tensor=True)
    hits = util.semantic_search(prompt_embedding, kb_embeddings, top_k=top_k)
    retrieved = []
    for hit in hits[0]:
        idx = hit['corpus_id']
        if 0 <= idx < len(knowledge_base_entries):
            retrieved.append(knowledge_base_entries[idx])
    return retrieved

# ---------- Load Models ----------
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4"
)

# Load base model
print("Loading base model...")
base_model = AutoModelForCausalLM.from_pretrained(
    base_model_path,
    quantization_config=quantization_config,
    device_map="auto",
    local_files_only=True
)
base_model.eval()

# Load fine-tuned model
print("Loading PEFT fine-tuned weights...")
fine_tuned_model = PeftModel.from_pretrained(base_model, fine_tuned_path)
fine_tuned_model.eval()

print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(fine_tuned_path if os.path.exists(os.path.join(fine_tuned_path, 'adapter_config.json')) else base_model_path, local_files_only=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# ---------- Safety Check ----------
SELF_HARM_KEYWORDS = [
    "suicide", "i want to die", "kill myself", "end my life", "hang myself", "hurt myself",
    "i can't go on", "want to die", "i'm going to die", "i'll kill myself"
]

def contains_self_harm(text: str) -> bool:
    t = text.lower()
    return any(k in t for k in SELF_HARM_KEYWORDS)

def crisis_response():
    return ("I'm really sorry you're feeling so overwhelmed. I can't provide medical care, "
            "but if you're thinking about hurting yourself, please contact your local emergency services "
            "or a crisis line right now. If you'd like, I can help you find resources or steps to stay safe.")

# ---------- CoT Template ----------
COT_TEMPLATE = """
You are a compassionate, friendly listener (not a diagnostician). Do NOT show these steps to the user. Instead, only provide the final supportive response.

Steps (INTERNAL ONLY, not to be shown):
1) Empathic opening
2) Validate feelings
3) If a KB example is relevant, briefly reflect it
4) Ask one open question
5) Offer 1-2 coping suggestions
6) If self-harm is detected, switch to crisis response
7) Close supportively
"""

def build_cot_prompt(user_prompt: str, retrieved_entries: list):
    if retrieved_entries:
        rag_context = "\n\n".join(
            [
                f"KB example: Client: {entry.get('Input.client_statement','')}\n"
                f"Response A: {entry.get('Input.res_a','')}\nResponse B: {entry.get('Input.res_b','')}"
                for entry in retrieved_entries
            ]
        )
    else:
        rag_context = "No direct KB examples."

    return (
        f"{COT_TEMPLATE}\n\n"
        f"KnowledgeBaseContext:\n{rag_context}\n\n"
        f"User: {user_prompt}\n\n"
        "Now provide the final user-facing response only, without showing steps or template. Response:"
    )

# ---------- Generation Helpers ----------
def _generate(model, tokenizer, prompt, max_new_tokens=200, temperature=0.7):
    inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=2048)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=True,
            temperature=temperature,
            pad_token_id=tokenizer.pad_token_id
        )
    text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return text

def _clean_response(full_text, original_prompt):
    if "Response:" in full_text:
        return full_text.split("Response:", 1)[-1].strip()
    return full_text.replace(original_prompt, "").strip()

def generate_standard_clean(model, tokenizer, prompt):
    raw = _generate(model, tokenizer, prompt)
    cleaned = _clean_response(raw, prompt)
    return cleaned

def rag_cot_generate_clean(model, tokenizer, prompt, top_k=3):
    if contains_self_harm(prompt):
        return crisis_response()
    retrieved = retrieve_from_kb(prompt, top_k)
    cot_prompt = build_cot_prompt(prompt, retrieved)
    raw = _generate(model, tokenizer, cot_prompt)
    cleaned = _clean_response(raw, cot_prompt)
    return cleaned

# ---------- Load test prompts ----------
df = pd.read_csv("merged_dataset_cleaned.csv")
test_df = df[df['Context'].str.contains("depression|stress|sadness", case=False, na=False)].head(3)
test_prompts = test_df['Context'].tolist()
references = test_df['Response'].tolist()

# Generate responses for all models
base_responses = []
fine_tuned_responses = []
cot_rag_responses = []

for prompt in test_prompts:
    print(f"\nPrompt:\n{prompt}\n")

    # Base model response
    base_resp = generate_standard_clean(base_model, tokenizer, prompt)
    print(f"Base Model Response:\n{base_resp}\n")
    base_responses.append(base_resp)

    # Fine-tuned model response
    fine_tuned_resp = generate_standard_clean(fine_tuned_model, tokenizer, prompt)
    print(f"Fine-Tuned Model Response:\n{fine_tuned_resp}\n")
    fine_tuned_responses.append(fine_tuned_resp)

    # CoT + RAG response
    cot_resp = rag_cot_generate_clean(fine_tuned_model, tokenizer, prompt)
    print(f"CoT + RAG Response:\n{cot_resp}\n")
    cot_rag_responses.append(cot_resp)

# ---------- Metrics ----------
rouge = evaluate.load("rouge")
meteor = evaluate.load("meteor")

def compute_metrics(generated, references):
    bleu_scores = []
    rouge_scores = []
    meteor_scores = []
    distinct1_scores = []
    distinct2_scores = []

    for gen, ref in zip(generated, references):
        try:
            bleu = sentence_bleu(
                [word_tokenize(ref.lower())],
                word_tokenize(gen.lower()),
                smoothing_function=SmoothingFunction().method1
            )
        except:
            bleu = 0.0
        bleu_scores.append(bleu)

        try:
            r = rouge.compute(predictions=[gen], references=[ref])['rougeL']
        except:
            r = 0.0
        rouge_scores.append(r)

        try:
            m = meteor.compute(predictions=[gen], references=[ref])['meteor']
        except:
            m = 0.0
        meteor_scores.append(m)

        toks = word_tokenize(gen.lower())
        if len(toks) > 0:
            distinct1_scores.append(len(set(toks)) / len(toks))
        else:
            distinct1_scores.append(0.0)

        bigrams = list(nltk.bigrams(toks))
        if len(bigrams) > 0:
            distinct2_scores.append(len(set(bigrams)) / len(bigrams))
        else:
            distinct2_scores.append(0.0)

    def compute_perplexity(model, texts):
        total_loss = 0.0
        total_tokens = 0
        for text in texts:
            inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True).to(device)
            labels = inputs.input_ids
            with torch.no_grad():
                outputs = model(**inputs, labels=labels)
                loss = outputs.loss
                total_loss += loss.item() * inputs.attention_mask.sum().item()
                total_tokens += inputs.attention_mask.sum().item()
        avg_loss = total_loss / total_tokens if total_tokens else 1e6
        return torch.exp(torch.tensor(avg_loss)).item()

    perplexity = compute_perplexity(fine_tuned_model, generated)  # Using fine-tuned model for perplexity as a reference

    return {
        "BLEU": sum(bleu_scores) / len(bleu_scores),
        "ROUGE-L": sum(rouge_scores) / len(rouge_scores),
        "METEOR": sum(meteor_scores) / len(meteor_scores),
        "Distinct-1": sum(distinct1_scores) / len(distinct1_scores),
        "Distinct-2": sum(distinct2_scores) / len(distinct2_scores),
        "Perplexity": perplexity
    }

# Compute metrics for all models
metrics_base = compute_metrics(base_responses, references)
metrics_finetuned = compute_metrics(fine_tuned_responses, references)
metrics_cot = compute_metrics(cot_rag_responses, references)

# Print comparison
print("\n===== METRICS COMPARISON =====")
print("Base Model:")
for k, v in metrics_base.items():
    print(f"{k}: {v:.4f}")

print("\nFine-Tuned Model:")
for k, v in metrics_finetuned.items():
    print(f"{k}: {v:.4f}")

print("\nFine-Tuned + CoT + RAG Model:")
for k, v in metrics_cot.items():
    print(f"{k}: {v:.4f}")

gc.collect()
torch.cuda.empty_cache()

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Using device: cuda
Loading embedding model...


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Loading base model...


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Loading PEFT fine-tuned weights...
Loading tokenizer...

Prompt:
I've been dealing with feelings of sadness and hopelessness for the past few months. I've tried to distract myself with work and other activities, but nothing seems to help. I've been thinking about seeking professional help, but I'm unsure if therapy is the right choice for me. I've heard that therapy can be expensive and time-consuming, and I'm not sure if I can afford it or if I have the energy to commit to it.

Base Model Response:
I've also heard that therapy can be helpful, but I'm not sure if it's worth the investment. I've been struggling with feelings of self-doubt and insecurity, and I'm not sure if therapy can help me with that. I've been dealing with these feelings for as long as I can remember, and I've never felt like I could talk to anyone about them. I've always felt like I was alone in my struggles, and I've never felt like I could trust anyone enough to share my feelings. I've been feeling this way for a

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.



===== METRICS COMPARISON =====
Base Model:
BLEU: 0.0122
ROUGE-L: 0.1289
METEOR: 0.1906
Distinct-1: 0.4256
Distinct-2: 0.6399
Perplexity: 2.4306

Fine-Tuned Model:
BLEU: 0.0321
ROUGE-L: 0.1266
METEOR: 0.1477
Distinct-1: 0.5365
Distinct-2: 0.7582
Perplexity: 2.6226

Fine-Tuned + CoT + RAG Model:
BLEU: 0.0654
ROUGE-L: 0.1841
METEOR: 0.2564
Distinct-1: 0.6329
Distinct-2: 0.9295
Perplexity: 4.8467


In [None]:
!pip install streamlit -q
!pip install evaluate

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.1/10.1 MB[0m [31m106.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m44.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.6


In [None]:
!wget -q -O - ipv4.icanhazip.com  # Displays your public IP


34.90.39.109


In [None]:
!streamlit run app.py & npx localtunnel --port 8501

[1G[0K⠙
Collecting usage statistics. To deactivate, set browser.gatherUsageStats to false.
[0m
[1G[0K⠹[1G[0K⠸[1G[0K⠼[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Local URL: [0m[1mhttp://localhost:8501[0m
[34m  Network URL: [0m[1mhttp://172.28.0.12:8501[0m
[34m  External URL: [0m[1mhttp://34.90.39.109:8501[0m
[0m
[1G[0K⠴[1G[0K⠦[1G[0K⠧[1G[0K⠇[1G[0Kyour url is: https://yellow-jars-wash.loca.lt
terminate called after throwing an instance of 'std::bad_alloc'
  what():  std::bad_alloc
^C


In [None]:
from flask import Flask, request, render_template_string
import torch
import json
import os
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import PeftModel
from sentence_transformers import SentenceTransformer, util
import nltk

# Set up Flask app
app = Flask(__name__)

# Download NLTK data
nltk.download('wordnet', quiet=True)
nltk.download('punkt_tab', quiet=True)

# ---------- Config ----------
def load_model():
    base_model_path = "/content/llama3_model"
    fine_tuned_path = "/content/llama3_finetuned"

    if not os.path.exists(base_model_path):
        raise FileNotFoundError(f"Base model path not found: {base_model_path}")
    if not os.path.exists(fine_tuned_path):
        raise FileNotFoundError(f"Fine-tuned path not found: {fine_tuned_path}")

    quantization_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_compute_dtype=torch.float16,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4"
    )

    base_model = AutoModelForCausalLM.from_pretrained(
        base_model_path,
        quantization_config=quantization_config,
        device_map="auto",
        local_files_only=True
    )
    fine_tuned_model = PeftModel.from_pretrained(base_model, fine_tuned_path)
    tokenizer = AutoTokenizer.from_pretrained(fine_tuned_path, local_files_only=True)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    return fine_tuned_model, tokenizer

# ---------- Load Knowledge Base ----------
def load_kb():
    kb_path = "/content/combine cbt.json"
    if not os.path.exists(kb_path):
        raise FileNotFoundError(f"Knowledge base path not found: {kb_path}")

    with open(kb_path, 'r', encoding='utf-8') as f:
        kb = json.load(f)
    entries = [entry for entry in kb.values()]
    embedder = SentenceTransformer('all-MiniLM-L6-v2')
    kb_texts = [f"{e.get('Input.client_statement','')} {e.get('Input.res_a','')} {e.get('Input.res_b','')}" for e in entries]
    kb_embeddings = embedder.encode(kb_texts, convert_to_tensor=True)
    return entries, embedder, kb_embeddings, kb_texts

def retrieve_from_kb(prompt, embedder, kb_embeddings, kb_texts, top_k=3):
    prompt_embedding = embedder.encode(prompt, convert_to_tensor=True)
    hits = util.semantic_search(prompt_embedding, kb_embeddings, top_k=top_k)
    retrieved = [knowledge_base_entries[hit['corpus_id']] for hit in hits[0]]
    return retrieved

# ---------- Safety Check ----------
SELF_HARM_KEYWORDS = ["suicide", "i want to die", "kill myself", "end my life", "hurt myself", "i'll kill myself"]
def contains_self_harm(text: str) -> bool:
    return any(k in text.lower() for k in SELF_HARM_KEYWORDS)

def crisis_response():
    return "I'm really sorry you're feeling so overwhelmed. If you're in crisis, please contact emergency services or a hotline like the National Suicide Prevention Lifeline at 988 (US). I'm here to listen, but I can't provide medical advice."

# ---------- CoT Template ----------
COT_TEMPLATE = """
You are a compassionate, friendly listener (not a diagnostician). Do NOT show these steps to the user. Instead, only provide the final supportive response.

Steps (INTERNAL ONLY, not to be shown):
1) Empathic opening
2) Validate feelings
3) If a KB example is relevant, briefly reflect it
4) Ask one open question
5) Offer 1-2 coping suggestions
6) If self-harm is detected, switch to crisis response
7) Close supportively
"""

def build_cot_prompt(user_prompt: str, retrieved_entries: list):
    if retrieved_entries:
        rag_context = "\n\n".join(
            [f"KB example: Client: {entry.get('Input.client_statement','')}\nResponse A: {entry.get('Input.res_a','')}\nResponse B: {entry.get('Input.res_b','')}" for entry in retrieved_entries]
        )
    else:
        rag_context = "No direct KB examples."

    return f"{COT_TEMPLATE}\n\nKnowledgeBaseContext:\n{rag_context}\n\nUser: {user_prompt}\n\nNow provide the final user-facing response only, without showing steps or template. Response:"

# ---------- Generation ----------
def generate_response(model, tokenizer, prompt, max_new_tokens=200, temperature=0.7):
    device = "cuda" if torch.cuda.is_available() else "cpu"
    inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=2048).to(device)
    with torch.no_grad():
        outputs = model.generate(**inputs, max_new_tokens=max_new_tokens, do_sample=True, temperature=temperature, pad_token_id=tokenizer.pad_token_id)
    text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return text

def clean_response(full_text, original_prompt):
    if "Response:" in full_text:
        return full_text.split("Response:", 1)[-1].strip()
    return full_text.replace(original_prompt, "").strip()

def generate_standard(model, tokenizer, prompt):
    raw = generate_response(model, tokenizer, prompt)
    cleaned = clean_response(raw, prompt)
    return cleaned

def rag_cot_generate(model, tokenizer, prompt, top_k=3):
    if contains_self_harm(prompt):
        return crisis_response()
    retrieved = retrieve_from_kb(prompt, top_k)
    cot_prompt = build_cot_prompt(prompt, retrieved)
    raw = generate_response(model, tokenizer, cot_prompt)
    cleaned = clean_response(raw, cot_prompt)
    return cleaned

# Load models and KB globally (to avoid reloading per request)
try:
    model, tokenizer = load_model()
    knowledge_base_entries, embedder, kb_embeddings, kb_texts = load_kb()
except Exception as e:
    print(f"Error during initialization: {e}")
    raise

# HTML template for the web interface
HTML_TEMPLATE = """
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>Mental Health AI</title>
    <style>
        body {
            background-color: black;
            color: white;
            font-family: Arial, sans-serif;
            text-align: center;
            margin: 0;
            padding: 20px;
        }
        h1 {
            color: #fff;
        }
        .input-container {
            margin: 20px auto;
            width: 60%;
        }
        textarea {
            background-color: #333;
            color: white;
            width: 100%;
            height: 100px;
            padding: 10px;
            border: 1px solid white;
            border-radius: 5px;
            resize: vertical;
        }
        button {
            background-color: #111;
            color: white;
            border: 1px solid white;
            padding: 10px 20px;
            margin-top: 10px;
            cursor: pointer;
        }
        button:hover {
            background-color: #222;
        }
        .output-container {
            display: flex;
            justify-content: space-between;
            margin-top: 20px;
            width: 80%;
            margin-left: auto;
            margin-right: auto;
        }
        .output-box {
            background-color: #111;
            color: white;
            padding: 15px;
            border: 1px solid white;
            border-radius: 5px;
            width: 48%;
            text-align: left;
            min-height: 200px;
        }
        .footer {
            margin-top: 20px;
            font-size: 0.9em;
            color: #ccc;
        }
    </style>
</head>
<body>
    <h1>🧠 Mental Health Conversational AI</h1>
    <hr>
    <div class="input-container">
        <form method="post">
            <textarea name="prompt" placeholder="How are you feeling today? Share your thoughts on stress, sadness, or depression..."></textarea>
            <button type="submit">Generate Response</button>
        </form>
    </div>
    {% if fine_tuned_response and cot_rag_response %}
        <div class="output-container">
            <div class="output-box"><strong>Fine-Tuned Model:</strong><br>{{ fine_tuned_response }}</div>
            <div class="output-box"><strong>Fine-Tuned + RAG + CoT:</strong><br>{{ cot_rag_response }}</div>
        </div>
    {% elif warning %}
        <p style="color: yellow;">{{ warning }}</p>
    {% endif %}
    <div class="footer">
        <hr>
        <p>**Powered by Fine-Tuned LLaMA-3-8B with CBT Knowledge Base** | For educational purposes only. Seek professional help for mental health concerns.</p>
    </div>
</body>
</html>
"""

# Routes
@app.route('/', methods=['GET', 'POST'])
def index():
    fine_tuned_response = None
    cot_rag_response = None
    warning = None

    if request.method == 'POST':
        prompt = request.form['prompt']
        if prompt:
            with st.spinner("Generating responses..."):  # Using st.spinner for progress (optional, remove if not desired)
                fine_tuned_response = generate_standard(model, tokenizer, prompt)
                cot_rag_response = rag_cot_generate(model, tokenizer, prompt, top_k=3)
                if contains_self_harm(prompt):
                    warning = "Self-harm detected. Please seek immediate professional help."
        else:
            warning = "Please enter a prompt."

    return render_template_string(HTML_TEMPLATE, fine_tuned_response=fine_tuned_response, cot_rag_response=cot_rag_response, warning=warning)

if __name__ == '__main__':
    app.run(host='0.0.0.0', port=5000, debug=True)

In [None]:
!pip install pyngrok -q


In [None]:
!pip install pyngrok -q
from pyngrok import ngrok
import os

# Set ngrok authtoken
ngrok.set_auth_token('348BHN8hYkCABquOpe7Nc1b7mem_5gvX7PFuZ9Bjz3LdSRsxd')  # Your provided token

# Check if app.py exists and start Flask app
if os.path.exists('/content/app.py'):
    # Use `nohup` to ensure the process continues after the Colab cell finishes
    # Redirect stdout/stderr to files to avoid cluttering cell output
    get_ipython().system_raw('nohup python3 /content/app.py > app.log 2>&1 &')
    print("Flask app started in background.")
else:
    print("Error: /content/app.py not found. Please upload the file.")

# Create ngrok tunnel with explicit HTTP protocol
try:
    public_url = ngrok.connect(
        addr='5000',  # Flask default port
        proto='http'  # Explicitly set protocol to http
    )
    print('Public URL:', public_url)

    # Display app logs to help with debugging
    print("\n--- Flask App Logs ---")
    if os.path.exists('app.log'):
        with open('app.log', 'r') as f:
            print(f.read())
    else:
        print("app.log not found.")

except Exception as e:
    print(f"Ngrok error: {e}")

Flask app started in background.
Public URL: NgrokTunnel: "https://camelia-superimproved-joane.ngrok-free.dev" -> "http://localhost:5000"

--- Flask App Logs ---



In [None]:
import os
import gc
import sys
import time
import json
import torch
import pandas as pd
import nltk

from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import PeftModel
from sentence_transformers import SentenceTransformer, util

from nltk.tokenize import word_tokenize

nltk.download('wordnet', quiet=True)
nltk.download('punkt', quiet=True)

# ---------- Config ----------
knowledge_base_path = "/content/combine cbt.json"
base_model_path = "/content/llama3_model/models--meta-llama--Meta-Llama-3-8B/snapshots/8cde5ca8380496c9a6cc7ef3a8b46a0372a1d920"
fine_tuned_path = "./llama3_finetuned"
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)

# ---------- Load Knowledge Base ----------
with open(knowledge_base_path, 'r', encoding='utf-8') as f:
    knowledge_base = json.load(f)
knowledge_base_entries = [entry for entry in knowledge_base.values()]

def _kb_to_text(entry):
    parts = []
    for k in ['Input.client_statement', 'Input.res_a', 'Input.res_b', 'ori_text', 'situation']:
        if k in entry and entry[k]:
            parts.append(str(entry[k]))
    return " ".join(parts)

kb_texts = [_kb_to_text(e) for e in knowledge_base_entries]

print("Loading embedding model...")
embedder = SentenceTransformer('all-MiniLM-L6-v2')
kb_embeddings = embedder.encode(kb_texts, convert_to_tensor=True)

def retrieve_from_kb(prompt, top_k=3):
    prompt_embedding = embedder.encode(prompt, convert_to_tensor=True)
    hits = util.semantic_search(prompt_embedding, kb_embeddings, top_k=top_k)
    retrieved = []
    for hit in hits[0]:
        idx = hit['corpus_id']
        if 0 <= idx < len(knowledge_base_entries):
            retrieved.append(knowledge_base_entries[idx])
    return retrieved

# ---------- Load Models ----------
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4"
)

# Load base model
print("Loading base model...")
base_model = AutoModelForCausalLM.from_pretrained(
    base_model_path,
    quantization_config=quantization_config,
    device_map="auto",
    local_files_only=True
)
base_model.eval()

# Load fine-tuned model
print("Loading PEFT fine-tuned weights...")
fine_tuned_model = PeftModel.from_pretrained(base_model, fine_tuned_path)
fine_tuned_model.eval()

print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(fine_tuned_path if os.path.exists(os.path.join(fine_tuned_path, 'adapter_config.json')) else base_model_path, local_files_only=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# ---------- Safety Check ----------
SELF_HARM_KEYWORDS = [
    "suicide", "i want to die", "kill myself", "end my life", "hang myself", "hurt myself",
    "i can't go on", "want to die", "i'm going to die", "i'll kill myself"
]

def contains_self_harm(text: str) -> bool:
    t = text.lower()
    return any(k in t for k in SELF_HARM_KEYWORDS)

def crisis_response():
    return ("I'm really sorry you're feeling so overwhelmed. I can't provide medical care, "
            "but if you're thinking about hurting yourself, please contact your local emergency services "
            "or a crisis line right now. If you'd like, I can help you find resources or steps to stay safe.")

# ---------- CoT Template ----------
COT_TEMPLATE = """
You are a compassionate, friendly listener (not a diagnostician). Do NOT show these steps to the user. Instead, only provide the final supportive response.

Steps (INTERNAL ONLY, not to be shown):
1) Empathic opening
2) Validate feelings
3) If a KB example is relevant, briefly reflect it
4) Ask one open question
5) Offer 1-2 coping suggestions
6) If self-harm is detected, switch to crisis response
7) Close supportively
"""

def build_cot_prompt(user_prompt: str, retrieved_entries: list):
    if retrieved_entries:
        rag_context = "\n\n".join(
            [
                f"KB example: Client: {entry.get('Input.client_statement','')}\n"
                f"Response A: {entry.get('Input.res_a','')}\nResponse B: {entry.get('Input.res_b','')}"
                for entry in retrieved_entries
            ]
        )
    else:
        rag_context = "No direct KB examples."

    return (
        f"{COT_TEMPLATE}\n\n"
        f"KnowledgeBaseContext:\n{rag_context}\n\n"
        f"User: {user_prompt}\n\n"
        "Now provide the final user-facing response only, without showing steps or template. Response:"
    )

# ---------- Generation Helpers ----------
def _generate(model, tokenizer, prompt, max_new_tokens=200, temperature=0.7):
    inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=2048)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=True,
            temperature=temperature,
            pad_token_id=tokenizer.pad_token_id
        )
    text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return text

def _clean_response(full_text, original_prompt):
    if "Response:" in full_text:
        return full_text.split("Response:", 1)[-1].strip()
    return full_text.replace(original_prompt, "").strip()

def generate_standard_clean(model, tokenizer, prompt):
    raw = _generate(model, tokenizer, prompt)
    cleaned = _clean_response(raw, prompt)
    return cleaned

def rag_cot_generate_clean(model, tokenizer, prompt, top_k=3):
    if contains_self_harm(prompt):
        return crisis_response()
    retrieved = retrieve_from_kb(prompt, top_k)
    cot_prompt = build_cot_prompt(prompt, retrieved)
    raw = _generate(model, tokenizer, cot_prompt)
    cleaned = _clean_response(raw, cot_prompt)
    return cleaned

# ---------- Chat Loop ----------
print("Welcome to the Mental Health Conversational AI! Type 'exit' to end the chat.")
while True:
    prompt = input("\nYour message: ")
    if prompt.lower() == 'exit':
        print("Thank you for chatting. Take care!")
        break

    print("\nGenerating responses...")

    # Base model response
    base_resp = generate_standard_clean(base_model, tokenizer, prompt)
    print("Base Model Response:")
    print(base_resp)

    # Fine-tuned model response
    fine_tuned_resp = generate_standard_clean(fine_tuned_model, tokenizer, prompt)
    print("Fine-Tuned Model Response:")
    print(fine_tuned_resp)

    # CoT + RAG response
    cot_resp = rag_cot_generate_clean(fine_tuned_model, tokenizer, prompt)
    print("Fine-Tuned + CoT + RAG Response:")
    print(cot_resp)

    print()  # Add a newline for readability

# Cleanup
gc.collect()
torch.cuda.empty_cache()

In [None]:
import os
import gc
import sys
import time
import torch
import nltk
import logging

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import PeftModel

nltk.download('wordnet', quiet=True)
nltk.download('punkt', quiet=True)

# ---------- Config ----------
base_model_path = "/content/llama3_model/models--meta-llama--Meta-Llama-3-8B/snapshots/8cde5ca8380496c9a6cc7ef3a8b46a0372a1d920"
fine_tuned_path = "/content/llama3_finetuned"
device = "cuda" if torch.cuda.is_available() else "cpu"
logger.info(f"Using device: {device}")

# Memory optimization
torch.cuda.empty_cache()

# ---------- Load Models ----------
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    low_cpu_mem_usage=True
)

try:
    logger.info("Loading base model...")
    base_model = AutoModelForCausalLM.from_pretrained(
        base_model_path,
        quantization_config=quantization_config,
        device_map="auto",
        local_files_only=True
    )
    base_model.eval()
    logger.info("Base model loaded successfully.")
except Exception as e:
    logger.error(f"Error loading base model: {e}")
    sys.exit(1)

try:
    logger.info("Loading PEFT fine-tuned weights...")
    fine_tuned_model = PeftModel.from_pretrained(base_model, fine_tuned_path)
    fine_tuned_model.eval()
    logger.info("Fine-tuned model loaded successfully.")
except Exception as e:
    logger.error(f"Error loading fine-tuned model: {e}")
    sys.exit(1)

try:
    logger.info("Loading tokenizer...")
    tokenizer = AutoTokenizer.from_pretrained(fine_tuned_path if os.path.exists(os.path.join(fine_tuned_path, 'adapter_config.json')) else base_model_path, local_files_only=True)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    logger.info("Tokenizer loaded successfully.")
except Exception as e:
    logger.error(f"Error loading tokenizer: {e}")
    sys.exit(1)

# ---------- Safety Check ----------
SELF_HARM_KEYWORDS = [
    "suicide", "i want to die", "kill myself", "end my life", "hang myself", "hurt myself",
    "i can't go on", "want to die", "i'm going to die", "i'll kill myself"
]

def contains_self_harm(text: str) -> bool:
    t = text.lower()
    return any(k in t for k in SELF_HARM_KEYWORDS)

def crisis_response():
    return ("I'm really sorry you're feeling so overwhelmed. I can't provide medical care, "
            "but if you're thinking about hurting yourself, please contact your local emergency services "
            "or a crisis line right now. If you'd like, I can help you find resources or steps to stay safe.")

# ---------- Generation Helpers ----------
def _generate(model, tokenizer, prompt, max_new_tokens=100, temperature=0.7):
    try:
        inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=2048)
        inputs = {k: v.to(device) for k, v in inputs.items()}
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=max_new_tokens,
                do_sample=True,
                temperature=temperature,
                pad_token_id=tokenizer.pad_token_id
            )
        text = tokenizer.decode(outputs[0], skip_special_tokens=True)
        return text
    except Exception as e:
        logger.error(f"Generation error: {e}")
        return "Error generating response. Please try again."

def _clean_response(full_text, original_prompt):
    if "Response:" in full_text:
        return full_text.split("Response:", 1)[-1].strip()
    return full_text.replace(original_prompt, "").strip()

def generate_standard_clean(model, tokenizer, prompt):
    raw = _generate(model, tokenizer, prompt)
    cleaned = _clean_response(raw, prompt)
    if contains_self_harm(prompt):
        return crisis_response()
    return cleaned

# ---------- Chat Loop ----------
logger.info("Welcome to the Fine-Tuned Mental Health Conversational AI! Type 'exit' to end the chat.")
while True:
    prompt = input("\nYour message: ")
    if prompt.lower() == 'exit':
        logger.info("Thank you for chatting. Take care!")
        break

    logger.info("Generating response...")
    try:
        response = generate_standard_clean(fine_tuned_model, tokenizer, prompt)
        print("Fine-Tuned Model Response:")
        print(response)
    except Exception as e:
        logger.error(f"Error during chat: {e}")

    print()  # Add a newline for readability

# Cleanup
gc.collect()
torch.cuda.empty_cache()

In [None]:
import os
import gc
import sys
import time
import json
import torch
import nltk

from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import PeftModel
from sentence_transformers import SentenceTransformer, util

from nltk.tokenize import word_tokenize

nltk.download('wordnet', quiet=True)
nltk.download('punkt', quiet=True)

# ---------- Config ----------
knowledge_base_path = "/content/combine cbt.json"
base_model_path = "/content/llama3_model/models--meta-llama--Meta-Llama-3-8B/snapshots/8cde5ca8380496c9a6cc7ef3a8b46a0372a1d920"
fine_tuned_path = "./llama3_finetuned"
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)

# ---------- Load Knowledge Base ----------
with open(knowledge_base_path, 'r', encoding='utf-8') as f:
    knowledge_base = json.load(f)
knowledge_base_entries = [entry for entry in knowledge_base.values()]

def _kb_to_text(entry):
    parts = []
    for k in ['Input.client_statement', 'Input.res_a', 'Input.res_b', 'ori_text', 'situation']:
        if k in entry and entry[k]:
            parts.append(str(entry[k]))
    return " ".join(parts)

kb_texts = [_kb_to_text(e) for e in knowledge_base_entries]

print("Loading embedding model...")
embedder = SentenceTransformer('all-MiniLM-L6-v2')
kb_embeddings = embedder.encode(kb_texts, convert_to_tensor=True)

def retrieve_from_kb(prompt, top_k=3):
    prompt_embedding = embedder.encode(prompt, convert_to_tensor=True)
    hits = util.semantic_search(prompt_embedding, kb_embeddings, top_k=top_k)
    retrieved = []
    for hit in hits[0]:
        idx = hit['corpus_id']
        if 0 <= idx < len(knowledge_base_entries):
            retrieved.append(knowledge_base_entries[idx])
    return retrieved

# ---------- Load Models ----------
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4"
)

# Load base model
print("Loading base model...")
base_model = AutoModelForCausalLM.from_pretrained(
    base_model_path,
    quantization_config=quantization_config,
    device_map="auto",
    local_files_only=True
)
base_model.eval()

# Load fine-tuned model
print("Loading PEFT fine-tuned weights...")
fine_tuned_model = PeftModel.from_pretrained(base_model, fine_tuned_path)
fine_tuned_model.eval()

print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(fine_tuned_path if os.path.exists(os.path.join(fine_tuned_path, 'adapter_config.json')) else base_model_path, local_files_only=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# ---------- Safety Check ----------
SELF_HARM_KEYWORDS = [
    "suicide", "i want to die", "kill myself", "end my life", "hang myself", "hurt myself",
    "i can't go on", "want to die", "i'm going to die", "i'll kill myself"
]

def contains_self_harm(text: str) -> bool:
    t = text.lower()
    return any(k in t for k in SELF_HARM_KEYWORDS)

def crisis_response():
    return ("I'm really sorry you're feeling so overwhelmed. I can't provide medical care, "
            "but if you're thinking about hurting yourself, please contact your local emergency services "
            "or a crisis line right now. If you'd like, I can help you find resources or steps to stay safe.")

# ---------- CoT Template ----------
COT_TEMPLATE = """
You are a compassionate, friendly listener (not a diagnostician). Do NOT show these steps to the user. Instead, only provide the final supportive response.

Steps (INTERNAL ONLY, not to be shown):
1) Empathic opening
2) Validate feelings
3) If a KB example is relevant, briefly reflect it
4) Ask one open question
5) Offer 1-2 coping suggestions
6) If self-harm is detected, switch to crisis response
7) Close supportively
"""

def build_cot_prompt(user_prompt: str, retrieved_entries: list):
    if retrieved_entries:
        rag_context = "\n\n".join(
            [
                f"KB example: Client: {entry.get('Input.client_statement','')}\n"
                f"Response A: {entry.get('Input.res_a','')}\nResponse B: {entry.get('Input.res_b','')}"
                for entry in retrieved_entries
            ]
        )
    else:
        rag_context = "No direct KB examples."

    return (
        f"{COT_TEMPLATE}\n\n"
        f"KnowledgeBaseContext:\n{rag_context}\n\n"
        f"User: {user_prompt}\n\n"
        "Now provide the final user-facing response only, without showing steps or template. Response:"
    )

# ---------- Generation Helpers ----------
def _generate(model, tokenizer, prompt, max_new_tokens=200, temperature=0.7):
    inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=2048)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=True,
            temperature=temperature,
            pad_token_id=tokenizer.pad_token_id
        )
    text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return text

def _clean_response(full_text, original_prompt):
    if "Response:" in full_text:
        return full_text.split("Response:", 1)[-1].strip()
    return full_text.replace(original_prompt, "").strip()

def rag_cot_generate_clean(model, tokenizer, prompt, top_k=3):
    if contains_self_harm(prompt):
        return crisis_response()
    retrieved = retrieve_from_kb(prompt, top_k)
    cot_prompt = build_cot_prompt(prompt, retrieved)
    raw = _generate(model, tokenizer, cot_prompt)
    cleaned = _clean_response(raw, cot_prompt)
    return cleaned

# ---------- Chat Loop ----------
print("Welcome to the Fine-Tuned Mental Health Conversational AI with CoT + RAG! Type 'exit' to end the chat.")
while True:
    prompt = input("\nYour message: ")
    if prompt.lower() == 'exit':
        print("Thank you for chatting. Take care!")
        break

    print("\nGenerating response...")
    response = rag_cot_generate_clean(fine_tuned_model, tokenizer, prompt)
    print("Fine-Tuned + CoT + RAG Response:")
    print(response)

    print()  # Add a newline for readability

# Cleanup
gc.collect()
torch.cuda.empty_cache()