In [None]:
!pip install -q \
    torch==2.4.1 \
    transformers==4.37.2 \
    peft==0.5.0 \
    bitsandbytes==0.41.3 \
    sentence-transformers==2.2.2 \
    huggingface_hub==0.19.4 \
    accelerate==0.27.2

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m129.4/129.4 kB[0m [31m13.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m797.0/797.0 MB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.4/8.4 MB[0m [31m139.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.6/85.6 kB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.6/92.6 MB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m311.7/311.7 kB[0m [31m28.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m280.0/280.0 kB[0m [31m29.9 MB/s[0m eta 

In [None]:
import os
from huggingface_hub import snapshot_download
from google.colab import userdata

# Set your Hugging Face token securely using Colab's secrets manager
os.environ["HF_TOKEN"] = userdata.get("HF_TOKEN")

# Model repo - choose the correct Llama 3 repo name from Hugging Face
model_repo = "meta-llama/Meta-Llama-3-8B"  # Change this if you want a different version

# Download the model weights and files
print("Starting download of Llama 3 model...")
snapshot_download(
    repo_id=model_repo,
    cache_dir="./llama3_model",
    token=os.environ["HF_TOKEN"],
    resume_download=True
)
print("Download complete! Files saved in ./llama3_model")

Starting download of Llama 3 model...


Fetching 17 files:   0%|          | 0/17 [00:00<?, ?it/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

config.json:   0%|          | 0.00/654 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/36.6k [00:00<?, ?B/s]

LICENSE:   0%|          | 0.00/7.80k [00:00<?, ?B/s]

.gitattributes:   0%|          | 0.00/1.52k [00:00<?, ?B/s]

USE_POLICY.md:   0%|          | 0.00/4.70k [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/177 [00:00<?, ?B/s]

consolidated.00.pth:   0%|          | 0.00/16.1G [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

params.json:   0%|          | 0.00/211 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/2.18M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/73.0 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

Download complete! Files saved in ./llama3_model


In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from collections import Counter
import re

# Load merged dataset
df_merged = pd.read_csv("/content/merged_dataset.csv")

# Remove duplicate rows
num_duplicates = df_merged.duplicated().sum()
print(f"Number of duplicate rows found: {num_duplicates}")
df_merged = df_merged.drop_duplicates(keep='first').reset_index(drop=True)
print(f"Number of rows after removing duplicates: {df_merged.shape[0]}")

# Remove empty rows (where Context or Response is empty/whitespace)
empty_rows_mask = (df_merged['Context'].str.strip() == '') | (df_merged['Response'].str.strip() == '')
num_empty_rows = empty_rows_mask.sum()
print(f"Number of empty rows found: {num_empty_rows}")
df_merged = df_merged[~empty_rows_mask].reset_index(drop=True)
print(f"Number of rows after removing empty rows: {df_merged.shape[0]}")

# Remove rows with missing values
rows_with_missing = df_merged[df_merged['Context'].isnull() | df_merged['Response'].isnull()]
print(f"Number of rows with missing values found: {rows_with_missing.shape[0]}")
df_merged = df_merged.dropna(subset=['Context', 'Response']).reset_index(drop=True)
print(f"Number of rows after removing missing values: {df_merged.shape[0]}")
print("\nMissing values after removal:")
print(df_merged.isnull().sum())

# Save cleaned dataset for later
df_merged.to_csv("merged_dataset_cleaned.csv", index=False)

# EDA
print(f"Number of rows: {df_merged.shape[0]}")
print(f"Number of columns: {df_merged.shape[1]}")
print("\nData types:")
print(df_merged.dtypes)
print("\nMissing values per column:")
print(df_merged.isnull().sum())

# Visualize missing values
plt.figure(figsize=(10, 6))
sns.heatmap(df_merged.isnull(), cbar=False, cmap='viridis')
plt.title('Missing Values Heatmap')
plt.savefig("missing_values_heatmap.png")
plt.close()

# Word counts
def count_words(text):
    if isinstance(text, str):return len(text.split())
    return 0

df_merged['Context_word_count'] = df_merged['Context'].apply(count_words)
df_merged['Response_word_count'] = df_merged['Response'].apply(count_words)
print("\nDescriptive statistics for word counts:")
print(df_merged[['Context_word_count', 'Response_word_count']].describe())

# Most common words
def most_common_words(text_series, n=20):
    all_words = []
    for text in text_series.dropna():
        all_words.extend(re.findall(r'\w+', text.lower()))
    return Counter(all_words).most_common(n)

common_context_words = most_common_words(df_merged['Context'])
common_response_words = most_common_words(df_merged['Response'])
print("\nMost common words in 'Context':")
print(common_context_words)
print("\nMost common words in 'Response':")
print(common_response_words)

# Visualize the distribution of word counts
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
sns.histplot(df_merged['Context_word_count'], bins=50, kde=True)
plt.title('Distribution of Context Word Count')
plt.xlabel('Word Count')
plt.ylabel('Frequency')
plt.subplot(1, 2, 2)
sns.histplot(df_merged['Response_word_count'], bins=50, kde=True, color='orange')
plt.title('Distribution of Response Word Count')
plt.xlabel('Word Count')
plt.ylabel('Frequency')
plt.tight_layout()
plt.savefig("word_count_distribution.png")
plt.close()

Number of duplicate rows found: 4344
Number of rows after removing duplicates: 85847
Number of empty rows found: 0
Number of rows after removing empty rows: 85847
Number of rows with missing values found: 746
Number of rows after removing missing values: 85101

Missing values after removal:
Context     0
Response    0
dtype: int64
Number of rows: 85101
Number of columns: 2

Data types:
Context     object
Response    object
dtype: object

Missing values per column:
Context     0
Response    0
dtype: int64

Descriptive statistics for word counts:
       Context_word_count  Response_word_count
count        85101.000000         85101.000000
mean           175.250455           244.935124
std            104.287518           123.429608
min              1.000000             1.000000
25%             94.000000           148.000000
50%            157.000000           225.000000
75%            235.000000           327.000000
max           2664.000000          1865.000000

Most common words in 'Con

In [None]:
# Run this in a Colab cell BEFORE importing sentence_transformers
!pip install huggingface_hub==0.19.4 --upgrade --force-reinstall

# After pip finishes, restart the kernel to clear imports/ABI:
import IPython
IPython.Application.instance().kernel.do_shutdown(True)


[0mCollecting huggingface_hub==0.19.4
  Using cached huggingface_hub-0.19.4-py3-none-any.whl.metadata (14 kB)
Collecting filelock (from huggingface_hub==0.19.4)
  Using cached filelock-3.20.0-py3-none-any.whl.metadata (2.1 kB)
Collecting fsspec>=2023.5.0 (from huggingface_hub==0.19.4)
  Using cached fsspec-2025.9.0-py3-none-any.whl.metadata (10 kB)
Collecting requests (from huggingface_hub==0.19.4)
  Using cached requests-2.32.5-py3-none-any.whl.metadata (4.9 kB)
Collecting tqdm>=4.42.1 (from huggingface_hub==0.19.4)
  Using cached tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Collecting pyyaml>=5.1 (from huggingface_hub==0.19.4)
  Using cached pyyaml-6.0.3-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl.metadata (2.4 kB)
Collecting typing-extensions>=3.7.4.3 (from huggingface_hub==0.19.4)
  Using cached typing_extensions-4.15.0-py3-none-any.whl.metadata (3.3 kB)
Collecting packaging>=20.9 (from huggingface_hub==0.19.4)
  Using cached packaging-25.0-py

{'status': 'ok', 'restart': True}

In [None]:
import importlib, sys, torch
print("Python:", sys.version.splitlines()[0])
print("Torch:", getattr(torch, "__version__", None))
print("CUDA available:", torch.cuda.is_available())
print("Torch CUDA reported:", getattr(torch, "version", None) and torch.version.cuda)

for pkg in ("triton","bitsandbytes","peft","transformers","safetensors"):
    try:
        m = importlib.import_module(pkg)
        print(pkg, "OK", getattr(m, "__version__", None))
    except Exception as e:
        print(pkg, "MISSING/FAILED ->", e)

Python: 3.12.12 (main, Oct 10 2025, 08:52:57) [GCC 11.4.0]
Torch: 2.4.1+cu121
CUDA available: True
Torch CUDA reported: 12.1
triton OK 3.0.0
bitsandbytes OK None
peft OK 0.5.0
transformers OK 4.37.2
safetensors OK 0.6.2


In [None]:
import os, gc, json, time, torch
# Install compatible versions of peft and transformers
!pip install peft==0.5.0 transformers==4.37.2 -q
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import PeftModel
from sentence_transformers import SentenceTransformer, util

print("torch:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())
!nvidia-smi

# paths (adjust if needed)
KB_PATH = "/content/combine cbt.json"
BASE_MODEL_PATH = "/content/llama3_model/models--meta-llama--Meta-Llama-3-8B/snapshots/8cde5ca8380496c9a6cc7ef3a8b46a0372a1d920"
FINE_TUNED_PATH = "/content/llama3_finetuned"

# ---------- Load KB (on CPU only) ----------
with open(KB_PATH, 'r', encoding='utf-8') as f:
    kb = json.load(f)
# Reduce KB size for testing
kb_entries = [v for v in kb.values()][:100] # Using only the first 100 entries

def kb_entry_to_text(e):
    parts = []
    for k in ['Input.client_statement','Input.res_a','Input.res_b','ori_text','situation']:
        if k in e and e[k]:
            parts.append(str(e[k]))
    return " ".join(parts)

kb_texts = [kb_entry_to_text(e) for e in kb_entries]
print("KB size:", len(kb_texts))

print("Loading SentenceTransformer ON CPU (to avoid GPU peak)...")
embedder = SentenceTransformer('all-MiniLM-L6-v2', device='cpu')
# compute embeddings on CPU (convert_to_tensor True returns CPU tensor)
kb_embeddings = embedder.encode(kb_texts, convert_to_tensor=True, device='cpu')
print("KB embeddings shape:", kb_embeddings.shape)

# free some memory, just in case
gc.collect()
torch.cuda.empty_cache()
time.sleep(1)

# ---------- Bits & Bytes config (allow CPU offload to avoid OOM) ----------
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    llm_int8_enable_fp32_cpu_offload=True
)

# free again, then load model
gc.collect()
torch.cuda.empty_cache()
time.sleep(1)
print("Loading base model (quantized, device_map='auto')...")

try:
    base = AutoModelForCausalLM.from_pretrained(
        BASE_MODEL_PATH,
        quantization_config=bnb_config,
        device_map="auto",
        local_files_only=True,
        torch_dtype=torch.float16,
        low_cpu_mem_usage=True
    )
    print("Base model loaded.")
except Exception as e:
    print("Failed loading base model:", e)
    raise

# attach PEFT weights
print("Attaching PEFT weights...")
try:
    model = PeftModel.from_pretrained(base, FINE_TUNED_PATH, local_files_only=True)
    print("PEFT loaded.")
except Exception as e:
    print("Failed loading PEFT:", e)
    raise

# tokenizer
print("Loading tokenizer...")
try:
    tokenizer = AutoTokenizer.from_pretrained(FINE_TUNED_PATH, local_files_only=True)
except Exception:
    tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_PATH, local_files_only=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# show model device distribution summary
dev_counts = {}
for n, p in list(model.named_parameters())[:200]:
    d = str(p.device)
    dev_counts[d] = dev_counts.get(d, 0) + 1
print("Sample parameter device distribution:", dev_counts)

print("Model and tokenizer ready. VRAM usage now:")
!nvidia-smi --query-gpu=memory.used,memory.total --format=csv,noheader,nounits

# Helper: generate wrapper
def generate_with_model(prompt, max_new_tokens=200, temperature=0.7):
    device = next(model.parameters()).device
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, padding=True, max_length=2048)
    if torch.cuda.is_available() and any('cuda' in str(p.device) for p in model.parameters()):
        inputs = {k:v.to('cuda') for k,v in inputs.items()}
    else:
        pass
    with torch.no_grad():
        outputs = model.generate(**inputs, max_new_tokens=max_new_tokens, do_sample=True, temperature=temperature, pad_token_id=tokenizer.pad_token_id)
    txt = tokenizer.decode(outputs[0].cpu(), skip_special_tokens=True)
    return txt

# ---------- Safety Check ----------
SELF_HARM_KEYWORDS = [
    "suicide", "i want to die", "kill myself", "end my life", "hang myself", "hurt myself",
    "i can't go on", "want to die", "i'm going to die", "i'll kill myself"
]

def contains_self_harm(text: str) -> bool:
    t = text.lower()
    return any(k in t for k in SELF_HARM_KEYWORDS)

def crisis_response():
    return ("I'm really sorry you're feeling so overwhelmed. I can't provide medical care, "
            "but if you're thinking about hurting yourself, please contact your local emergency services "
            "or a crisis line right now. If you'd like, I can help you find resources or steps to stay safe.")

# ---------- Chat Loop ----------
print("Welcome to the Fine-Tuned Mental Health Conversational AI! Type 'exit' to end the chat.")
while True:
    prompt = input("\nYour message: ")
    if prompt.lower() == 'exit':
        print("Thank you for chatting. Take care!")
        break

    print("\nGenerating response...")
    try:
        response = generate_with_model(prompt)
        if contains_self_harm(prompt):
            response = crisis_response()
        print("Fine-Tuned Model Response:")
        print(response)
    except Exception as e:
        print(f"Error generating response: {e}")

    print()  # Add a newline for readability

# Cleanup
gc.collect()
torch.cuda.empty_cache()

torch: 2.4.1+cu121
CUDA available: True
Sat Nov  1 01:08:00 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA A100-SXM4-80GB          Off |   00000000:00:05.0 Off |                    0 |
| N/A   33C    P0             54W /  400W |       5MiB /  81920MiB |      0%      Default |
|                                         |                        |             Disabled |
+-----------------------------------------+------------------------+----------------------+
        

.gitattributes: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

data_config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

model.onnx:   0%|          | 0.00/90.4M [00:00<?, ?B/s]

model_O1.onnx:   0%|          | 0.00/90.4M [00:00<?, ?B/s]

model_O2.onnx:   0%|          | 0.00/90.3M [00:00<?, ?B/s]

model_O3.onnx:   0%|          | 0.00/90.3M [00:00<?, ?B/s]

model_O4.onnx:   0%|          | 0.00/45.2M [00:00<?, ?B/s]

model_qint8_arm64.onnx:   0%|          | 0.00/23.0M [00:00<?, ?B/s]

model_qint8_avx512.onnx:   0%|          | 0.00/23.0M [00:00<?, ?B/s]

model_qint8_avx512_vnni.onnx:   0%|          | 0.00/23.0M [00:00<?, ?B/s]

model_quint8_avx2.onnx:   0%|          | 0.00/23.0M [00:00<?, ?B/s]

openvino_model.bin:   0%|          | 0.00/90.3M [00:00<?, ?B/s]

openvino_model.xml: 0.00B [00:00, ?B/s]

openvino_model_qint8_quantized.bin:   0%|          | 0.00/22.9M [00:00<?, ?B/s]

openvino_model_qint8_quantized.xml: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

train_script.py: 0.00B [00:00, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

KB embeddings shape: torch.Size([100, 384])
Loading base model (quantized, device_map='auto')...


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Base model loaded.
Attaching PEFT weights...
PEFT loaded.
Loading tokenizer...


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Sample parameter device distribution: {'cuda:0': 200}
Model and tokenizer ready. VRAM usage now:
6105, 81920
Welcome to the Fine-Tuned Mental Health Conversational AI! Type 'exit' to end the chat.

Your message: exit
Thank you for chatting. Take care!


In [None]:
# ====================================================
# ✅ Environment & CUDA Cleanup
# ====================================================
import os
import gc
import torch

# Clean up GPU memory before loading big models
gc.collect()
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    torch.cuda.ipc_collect()

print("Torch version:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())
print("Using device:", "cuda" if torch.cuda.is_available() else "cpu")

# ====================================================
# ✅ Imports
# ====================================================
import sys
import time
import json
import nltk
import pandas as pd

from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import PeftModel
from sentence_transformers import SentenceTransformer, util
from nltk.tokenize import word_tokenize

nltk.download('wordnet', quiet=True)
nltk.download('punkt', quiet=True)

# ====================================================
# ✅ Paths & Config
# ====================================================
knowledge_base_path = "/content/combine cbt.json"
base_model_path = "/content/llama3_model/models--meta-llama--Meta-Llama-3-8B/snapshots/8cde5ca8380496c9a6cc7ef3a8b46a0372a1d920"
fine_tuned_path = "/content/llama3_finetuned"  # Adjusted to match Colab path
device = "cuda" if torch.cuda.is_available() else "cpu"

# ====================================================
# ✅ Load Knowledge Base
# ====================================================
with open(knowledge_base_path, 'r', encoding='utf-8') as f:
    knowledge_base = json.load(f)
knowledge_base_entries = [entry for entry in knowledge_base.values()]

def _kb_to_text(entry):
    parts = []
    for k in ['Input.client_statement', 'Input.res_a', 'Input.res_b', 'ori_text', 'situation']:
        if k in entry and entry[k]:
            parts.append(str(entry[k]))
    return " ".join(parts)

kb_texts = [_kb_to_text(e) for e in knowledge_base_entries]

print("Loading embedding model...")
embedder = SentenceTransformer('all-MiniLM-L6-v2')
kb_embeddings = embedder.encode(kb_texts, convert_to_tensor=True)

def retrieve_from_kb(prompt, top_k=3):
    prompt_embedding = embedder.encode(prompt, convert_to_tensor=True)
    hits = util.semantic_search(prompt_embedding, kb_embeddings, top_k=top_k)
    retrieved = []
    for hit in hits[0]:
        idx = hit['corpus_id']
        if 0 <= idx < len(knowledge_base_entries):
            retrieved.append(knowledge_base_entries[idx])
    return retrieved

# ====================================================
# ✅ Load Base Model (4-bit Quantized)
# ====================================================
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4"
)

print("Loading base model...")
base_model = AutoModelForCausalLM.from_pretrained(
    base_model_path,
    quantization_config=quantization_config,
    device_map="auto",
    local_files_only=True
)
base_model.eval()

# ====================================================
# ✅ Load PEFT Fine-Tuned Weights
# ====================================================
print("Loading PEFT fine-tuned weights...")
fine_tuned_model = PeftModel.from_pretrained(base_model, fine_tuned_path)
fine_tuned_model.eval()

# ====================================================
# ✅ Load Tokenizer (Always from Base Model)
# ====================================================
print("Loading tokenizer from base model path...")
tokenizer = AutoTokenizer.from_pretrained(base_model_path, local_files_only=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# ====================================================
# ✅ Safety Check (Self-Harm Detection)
# ====================================================
SELF_HARM_KEYWORDS = [
    "suicide", "i want to die", "kill myself", "end my life", "hang myself", "hurt myself",
    "i can't go on", "want to die", "i'm going to die", "i'll kill myself"
]

def contains_self_harm(text: str) -> bool:
    t = text.lower()
    return any(k in t for k in SELF_HARM_KEYWORDS)

def crisis_response():
    return ("I'm really sorry you're feeling so overwhelmed. I can't provide medical care, "
            "but if you're thinking about hurting yourself, please contact your local emergency services "
            "or a crisis line right now. If you'd like, I can help you find resources or steps to stay safe.")

# ====================================================
# ✅ CoT + RAG Prompt Template
# ====================================================
COT_TEMPLATE = """
You are a compassionate, friendly listener (not a diagnostician). Do NOT show these steps to the user. Instead, only provide the final supportive response.

Steps (INTERNAL ONLY, not to be shown):
1) Empathic opening
2) Validate feelings
3) If a KB example is relevant, briefly reflect it
4) Ask one open question
5) Offer 1-2 coping suggestions
6) If self-harm is detected, switch to crisis response
7) Close supportively
"""

def build_cot_prompt(user_prompt: str, retrieved_entries: list):
    if retrieved_entries:
        rag_context = "\n\n".join(
            [
                f"KB example: Client: {entry.get('Input.client_statement','')}\n"
                f"Response A: {entry.get('Input.res_a','')}\nResponse B: {entry.get('Input.res_b','')}"
                for entry in retrieved_entries
            ]
        )
    else:
        rag_context = "No direct KB examples."

    return (
        f"{COT_TEMPLATE}\n\n"
        f"KnowledgeBaseContext:\n{rag_context}\n\n"
        f"User: {user_prompt}\n\n"
        "Now provide the final user-facing response only, without showing steps or template. Response:"
    )

# ====================================================
# ✅ Generation Functions
# ====================================================
def _generate(model, tokenizer, prompt, max_new_tokens=200, temperature=0.7):
    inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=2048)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=True,
            temperature=temperature,
            pad_token_id=tokenizer.pad_token_id
        )
    text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return text

def _clean_response(full_text, original_prompt):
    if "Response:" in full_text:
        return full_text.split("Response:", 1)[-1].strip()
    return full_text.replace(original_prompt, "").strip()

def generate_standard_clean(model, tokenizer, prompt):
    raw = _generate(model, tokenizer, prompt)
    return _clean_response(raw, prompt)

def rag_cot_generate_clean(model, tokenizer, prompt, top_k=3):
    if contains_self_harm(prompt):
        return crisis_response()
    retrieved = retrieve_from_kb(prompt, top_k)
    cot_prompt = build_cot_prompt(prompt, retrieved)
    raw = _generate(model, tokenizer, cot_prompt)
    return _clean_response(raw, cot_prompt)

# ====================================================
# ✅ Interactive Chat Loop
# ====================================================
print("\n✅ Welcome to the Mental Health Conversational AI with CoT + RAG!")
print("Type 'exit' to end the chat.")

while True:
    try:
        prompt = input("\nYour message: ")
        if prompt.lower().strip() == 'exit':
            print("Thank you for chatting. Take care! ❤️")
            break

        print("\nGenerating responses...\n")

        cot_resp = rag_cot_generate_clean(fine_tuned_model, tokenizer, prompt)
        print("📚 Fine-Tuned + CoT + RAG Response:\n", cot_resp)

    except KeyboardInterrupt:
        print("\nExiting...")
        break
    except Exception as e:
        print(f"⚠️ Error: {e}")
        break

# Cleanup at the end
gc.collect()
if torch.cuda.is_available():
    torch.cuda.empty_cache()

Torch version: 2.4.1+cu121
CUDA available: True
Using device: cuda
Loading embedding model...
Loading base model...


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Loading PEFT fine-tuned weights...
Loading tokenizer from base model path...


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.



✅ Welcome to the Mental Health Conversational AI with CoT + RAG!
Type 'exit' to end the chat.

Your message: exit
Thank you for chatting. Take care! ❤️


In [None]:
# ====================================================
# 🔧 1. Clean up CUDA memory
# ====================================================
import gc, torch, os

gc.collect()
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    torch.cuda.ipc_collect()


import torch, gc, os

gc.collect()
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    torch.cuda.ipc_collect()

print("✅ CUDA cleaned")
print("Torch:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())


# ====================================================
# ✅ Imports
# ====================================================
import sys
import time
import json
import nltk

from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import PeftModel
from sentence_transformers import SentenceTransformer, util
from nltk.tokenize import word_tokenize

nltk.download('wordnet', quiet=True)
nltk.download('punkt', quiet=True)

# ====================================================
# ✅ Paths & Config
# ====================================================
knowledge_base_path = "/content/combine cbt.json"
base_model_path = "/content/llama3_model/models--meta-llama--Meta-Llama-3-8B/snapshots/8cde5ca8380496c9a6cc7ef3a8b46a0372a1d920"
fine_tuned_path = "/content/llama3_finetuned"  # Adjusted to match Colab path
device = "cuda" if torch.cuda.is_available() else "cpu"

# ====================================================
# ✅ Load Knowledge Base
# ====================================================
with open(knowledge_base_path, 'r', encoding='utf-8') as f:
    knowledge_base = json.load(f)
knowledge_base_entries = [entry for entry in knowledge_base.values()]

def _kb_to_text(entry):
    parts = []
    for k in ['Input.client_statement', 'Input.res_a', 'Input.res_b', 'ori_text', 'situation']:
        if k in entry and entry[k]:
            parts.append(str(entry[k]))
    return " ".join(parts)

kb_texts = [_kb_to_text(e) for e in knowledge_base_entries]

print("Loading embedding model...")
embedder = SentenceTransformer('all-MiniLM-L6-v2')
kb_embeddings = embedder.encode(kb_texts, convert_to_tensor=True)

# ====================================================
# ✅ Load Base Model (4-bit Quantized)
# ====================================================
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4"
)

print("Loading base model...")
base_model = AutoModelForCausalLM.from_pretrained(
    base_model_path,
    quantization_config=quantization_config,
    device_map="auto",
    local_files_only=True
)
base_model.eval()

# ====================================================
# ✅ Load PEFT Fine-Tuned Weights
# ====================================================
print("Loading PEFT fine-tuned weights...")
fine_tuned_model = PeftModel.from_pretrained(base_model, fine_tuned_path)
fine_tuned_model.eval()

# ====================================================
# ✅ Load Tokenizer (Always from Base Model)
# ====================================================
print("Loading tokenizer from base model path...")
tokenizer = AutoTokenizer.from_pretrained(base_model_path, local_files_only=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# ====================================================
# ✅ Safety Check (Self-Harm Detection)
# ====================================================
SELF_HARM_KEYWORDS = [
    "suicide", "i want to die", "kill myself", "end my life", "hang myself", "hurt myself",
    "i can't go on", "want to die", "i'm going to die", "i'll kill myself"
]

def contains_self_harm(text: str) -> bool:
    t = text.lower()
    return any(k in t for k in SELF_HARM_KEYWORDS)

def crisis_response():
    return ("I'm really sorry you're feeling so overwhelmed. I can't provide medical care, "
            "but if you're thinking about hurting yourself, please contact your local emergency services "
            "or a crisis line right now. If you'd like, I can help you find resources or steps to stay safe.")

# ====================================================
# ✅ Generation Functions
# ====================================================
def _generate(model, tokenizer, prompt, max_new_tokens=200, temperature=0.7):
    inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=2048)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=True,
            temperature=temperature,
            pad_token_id=tokenizer.pad_token_id
        )
    text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return text

def _clean_response(full_text, original_prompt):
    if "Response:" in full_text:
        return full_text.split("Response:", 1)[-1].strip()
    return full_text.replace(original_prompt, "").strip()

def generate_standard_clean(model, tokenizer, prompt):
    raw = _generate(model, tokenizer, prompt)
    return _clean_response(raw, prompt)

# ====================================================
# ✅ Interactive Chat Loop
# ====================================================
print("\n✅ Welcome to the Fine-Tuned Mental Health Conversational AI!")
print("Type 'exit' to end the chat.")

while True:
    try:
        prompt = input("\nYour message: ")
        if prompt.lower().strip() == 'exit':
            print("Thank you for chatting. Take care! ❤️")
            break

        print("\nGenerating responses...\n")

        fine_tuned_resp = generate_standard_clean(fine_tuned_model, tokenizer, prompt)
        print("🎯 Fine-Tuned Model Response:\n", fine_tuned_resp)

    except KeyboardInterrupt:
        print("\nExiting...")
        break
    except Exception as e:
        print(f"⚠️ Error: {e}")
        break

# Cleanup at the end
gc.collect()
if torch.cuda.is_available():
    torch.cuda.empty_cache()

✅ CUDA cleaned
Torch: 2.4.1+cu121
CUDA available: True
Loading embedding model...
Loading base model...


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Loading PEFT fine-tuned weights...
Loading tokenizer from base model path...


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.



✅ Welcome to the Fine-Tuned Mental Health Conversational AI!
Type 'exit' to end the chat.

Your message: exit
Thank you for chatting. Take care! ❤️


In [8]:
# ====================================================
# ✅ Environment & CUDA Cleanup
# ====================================================
import os
import gc
import torch

# Clean up GPU memory before loading big models
gc.collect()
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    torch.cuda.ipc_collect()

print("Torch version:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())
print("Using device:", "cuda" if torch.cuda.is_available() else "cpu")


# ====================================================
# ✅ Imports
# ====================================================
import sys
import time
import json
import nltk

from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import PeftModel
from sentence_transformers import SentenceTransformer, util
from nltk.tokenize import word_tokenize

nltk.download('wordnet', quiet=True)
nltk.download('punkt', quiet=True)

# ====================================================
# ✅ Paths & Config
# ====================================================
knowledge_base_path = "/content/combine cbt.json"
base_model_path = "/content/llama3_model/models--meta-llama--Meta-Llama-3-8B/snapshots/8cde5ca8380496c9a6cc7ef3a8b46a0372a1d920"
fine_tuned_path = "/content/llama3_finetuned"  # Adjusted to match Colab path
device = "cuda" if torch.cuda.is_available() else "cpu"

# ====================================================
# ✅ Load Knowledge Base
# ====================================================
with open(knowledge_base_path, 'r', encoding='utf-8') as f:
    knowledge_base = json.load(f)
knowledge_base_entries = [entry for entry in knowledge_base.values()]

def _kb_to_text(entry):
    parts = []
    for k in ['Input.client_statement', 'Input.res_a', 'Input.res_b', 'ori_text', 'situation']:
        if k in entry and entry[k]:
            parts.append(str(entry[k]))
    return " ".join(parts)

kb_texts = [_kb_to_text(e) for e in knowledge_base_entries]

print("Loading embedding model...")
embedder = SentenceTransformer('all-MiniLM-L6-v2')
kb_embeddings = embedder.encode(kb_texts, convert_to_tensor=True)

def retrieve_from_kb(prompt, top_k=3):
    prompt_embedding = embedder.encode(prompt, convert_to_tensor=True)
    hits = util.semantic_search(prompt_embedding, kb_embeddings, top_k=top_k)
    retrieved = []
    for hit in hits[0]:
        idx = hit['corpus_id']
        if 0 <= idx < len(knowledge_base_entries):
            retrieved.append(knowledge_base_entries[idx])
    return retrieved

# ====================================================
# ✅ Load Base Model (4-bit Quantized)
# ====================================================
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4"
)

print("Loading base model...")
base_model = AutoModelForCausalLM.from_pretrained(
    base_model_path,
    quantization_config=quantization_config,
    device_map="auto",
    local_files_only=True
)
base_model.eval()

# ====================================================
# ✅ Load PEFT Fine-Tuned Weights
# ====================================================
print("Loading PEFT fine-tuned weights...")
fine_tuned_model = PeftModel.from_pretrained(base_model, fine_tuned_path)
fine_tuned_model.eval()

# ====================================================
# ✅ Load Tokenizer (Always from Base Model)
# ====================================================
print("Loading tokenizer from base model path...")
tokenizer = AutoTokenizer.from_pretrained(base_model_path, local_files_only=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# ====================================================
# ✅ Safety Check (Self-Harm Detection)
# ====================================================
SELF_HARM_KEYWORDS = [
    "suicide", "i want to die", "kill myself", "end my life", "hang myself", "hurt myself",
    "i can't go on", "want to die", "i'm going to die", "i'll kill myself"
]

def contains_self_harm(text: str) -> bool:
    t = text.lower()
    return any(k in t for k in SELF_HARM_KEYWORDS)

def crisis_response():
    return ("I'm really sorry you're feeling so overwhelmed. I can't provide medical care, "
            "but if you're thinking about hurting yourself, please contact your local emergency services "
            "or a crisis line right now. If you'd like, I can help you find resources or steps to stay safe.")

# ====================================================
# ✅ CoT + RAG Prompt Template
# ====================================================
COT_TEMPLATE = """
You are a compassionate, friendly listener (not a diagnostician). Do NOT show these steps to the user. Instead, only provide the final supportive response.

Steps (INTERNAL ONLY, not to be shown):
1) Empathic opening
2) Validate feelings
3) If a KB example is relevant, briefly reflect it
4) Ask one open question
5) Offer 1-2 coping suggestions
6) If self-harm is detected, switch to crisis response
7) Close supportively
"""

def build_cot_prompt(user_prompt: str, retrieved_entries: list):
    if retrieved_entries:
        rag_context = "\n\n".join(
            [
                f"KB example: Client: {entry.get('Input.client_statement','')}\n"
                f"Response A: {entry.get('Input.res_a','')}\nResponse B: {entry.get('Input.res_b','')}"
                for entry in retrieved_entries
            ]
        )
    else:
        rag_context = "No direct KB examples."

    return (
        f"{COT_TEMPLATE}\n\n"
        f"KnowledgeBaseContext:\n{rag_context}\n\n"
        f"User: {user_prompt}\n\n"
        "Now provide the final user-facing response only, without showing steps or template. Response:"
    )

# ====================================================
# ✅ Generation Functions
# ====================================================
def _generate(model, tokenizer, prompt, max_new_tokens=200, temperature=0.7):
    inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=2048)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=True,
            temperature=temperature,
            pad_token_id=tokenizer.pad_token_id
        )
    text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return text

def _clean_response(full_text, original_prompt):
    if "Response:" in full_text:
        return full_text.split("Response:", 1)[-1].strip()
    return full_text.replace(original_prompt, "").strip()

def generate_standard_clean(model, tokenizer, prompt):
    raw = _generate(model, tokenizer, prompt)
    return _clean_response(raw, prompt)

def rag_cot_generate_clean(model, tokenizer, prompt, top_k=3):
    if contains_self_harm(prompt):
        return crisis_response()
    retrieved = retrieve_from_kb(prompt, top_k)
    cot_prompt = build_cot_prompt(prompt, retrieved)
    raw = _generate(model, tokenizer, cot_prompt)
    return _clean_response(raw, cot_prompt)

# ====================================================
# ✅ Interactive Chat Loop
# ====================================================
print("\n✅ Welcome to the Mental Health Conversational AI with CoT + RAG!")
print("Type 'exit' to end the chat.")

while True:
    try:
        prompt = input("\nYour message: ")
        if prompt.lower().strip() == 'exit':
            print("Thank you for chatting. Take care! ❤️")
            break

        print("\nGenerating responses...\n")

        cot_resp = rag_cot_generate_clean(fine_tuned_model, tokenizer, prompt)
        print("📚 Fine-Tuned + CoT + RAG Response:\n", cot_resp)

    except KeyboardInterrupt:
        print("\nExiting...")
        break
    except Exception as e:
        print(f"⚠️ Error: {e}")
        break

# Cleanup at the end
gc.collect()
if torch.cuda.is_available():
    torch.cuda.empty_cache()

Torch version: 2.4.1+cu121
CUDA available: True
Using device: cuda
Loading embedding model...
Loading base model...


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Loading PEFT fine-tuned weights...
Loading tokenizer from base model path...


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.



✅ Welcome to the Mental Health Conversational AI with CoT + RAG!
Type 'exit' to end the chat.

Your message: exit
Thank you for chatting. Take care! ❤️
