In [None]:
!pip install -U bitsandbytes
!pip install -U transformers
!pip install -U accelerate datasets peft trl
!pip install -U huggingface_hub
!pip install -q evaluate bert-score
!pip install rouge_score
!pip install -q streamlit pyngrok
!pip install -q flask flask-ngrok flask-cors

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
from huggingface_hub import notebook_login
notebook_login()

In [None]:
!pip install python-dotenv

from pathlib import Path

env_path = Path("/content/drive/MyDrive/secrets/.env")
env_content = "HF_TOKEN=ACTUAL_NGROK_TOKEN"
env_path.parent.mkdir(parents=True, exist_ok=True)
env_path.write_text(env_content)

print(f".env file created at {env_path}")


In [None]:
from dotenv import load_dotenv
import os
load_dotenv(dotenv_path=env_path)

from pyngrok import ngrok
ngrok.set_auth_token(os.getenv("HF_TOKEN"))

# Dataset Counsel Chat

In [None]:
import pandas as pd

#Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

#Load dataset
file_path = "/content/drive/MyDrive/chatbot/counsel/datasets/20200325_counsel_chat.csv"
df = pd.read_csv(file_path)

#Preview dataset
print("🧾 Columns:", df.columns)
print("\n📌 Sample rows:")
display(df.head(3))

#Check for missing values
print("\n🧼 Missing values per column:")
print(df.isnull().sum())


# Step 2: Extract Best Q&A Pairs to JSONL (for LoRA Fine-Tuning)

In [None]:
import json

#Group by questionID and select top upvoted answer
top_answers_df = (
    df.sort_values("upvotes", ascending=False)
      .groupby("questionID", as_index=False)
      .first()
)

#Format into JSONL format for instruction-output fine-tuning
jsonl_data = []
for _, row in top_answers_df.iterrows():
    instruction = row["questionText"].strip()
    output = row["answerText"].strip()

    #Optional: Skip if too short
    if len(instruction.split()) < 4 or len(output.split()) < 4:
        continue

    jsonl_data.append({"instruction": instruction, "output": output})

#Save to JSONL file
output_path = "/content/drive/MyDrive/chatbot/counsel/datasets/counselchat_cleaned.jsonl"
with open(output_path, "w") as f:
    for entry in jsonl_data:
        json.dump(entry, f)
        f.write("\n")

print(f"✅ Saved {len(jsonl_data)} cleaned Q&A pairs to: {output_path}")


# Testing with Mistral + Counsel Chat

In [None]:
import json
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

#Load base model (or your fine-tuned model path)
model_id = "mistralai/Mistral-7B-Instruct-v0.2"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16, device_map="auto")

#Load your CounselChat cleaned JSONL
jsonl_path = "/content/drive/MyDrive/chatbot/counsel/datasets/counselchat_cleaned.jsonl"
samples = []

with open(jsonl_path, "r") as file:
    for line in file:
        data = json.loads(line)
        samples.append(data)

#Test first 3 prompts
for i in range(3):
    user_question = samples[i]['instruction']
    expected_answer = samples[i]['output']

    #Prompt style to guide model
    prompt = f"""<s>[INST] You are a supportive mental health assistant. Provide thoughtful guidance.

User: {user_question}

Respond with empathy and practical steps. [/INST]"""

    #Generate
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True).to(model.device)
    with torch.no_grad():
        output = model.generate(**inputs, max_new_tokens=200)
        response = tokenizer.decode(output[0], skip_special_tokens=True)

    print(f"\n📌 Prompt: {user_question}\n🔹 Expected: {expected_answer}\n🤖 Model: {response}\n" + "="*80)


# Tuning -Trial 1

In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model, TaskType
import torch

#Load dataset
dataset = load_dataset("json", data_files="/content/drive/MyDrive/chatbot/counsel/datasets/counselchat_cleaned.jsonl", split="train")

#Load tokenizer + Fix padding
model_name = "mistralai/Mistral-7B-Instruct-v0.2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token  # Fix padding token issue

#Tokenization function
def tokenize(example):
    prompt = f"<s>[INST] You are a professional mental health assistant providing thoughtful, multi-step advice to help users with anxiety, stress, and emotional struggles.\n{example['instruction']} [/INST]"
    result = tokenizer(prompt, text_target=example["output"], truncation=True, padding="max_length", max_length=512)
    return result

#Tokenize dataset
tokenized_dataset = dataset.map(tokenize, remove_columns=["instruction", "output"])

#Load base model + apply LoRA
base_model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto")

lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.CAUSAL_LM
)
model = get_peft_model(base_model, lora_config)

#Training arguments
training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/chatbot/counsel/checkpoints/trial1",
    num_train_epochs=3,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=2,
    warmup_steps=10,
    logging_steps=20,
    learning_rate=2e-5,
    fp16=True,
    optim="paged_adamw_8bit",
    save_strategy="epoch",
    report_to="none"
)

#Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset
)

#Start fine-tuning
trainer.train()

#Save model
model.save_pretrained("/content/drive/MyDrive/chatbot/counsel/checkpoints/trial1")
tokenizer.save_pretrained("/content/drive/MyDrive/chatbot/counsel/checkpoints/trial1")

print("✅ Trial 1 fine-tuning complete and saved to Drive!")


# Trial 1 - Evaluation

In [None]:
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM
import evaluate
from tqdm import tqdm
from peft import PeftModel, PeftConfig


#Load the evaluation dataset (1000 samples)
eval_dataset = load_dataset(
    "json",
    data_files="/content/drive/MyDrive/chatbot/counsel/datasets/counselchat_cleaned.jsonl",
    split="train[:300]"
)

model_path = "/content/drive/MyDrive/chatbot/counsel/checkpoints/trial1"
config = PeftConfig.from_pretrained(model_path)

base_model = AutoModelForCausalLM.from_pretrained(
    config.base_model_name_or_path,
    torch_dtype=torch.float16,
    device_map="auto",
    local_files_only=True
)
model = PeftModel.from_pretrained(base_model, model_path, local_files_only=True)
tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path, local_files_only=True)


#Load evaluation metrics
bleu = evaluate.load("bleu")
rouge = evaluate.load("rouge")
bertscore = evaluate.load("bertscore")

#Inference loop
predictions = []
references = []

for example in tqdm(eval_dataset, desc="🔍 Evaluating Trial 1"):
    prompt = f"<s>[INST] You are a professional mental health assistant providing thoughtful, multi-step advice to help users with anxiety, stress, and emotional struggles.\n{example['instruction']} [/INST]"
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512).to(model.device)

    with torch.no_grad():
        outputs = model.generate(**inputs, max_new_tokens=100, pad_token_id=tokenizer.eos_token_id)
        pred = tokenizer.decode(outputs[0], skip_special_tokens=True)

    predictions.append(pred)
    references.append(example["output"])

#Compute metrics
bleu_result = bleu.compute(predictions=predictions, references=[[ref] for ref in references])
rouge_result = rouge.compute(predictions=predictions, references=references)
bertscore_result = bertscore.compute(predictions=predictions, references=references, lang="en")

#Print results
print("\n📊 Trial 1 Evaluation Results (1000 samples):")
print(f"BLEU: {round(bleu_result['bleu'], 4)}")
print(f"ROUGE-1: {round(rouge_result['rouge1'], 4)}")
print(f"ROUGE-2: {round(rouge_result['rouge2'], 4)}")
print(f"ROUGE-L: {round(rouge_result['rougeL'], 4)}")
print(f"BERTScore (F1): {round(sum(bertscore_result['f1']) / len(bertscore_result['f1']), 4)}")


# Tuning trial 2

In [None]:
!pip install -q transformers datasets peft accelerate bitsandbytes

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer
from datasets import load_dataset
from peft import LoraConfig, get_peft_model, TaskType
import gc

#Clear GPU memory
gc.collect()
torch.cuda.empty_cache()

#Load dataset (~890 samples)
dataset = load_dataset(
    "json",
    data_files="/content/drive/MyDrive/chatbot/counsel/datasets/counselchat_cleaned.jsonl",
    split="train"
)

#Load model + tokenizer
model_name = "mistralai/Mistral-7B-Instruct-v0.2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    device_map="auto"
)

#Tokenization function
def tokenize(example):
    prompt = f"<s>[INST] You are a professional mental health assistant providing thoughtful, multi-step advice to help users with anxiety, stress, and emotional struggles.\n{example['instruction']} [/INST]"
    output = f" {example['output']}</s>"
    result = tokenizer(prompt + output, padding="max_length", truncation=True, max_length=384)
    result["labels"] = tokenizer(output, padding="max_length", truncation=True, max_length=384)["input_ids"]
    return result

#Tokenize the dataset
tokenized_dataset = dataset.map(tokenize, remove_columns=["instruction", "output"])

#LoRA config
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.CAUSAL_LM
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

#Training args (tested in Trial 9)
training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/chatbot/counsel/checkpoints/trial2",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    num_train_epochs=4,
    learning_rate=2e-5,
    fp16=True,
    logging_steps=10,
    save_strategy="epoch",
    report_to="none"
)

#Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer
)

#Train
trainer.train()

#Save model + tokenizer
model.save_pretrained("/content/drive/MyDrive/chatbot/counsel/checkpoints/trial2")
tokenizer.save_pretrained("/content/drive/MyDrive/chatbot/counsel/checkpoints/trial2")

print("✅ Trial 2 fine-tuning complete and saved to Drive!")


# Trial 2 - evaluation

In [None]:
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel, PeftConfig
import evaluate
from tqdm import tqdm

#Load evaluation dataset (300 or 1000 samples)
eval_dataset = load_dataset(
    "json",
    data_files="/content/drive/MyDrive/chatbot/counsel/datasets/counselchat_cleaned.jsonl",
    split="train[:300]"
)

#Load the PEFT config + base model
model_path = "/content/drive/MyDrive/chatbot/counsel/checkpoints/trial2"  # 👈 Change here
config = PeftConfig.from_pretrained(model_path)

#Load base model
base_model = AutoModelForCausalLM.from_pretrained(
    config.base_model_name_or_path,
    torch_dtype=torch.float16,
    device_map="auto",
    local_files_only=True
)

#Load LoRA adapter
model = PeftModel.from_pretrained(base_model, model_path, local_files_only=True)
tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path, local_files_only=True)

#Load metrics
bleu = evaluate.load("bleu")
rouge = evaluate.load("rouge")
bertscore = evaluate.load("bertscore")

#Inference loop
predictions = []
references = []

for example in tqdm(eval_dataset, desc="🔍 Evaluating Trial 2"):
    prompt = f"<s>[INST] You are a professional mental health assistant providing thoughtful, multi-step advice to help users with anxiety, stress, and emotional struggles.\n{example['instruction']} [/INST]"
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True).to(model.device)

    with torch.no_grad():
        outputs = model.generate(**inputs, max_new_tokens=200, pad_token_id=tokenizer.eos_token_id)
        pred = tokenizer.decode(outputs[0], skip_special_tokens=True)

    predictions.append(pred)
    references.append(example["output"])

#Evaluation
bleu_result = bleu.compute(predictions=predictions, references=[[ref] for ref in references])
rouge_result = rouge.compute(predictions=predictions, references=references)
bertscore_result = bertscore.compute(predictions=predictions, references=references, lang="en")

#Print results
print("\n📊 Trial 2 Evaluation Results (300 samples):")
print(f"BLEU: {round(bleu_result['bleu'], 4)}")
print(f"ROUGE-1: {round(rouge_result['rouge1'], 4)}")
print(f"ROUGE-2: {round(rouge_result['rouge2'], 4)}")
print(f"ROUGE-L: {round(rouge_result['rougeL'], 4)}")
print(f"BERTScore (F1): {round(sum(bertscore_result['f1']) / len(bertscore_result['f1']), 4)}")


# creating synthetic GPT samples

🔁 Synthetic Sample Generation – CounselChat Augmentation (Trial 3)
To enhance the performance of our Mistral 7B fine-tuning for the mental health chatbot, we generate 200 high-quality synthetic Q&A samples. These samples are designed to:

Match the tone, format, and structure of the original CounselChat dataset

Improve generalization and bigram-level coherence (ROUGE-2)

Stay lightweight to avoid exhausting Colab Pro A100 resources

Each synthetic entry consists of:

A realistic user query about anxiety, depression, or emotional distress (instruction)

A thoughtful, structured, empathetic multi-step response (output) based on known therapeutic practices (e.g., CBT, DBT)

These 200 samples will be merged with ~890 real samples to create a new training set (counselchat_augmented_1090.jsonl) for Trial 3 fine-tuning.

In [None]:
import json
import random

#Expanded set of ~60 varied emotional/mental health prompts
user_issues = [
    "I feel anxious about everyday situations, even simple ones like sending emails.",
    "I can't stop overanalyzing conversations after they happen.",
    "I'm always expecting the worst-case scenario, even when things seem fine.",
    "My heart races whenever I have to speak in front of others.",
    "I feel guilty when I try to take time for myself.",
    "I’m scared to share how I feel with my family because they might judge me.",
    "Sometimes I just feel numb, like I’m not really here.",
    "I constantly compare myself to others and feel like I’m falling behind.",
    "I feel overwhelmed by my responsibilities and don't know where to start.",
    "I'm always trying to please others and end up neglecting myself.",
    "Even when I achieve something, I feel like it's not good enough.",
    "I can’t stop thinking about things I did wrong years ago.",
    "I’m scared I’ll never feel truly happy again.",
    "Sometimes I cry for no reason and I don't know how to explain it.",
    "I get upset with myself when I’m not productive enough.",
    "I feel like a burden to the people around me.",
    "I hate asking for help because I don't want to seem weak.",
    "I isolate myself even though I don’t want to be alone.",
    "It feels like my mind is always racing and I can't shut it off.",
    "I’ve been eating less lately because I feel stressed and anxious.",
    "I feel like my life lacks purpose or direction.",
    "I try to sleep but my brain keeps bringing up bad memories.",
    "I always feel like I’m being judged, even when no one is around.",
    "Sometimes I fake being okay because I don’t want others to worry.",
    "I struggle to get out of bed in the mornings lately.",
    "I don’t know how to express my emotions without feeling ashamed.",
    "I keep doubting my self-worth, even when others praise me.",
    "I feel stuck in a loop of negative thinking.",
    "It’s hard for me to enjoy things I used to love.",
    "I feel afraid of failing, so I avoid trying new things.",
    "I’ve lost interest in socializing with others.",
    "I get nervous even before small meetings or group calls.",
    "I overthink everything I say and do.",
    "I feel like I’m pretending to be okay all the time.",
    "It feels like no one really understands me.",
    "I panic when plans change unexpectedly.",
    "I feel like I'm falling apart on the inside.",
    "I worry a lot about things I can’t control.",
    "I feel disconnected from my own life.",
    "I’m trying to heal, but progress feels painfully slow.",
    "I question if therapy is even helping me.",
    "I want to trust people but I'm afraid of being hurt.",
    "I avoid eye contact because I feel ashamed.",
    "I feel anxious when someone compliments me.",
    "I'm afraid people are pretending to like me.",
    "I get angry at myself for feeling this way.",
    "I often feel like an imposter, even with my accomplishments.",
    "I’m overwhelmed and don’t know where to begin.",
    "I worry that people secretly dislike me.",
    "I sometimes wish I could just disappear for a while.",
    "I get drained by social events, even short ones.",
    "I feel like I can't be myself around others.",
    "I don’t feel motivated to do anything lately.",
    "I try to look strong, but inside I’m falling apart.",
    "I’m scared of being vulnerable with people.",
    "I feel emotionally exhausted by my own thoughts.",
    "I want to feel better, but I don’t know how.",
    "I get scared of being alone but also fear getting close to people."
]

# Same response components from earlier
responses_intro = [
    "Thank you for being open and sharing this. You're not alone in feeling this way.",
    "I hear you, and I want you to know that your feelings are valid.",
    "What you're experiencing is difficult, and it's good that you're reaching out.",
    "It takes courage to speak up about this — you're already taking a positive step.",
    "I'm really glad you asked this. Many people go through similar emotions."
]

responses_body = [
    "Anxiety can often lead to overthinking and self-doubt. It helps to practice grounding techniques like focused breathing or mindful journaling.",
    "Cognitive Behavioral Therapy (CBT) has proven effective in managing negative thought loops. You might consider exploring CBT worksheets or speaking with a therapist trained in it.",
    "Try to notice when these thoughts come up and gently challenge them by asking yourself if they're based on facts or assumptions.",
    "Consider keeping a self-compassion journal where you write down moments you were kind to yourself or others.",
    "Sometimes, talking to a counselor or even joining a peer support group can provide relief and perspective."
]

responses_close = [
    "Remember, healing is not linear. Small steps matter.",
    "Be patient with yourself. You deserve support and care.",
    "You are worthy of kindness — from others and from yourself.",
    "Don’t hesitate to seek professional help when needed. You’re not alone.",
    "Keep going. You're doing better than you think."
]

# Generate and save 200 synthetic Q&A pairs
synthetic_samples = []
for _ in range(200):
    instruction = random.choice(user_issues)
    output = "\n\n".join([
        random.choice(responses_intro),
        random.choice(responses_body),
        random.choice(responses_close)
    ])
    synthetic_samples.append({
        "instruction": instruction,
        "output": output
    })

# Save as JSONL
output_path = "/content/drive/MyDrive/chatbot/counsel/datasets/counselchat_synthetic_200.jsonl"
with open(output_path, "w") as f:
    for item in synthetic_samples:
        f.write(json.dumps(item) + "\n")

print(f"✅ 200 synthetic samples saved to: {output_path}")


# Creating new augmented dataset - synthetic200 + counsel chat dataset

In [None]:
import json
import random

#File paths
real_path = "/content/drive/MyDrive/chatbot/counsel/datasets/counselchat_cleaned.jsonl"
synthetic_path = "/content/dtasets/counselchat_augmented_1090.jsonl"rive/MyDrive/chatbot/counsel/datasets/counselchat_synthetic_200.jsonl"
output_path = "/content/drive/MyDrive/chatbot/counsel/da

#Load real CounselChat data
with open(real_path, "r") as f:
    real_data = [json.loads(line.strip()) for line in f]

#Load synthetic samples
with open(synthetic_path, "r") as f:
    synthetic_data = [json.loads(line.strip()) for line in f]

#Combine and shuffle
combined_data = real_data + synthetic_data
random.shuffle(combined_data)

#Save merged dataset as JSONL
with open(output_path, "w") as f:
    for item in combined_data:
        f.write(json.dumps(item) + "\n")

print(f"✅ Merged dataset saved to: {output_path}")
print(f"📊 Total samples: {len(combined_data)}")


# Trial 3 with augmented dataset

In [None]:
!pip install -q transformers datasets peft accelerate bitsandbytes

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer
from datasets import load_dataset
from peft import LoraConfig, get_peft_model, TaskType
import gc

#Clear memory
gc.collect()
torch.cuda.empty_cache()

#Load merged dataset (Counsel + 200 synthetic)
dataset = load_dataset(
    "json",
    data_files="/content/drive/MyDrive/chatbot/counsel/datasets/counselchat_augmented_1090.jsonl",
    split="train"
)

#Load model and tokenizer
model_name = "mistralai/Mistral-7B-Instruct-v0.2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    device_map="auto"
)

#Tokenize
def tokenize(example):
    prompt = f"<s>[INST] You are a professional mental health assistant providing thoughtful, multi-step advice to help users with anxiety, stress, and emotional struggles.\n{example['instruction']} [/INST]"
    output = f" {example['output']}</s>"
    result = tokenizer(prompt + output, padding="max_length", truncation=True, max_length=384)
    result["labels"] = tokenizer(output, padding="max_length", truncation=True, max_length=384)["input_ids"]
    return result

tokenized_dataset = dataset.map(tokenize, remove_columns=["instruction", "output"])

#LoRA config
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.CAUSAL_LM
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

#Training arguments
training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/chatbot/counsel/checkpoints/trial3",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    num_train_epochs=4,
    learning_rate=2e-5,
    fp16=True,
    logging_steps=10,
    save_strategy="epoch",
    report_to="none"
)

#Trainer setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer
)

#Train the model
trainer.train()

#Save model and tokenizer
model.save_pretrained("/content/drive/MyDrive/chatbot/counsel/checkpoints/trial3")
tokenizer.save_pretrained("/content/drive/MyDrive/chatbot/counsel/checkpoints/trial3")

print("✅ Trial 3 fine-tuning complete and saved to Drive!")


# Trial 3 - Evaluation

In [None]:
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel, PeftConfig
import evaluate
from tqdm import tqdm

#Load evaluation dataset
eval_dataset = load_dataset(
    "json",
    data_files="/content/drive/MyDrive/chatbot/counsel/datasets/counselchat_cleaned.jsonl",
    split="train[:300]"  # Adjust to 1000 if needed
)

#Load the PEFT model and config
model_path = "/content/drive/MyDrive/chatbot/counsel/checkpoints/trial3"
config = PeftConfig.from_pretrained(model_path)

#Load base model and LoRA adapter
base_model = AutoModelForCausalLM.from_pretrained(
    config.base_model_name_or_path,
    torch_dtype=torch.float16,
    device_map="auto",
    local_files_only=True
)

model = PeftModel.from_pretrained(base_model, model_path, local_files_only=True)
tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path, local_files_only=True)

#Load evaluation metrics
bleu = evaluate.load("bleu")
rouge = evaluate.load("rouge")
bertscore = evaluate.load("bertscore")

#Run generation
predictions = []
references = []

for example in tqdm(eval_dataset, desc="🔍 Evaluating Trial 3"):
    prompt = f"<s>[INST] You are a professional mental health assistant providing thoughtful, multi-step advice to help users with anxiety, stress, and emotional struggles.\n{example['instruction']} [/INST]"
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True).to(model.device)

    with torch.no_grad():
        outputs = model.generate(**inputs, max_new_tokens=200, pad_token_id=tokenizer.eos_token_id)
        pred = tokenizer.decode(outputs[0], skip_special_tokens=True)

    predictions.append(pred)
    references.append(example["output"])

#Compute metrics
bleu_result = bleu.compute(predictions=predictions, references=[[ref] for ref in references])
rouge_result = rouge.compute(predictions=predictions, references=references)
bertscore_result = bertscore.compute(predictions=predictions, references=references, lang="en")

#Print results
print("\n📊 Trial 3 Evaluation Results (300 samples):")
print(f"BLEU: {round(bleu_result['bleu'], 4)}")
print(f"ROUGE-1: {round(rouge_result['rouge1'], 4)}")
print(f"ROUGE-2: {round(rouge_result['rouge2'], 4)}")
print(f"ROUGE-L: {round(rouge_result['rougeL'], 4)}")
print(f"BERTScore (F1): {round(sum(bertscore_result['f1']) / len(bertscore_result['f1']), 4)}")

# Dataset Cleaning Updated

## 🧹 Smart Filtering of CounselChat Dataset (Explained)

This dataset cleaning script was applied to the original CSV file `20200325_counsel_chat.csv` from CounselChat.

---

### ✅ Step-by-Step Actions

#### 1. **Loaded the original raw CSV**
- File: `20200325_counsel_chat.csv`
- Contains therapist responses to mental health questions.
- Columns included: `questionText`, `answerText`, `topic`, `therapistInfo`, etc.

#### 2. **Extracted Key Fields**
Only two fields were kept to train the model:

| CSV Field       | Transformed Field |
|----------------|-------------------|
| `questionText` | `instruction`     |
| `answerText`   | `output`          |

Example format:
```json
{
  "instruction": "I’ve been feeling anxious a lot lately. How can I manage it?",
  "output": "Anxiety is common. You can begin with deep breathing and journaling..."
}
```

---

#### 3. **Applied Smart Filters**
The goal was to remove low-quality or misleading samples.

| Filter Rule                                      | Reason                                              |
|--------------------------------------------------|-----------------------------------------------------|
| `instruction` and `output` must not be empty     | Skips blank entries                                 |
| `instruction` must have ≥ 5 words                | Too-short questions aren't useful                   |
| `output` must have ≥ 15 words                    | Short answers don’t help the model learn enough     |
| No phrases like “I am an AI”, “I don’t understand” | Removes hallucinated or chatbot-like answers        |
| `instruction` must not appear inside the `output`| Avoid parroting                                     |

This ensures cleaner and more meaningful learning data.

---

#### 4. **Saved Cleaned Data**
- Saved as JSONL to:
  `/content/drive/MyDrive/chatbot/counsel/datasets/counselchat_smartfiltered.jsonl`
- Each line contains one valid instruction-output pair

---

### 🧠 Why This Matters
- Improves dataset quality for fine-tuning
- Reduces junk samples and repetitive patterns
- Leads to **better BLEU, ROUGE, and BERTScore results**

In [None]:
import pandas as pd
import json

csv_path = "/content/drive/MyDrive/chatbot/counsel/datasets/20200325_counsel_chat.csv"  # Update path if needed
df = pd.read_csv(csv_path)

print("Columns:", df.columns)
print("Total rows:", len(df))

def format_row(row):
    instruction = row.get("questionText", "").strip()
    output = row.get("answerText", "").strip()
    return {"instruction": instruction, "output": output}

def is_valid(example):
    instr = example["instruction"]
    out = example["output"]

    # Basic checks
    if not instr or not out:
        return False
    if len(instr.split()) < 5 or len(out.split()) < 15:
        return False
    if "I am an AI" in out or "I don't understand" in out:
        return False
    if instr.lower() in out.lower():
        return False
    return True

filtered = []
for _, row in df.iterrows():
    formatted = format_row(row)
    if is_valid(formatted):
        filtered.append(formatted)

print(f"Total valid instruction-output pairs: {len(filtered)}")

output_path = "/content/drive/MyDrive/chatbot/counsel/datasets/counselchat_smartfiltered.jsonl"
with open(output_path, "w") as f:
    for item in filtered:
        f.write(json.dumps(item) + "\n")

print("Cleaned JSONL saved to:", output_path)


# Tuning - Trial 4

In [None]:
#!pip install -q transformers datasets peft accelerate bitsandbytes

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer
from datasets import load_dataset
from peft import LoraConfig, get_peft_model, TaskType
import gc

#Clear memory
gc.collect()
torch.cuda.empty_cache()

#Load filtered dataset (2,116 samples)
dataset = load_dataset(
    "json",
    data_files="/content/drive/MyDrive/chatbot/counsel/datasets/counselchat_smartfiltered.jsonl",
    split="train"
)

#Load model and tokenizer
model_name = "mistralai/Mistral-7B-Instruct-v0.2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    device_map="auto"
)

#Tokenization function
def tokenize(example):
    prompt = f"<s>[INST] You are a professional mental health assistant providing thoughtful, multi-step advice to help users with anxiety, stress, and emotional struggles.\n{example['instruction']} [/INST]"
    output = f" {example['output']}</s>"
    result = tokenizer(prompt + output, padding="max_length", truncation=True, max_length=384)
    result["labels"] = tokenizer(output, padding="max_length", truncation=True, max_length=384)["input_ids"]
    return result

tokenized_dataset = dataset.map(tokenize, remove_columns=["instruction", "output"])

#LoRA Config
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.CAUSAL_LM
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

#Training arguments
training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/chatbot/counsel/checkpoints/trial4",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    num_train_epochs=5,
    learning_rate=2e-5,
    fp16=True,
    logging_steps=10,
    save_strategy="epoch",
    report_to="none"
)

#Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer
)

#Train!
trainer.train()

#Save model and tokenizer
model.save_pretrained("/content/drive/MyDrive/chatbot/counsel/checkpoints/trial4")
tokenizer.save_pretrained("/content/drive/MyDrive/chatbot/counsel/checkpoints/trial4")

print("✅ Trial 4 complete! Model saved to Drive.")


# Trial 4 - Evaluation

In [None]:
import torch
import time
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel, PeftConfig
import evaluate
from tqdm import tqdm

#Enable TF32 for faster matrix operations
torch.backends.cuda.matmul.allow_tf32 = True

#Load smaller eval set (500 samples for quick test)
eval_dataset = load_dataset(
    "json",
    data_files="/content/drive/MyDrive/chatbot/counsel/datasets/counselchat_smartfiltered.jsonl",
    split="train[:300]"
)

#Load PEFT config and model
model_path = "/content/drive/MyDrive/chatbot/counsel/checkpoints/trial4"
config = PeftConfig.from_pretrained(model_path)

base_model = AutoModelForCausalLM.from_pretrained(
    config.base_model_name_or_path,
    torch_dtype=torch.float16,
    device_map="auto",
    local_files_only=True
)

model = PeftModel.from_pretrained(base_model, model_path, local_files_only=True)
tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path, local_files_only=True)

#Load evaluation metrics
bleu = evaluate.load("bleu")
rouge = evaluate.load("rouge")
bertscore = evaluate.load("bertscore")

#Inference loop with timing
predictions = []
references = []
token_lengths = []
start_time = time.time()

for example in tqdm(eval_dataset, desc="🔍 Evaluating Trial 4 (500 samples, fast mode)"):
    prompt = f"<s>[INST] You are a professional mental health assistant providing thoughtful, multi-step advice to help users with anxiety, stress, and emotional struggles.\n{example['instruction']} [/INST]"
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True).to(model.device)

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=150,  # ✅ Reduced from 200
            pad_token_id=tokenizer.eos_token_id
        )
        pred = tokenizer.decode(outputs[0], skip_special_tokens=True)

    predictions.append(pred)
    references.append(example["output"])
    token_lengths.append(len(tokenizer.tokenize(pred)))

end_time = time.time()

#Evaluation metrics
bleu_result = bleu.compute(predictions=predictions, references=[[ref] for ref in references])
rouge_result = rouge.compute(predictions=predictions, references=references)
bertscore_result = bertscore.compute(predictions=predictions, references=references, lang="en")

#Performance logs
avg_gen_time = round((end_time - start_time) / len(eval_dataset), 2)
avg_token_len = round(sum(token_lengths) / len(token_lengths), 2)

print("\n📊 Trial 4 Evaluation Results (500 samples, Fast Mode):")
print(f"BLEU: {round(bleu_result['bleu'], 4)}")
print(f"ROUGE-1: {round(rouge_result['rouge1'], 4)}")
print(f"ROUGE-2: {round(rouge_result['rouge2'], 4)}")
print(f"ROUGE-L: {round(rouge_result['rougeL'], 4)}")
print(f"BERTScore (F1): {round(sum(bertscore_result['f1']) / len(bertscore_result['f1']), 4)}")

print(f"\n⏱️ Avg generation time per sample: {avg_gen_time} seconds")
print(f"🧠 Avg token length of output: {avg_token_len} tokens")


## 📊 Mistral-7B + CounselChat Hyperparameter Tuning Tracker

### ✅ Trial Summary Table

| Trial | Dataset Size                  | LoRA `r` | Alpha | Dropout | Epochs | LR     | Prompt Style                                    | BLEU   | ROUGE-1 | ROUGE-2 | ROUGE-L | BERTScore | Notes            |
|-------|-------------------------------|----------|--------|----------|--------|--------|------------------------------------------------|--------|----------|----------|----------|------------|------------------|
| 1     | 890 real                      | 8        | 16     | 0.05     | 3      | 2e-5   | Friendly therapist tone                         | 0.0203 | 0.2888   | 0.0429   | 0.1349   | 0.8311     | Baseline         |
| 2     | 890 real                      | 16       | 32     | 0.05     | 4      | 2e-5   | Same prompt as Trial 1                          | 0.0173 | 0.2315   | 0.0334   | 0.1220   | 0.8076     | More stable LoRA |
| 3     | 1,015 (real + 200 synthetic)  | 16       | 32     | 0.05     | 4      | 2e-5   | Same prompt as Trial 1                          | 0.0160 | 0.2163   | 0.0291   | 0.1134   | 0.8058     | With GPT samples |
| ✅ 4  | 2,116 smart-filtered (real)   | 16       | 32     | 0.05     | 5      | 2e-5   | Same prompt, full cleaned dataset               | **0.0222** | 0.2384   | **0.0387** | 0.1225   | **0.8144** | Best so far 🔥   |

---

### 🔧 Notes:
- **BLEU** = n-gram overlap
- **ROUGE-1/2/L** = word, bigram, sequence overlap
- **BERTScore** = semantic similarity using RoBERTa
- All trials used the same system prompt:  
  `"You are a professional mental health assistant providing thoughtful, multi-step advice to help users with anxiety, stress, and emotional struggles."`

---

# Trial 4 Model (Chatbot Testing)

Bulk message testing

In [None]:
#Test prompts for Mistral-7B + CounselChat model
test_prompts = [
    "I feel like I'm constantly overthinking every little thing. How do I make it stop?",
    "Why do I get so anxious when I have to talk to people, even my friends?",
    "I wake up with a knot in my stomach every day. What can I do about it?",
    "Lately I’ve lost interest in everything. Nothing makes me happy anymore.",
    "Is it normal to feel tired all the time even when I’m not physically active?",
    "I feel numb, like nothing really matters. What should I do?",
    "I had a panic attack at work and now I’m afraid it’ll happen again. How do I cope?",
    "My heart races for no reason and I feel like I can't breathe. Am I losing control?",
    "How can I calm down quickly when I feel overwhelmed in public?",
    "No matter what I do, I always feel like I’m not good enough.",
    "I compare myself to everyone and feel like I’m constantly failing.",
    "Why do I always blame myself when something goes wrong?",
    "I always say yes to people even when I’m exhausted. How do I set boundaries?",
    "My partner doesn’t understand my anxiety. It’s hurting our relationship. What should I do?",
    "I feel guilty for putting myself first. Is that selfish?"
]

#Save to TXT
with open("/content/drive/MyDrive/chatbot/counsel/datasets/test_prompts_counselchat.txt", "w") as txt_file:
    txt_file.write("\n".join(test_prompts))

#Save to JSONL
with open("/content/drive/MyDrive/chatbot/counsel/datasets/test_prompts_counselchat.jsonl", "w") as jsonl_file:
    for prompt in test_prompts:
        jsonl_file.write(f'{{"instruction": "{prompt}"}}\n')

#Save to CSV
import pandas as pd
df = pd.DataFrame({"user_prompt": test_prompts})
df.to_csv("/content/drive/MyDrive/chatbot/counsel/datasets/test_prompts_counselchat.csv", index=False)

print("✅ Files saved to Colab:")
print("- test_prompts_counselchat.txt")
print("- test_prompts_counselchat.jsonl")
print("- test_prompts_counselchat.csv")


In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel, PeftConfig
import json

#Load the fine-tuned model (Trial 4)
model_path = "/content/drive/MyDrive/chatbot/counsel/checkpoints/trial4"
config = PeftConfig.from_pretrained(model_path)

base_model = AutoModelForCausalLM.from_pretrained(
    config.base_model_name_or_path,
    torch_dtype=torch.float16,
    device_map="auto",
    local_files_only=True
)

model = PeftModel.from_pretrained(base_model, model_path, local_files_only=True)
tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path, local_files_only=True)

#Load test prompts from JSONL
test_file_path = "/content/drive/MyDrive/chatbot/counsel/datasets/test_prompts_counselchat.jsonl"
prompts = []
with open(test_file_path, "r") as file:
    for line in file:
        entry = json.loads(line)
        prompts.append(entry["instruction"])

#Define the response function
def generate_response(prompt, max_tokens=150):
    full_prompt = f"<s>[INST] You are a professional mental health assistant providing thoughtful, multi-step advice to help users with anxiety, stress, and emotional struggles.\n{prompt} [/INST]"
    inputs = tokenizer(full_prompt, return_tensors="pt", truncation=True).to(model.device)

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_tokens,
            pad_token_id=tokenizer.eos_token_id,
            do_sample=False
        )

    return tokenizer.decode(outputs[0], skip_special_tokens=True).strip()

#Generate and print responses
for i, prompt in enumerate(prompts, 1):
    print(f"\n🧠 Prompt {i}: {prompt}")
    print("🤖 Response:\n", generate_response(prompt))


# Tuning - Trial 5

In [None]:
#!pip install -q transformers datasets peft accelerate bitsandbytes

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer
from datasets import load_dataset
from peft import LoraConfig, get_peft_model, TaskType
import gc

#Clear memory
gc.collect()
torch.cuda.empty_cache()

#Load filtered dataset (2,116 samples)
dataset = load_dataset(
    "json",
    data_files="/content/drive/MyDrive/chatbot/counsel/datasets/counselchat_smartfiltered.jsonl",
    split="train"
)

#Load model and tokenizer
model_name = "mistralai/Mistral-7B-Instruct-v0.2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    device_map="auto"
)

#Trial 5 Prompt Style: Updated for variety and tone
def tokenize(example):
    prompt = f"<s>[INST] As a helpful mental health assistant, how would you support someone who says:\n\"{example['instruction']}\" [/INST]"
    output = f" {example['output']}</s>"
    result = tokenizer(prompt + output, padding="max_length", truncation=True, max_length=512)
    result["labels"] = tokenizer(output, padding="max_length", truncation=True, max_length=512)["input_ids"]
    return result

tokenized_dataset = dataset.map(tokenize, remove_columns=["instruction", "output"])

#LoRA Config — same structure as Trial 4
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.CAUSAL_LM
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

#Training arguments (Trial 5 updates applied)
training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/chatbot/counsel/checkpoints/trial5",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    num_train_epochs=5,
    learning_rate=1e-5,  # lower learning rate
    fp16=True,
    logging_steps=10,
    save_strategy="epoch",
    report_to="none"
)

#Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer
)

#Train!
trainer.train()

#Save model and tokenizer
model.save_pretrained("/content/drive/MyDrive/chatbot/counsel/checkpoints/trial5")
tokenizer.save_pretrained("/content/drive/MyDrive/chatbot/counsel/checkpoints/trial5")

print("✅ Trial 5 complete! Model saved to Drive.")


# Trial 5 - Evaluation

In [None]:
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel, PeftConfig
import evaluate
from tqdm import tqdm
import time

#Load 300-sample evaluation set
eval_dataset = load_dataset(
    "json",
    data_files="/content/drive/MyDrive/chatbot/counsel/datasets/counselchat_smartfiltered.jsonl",
    split="train[:300]"
)

#Load fine-tuned Trial 5 model + PEFT config
model_path = "/content/drive/MyDrive/chatbot/counsel/checkpoints/trial5"
config = PeftConfig.from_pretrained(model_path)

base_model = AutoModelForCausalLM.from_pretrained(
    config.base_model_name_or_path,
    torch_dtype=torch.float16,
    device_map="auto",
    local_files_only=True
)

model = PeftModel.from_pretrained(base_model, model_path, local_files_only=True)
tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path, local_files_only=True)

#Load evaluation metrics
bleu = evaluate.load("bleu")
rouge = evaluate.load("rouge")
bertscore = evaluate.load("bertscore")

#Inference loop
predictions = []
references = []

start = time.time()

for example in tqdm(eval_dataset, desc="🔍 Evaluating Trial 5 (300 samples)"):
    prompt = f"<s>[INST] As a helpful mental health assistant, how would you support someone who says:\n\"{example['instruction']}\" [/INST]"
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True).to(model.device)

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=150,
            pad_token_id=tokenizer.eos_token_id,
            do_sample=True,
            temperature=0.7,
            top_p=0.95,
            repetition_penalty=1.2
        )
        pred = tokenizer.decode(outputs[0], skip_special_tokens=True)

    predictions.append(pred)
    references.append(example["output"])

#Compute metrics
bleu_result = bleu.compute(predictions=predictions, references=[[ref] for ref in references])
rouge_result = rouge.compute(predictions=predictions, references=references)
bertscore_result = bertscore.compute(predictions=predictions, references=references, lang="en")

end = time.time()
avg_gen_time = round((end - start) / len(predictions), 2)

#Print results
print("\n📊 Trial 5 Evaluation Results (300 samples):")
print(f"BLEU: {round(bleu_result['bleu'], 4)}")
print(f"ROUGE-1: {round(rouge_result['rouge1'], 4)}")
print(f"ROUGE-2: {round(rouge_result['rouge2'], 4)}")
print(f"ROUGE-L: {round(rouge_result['rougeL'], 4)}")
print(f"BERTScore (F1): {round(sum(bertscore_result['f1']) / len(bertscore_result['f1']), 4)}")
print(f"⏱️ Avg generation time/sample: {avg_gen_time}s")


# Merging - CounselChat + Augmented EmpatheticDialogues For Trial 6

In [None]:
import json

with open("/content/drive/MyDrive/chatbot/counsel/datasets/counselchat_smartfiltered.jsonl", "r") as f:
    counsel_data = [json.loads(line) for line in f]

with open("/content/drive/MyDrive/SIT782/datasets/empdiag_augmented_4100.jsonl", "r") as f:
    empathetic_data = [json.loads(line) for line in f]

assert "instruction" in counsel_data[0] and "output" in counsel_data[0], "❌ CounselChat format issue"
assert "instruction" in empathetic_data[0] and "output" in empathetic_data[0], "❌ Empathetic format issue"

merged_data = counsel_data + empathetic_data
unique_jsons = list({json.dumps(entry, sort_keys=True) for entry in merged_data})
merged_cleaned = [json.loads(j) for j in unique_jsons]

output_path = "/content/drive/MyDrive/chatbot/counsel/datasets/merged_counsel_empathetic.jsonl"
with open(output_path, "w") as f:
    for sample in merged_cleaned:
        json.dump(sample, f)
        f.write("\n")

#final report
print(f"Merged dataset saved to: {output_path}")
print(f"Total samples after deduplication: {len(merged_cleaned)}")


dataset analysis

In [None]:
import json

# Load merged dataset
with open("/content/drive/MyDrive/chatbot/counsel/datasets/merged_counsel_empathetic.jsonl", "r") as f:
    merged_data = [json.loads(line) for line in f]

# Basic stats
total_samples = len(merged_data)
instruction_lengths = [len(entry["instruction"].split()) for entry in merged_data]
output_lengths = [len(entry["output"].split()) for entry in merged_data]

# Token length distributions
avg_instruction_len = sum(instruction_lengths) / total_samples
avg_output_len = sum(output_lengths) / total_samples

min_instruction_len = min(instruction_lengths)
max_instruction_len = max(instruction_lengths)

min_output_len = min(output_lengths)
max_output_len = max(output_lengths)

# Extremely short responses
short_outputs = sum(1 for length in output_lengths if length < 20)

# Summary
dataset_stats = {
    "Total Samples": total_samples,
    "Avg Instruction Length (words)": round(avg_instruction_len, 2),
    "Avg Output Length (words)": round(avg_output_len, 2),
    "Min Instruction Length": min_instruction_len,
    "Max Instruction Length": max_instruction_len,
    "Min Output Length": min_output_len,
    "Max Output Length": max_output_len,
    "Samples with Output < 20 words": short_outputs
}

dataset_stats


Why We Removed Outputs with Fewer Than 20 Words (Trial 6 Dataset)

In our merged dataset (counselchat + empathetic_dialogues), many samples had very short responses, often under 20 words. While these samples may be realistic, they:

    Lack informative content or emotional depth

    Hurt phrase overlap metrics like ROUGE-2 and BLEU

    Introduce noise or overly generic replies into training

To improve model quality, especially for long-form conversational replies, we created a cleaned dataset where each response (output) has at least 20 words.

This refined dataset (Trial 6) will help:

    Boost evaluation scores (ROUGE, BLEU, BERTScore)

    Encourage more complete, helpful responses

    Train the model to match empathetic, multi-step advice

In [None]:
import json

merged_path = "/content/drive/MyDrive/chatbot/counsel/datasets/merged_counsel_empathetic.jsonl"
with open(merged_path, "r") as f:
    merged_data = [json.loads(line) for line in f]

#Filter: Keep only samples with output ≥ 20 words
filtered_data = [sample for sample in merged_data if len(sample["output"].split()) >= 20]

trial6_path = "/content/drive/MyDrive/chatbot/counsel/datasets/merged_counsel_empathetic_cleaned_trial6.jsonl"
with open(trial6_path, "w") as f:
    for sample in filtered_data:
        json.dump(sample, f)
        f.write("\n")

print("Trial 6 dataset saved!")
print(f"Total samples (after filter): {len(filtered_data)}")
print(f"Samples removed (< 20 words): {len(merged_data) - len(filtered_data)}")


# Trial 6 Tuning

🔁 Trial 6 Fine-Tuning Script (merged_counsel_empathetic_cleaned_trial6.jsonl)

In [None]:
import torch
import gc
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model, TaskType

#Clear memory
gc.collect()
torch.cuda.empty_cache()

#Load dataset (Trial 6)
dataset = load_dataset(
    "json",
    data_files="/content/drive/MyDrive/chatbot/counsel/datasets/merged_counsel_empathetic_cleaned_trial6.jsonl",
    split="train"
)

#Load model and tokenizer
model_name = "mistralai/Mistral-7B-Instruct-v0.2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    device_map="auto"
)
model.config.use_cache = False  # Prevent rotary crash

#Preprocessing function (batched-safe)
def preprocess(batch):
    input_ids_list = []
    attention_mask_list = []
    labels_list = []

    for instruction, output in zip(batch["instruction"], batch["output"]):
        prompt = f"You are a helpful mental health assistant. Respond empathetically and with clarity:\n{instruction}"

        prompt_tokens = tokenizer(prompt, truncation=True, max_length=256, padding="max_length")
        output_tokens = tokenizer(output, truncation=True, max_length=256, padding="max_length")

        input_ids = prompt_tokens["input_ids"] + output_tokens["input_ids"]
        attention_mask = prompt_tokens["attention_mask"] + output_tokens["attention_mask"]
        labels = [-100] * len(prompt_tokens["input_ids"]) + output_tokens["input_ids"]

        # Ensure total length is 512 tokens
        input_ids_list.append(input_ids[:512])
        attention_mask_list.append(attention_mask[:512])
        labels_list.append(labels[:512])

    return {
        "input_ids": input_ids_list,
        "attention_mask": attention_mask_list,
        "labels": labels_list
    }

#Apply preprocessing
tokenized_dataset = dataset.map(
    preprocess,
    batched=True,
    remove_columns=dataset.column_names
)

#LoRA Config (rotary-safe)
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.CAUSAL_LM,
    target_modules=["gate_proj", "up_proj", "down_proj"]
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

#Training Arguments
training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/chatbot/counsel/checkpoints/trial6",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    num_train_epochs=4,
    learning_rate=1e-5,
    logging_steps=20,
    save_strategy="epoch",
    fp16=True,
    report_to=[],
    run_name="mistral_trial6"
)

#Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer
)

#Train
trainer.train()

#Save
model.save_pretrained("/content/drive/MyDrive/chatbot/counsel/checkpoints/trial6")
tokenizer.save_pretrained("/content/drive/MyDrive/chatbot/counsel/checkpoints/trial6")

print("✅ Trial 6 complete and saved!")

# Trial 6 - Evaluation

In [None]:
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM
import evaluate
from tqdm import tqdm

model_path = "/content/drive/MyDrive/chatbot/counsel/checkpoints/trial6"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype=torch.float16, device_map="auto")

#Load evaluation dataset (300 samples for speed)
eval_dataset = load_dataset(
    "json",
    data_files="/content/drive/MyDrive/chatbot/counsel/datasets/merged_counsel_empathetic_cleaned_trial6.jsonl",
    split="train[:300]"
)

bleu = evaluate.load("bleu")
rouge = evaluate.load("rouge")
bertscore = evaluate.load("bertscore")


predictions = []
references = []

for example in tqdm(eval_dataset, desc="🔍 Evaluating Trial 6 (300 samples)"):
    input_text = f"<s>[INST] You are a helpful mental health assistant. Respond empathetically and with clarity:\n{example['instruction']} [/INST]"
    inputs = tokenizer(input_text, return_tensors="pt", truncation=True).to(model.device)

    with torch.no_grad():
        outputs = model.generate(**inputs, max_new_tokens=200, pad_token_id=tokenizer.eos_token_id)
        pred = tokenizer.decode(outputs[0], skip_special_tokens=True)

    predictions.append(pred)
    references.append(example["output"])

bleu_result = bleu.compute(predictions=predictions, references=[[ref] for ref in references])
rouge_result = rouge.compute(predictions=predictions, references=references)
bertscore_result = bertscore.compute(predictions=predictions, references=references, lang="en")

{
    "BLEU": round(bleu_result["bleu"], 4),
    "ROUGE-1": round(rouge_result["rouge1"], 4),
    "ROUGE-2": round(rouge_result["rouge2"], 4),
    "ROUGE-L": round(rouge_result["rougeL"], 4),
    "BERTScore (F1)": round(sum(bertscore_result["f1"]) / len(bertscore_result["f1"]), 4)
}


📊 Trial 6 Evaluation Comparison: Mistral + CounselChat Trials

| Trial   | Dataset                        | Samples | LoRA Params (r/α) | Epochs | BLEU   | ROUGE-1 | ROUGE-2 | ROUGE-L | BERTScore (F1) |
|---------|--------------------------------|---------|-------------------|--------|--------|----------|----------|----------|----------------|
| Trial 1 | counselchat_cleaned.jsonl    | ~890    | 8 / 16            | 3      | 0.0173 | 0.2315   | 0.0334   | 0.1220   | 0.8076         |
| Trial 2 | Same as Trial 1                | ~890    | 16 / 32           | 4      | 0.0160 | 0.2163   | 0.0291   | 0.1134   | 0.8058         |
| Trial 3 | Real + 200 Synthetic           | ~1,015  | 16 / 32           | 4      | 0.0160 | 0.2163   | 0.0291   | 0.1134   | 0.8058         |
| Trial 4 | Smart-filtered CounselChat     | ~2,116  | 16 / 32           | 5      | 0.0222 | 0.2384   | 0.0387   | 0.1225   | 0.8144         |
| Trial 5 | Same dataset, prompt tweaks    | ~2,116  | 16 / 32           | 5      | 0.0243 | 0.3071   | 0.0416   | 0.1307   | 0.8344         |
| **Trial 6** | Merged (filtered counsel) + Empathetic (augmented) | 2,277 | 8 / 16            | 4      | **0.0280** | **0.2967** | **0.0540** | **0.1442** | **0.8328**       |




In [None]:
#!pip install -q transformers datasets evaluate bert_score tqdm

import torch
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel, PeftConfig
import evaluate
from tqdm import tqdm

#Load 500 evaluation samples from MentalChat16K
eval_data = load_dataset("ShenLab/MentalChat16K", split="train")
eval_subset = eval_data.shuffle(seed=42).select(range(500))

#Load fine-tuned model from Trial 12
model_path = "/content/drive/MyDrive/chatbot/counsel/checkpoints/trial6"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype=torch.float16, device_map="auto")

#Response generator
def generate_response(prompt):
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, padding=True, max_length=350).to("cuda")
    outputs = model.generate(
        input_ids=inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        max_new_tokens=180,
        temperature=0.6,
        top_p=0.9,
        do_sample=True,
        repetition_penalty=1.1,
        pad_token_id=tokenizer.eos_token_id
    )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

#Generate predictions with live progress
preds, refs = [], []

print("🧠 Generating responses on 500 MentalChat16K samples...")
for sample in tqdm(eval_subset, desc="Evaluating"):
    user_msg = sample["instruction"]
    ref_response = sample["output"]
    prompt = f"<s>[INST] You are a kind and supportive mental health assistant who responds empathetically to this user message:\n{user_msg} [/INST]"
    gen_response = generate_response(prompt)
    preds.append(gen_response)
    refs.append(ref_response)

#Run evaluation metrics
print("\n📊 Running BLEU, ROUGE, and BERTScore evaluations...\n")

bleu = evaluate.load("bleu")
rouge = evaluate.load("rouge")
bertscore = evaluate.load("bertscore")

print("🔵 BLEU:", bleu.compute(predictions=preds, references=refs)["bleu"] * 100)

rouge_scores = rouge.compute(predictions=preds, references=refs)
print("🟥 ROUGE-1:", rouge_scores["rouge1"] * 100)
print("🟥 ROUGE-2:", rouge_scores["rouge2"] * 100)
print("🟥 ROUGE-L:", rouge_scores["rougeL"] * 100)

bert_result = bertscore.compute(predictions=preds, references=refs, lang="en")
print("🟩 BERTScore (F1):", sum(bert_result["f1"]) / len(bert_result["f1"]) * 100)


In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

# 🔹 Load your fine-tuned model from a specific trial
model_path = "/content/drive/MyDrive/chatbot/counsel/checkpoints/trial6"  # change as needed
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype=torch.float16, device_map="auto")

# 🔹 List of saved user questions (manually added or loaded from file)
prompts = [
    "I feel like I'm constantly overthinking every little thing. How do I make it stop?",
    "Why do I get so anxious when I have to talk to people, even my friends?",
    "I wake up with a knot in my stomach every day. What can I do about it?",
    "im being bullied in my college no one to talk",
    "i like to dance and sing"
    # ...add more prompts as needed
]

# 🔹 Generate and print responses (no saving)
for i, question in enumerate(prompts, 1):
    full_prompt = f"<s>[INST] You are a kind and supportive mental health assistant who responds empathetically to this user message:\n{question} [/INST]"
    inputs = tokenizer(full_prompt, return_tensors="pt", truncation=True, padding=True).to("cuda")
    outputs = model.generate(**inputs, max_new_tokens=200, temperature=0.6, top_p=0.9, do_sample=True, pad_token_id=tokenizer.eos_token_id)
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    print(f"🧠 Prompt {i}: {question}\n🤖 Response: {response}\n{'-'*80}")


Trial 6 Model (Chatbot Testing)

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel, PeftConfig
import json
from tqdm import tqdm

#Trial 6 model path
model_path = "/content/drive/MyDrive/chatbot/counsel/checkpoints/trial6"

#Load model + tokenizer
config = PeftConfig.from_pretrained(model_path)
base_model = AutoModelForCausalLM.from_pretrained(
    config.base_model_name_or_path,
    torch_dtype=torch.float16,
    device_map="auto"
)
model = PeftModel.from_pretrained(base_model, model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)

#Load test prompts
test_file_path = "/content/drive/MyDrive/chatbot/counsel/datasets/test_prompts_counselchat.jsonl"
with open(test_file_path, "r") as f:
    test_data = [json.loads(line) for line in f]

#Run inference
results = []
for i, sample in enumerate(tqdm(test_data, desc="🧠 Running Trial 6 Inference")):
    prompt = f"<s>[INST] You are a helpful mental health assistant. Respond empathetically and with clarity:\n{sample['instruction']} [/INST]"
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True).to(model.device)

    with torch.no_grad():
        output = model.generate(**inputs, max_new_tokens=200, pad_token_id=tokenizer.eos_token_id)
        decoded = tokenizer.decode(output[0], skip_special_tokens=True)

    results.append({
        "user_prompt": sample["instruction"],
        "model_response": decoded.strip()
    })

#Display responses
for idx, r in enumerate(results, 1):
    print(f"\n🧠 Prompt {idx}: {r['user_prompt']}\n🤖 Response: {r['model_response']}")

#Save to CSV
import pandas as pd
df = pd.DataFrame(results)
df.to_csv("/content/trial6_test_outputs.csv", index=False)
print("\n✅ Responses saved to trial6_test_outputs.csv")


# FINAL ANDROID APP BACKEND

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import PeftModel, PeftConfig
from flask import Flask, request, jsonify
from flask_cors import CORS
from pyngrok import ngrok
import os, threading

model_path = "/content/drive/MyDrive/chatbot/counsel/checkpoints/trial6"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4"
)

config = PeftConfig.from_pretrained(model_path)
base_model = AutoModelForCausalLM.from_pretrained(
    config.base_model_name_or_path,
    quantization_config=bnb_config,
    device_map="auto"
)
model = PeftModel.from_pretrained(base_model, model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)

SAFETY_KEYWORDS = [
    "kill myself", "end my life", "suicide", "hurt myself", "self-harm",
    "i want to die", "not worth living", "give up on life"
]

SAFETY_RESPONSE = (
    "💙 I'm really sorry you're feeling this way. You're not alone. "
    "Please seek help from a professional. You can contact Lifeline Australia at 13 11 14, "
    "available 24/7 for support."
)

app = Flask(__name__)
CORS(app)

@app.route("/chat", methods=["POST"])
def chat():
    data = request.json
    user_input = data.get("message", "")
    mode = data.get("mode", "long")  # 'short' or 'long'

    if any(keyword in user_input.lower() for keyword in SAFETY_KEYWORDS):
        return jsonify({"response": SAFETY_RESPONSE})

    style_instruction = {
        "short": "Keep it brief, supportive, and to the point (under 100 words).",
        "long": "Give a detailed and compassionate response with examples and encouragement (around 200 words)."
    }.get(mode, "Give a helpful and supportive response.")

    prompt = (
        f"<s>[INST] You are a kind and supportive mental health assistant.\n"
        f"{style_instruction}\nUser: {user_input}\n[/INST]"
    )

    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, padding=True, max_length=512).to(model.device)

    max_tokens = 400 if mode == "long" else 120

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_tokens,
            temperature=0.6,
            top_p=0.9,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id
        )

    decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
    response = decoded.split("[/INST]")[-1].strip() if "[/INST]" in decoded else decoded.strip()

    return jsonify({"response": response})


ngrok.set_auth_token(os.getenv("HF_TOKEN"))

def run_flask():
    app.run()

threading.Thread(target=run_flask).start()
public_url = ngrok.connect(5000)
print(f"Public URL to access chatbot API: {public_url}")


In [None]:
!pkill -f uvicorn
!pkill -f streamlit
ngrok.kill()

# Final Web UI

In [None]:
code = '''

import streamlit as st
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import PeftModel, PeftConfig

# --- Page Config ---
st.set_page_config(page_title="🧠 CounselChat", page_icon="💬", layout="centered")

# --- Model Path ---
model_path = "/content/drive/MyDrive/chatbot/counsel/checkpoints/trial6"

# --- Bits & Bytes Config ---
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4"
)

# --- Load Model & Tokenizer ---
@st.cache_resource(show_spinner=True)
def load_model():
    config = PeftConfig.from_pretrained(model_path)
    base_model = AutoModelForCausalLM.from_pretrained(
        config.base_model_name_or_path,
        quantization_config=bnb_config,
        device_map="auto"
    )
    model = PeftModel.from_pretrained(base_model, model_path)
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    return model, tokenizer

with st.spinner("🔄 Loading model..."):
    model, tokenizer = load_model()

# --- Safety Filter ---
SAFETY_KEYWORDS = [
    "kill myself", "end my life", "suicide", "hurt myself", "self-harm",
    "i want to die", "not worth living", "give up on life"
]

SAFETY_RESPONSE = (
    "💙 I'm really sorry you're feeling this way. You're not alone. "
    "Please seek help from a professional. You can contact Lifeline Australia at 13 11 14, "
    "available 24/7 for support."
)

# --- Chat History Container (Scrollable & Styled) ---
st.markdown("""
    <style>
        .chat-container {
            max-height: 500px;
            overflow-y: auto;
            padding-right: 10px;
            margin-bottom: 15px;
        }
        .user-bubble {
            background-color: #DCF8C6;
            color: #000000;
            padding: 10px;
            border-radius: 15px;
            text-align: right;
            width: fit-content;
            margin-left: auto;
            margin-bottom: 10px;
        }
        .bot-bubble {
            background-color: #F0F0F0;
            color: #000000;
            padding: 10px;
            border-radius: 15px;
            text-align: left;
            width: fit-content;
            margin-right: auto;
            margin-bottom: 10px;
        }
    </style>
""", unsafe_allow_html=True)

# --- Header ---
st.markdown("""
    <div style='text-align: center;'>
        <h1>🧠 CounselChat</h1>
        <p>Your empathetic mental health assistant</p>
    </div>
""", unsafe_allow_html=True)

# --- Chat Form ---
mode = st.radio("Select Response Style:", options=["short", "long"], horizontal=True)
user_input = st.text_input("💬 Type your message:")

# --- Trigger Chat ---
if st.button("Send") and user_input:
    with st.container():
        st.markdown("<div class='chat-container'>", unsafe_allow_html=True)

        # Safety filter
        if any(keyword in user_input.lower() for keyword in SAFETY_KEYWORDS):
            st.markdown(f"<div class='bot-bubble'>{SAFETY_RESPONSE}</div>", unsafe_allow_html=True)
        else:
            with st.spinner("🤖 Thinking..."):
                style_instruction = {
                    "short": "Keep it brief, supportive, and to the point (under 100 words).",
                    "long": "Give a detailed and compassionate response with examples and encouragement (around 200 words)."
                }.get(mode, "Give a helpful and supportive response.")

                prompt = (
                    f"""<s>[INST] You are a kind and supportive mental health assistant.\n"""
                    f"""{style_instruction}\nUser: {user_input}\n[/INST]"""
                )

                inputs = tokenizer(prompt, return_tensors="pt", truncation=True, padding=True, max_length=512).to(model.device)
                max_tokens = 400 if mode == "long" else 120

                with torch.no_grad():
                    outputs = model.generate(
                        **inputs,
                        max_new_tokens=max_tokens,
                        temperature=0.6,
                        top_p=0.9,
                        do_sample=True,
                        pad_token_id=tokenizer.eos_token_id,
                        eos_token_id=tokenizer.eos_token_id
                    )

                decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
                response = decoded.split("[/INST]")[-1].strip() if "[/INST]" in decoded else decoded.strip()

                # Show conversation
                st.markdown(f"<div class='user-bubble'>{user_input}</div>", unsafe_allow_html=True)
                st.markdown(f"<div class='bot-bubble'>🤖 {response}</div>", unsafe_allow_html=True)

        st.markdown("</div>", unsafe_allow_html=True)

'''
#Save to app.py
with open("app.py", "w") as f:
    f.write(code)

In [None]:
!streamlit run app.py & npx localtunnel --port 8501


In [None]:
!pkill -f streamlit

In [None]:
print("hello")

# Evaluation Summary For Counsel Chat + Mistral 7b model trials


###Evaluation Summary

| Trial | Dataset Used                            | Synthetic Data | Prompt Style                                                | LoRA Config (r/α) | Epochs | BLEU   | ROUGE-1 | ROUGE-2 | ROUGE-L | BERTScore (F1) | Notes                          |
|-------|------------------------------------------|----------------|--------------------------------------------------------------|-------------------|--------|--------|----------|----------|----------|----------------|---------------------------------|
| 1     | CounselChat (890)                        | ❌ No          | "You are a professional mental health assistant..."         | r=8 / α=16        | 3      | 0.0203 | 0.2888   | 0.0429   | 0.1349   | 0.8311         | Baseline trial                 |
| 2     | CounselChat (same)                       | ❌ No          | Same as Trial 1                                              | r=16 / α=32       | 4      | 0.0173 | 0.2315   | 0.0334   | 0.122    | 0.8076         | Slight drop in scores          |
| 3     | CounselChat + 200 synthetic              | ✅ Yes         | Same as Trial 1                                              | r=16 / α=32       | 4      | 0.0228 | 0.2984   | 0.0461   | 0.1387   | 0.8331         | Better than Trial 1 & 2        |
| 4     | Smart-filtered CounselChat (2,116)       | ❌ No          | Same as Trial 1                                              | r=16 / α=32       | 5      | 0.0222 | 0.2384   | 0.0387   | 0.1225   | 0.8144         | Fast mode, improved token len  |
| 5     | Same as Trial 4                          | ❌ No          | "How would you support someone who says..." variation        | r=16 / α=32       | 5      | 0.0243 | 0.3071   | 0.0416   | 0.1307   | 0.8344         | Most human-like tone           |
| 6     | Merged CounselChat + Empathetic (cleaned)| ✅ Yes (merged)| "You are a helpful mental health assistant..." (rotary-safe) | r=8 / α=16        | 4      | 0.028  | 0.2967   | 0.054    | 0.1442   | 0.8328         | ✅ Best overall on clean eval   |
| 6★    | Trial 6 on MentalChat16K (general test)  | ✅ Yes         | Same as above                                                | r=8 / α=16        | —      | 1.91   | 31.90    | 5.00     | 13.57    | 83.33%          | ✅ Best generalization ability  |


In [None]:
from nbformat import read, write
from nbformat import NO_CONVERT

with open("/content/drive/MyDrive/Colab Notebooks/Copy-of-counsel-mistral7b.ipynb") as f:
    nb = read(f, as_version=NO_CONVERT)

# Remove invalid widget metadata
if "widgets" in nb.metadata and "state" not in nb.metadata["widgets"]:
    del nb.metadata["widgets"]

with open("/content/drive/MyDrive/Colab Notebooks/counsel-mistral7b-no_output-cleaned_notebook.ipynb", "w") as f:
    write(nb, f)
