In [None]:
!pip install -U bitsandbytes
!pip install -U transformers
!pip install -U transformers accelerate datasets peft bitsandbytes trl
!pip install -U huggingface_hub
pip install rouge-score bert-score nltk

In [None]:
from huggingface_hub import notebook_login
notebook_login()

In [None]:
import os
import torch
import gc
from peft import LoraConfig, get_peft_model
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer
from datasets import load_dataset

data_path = "datasets/empathetic_dialogues.jsonl"
dataset = load_dataset("json", data_files=data_path)["train"].select(range(300))

base_model = "mistralai/Mistral-7B-Instruct-v0.2"
tokenizer = AutoTokenizer.from_pretrained(base_model)
tokenizer.pad_token = tokenizer.eos_token

def tokenize(example):
    return tokenizer(
        example["instruction"],
        text_target=example["output"],
        padding="max_length",
        truncation=True,
        max_length=512,
    )

dataset = dataset.map(tokenize, batched=True)

trials = [
    {"lr": 2e-4, "r": 4, "bs": 2},
    {"lr": 3e-4, "r": 4, "bs": 2},
]

for idx, trial in enumerate(trials):
    print(f"\nStarting Trial #{idx+1} — LR: {trial['lr']}, LoRA r: {trial['r']}, BS: {trial['bs']}")
    out_dir = f"hparam2_trials/trial_{idx+1}_lr{trial['lr']}_r{trial['r']}_bs{trial['bs']}"

    if os.path.exists(os.path.join(out_dir, "final")):
        print("Skipping: Already exists")
        continue

    trainer = None  

    try:
  
        model = AutoModelForCausalLM.from_pretrained(
            base_model,
            torch_dtype=torch.float16,
            device_map="auto",
            low_cpu_mem_usage=True,
        )

        model.gradient_checkpointing_enable()

        peft_config = LoraConfig(
            r=trial['r'],
            lora_alpha=trial['r'] * 2,
            lora_dropout=0.05,
            bias="none",
            task_type="CAUSAL_LM",
        )
        model = get_peft_model(model, peft_config)

        training_args = TrainingArguments(
            output_dir=out_dir,
            num_train_epochs=3,  
            per_device_train_batch_size=1,
            gradient_accumulation_steps=trial['bs'],
            learning_rate=trial['lr'],
            logging_dir=os.path.join(out_dir, "logs"),
            save_strategy="no",
            report_to="none",
            fp16=True,
            logging_steps=10,
        )

        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=dataset,
            tokenizer=tokenizer,
        )

        trainer.train()
        model.save_pretrained(os.path.join(out_dir, "final"))
        tokenizer.save_pretrained(os.path.join(out_dir, "final"))
        print("Trial complete.")

    except Exception as e:
        print(f"Trial #{idx+1} failed: {str(e)}")

    finally:
        #memory
        del model
        if trainer:
            del trainer
        torch.cuda.empty_cache()
        gc.collect()

2 trial models will be created

Evaluation

In [None]:
import os
import torch
from tqdm import tqdm
from rouge_score import rouge_scorer
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from bert_score import score as bert_score
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
import json

model_path = "hparam2_trials/trial_1_lr0.0002_r4_bs2/final"  
data_path = "datasets/empathetic_dialogues.jsonl"
eval_output = "hparam2_trials/eval2_trial1_300sample.json" 

model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype=torch.float16).cuda()
tokenizer = AutoTokenizer.from_pretrained(model_path)

dataset = load_dataset("json", data_files=data_path)["train"].select(range(300))

scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
smooth_fn = SmoothingFunction().method1

all_bleu, all_rouge1, all_rouge2, all_rougeL = [], [], [], []
predictions, references = [], []

print("Generating predictions...")
for item in tqdm(dataset):
    prompt, ref = item["instruction"], item["output"]

    inputs = tokenizer(prompt, return_tensors="pt", truncation=True).input_ids.cuda()
    with torch.no_grad():
        outputs = model.generate(inputs, max_new_tokens=100, pad_token_id=tokenizer.eos_token_id)
    pred = tokenizer.decode(outputs[0], skip_special_tokens=True)

    predictions.append(pred)
    references.append(ref)

    all_bleu.append(sentence_bleu([ref.split()], pred.split(), smoothing_function=smooth_fn))
    rouge_scores = scorer.score(ref, pred)
    all_rouge1.append(rouge_scores["rouge1"].fmeasure)
    all_rouge2.append(rouge_scores["rouge2"].fmeasure)
    all_rougeL.append(rouge_scores["rougeL"].fmeasure)

print("Calculating BERTScore...")
P, R, F1 = bert_score(predictions, references, lang="en", verbose=True)

results = {
    "BLEU": sum(all_bleu) / len(all_bleu),
    "ROUGE-1": sum(all_rouge1) / len(all_rouge1),
    "ROUGE-2": sum(all_rouge2) / len(all_rouge2),
    "ROUGE-L": sum(all_rougeL) / len(all_rougeL),
    "BERTScore_P": P.mean().item(),
    "BERTScore_R": R.mean().item(),
    "BERTScore_F1": F1.mean().item(),
}

with open(eval_output, "w") as f:
    json.dump(results, f, indent=2)

print("Evaluation complete. Scores saved to:", eval_output)


In [None]:
import os
import torch
from tqdm import tqdm
from rouge_score import rouge_scorer
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from bert_score import score as bert_score
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
import json

model_path = "hparam2_trials/trial_2_lr0.0003_r4_bs2/final" 
data_path = "datasets/empathetic_dialogues.jsonl"
eval_output = "hparam2_trials/eval2_trial2_300sample.json"  

model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype=torch.float16).cuda()
tokenizer = AutoTokenizer.from_pretrained(model_path)

dataset = load_dataset("json", data_files=data_path)["train"].select(range(300))

scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
smooth_fn = SmoothingFunction().method1

all_bleu, all_rouge1, all_rouge2, all_rougeL = [], [], [], []
predictions, references = [], []

print("Generating predictions...")
for item in tqdm(dataset):
    prompt, ref = item["instruction"], item["output"]

    inputs = tokenizer(prompt, return_tensors="pt", truncation=True).input_ids.cuda()
    with torch.no_grad():
        outputs = model.generate(inputs, max_new_tokens=100, pad_token_id=tokenizer.eos_token_id)
    pred = tokenizer.decode(outputs[0], skip_special_tokens=True)

    predictions.append(pred)
    references.append(ref)

    all_bleu.append(sentence_bleu([ref.split()], pred.split(), smoothing_function=smooth_fn))
    rouge_scores = scorer.score(ref, pred)
    all_rouge1.append(rouge_scores["rouge1"].fmeasure)
    all_rouge2.append(rouge_scores["rouge2"].fmeasure)
    all_rougeL.append(rouge_scores["rougeL"].fmeasure)

print("Calculating BERTScore...")
P, R, F1 = bert_score(predictions, references, lang="en", verbose=True)

results = {
    "BLEU": sum(all_bleu) / len(all_bleu),
    "ROUGE-1": sum(all_rouge1) / len(all_rouge1),
    "ROUGE-2": sum(all_rouge2) / len(all_rouge2),
    "ROUGE-L": sum(all_rougeL) / len(all_rougeL),
    "BERTScore_P": P.mean().item(),
    "BERTScore_R": R.mean().item(),
    "BERTScore_F1": F1.mean().item(),
}

with open(eval_output, "w") as f:
    json.dump(results, f, indent=2)

print("Evaluation complete. Scores saved to:", eval_output)


Testing the best model response

In [None]:
import torch
import re
from transformers import AutoModelForCausalLM, AutoTokenizer

model_path = "hparam_trials/trial_1_lr0.0002_r4_bs2/final" 
use_memory = True  
temperature = 0.7
max_tokens = 150

model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype=torch.float16).cuda()
tokenizer = AutoTokenizer.from_pretrained(model_path)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"

def clean_response(text):
    text = re.sub(r"\s+", " ", text).strip()
    text = re.sub(r"[“”]", '"', text)
    text = re.sub(r"[‘’]", "'", text)
    return text


print("\nWelcome to the Mental Health Chatbot (Mistral-7B)")
print("Type 'exit' to quit.\n")

history = ""

while True:
    user_input = input("You: ").strip()
    if user_input.lower() in ['exit', 'quit']:
        print("Goodbye! Take care.")
        break

    if use_memory:
        history += f"\nUser: {user_input}"
        prompt = f"<s>[INST] You are a kind and supportive mental health assistant.{history} [/INST]"
    else:
        prompt = f"<s>[INST] You are a kind and supportive mental health assistant.\n{user_input} [/INST]"

    input_ids = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512).input_ids.cuda()

    with torch.no_grad():
        output_ids = model.generate(
            input_ids=input_ids,
            max_new_tokens=max_tokens,
            temperature=temperature,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id
        )

    decoded = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    reply = decoded.split("[/INST]")[-1].strip()
    reply = clean_response(reply)

    print(f"Mistral: {reply}\n")

    if use_memory:
        history += f"\nMistral: {reply}"
