In [10]:
# Install required packages
!pip install transformers accelerate datasets gradio huggingface_hub --quiet

# Suppress tokenizer warnings (optional)
import transformers
transformers.logging.set_verbosity_error()

import torch
import gc
import gradio as gr
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
from datasets import load_dataset
from itertools import chain
import pandas as pd

# ✅ Load public open-access FLAN-T5 model
print("Loading tokenizer and model (Flan-T5)...")
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-large")
model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-large", device_map="auto")

pipe = pipeline(
    "text2text-generation",
    model=model,
    tokenizer=tokenizer
)

gc.collect()

# Preprocessing empathetic dataset
def process_empathetic_dataset(batch_size=200):
    print("Loading empathetic dialogues dataset...")
    dataset = load_dataset("Estwld/empathetic_dialogues_llm", split="train[:2000]")
    dataset = list(dataset)
    processed_data = []

    for i in range(0, len(dataset), batch_size):
        print(f"Processing batch {i//batch_size + 1}/{(len(dataset)+batch_size-1)//batch_size}...")
        batch = dataset[i:i+batch_size]

        for example in batch:
            if isinstance(example, dict):
                input_text = f"Emotion: {example['emotion']}. Situation: {example['situation']}. How should a therapist respond empathetically?"
                target_text = next((conv['content'] for conv in example['conversations'] if conv['role'] == 'assistant'), "[No response available]")
                processed_data.append({"input_text": input_text, "target_text": target_text})

        gc.collect()

    return processed_data

# Preprocessing mental health dataset
def process_mental_health_dataset(batch_size=200):
    print("Loading mental health counseling dataset...")
    dataset = load_dataset("Amod/mental_health_counseling_conversations", split="train[:2000]")
    dataset = list(dataset)
    processed_data = []

    for i in range(0, len(dataset), batch_size):
        print(f"Processing batch {i//batch_size + 1}/{(len(dataset)+batch_size-1)//batch_size}...")
        batch = dataset[i:i+batch_size]

        for example in batch:
            if isinstance(example, dict):
                input_text = f"User: {example['Context']}. Respond empathetically as a therapist."
                target_text = example['Response']
                processed_data.append({"input_text": input_text, "target_text": target_text})

        gc.collect()

    return processed_data

# Load and preprocess datasets
print("Processing empathetic dialogues dataset...")
empathetic_data = process_empathetic_dataset()
print(f"✅ Loaded {len(empathetic_data)} examples from empathetic dataset")

print("Processing mental health dataset...")
mental_health_data = process_mental_health_dataset()
print(f"✅ Loaded {len(mental_health_data)} examples from mental health dataset")

# Combine datasets
combined_dataset = list(chain(empathetic_data, mental_health_data))
del empathetic_data, mental_health_data
gc.collect()

print(f"🧠 Combined dataset has {len(combined_dataset)} examples")

# Test function with enhanced prompts
def test_virtual_psychologist(input_query, max_length=200):
    input_text = f"User: {input_query}. Respond empathetically as a therapist."
    output = pipe(
        input_text,
        max_length=max_length,
        do_sample=True,
        temperature=0.7,
        top_p=0.9,
        num_return_sequences=1
    )
    return output[0]['generated_text'].strip()

# ✅ Gradio Interface
def chat_with_therapist(user_input):
    return test_virtual_psychologist(user_input)

interface = gr.Interface(
    fn=chat_with_therapist,
    inputs="text",
    outputs="text",
    title="🧠 Virtual Therapist",
    description="Chat with an AI therapist trained on empathetic and mental health dialogues."
)

# 🚀 Launch the Gradio app
interface.launch(debug=True)

Loading tokenizer and model (Flan-T5)...
Processing empathetic dialogues dataset...
Loading empathetic dialogues dataset...
Processing batch 1/10...
Processing batch 2/10...
Processing batch 3/10...
Processing batch 4/10...
Processing batch 5/10...
Processing batch 6/10...
Processing batch 7/10...
Processing batch 8/10...
Processing batch 9/10...
Processing batch 10/10...
✅ Loaded 2000 examples from empathetic dataset
Processing mental health dataset...
Loading mental health counseling dataset...
Processing batch 1/10...
Processing batch 2/10...
Processing batch 3/10...
Processing batch 4/10...
Processing batch 5/10...
Processing batch 6/10...
Processing batch 7/10...
Processing batch 8/10...
Processing batch 9/10...
Processing batch 10/10...
✅ Loaded 2000 examples from mental health dataset
🧠 Combined dataset has 4000 examples
Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` ex

Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7860 <> https://63b14eec310dd22923.gradio.live


