In [145]:
import os
import json
import nltk
import re
import logging
from datasets import load_dataset
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain.llms.base import LLM
from langchain.prompts import PromptTemplate
from langchain.embeddings import HuggingFaceEmbeddings
import google.generativeai as genai
from google.oauth2 import service_account
from pydantic import BaseModel, Field
from rouge_score import rouge_scorer
import sacrebleu

In [146]:
# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

In [147]:
# Download required NLTK resources
nltk.download("punkt")
nltk.download("wordnet")
nltk.download("stopwords")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\farha\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\farha\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\farha\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [148]:
# Path to the service account's JSON file
service_account_path = "adv-nlp-uts-faa7595a22eb.json"

# Create credentials using the service account JSON file
try:
    credentials = service_account.Credentials.from_service_account_file(service_account_path, scopes=["https://www.googleapis.com/auth/generative-language"])
except FileNotFoundError:
    logger.error(f"Service account file not found at {service_account_path}.")
    raise
except Exception as e:
    logger.error(f"Error creating credentials from the service account file: {e}")
    raise

# Configure the Gemini API client with the credentials
genai.configure(credentials=credentials)

In [149]:
# Initialize lemmatizer and stopwords
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))

In [150]:
# Define text preprocessing function with lemmatization
def preprocess_text(text):
    # 1. Strip whitespace
    text = text.strip()

    # 2. Tokenize the text
    tokens = nltk.word_tokenize(text)

    # 3. Remove stopwords and apply lemmatization
    lemmatized_tokens = [lemmatizer.lemmatize(token.lower()) for token in tokens if token.lower() not in stop_words]

    # 4. Join the tokens back into a string
    preprocessed_text = " ".join(lemmatized_tokens)

    return preprocessed_text

In [151]:
# Load the mental health conversations dataset
# dataset = load_dataset("RafaelMPereira/HealthCareMagic-100k-Chat-Format-en")
dataset = load_dataset("Amod/mental_health_counseling_conversations")

In [152]:
# Combine 'Context' and 'Response' into 'text' field and preprocess
def process_entry(entry):
    combined_text = f"Context: {entry['Context']}\nResponse: {entry['Response']}"
    return {"text": preprocess_text(combined_text)}


processed_data = [process_entry(entry) for entry in dataset["train"]]

In [153]:
# Split documents into chunks if necessary (e.g., max 512 tokens)
def chunk_text(text, max_length=512):
    tokens = nltk.word_tokenize(text)
    chunks = [" ".join(tokens[i : i + max_length]) for i in range(0, len(tokens), max_length)]
    return chunks

In [154]:
# Create a list of documents for embedding
documents = []
for item in processed_data:
    chunks = chunk_text(item["text"])
    documents.extend(chunks)

logger.info(f"Total number of documents after chunking: {len(documents)}")

INFO:__main__:Total number of documents after chunking: 3793


In [155]:
# Initialize the embeddings model
embedding_model_name = "all-MiniLM-L6-v2"
embeddings = HuggingFaceEmbeddings(model_name=embedding_model_name)

# Initialize FAISS vector store
vector_store = FAISS.from_texts(documents, embeddings)

# Save the vector store locally
vector_store.save_local("faiss_mental_health_index")

# Load the vector store from disk (for future use)
# vector_store = FAISS.load_local("faiss_mental_health_index", embeddings)

INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: cuda
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: all-MiniLM-L6-v2


In [156]:
# Implement the Gemini LLM class
class GeminiLLM(LLM, BaseModel):
    model_name: str = Field(default="gemini-1.5-flash")
    temperature: float = Field(default=0.7)

    @property
    def _llm_type(self):
        return "gemini"

    def _call(self, prompt: str, stop: list[str] = None) -> str:
        try:
            # Initialize the model
            model = genai.GenerativeModel(model_name=self.model_name)

            # Generate content using the Gemini API
            response = model.generate_content(
                prompt,
                # temperature=self.temperature,
                # max_output_tokens=512  # The text prompt to generate content from  # Adjust token limit as needed
            )

            # Extract generated text from the response
            generated_text = response.text

            # Handle stop tokens if provided
            if stop:
                for token in stop:
                    generated_text = generated_text.split(token)[0]

            return generated_text.strip()

        except Exception as e:
            logger.error(f"Gemini API error: {e}")
            return "I'm sorry, but I couldn't process your request at this time."


# Initialize the Gemini LLM client
llm = GeminiLLM(model_name="gemini-1.5-flash", temperature=0.7)

# Define a prompt template
prompt_template = PromptTemplate(
    input_variables=["context", "question"],
    template="""
You are a helpful mental health assistant.

Use the following context to answer the question.

Context:
{context}

Question:
{question}

Answer:""",
)

In [157]:
# Create a RetrievalQA chain with the custom prompt
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",  # You can experiment with 'refine' or 'map_reduce'
    retriever=vector_store.as_retriever(search_kwargs={"k": 5}),
    chain_type_kwargs={"prompt": prompt_template},
    return_source_documents=True,
)

In [158]:
# Function to handle user queries
def answer_query(query):
    try:
        response = qa_chain({"query": query})
        answer = response["result"]
        source_docs = response["source_documents"]
        print("Response:")
        print(answer)
        print("\nRelevant Source Documents:")
        for doc in source_docs:
            print(doc.metadata.get("source", "Unknown Source"))
            print(doc.page_content)
            print("-" * 80)
    except Exception as e:
        logger.error(f"Error during query processing: {e}")

In [159]:
# Test the RAG system with a query
query = "How can someone deal with anxiety effectively?"
answer_query(query)

Response:
Dealing with anxiety effectively requires a multi-pronged approach that combines self-help strategies with professional support. Here's what the provided context suggests:

**1. Seek Professional Help:**

* **Therapy:**  It's highly recommended to see a therapist specializing in anxiety disorders. They can help you identify underlying causes, develop personalized coping strategies, and guide you through exposure therapy.
* **Finding the Right Therapist:** It's crucial to feel comfortable with your therapist as a trusting relationship is essential for effective treatment. 

**2. Mindfulness and Self-Care:**

* **Daily Mindfulness Practice:**  Engage in activities like guided meditation, deep breathing exercises, or yoga to cultivate a sense of calmness and present-moment awareness.
* **Positive Self-Talk:**  Challenge negative thoughts and replace them with more optimistic and reassuring self-statements.
* **Exposure Therapy:**  Gradually confront anxiety-provoking situations 

In [None]:
# Evaluation functions
def evaluate_rouge(predicted, reference):
    scorer = rouge_scorer.RougeScorer(["rouge1", "rougeL"], use_stemmer=True)
    return scorer.score(reference, predicted)


def evaluate_bleu(predicted, reference):
    bleu = sacrebleu.corpus_bleu([predicted], [[reference]])
    return bleu.score


def evaluate_f1(predicted, reference):
    predicted_tokens = nltk.word_tokenize(predicted)
    reference_tokens = nltk.word_tokenize(reference)
    common_tokens = set(predicted_tokens) & set(reference_tokens)

    precision = len(common_tokens) / len(predicted_tokens) if predicted_tokens else 0
    recall = len(common_tokens) / len(reference_tokens) if reference_tokens else 0

    if precision + recall == 0:
        return 0.0
    return 2 * (precision * recall) / (precision + recall)


def run_evaluation(file_path):
    with open(file_path, "r") as f:
        data = json.load(f)

    rouge_scores, bleu_scores, f1_scores = [], [], []

    for entry in data:
        question = entry["question"]
        reference_answer = entry["answer"]
        predicted_answer = answer_query(question)

        rouge = evaluate_rouge(predicted_answer, reference_answer)
        bleu = evaluate_bleu(predicted_answer, reference_answer)
        f1 = evaluate_f1(predicted_answer, reference_answer)

        rouge_scores.append(rouge["rougeL"].fmeasure)
        bleu_scores.append(bleu)
        f1_scores.append(f1)

        print(f"Q: {question}")
        print(f"Predicted: {predicted_answer}")
        print(f"Reference: {reference_answer}")
        print(f"ROUGE-L: {rouge}")
        print(f"BLEU: {bleu}")
        print(f"F1: {f1}")
        print("-" * 80)

    print("\n=== Evaluation Summary ===")
    print(f"Average ROUGE-L: {sum(rouge_scores) / len(rouge_scores):.4f}")
    print(f"Average BLEU: {sum(bleu_scores) / len(bleu_scores):.2f}")
    print(f"Average F1: {sum(f1_scores) / len(f1_scores):.4f}")

In [None]:
run_evaluation("questions_answers.json")