# The RAG Flow

In [1]:
import os
import json
import math
import time
import random
import re
from typing import List, Tuple, Any

import numpy as np
import pandas as pd
from tqdm import tqdm
from scipy.spatial.distance import cosine
from dotenv import load_dotenv
import psycopg2
from psycopg2.extras import Json, RealDictCursor
import ast

# LangChain imports
from langchain.schema import BaseRetriever, Document
from langchain.chains import RetrievalQA
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain.prompts import PromptTemplate
from langchain.schema.runnable import RunnablePassthrough
from langchain.schema.output_parser import StrOutputParser
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_retrieval_chain

from pydantic import Field

In [2]:
## 1. Setup Environment and Configure Connections

# Load environment variables
load_dotenv()

openai_api_key = os.getenv("OPENAI_API_KEY") # Get OpenAI API key from environment variables
llm = ChatOpenAI(model_name="gpt-4o-mini", temperature=0.5, openai_api_key=openai_api_key) #Instantiating a Language Model 

embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key) # Create embeddings object using OpenAI's embedding model

# Get PostgreSQL connection details from environment variables
username = os.getenv('PG_ADMIN_USERNAME')  # Get the server admin login name stored in .env file
password = os.getenv('PG_ADMIN_PASSWORD')  # Get the server password stored in .env file
host = os.getenv('PG_SERVER_NAME')         # Get the server name stored in .env file
port = os.getenv('POSTGRES_PORT')          # Get the Standard PostgreSQL port stored in .env file
database = os.getenv('PG_DATABASE')        # Get the database name stored in .env file

# The PostgreSQL connection string 
POSTGRESQL_CONNECTION = f"postgresql://{username}:{password}@{host}:{port}/{database}"

In [3]:
## 2. Define a Custom Document Retrieval Function (PostgresRetriever)  # Help from chatgpt!
# Vector-based retrieval: This uses vector similarity search using the pgvector extension in PostgreSQL.
 
# Custom Retriever class that extends LangChain's BaseRetriever
from scipy.spatial.distance import cosine

class PostgresRetriever(BaseRetriever):
    connection_string: str = Field(...)
    collection_name: str = Field(...)
    embedding_function: Any = Field(...)

    class Config:
        arbitrary_types_allowed = True

    def _get_relevant_documents(self, query: str) -> List[Document]:
        query_embedding = self.embedding_function.embed_query(query)

        with psycopg2.connect(self.connection_string) as conn:
            with conn.cursor(cursor_factory=RealDictCursor) as cur:
                cur.execute(f"""
                    SELECT question_id, question_full, answers, metadata,
                           question_vector <-> %s::vector AS document_distance,
                           answers_vector <-> %s::vector AS answers_distance,
                           question_vector::text as question_vector_text
                    FROM {self.collection_name}
                    ORDER BY question_vector <-> %s::vector
                    LIMIT 20
                """, (json.dumps(query_embedding), json.dumps(query_embedding), json.dumps(query_embedding)))
                results = cur.fetchall()

        all_answers = []
        for result in results:
            metadata = result['metadata']
            answers = result['answers']
            document_relevance = 1 - result['document_distance']
            answers_relevance = 1 - result['answers_distance']
            question_vector = json.loads(result['question_vector_text'])

            for answer in answers:
                answer_text = answer['answer']
                source = answer['source']

                # Assign trust score based on source
                if source == 'AskTheraRAGBuddy':
                    trust_score = 0.5
                elif source == 'Mental Health Dataset':
                    trust_score = 0.75
                else:
                    trust_score = 1.0

                # Combine document relevance, answer relevance, and trust score
                combined_score = (
                    document_relevance * 0.4 +  # Document relevance
                    answers_relevance * 0.4 +   # Answer relevance
                    trust_score * 0.2           # Trust score
                )

                all_answers.append({
                    'question_id': result['question_id'],
                    'question_full': result['question_full'],
                    'answer_text': answer_text,
                    'source': source,
                    'score': combined_score,
                    'metadata': metadata,
                    'question_vector': question_vector  # Add this line
                })

        # Sort answers and select top 10
        sorted_answers = sorted(all_answers, key=lambda x: x['score'], reverse=True)[:10]

        # Create Document objects for the top 10 answers
        documents = []
        for answer in sorted_answers:
            doc = Document(
                page_content=f"Question: {answer['question_full']}\nAnswer: {answer['answer_text']}",
                metadata={
                    'question_id': answer['question_id'],
                    'source': answer['source'],
                    'score': answer['score'],
                    'topic': answer['metadata'].get('topic', ''),
                    'question_title': answer['metadata'].get('question_title', ''),
                    'question_vector': answer['question_vector']  # Use this line
                }
            )
            documents.append(doc)
        return documents
        
    async def _aget_relevant_documents(self, query: str) -> List[Document]:
        return self._get_relevant_documents(query)

# Create an instance of the PostgresRetriever class
postgres_retriever = PostgresRetriever(
    connection_string=POSTGRESQL_CONNECTION,  # PostgreSQL connection string
    collection_name="talk",  # Name of the collection storing documents
    embedding_function=embeddings  # Embedding function to generate query vectors
)


In [4]:
## 3. Custom Prompt Template
# Define a custom prompt template

CUSTOM_PROMPT = PromptTemplate(
    input_variables=["context", "input"],
    template="""
You are an AI assistant tasked with summarizing insights from mental health professionals about mental health related topics and issues.

## Provide an initial response that directly addresses the user's question or concern.
Use insights from the retrieved expert opinions to inform this response.

## After your initial response, provide a comprehensive summary, based only on the retrieved information, that:

## Based on what experts have said on topics relating to [summarized topic]
- [Provide a brief overview of the topic, including its definition and primary characteristics using only the retrieved documents]
- [List key aspects or identifiable features, if applicable. Only include information that is present in the retrieved documents]

## Key points regarding [summarized topic]
- Nature of the concept/issue: [Summarize the fundamental nature]
- Key aspects: [List main aspects or components]
- Related factors: [Summarize related factors or influences, if mentioned]
- Importance/Impact: [Outline the significance or effects]
- Professional perspectives: [Summarize expert views or approaches]

## Consensus among experts
- [Highlight any points of agreement among the experts]

## Specific insights or recommendations mentioned
- [List specific insights, advice, or recommendations given by experts, using bullet points]

Only include information that is present in the retrieved documents. Do not add any information from your own knowledge. 
If any section is not applicable based on the retrieved information, omit it from the summary. Ensure that bullet points are used consistently throughout the response.


Retrieved information:
{context}

User's question: {input}

Your summarized response:
"""
)

# Create the document chain
document_chain = create_stuff_documents_chain(llm, CUSTOM_PROMPT)

In [5]:
## 4. Setup and Execute RAG Pipeline with Helper Functions
# Setup the RAG chain to combine document retrieval and language model responses

def post_process_rag_output(rag_output):
    # Extract the answer from the RAG output
    answer = rag_output['answer']
    
    # Split the answer into sections based on numbered or bulleted lists
    sections = re.split(r'\n\d+\.|\n•', answer)
    
    # Remove any empty sections
    sections = [section.strip() for section in sections if section.strip()]
    
    # Format each section
    formatted_sections = []
    for section in sections:
        # Try to extract a title from the first line
        lines = section.split('\n')
        title = lines[0].strip(':')
        content = '\n'.join(lines[1:])
        
        formatted_sections.append(f"## {title}\n{content}\n")
    
    # Join the formatted sections
    formatted_output = "\n".join(formatted_sections)
    
    return formatted_output

# Create the retrieval chain
rag_chain = create_retrieval_chain(postgres_retriever, document_chain)

# Define topics
topics = {
    "greeting": [
        'afternoon', 'bonjour', 'ciao', 'evening', 'good afternoon', 'good day', 'good evening',
        'good night', 'greetings', 'guten tag', 'hello', 'hello afternoon', 'hello evening', 'hey',
        'hey afternoon', 'hey evening', 'hey there', 'hi', 'hi there', 'hola', 'howdy',
        'is anyone there?', 'konnichiwa', 'namaste', 'night', 'ola', 'salut', 'sawubona'
    ],
    "farewell": [
        'adios', 'au revoir', 'bye', 'bye then', 'catch you later', 'ciao', 'fare thee well',
        'farewell', 'good night', 'goodbye', 'goodnight', 'hello night', 'hey night', 'later',
        'night', 'ok bye', 'sayonara', 'see you', 'see you later', 'so long', 'take care',
        'until next time'
    ],
    "thanks": [
        'thanks', 'thank you', "that's helpful", 'thanks for the help', 'thank you very much',
        'appreciate it', 'cheers', 'gracias', 'much obliged', "you're the best", 'thanks a bunch',
        'you rock'
    ],
    "about": [
        'who are you?', 'what are you?', 'who you are?', 'tell me more about yourself.',
        'what is your name?', 'what should i call you?', "what's your name?", 'tell me about yourself',
        'introduce yourself', 'what can you do?', "what's your purpose?", 'explain yourself',
        'what do you do?', "what's your function?", 'who created you?'
    ]
}

def is_special_topic(query):
    """Check if the query matches any special topic (greeting or farewell)."""
    query = query.lower().strip()
    for topic, phrases in topics.items():
        # Use regular expressions to match whole words to avoid substring issues
        for phrase in phrases:
            if re.search(r'\b' + re.escape(phrase) + r'\b', query):
                return topic
    return None

def get_single_response(documents, topic):
    """Select a single response from the documents for the given topic and format it."""
    relevant_docs = [doc for doc in documents if is_special_topic(doc.page_content) == topic]
    
    if relevant_docs:
        content = random.choice(relevant_docs).page_content
        # Look for a section marked as "Answer:"
        match = re.search(r'Answer:\s*(.*?)(?:\n|$)', content, re.DOTALL)
        if match:
            return match.group(1).strip()
    
    # If no match found, return a fallback response
    return "Sorry, I couldn't find a specific answer for that topic."


def run_rag(query):
    topic = is_special_topic(query)
    
    # If it's a greeting or farewell, retrieve documents from the postgres retriever
    if topic:
        documents = postgres_retriever.invoke(query)
        response = get_single_response(documents, topic)
        
        if response:
            return response
    
    # Fallback to the standard RAG response generation if no special topic found
    response = rag_chain.invoke({"input": query})
    formatted_response = post_process_rag_output(response)
    return formatted_response

In [6]:
query = "What is mental health"
rag_response = run_rag(query)
print(rag_response)

## Mental health refers to a state of well-being that encompasses emotional, psychological, and social aspects of an individual. It influences how people think, feel, and act, and plays a crucial role in how they handle stress, relate to others, and make choices. Essentially, mental health is about realizing one’s own abilities, coping with life’s normal stresses, working productively, and contributing to the community.

## Based on what experts have said on topics relating to mental health
- **Overview**: Mental health is defined as a state of well-being where individuals can realize their abilities, cope with normal life stresses, work productively, and contribute to their communities. It includes emotional, psychological, and social well-being, affecting thoughts, feelings, and actions.
  
- **Key aspects**:
  - Emotional well-being
  - Psychological well-being
  - Social well-being

## Key points regarding mental health
- **Nature of the concept/issue**: Mental health is fundamenta

In [7]:
query = "What is depression"
rag_response = run_rag(query)
print(rag_response)

## Depression is a mental health disorder characterized by a persistently depressed mood or loss of interest in activities, leading to significant impairment in daily life. It can manifest in various ways, including feelings of sadness, irritability, and a lack of energy. If you or someone you know is experiencing symptoms of depression, it's important to seek help from a mental health professional for proper evaluation and support.

## Based on what experts have said on topics relating to Depression
- **Definition and Primary Characteristics**: Depression is a mental illness that affects mood, self-perception, and interpersonal relationships. It can be referred to as clinical depression, major depressive disorder, or major depression. The disorder typically lasts longer than two weeks and does not usually resolve on its own.
  
- **Key Aspects or Identifiable Features**:
  - Depressed mood or irritability
  - Decreased interest or pleasure in previously enjoyed activities
  - Signific

In [7]:
query = "hi"
rag_response = run_rag(query)
print(rag_response)

Hi there. How are you feeling today?


In [8]:
query = "What is PTSD?"
rag_response = run_rag(query)
print(rag_response)

## Post-Traumatic Stress Disorder (PTSD) is a mental health condition that can develop after an individual has experienced or witnessed a traumatic event, where there was a real or perceived threat of harm or loss of life. Symptoms can include hyper-vigilance, anxiety, nightmares, and intrusive memories related to the trauma. It is important to recognize that PTSD can be triggered or exacerbated by subsequent traumatic experiences, such as a car accident, especially for those with a prior history of trauma, such as military service.

### Comprehensive Summary

#### Based on what experts have said on topics relating to PTSD:
- **Overview of PTSD**: PTSD is a mental health disorder that can occur following exposure to traumatic events. It is characterized by symptoms such as anxiety, hyper-vigilance, and intrusive memories.
- **Key aspects**:
  - Symptoms may include heightened anxiety, avoidance of reminders, and emotional distress.
  - The cumulative effect of multiple traumas can lead