In [17]:
from langchain_openai import OpenAIEmbeddings
from langchain_groq import ChatGroq
from dotenv import load_dotenv
from langchain_core.prompts import ChatPromptTemplate
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader
from langgraph.graph import START, END, StateGraph
from typing import List, TypedDict
from CONFIG import OPENAI_EMBEDDED_MODEL, GROQ_MODEL
from langchain_community.vectorstores import FAISS
import re
from pydantic import BaseModel
from langchain_core.documents import Document

In [18]:
load_dotenv()
llm = ChatGroq(model=GROQ_MODEL)
embedded_model = OpenAIEmbeddings(model=OPENAI_EMBEDDED_MODEL)

In [19]:
pdf_loading = PyPDFLoader(file_path='A:\AI_Projects_Practice\CRAG\The_Evolution_of_AI_in_Dubai.pdf')
pdf = pdf_loading.load()
len(pdf)

8

In [20]:
splitting = RecursiveCharacterTextSplitter(chunk_size=400, chunk_overlap=100).split_documents(pdf)
len(splitting)

56

In [21]:
vector_storage = FAISS.from_documents(splitting, embedded_model)

In [22]:
retrieved = vector_storage.as_retriever(search_type='similarity', search_kwargs={'k': 2})
for i in retrieved.invoke('dubai'):
    print(i.page_content)
    print('*'*100)

boosting the economy, and improving quality of life for residents and visitors.
Key components of this strategy include:
Infrastructural Development: Dubai is investing in tech infrastructure to support AI-driven
services. This entails deploying high-speed internet and cloud computing resources across the
city.
****************************************************************************************************
Dubai AI Strategy: Introduced in 2017, this strategy aims to make Dubai a global hub for AI by
2031. It focuses on utilizing AI to optimize government operations, improve city services, and
advance economic development. It involves collaboration between governmental bodies and
private sectors to create an ecosystem that fosters AI innovation.
****************************************************************************************************


In [23]:
UPPER_THD = 0.7
LOWER_THD = 0.3

In [24]:
class state(TypedDict):
    question: str
    doc: List[Document]

    good_docs: List[Document]
    verdict: str
    reason: str

    strip: List[str]
    refined_strip: List[str]
    refined_text: str
    
    answer: str

In [25]:
def retrieved_node(state):
    return {'doc': retrieved.invoke(state['question'])}

In [None]:
# EVALUATOR

class DocEvaluator(BaseModel):
    score: float
    reason: str

doc_eval_prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "You are a strict retrieval evaluator for RAG.\n"
            "You will be given ONE retrieved chunk and a question.\n"
            "Return a relevance score in [0.0, 1.0].\n"
            "- 1.0: chunk alone is sufficient to answer fully/mostly\n"
            "- 0.0: chunk is irrelevant\n"
            "Be conservative with high scores.\n"
            "Also return a short reason.\n"
            "Output JSON only."
	  ),
	  (
		"human",
		"Question is: {question}\n\nChunk is: {chunk}"
	  )
    ]
)

doc_eval_chain = doc_eval_prompt | llm.with_structured_output(DocEvaluator)

def eval_each_doc_node(state: state) -> state:

    q = state["question"]
    
    scores: List[float] = []
    reasons: List[str] = []
    good: List[Document] = []

    for d in state["docs"]:
        out = doc_eval_chain.invoke({"question": q, "chunk": d.page_content})
        scores.append(out.score)
        reasons.append(out.reason)

        # 5) for CORRECT case we will refine only docs with score > LOWER_TH
        if out.score > LOWER_THD:
            good.append(d)

    # 2) CORRECT if at least one doc > UPPER_TH
    if any(s > UPPER_THD for s in scores):
        return {
            "good_docs": good,
            "verdict": "CORRECT",
            "reason": f"At least one retrieved chunk scored > {UPPER_THD}.",
        }
    
        # 3) INCORRECT if all docs < LOWER_TH
    if len(scores) > 0 and all(s < LOWER_THD for s in scores):
        why = "No chunk was sufficient."
        return {
            "good_docs": [],
            "verdict": "INCORRECT",
            "reason": f"All retrieved chunks scored < {LOWER_THD}. {why}",
        }

    # 4) Anything in between => AMBIGUOUS
    why = "Mixed relevance signals."
    return {
        "good_docs": good,
        "verdict": "AMBIGUOUS",
        "reason": f"No chunk scored > {UPPER_THD}, but not all were < {LOWER_THD}. {why}",
    }