## **Data Preprocessing**

In [7]:
import os
import markdown as md
from bs4 import BeautifulSoup
import frontmatter
from langchain_text_splitters.markdown import MarkdownHeaderTextSplitter
import dotenv
dotenv.load_dotenv()

True

In [106]:
directory_path = os.getcwd()
markdown_files = [f for f in os.listdir(directory_path) if f.endswith('.md')]
markdown_files

['about.md',
 'amazon-scraper.md',
 'audiobook.md',
 'com-sent-pros.md',
 'curl-counter.md',
 'dialexus-chat.md',
 'Finetune-RoBERTa-Sentiment.md',
 'fraud-detect.md',
 'IELTS-Evaluator.md',
 'imdb-analysis.md',
 'lab-gene-guard.md',
 'learning-journey.md',
 'music-player.md',
 'my-thoughts.md',
 'personal-AI-assistant.md',
 'recipe-finder.md',
 'tweet-scraper.md',
 'youtube-AI-Analyzer.md',
 'youtube-recommender.md']

In [107]:
def load_markdown(file_path):
    post = frontmatter.load(file_path)
    meta = post.metadata
    body = post.content
    return meta, body

In [110]:
headers_to_split_on = [("#", "Header 1"), ("##", "Header 2"), ("###", "Header 3")]
header_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)

chunks = []
ids = []
texts = []
metadatas = []

for file_path in [markdown_files[5]]:
    meta, body = load_markdown(file_path)

    split_chunks = header_splitter.split_text(body)

    prev_h2 = None
    prev_h3_overlap = ""
    h2_buffer = ""  # to hold the ## text until we see if it has ###

    for i, c in enumerate(split_chunks):
        chunk_id = f"{meta.get('id')}-{i+1}"

        # Compose header chain
        header_context = ""
        if "Header 1" in c.metadata:
            header_context += f"{c.metadata['Header 1']}\n"
        if "Header 2" in c.metadata:
            header_context += f"{c.metadata['Header 2']}\n"

        # ----------------------------
        # Handle ### chunks
        # ----------------------------
        if "Header 3" in c.metadata:
            header_text = f"{c.metadata['Header 3']}\n"
            # If parent h2 exists, prepend its text
            full_text = header_context + h2_buffer + "\n"
            # Add overlap if inside same h2
            if prev_h2 == c.metadata.get("Header 2") and prev_h3_overlap:
                full_text += prev_h3_overlap + "\n"
            # Add h3 header and its content
            full_text += header_text + c.page_content

            # Store last 150 chars of this chunk for overlap
            prev_h3_overlap = '...' + c.page_content[-150:]
            prev_h2 = c.metadata.get("Header 2")

            chunks.append(c)
            texts.append(full_text)
            metadatas.append({**meta, **c.metadata})
            ids.append(chunk_id)

        # ----------------------------
        # Handle ## chunks
        # ----------------------------
        elif "Header 2" in c.metadata:
            # reset h3 overlap
            prev_h3_overlap = ""
            prev_h2 = c.metadata.get("Header 2")

            # Save h2 text in case we encounter h3 later
            h2_buffer = c.page_content

            # Peek ahead: if the next chunk is NOT an h3 under this h2, then this h2 stands alone
            next_chunk = split_chunks[i+1] if i+1 < len(split_chunks) else None
            if not (next_chunk and "Header 3" in next_chunk.metadata and next_chunk.metadata.get("Header 2") == prev_h2):
                # It's a standalone ## with no children
                full_text = header_context + "\n" + c.page_content
                chunks.append(c)
                texts.append(full_text)
                metadatas.append({**meta, **c.metadata})
                ids.append(chunk_id)

        # ----------------------------
        # Handle # chunks (top-level)
        # ----------------------------
        elif "Header 1" in c.metadata:
            prev_h3_overlap = ""
            prev_h2 = None
            h2_buffer = ""
            full_text = header_context + "\n" + c.page_content
            chunks.append(c)
            texts.append(full_text)
            metadatas.append({**meta, **c.metadata})
            ids.append(chunk_id)

In [111]:
texts

['Freelancing: Multi-Tenant Real-Time Chat Application\n**Overview**\n\nThis is a real-time chat application designed for organizations that require complete separation of communication environments. It supports a hierarchical access system — with a global Super Admin overseeing multiple isolated admin-managed tenants. Each admin has full control over their own users, chat groups, and messages, without ever seeing or accessing data from other tenants. The system provides secure authentication, tenant-aware message handling, group messaging, and user management — all with full data isolation, real-time delivery, and an intuitive interface.',
 'Freelancing: Multi-Tenant Real-Time Chat Application\n**Features**\nThe system follows a strict hierarchical model with well-defined capabilities for each role: **Super Admin**, **Admin**, and **User**. Each tenant operates in complete isolation from others, with a focus on privacy, real-time communication, and role-specific control.\n**Super Admi

In [112]:
def markdown_to_text(markdown_str: str) -> str:
    # 1) Convert MD → HTML
    html = md.markdown(markdown_str)
    # print(html)
    # 2) Strip HTML tags → plain text
    return BeautifulSoup(html, "html.parser").get_text(separator=" ")

In [113]:
plain_texts = [markdown_to_text(text) + ' \n\n' for text in texts]

## **Data Embedding** & **Retrieving**

In [1]:
import os
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_community.retrievers import PineconeHybridSearchRetriever
from pinecone import Pinecone
from pinecone_text.sparse import BM25Encoder
from langchain.retrievers.contextual_compression import ContextualCompressionRetriever
from langchain_community.document_compressors import FlashrankRerank
from flashrank import Ranker
from typing import Optional, List
from pydantic import Field
from langchain_core.callbacks.manager import CallbackManagerForRetrieverRun
from langchain_core.documents import Document

  from .autonotebook import tqdm as notebook_tqdm


In [8]:
GOOGLE_API_KEY = os.getenv('GOOGLE_API_KEY')
PINECONE_API_KEY = os.getenv('PINECONE_API_KEY')
EMBED_MODEL = os.getenv('EMBEDDING_MODEL')

In [None]:
# Dense Vector embedding
class FixedDimensionGoogleGenerativeAIEmbeddings(GoogleGenerativeAIEmbeddings):
    """
    A wrapper that fixes the output_dimensionality for embedding methods.
    """
    # Define a Pydantic-compatible field to store the output dimension.
    # This makes the field visible to external validation checks.
    output_dimensionality: Optional[int] = Field(
        None, description="The fixed output dimension for embeddings."
    )

    # We override the __init__ to handle the parameter and pass it to the base class.
    # The Field definition above will handle the validation, so we don't need a custom pop.
    def __init__(self, **kwargs):
        super().__init__(**kwargs)

    def embed_documents(self, texts, **kwargs):
        if self.output_dimensionality is not None:
            kwargs['output_dimensionality'] = self.output_dimensionality
        return super().embed_documents(texts, **kwargs)

    def embed_query(self, text, **kwargs):
        if self.output_dimensionality is not None:
            kwargs['output_dimensionality'] = self.output_dimensionality
        return super().embed_query(text, **kwargs)

# we can pass the output_dimensionality to the constructor directly.
embeddings = FixedDimensionGoogleGenerativeAIEmbeddings(
    google_api_key=GOOGLE_API_KEY,
    model=EMBED_MODEL,
    output_dimensionality=768
)

In [11]:
# Sparse vector embedding
bm25_encoder = BM25Encoder().default()

In [118]:
bm25_encoder.fit(plain_texts)
bm25_encoder.dump("bm25_values.json")

100%|██████████| 10/10 [00:00<00:00, 361.96it/s]


In [66]:

bm25_encoder = BM25Encoder().load("bm25_values.json")

In [12]:
index_name = "personal-assistant"
pc = Pinecone(api_key=PINECONE_API_KEY)
index = pc.Index(index_name)

In [13]:
# Hybrid Retriever configuration
namespace = "portfolio"
# hybrid_retriever = PineconeHybridSearchRetriever(
#     embeddings=embeddings,
#     sparse_encoder=bm25_encoder,
#     index=index,
#     top_k=20,
#     namespace=namespace  
# )

class SafeHybridSearchRetriever(PineconeHybridSearchRetriever):
    def _get_relevant_documents(
        self, query: str, *, run_manager: Optional[CallbackManagerForRetrieverRun] = None
    ) -> List[Document]:
        """Get documents relevant to the query using hybrid search with fallback to dense-only."""
        try:
            # Try hybrid search first
            return super()._get_relevant_documents(query, run_manager=run_manager)
        except Exception as e:
            # If sparse encoding fails, fall back to dense-only search
            if "Sparse vector must contain at least one value" in str(e):
                print("Falling back to dense-only search for query:", query)
                # Generate dense embeddings
                embedding = self.embeddings.embed_query(query)
                # Search with only dense vectors
                results = self.index.query(
                    vector=embedding,
                    top_k=self.top_k,
                    include_metadata=True,
                    namespace=self.namespace,
                )
                # Convert Pinecone results to LangChain documents
                return self._process_pinecone_results(results)
            else:
                # If it's a different error, re-raise it
                raise e
    
    def _process_pinecone_results(self, results):
        """Process Pinecone results into Document objects."""
        docs = []
        for result in results.matches:
            metadata = result.metadata or {}
            # Create Document with page content and metadata
            doc = Document(
                page_content=metadata.pop("text", ""),
                metadata=metadata,
            )
            docs.append(doc)
        return docs
            
hybrid_retriever = SafeHybridSearchRetriever( 
        embeddings=embeddings, 
        sparse_encoder=bm25_encoder, 
        index=index,
        top_k=20,
        namespace=namespace
    )

In [None]:
# Reranker configuration
reranker_compressor = FlashrankRerank(
    model="ms-marco-TinyBERT-L-2-v2",
    top_n=5
)

retriever = ContextualCompressionRetriever(
    base_compressor=reranker_compressor, 
    base_retriever=hybrid_retriever
)

In [69]:
# Upserting to Pinecone
# hybrid_retriever.add_texts(plain_texts, metadatas=metadatas, ids=ids, namespace=namespace)

In [70]:
# # Optionally deleting base on all records or specific prefixes
# # 1. Delete ALL records in the index
# def delete_all_records():
#     index.delete(delete_all=True)
#     print("✅ All records deleted from index.")

# # 2. Delete records by ID prefix (e.g., 'chat-app')
# def delete_records_by_prefix(prefix: str):
#     ids_to_delete = [f"{prefix}-{i}" for i in range(1, 12)]  # adjust range if needed
#     index.delete(ids=ids_to_delete, namespace="portfolio")
#     print(f"✅ Deleted records with prefix '{prefix}-*'")

In [71]:
# Querying the retriever
result = retriever.invoke("has amaan did any freelancing?")
result

[Document(metadata={'id': 'curl-cnt', 'relevance_score': np.float32(0.0049223006), 'Header 1': 'Curl Counter', 'Header 2': 'Links:', 'duration': 'Jan 2025 - Feb 2025', 'title': 'Curl Counter', 'type': 'project', 'score': 0.131391078}, page_content='Curl Counter\nLinks: \n \n Github Repository: https://github.com/AmaanP314/curl-counter \n Live App: Unfortunately, there is no live demonstration available for this app at the moment. The memory requirements exceed the limits of the free tier on the hosting platform I use, Render. \n \n\n'),
 Document(metadata={'id': 'lab-guard', 'relevance_score': np.float32(0.0021973418), 'Header 1': 'Freelancing Work: Lab Gene Guard', 'duration': 'Feb 2025', 'title': 'Lab Gene Guard', 'type': 'freelancing', 'score': 0.136480749}, page_content='Freelancing Work: Lab Gene Guard \n One of my notable freelancing projects is Lab Gene Guard, a health-tech platform focused on genetic testing. I was responsible for developing and deploying the entire website, wh

## **LLM Integration**:

In [15]:
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate

In [None]:
llm_model = os.getenv('LLM_MODEL')
llm_model

In [17]:
llm = ChatGoogleGenerativeAI(
    model=llm_model,
    google_api_key=GOOGLE_API_KEY,
    temperature=1.0,
)

In [None]:
custom_prompt = PromptTemplate(
    input_variables=["context", "question"],
    template=os.getenv('CUSTOM_PROMPT')
)

In [19]:
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,              
    chain_type="stuff",
    return_source_documents=True,
    chain_type_kwargs={"prompt": custom_prompt},
)

In [23]:
query = "what's your role?"
response = qa_chain.invoke({"query": query})
print(response["result"])

I am Portfolio AI Assistant, I am here to answer questions about Amaan Poonawala's professional background.


## **Memory Integration**:

In [35]:
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder, SystemMessagePromptTemplate, HumanMessagePromptTemplate
from langchain.chains import create_retrieval_chain
from langchain_core.messages import HumanMessage, AIMessage, BaseMessage
from langchain_core.runnables import RunnableLambda, RunnablePassthrough
from langchain_community.chat_message_histories import ChatMessageHistory
from langchain_core.chat_history import BaseChatMessageHistory
from langchain_core.runnables.history import RunnableWithMessageHistory
from langchain.chains import create_history_aware_retriever
from langchain.chains.combine_documents import create_stuff_documents_chain
from typing import Any, Dict

In [36]:
store = {} # Simple in-memory store (replace with persistent storage for production)

def get_full_session_history(session_id: str) -> BaseChatMessageHistory:
    """Retrieves or creates the FULL chat history object for a given session ID."""
    if session_id not in store:
        print(f"INFO: Creating new chat history for session: {session_id}")
        store[session_id] = ChatMessageHistory()
    # else:
        # print(f"DEBUG: Accessing existing chat history for session: {session_id}")
    return store[session_id] 

MAX_HISTORY_TURNS = 3 # Number of recent turns (1 turn = 1 human + 1 AI message)
MAX_HISTORY_MESSAGES = MAX_HISTORY_TURNS * 2


def limit_history_for_rag_chain(input_dict: Dict[str, Any]) -> Dict[str, Any]:
    """
    Takes the input dictionary prepared by RunnableWithMessageHistory (potentially
    containing full history), trims 'chat_history', and returns the modified dict
    for the rag_chain.
    """
    modified_input = input_dict.copy()

    if "chat_history" in modified_input:
        history = modified_input["chat_history"]
        if isinstance(history, list) and all(isinstance(m, BaseMessage) for m in history):
            limited_history = history[-MAX_HISTORY_MESSAGES:]
            modified_input["chat_history"] = limited_history
            # print(f"DEBUG: Passing limited history ({len(limited_history)} msgs) to rag_chain.")
        else:
            # This case shouldn't happen with standard history objects, but good to check
             print("WARN: 'chat_history' in input_dict is not a list of BaseMessages. Passing as is.")

    return modified_input

In [None]:
retriever_prompt_template = os.getenv("RETRIEVER_PROMPT").format(max_turns=MAX_HISTORY_TURNS)

contextualize_q_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", retriever_prompt_template),
        MessagesPlaceholder(variable_name="chat_history"), # This will receive the limited history
        ("human", "{input}"),
     ]
)
history_aware_retriever = create_history_aware_retriever(
    llm, retriever, contextualize_q_prompt
)

In [None]:
qa_system_prompt = os.getenv("SYSTEM_PROMPT").format(max_turns=MAX_HISTORY_TURNS)


qa_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", qa_system_prompt),
        # MessagesPlaceholder(variable_name="chat_history"), # History is included in the system prompt now
        ("human", "{input}"), # The original user input for the current turn
    ]
)

In [65]:
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser

In [66]:
reformulate_question_chain = (
    contextualize_q_prompt # Your existing prompt for this
    | llm
    | StrOutputParser()
)

In [None]:
qa_chain = create_stuff_documents_chain(llm, qa_prompt)

In [None]:
retrieval_chain_that_exposes_question = RunnablePassthrough.assign(
    standalone_question=reformulate_question_chain,
).assign(
    context=lambda x: retriever.invoke(x["standalone_question"]),
)

rag_chain = retrieval_chain_that_exposes_question.assign(
    answer=qa_chain 
)

In [69]:
conversational_rag_chain = RunnableWithMessageHistory(
    runnable=RunnableLambda(limit_history_for_rag_chain) | rag_chain, # Wrap rag_chain
    get_session_history=get_full_session_history, # Use the function returning the ACTUAL history object
    input_messages_key="input",
    history_messages_key="chat_history", # Key used by the wrapper and prompts
    output_messages_key="answer",
)

In [77]:
conversational_rag_chain.invoke(
        {"input": "nice, now tell me about Amaan"},
        config={"configurable": {"session_id": "session_id"}}
    )

{'input': 'nice, now tell me about Amaan',
 'chat_history': [HumanMessage(content='nice, now tell me about yourself', additional_kwargs={}, response_metadata={}),
  AIMessage(content="I am the Portfolio AI Assistant, an agentic Retrieval-Augmented Generation (RAG) chatbot designed to answer questions about Amaan's professional background.", additional_kwargs={}, response_metadata={}),
  HumanMessage(content='how did he manage to build you?', additional_kwargs={}, response_metadata={}),
  AIMessage(content='The Portfolio AI Assistant was built by Amaan using a combination of hybrid retrieval (dense gemini-embedding-001 + sparse BM25), history-aware query rewriting, semantic, header-aware chunking with contextual overlap, and a dense-only fallback.', additional_kwargs={}, response_metadata={}),
  HumanMessage(content='nice, now tell me about Amaan', additional_kwargs={}, response_metadata={}),
  AIMessage(content="I can provide information about Amaan's professional background.", additio

In [None]:
def formatter(session_id: str, query: str):
    print(f"\n--- Running Query for Session '{session_id}' ---")
    print(f"User Query: {query}")

    response = conversational_rag_chain.invoke(
        {"input": query},
        config={"configurable": {"session_id": session_id}}
    )

    print("\nReformulated Query for Retrieval:")
    print(f"{response.get('standalone_question', 'N/A')}")

    print("\nLLM Answer:")
    print(response.get("answer", "No answer found."))

    print("\nRetrieved Documents (Context):")
    retrieved_docs = response.get("context", [])
    if retrieved_docs:
        for i, doc in enumerate(retrieved_docs):
            print(f"  Document {i+1}:")
            print(f"    Page Content (snippet): {doc.page_content[:200]}...")
            print(f"    Metadata: {doc.metadata}")
    else:
        print("  No documents were retrieved.")

    # Print the state of the store to verify saving
    print(f"\nCurrent Store State for '{session_id}':")
    if session_id in store:
         # Access the messages attribute of the ChatMessageHistory object
         history_object = store[session_id]
         print(f"  History contains {len(history_object.messages)} messages.")
    else:
         print("  No history found in store for this session ID.")

In [63]:
store = {}

In [79]:
session_1 = "314159"

# Run multiple queries in the same session
formatter(session_1, "tell me about yourself")


--- Running Query for Session '314159' ---
User Query: tell me about yourself
INFO: Creating new chat history for session: 314159

Reformulated Query for Retrieval:
What are the capabilities of Portfolio AI Assistant?

LLM Answer:
I am the Portfolio AI Assistant, an agent designed to answer questions about Amaan Poonawala's professional background. I provide information about his projects, skills, certifications, and learning journey with high precision.

Retrieved Documents (Context):
  Document 1:
    Page Content (snippet): Portfolio AI Assistant
Key capabilities (what it actually does well) 
 
 Context-aware answers  grounded in retrieved chunks (no hallucinated CV claims). 
 Resilient retrieval : hybrid search + dense ...
    Metadata: {'id': 'portfolio-ai-assistant', 'relevance_score': np.float32(0.999516), 'Header 1': 'Portfolio AI Assistant', 'Header 2': 'Key capabilities (what it actually does well)', 'duration': 'Apr 2025 – May 2025', 'tags': ['RAG', 'Agentic RAG', 'Pinecone

In [70]:
def chat(query: str, session_id: str):   
    response = conversational_rag_chain.invoke(
        {"input": query},
        config={"configurable": {"session_id": session_id}}
    )
    return response.get("answer", "No answer found.")