In [None]:
import os
import dotenv
from pathlib import Path

# LangChain and related imports
from langchain_core.messages import AIMessage, HumanMessage
from langchain_community.document_loaders import (
    TextLoader,
    WebBaseLoader,
    PyPDFLoader,
    Docx2txtLoader,
)
from langchain_community.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_anthropic import ChatAnthropic
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain.chains import create_history_aware_retriever, create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain

# Load environment variables
from dotenv import load_dotenv
load_dotenv()



In [None]:
doc_path = [
    "docs/test_rag.pdf",
    "docs/test_rag.docx",
    "docs/Document 4.pdf"
]

docs =[]
for doc_file in doc_path: 
    file_path = Path(doc_file)
    try: 
        if doc_file.endswith(".pdf"):
            loader = PyPDFLoader(file_path)
        elif doc_file.endswith(".docx"):
            loader = Docx2txtLoader(file_path)
        elif doc_file.endswith(".txt") or doc_file.endswith(".md"):
            loader = TextLoader(file_path)
        else:
            print(f"Documents type {file_path.type} not supported.")
            continue
        # Make the Docs in one object has all the Docs
        docs.extend(loader.load())
    except Exception as e : 
        print(f"Error loading the document {doc_file}: {e}")
    
  #  finally:
      #  os.remove(file_path)

    #Load webs URLs

web_urls = [
    "https://docs.streamlit.io/develop/quick-reference/release-notes",
    
]

# Container for all loaded documents
docs = []

# Load local files
for doc_file in doc_path: 
    file_path = Path(doc_file)
    try: 
        if doc_file.endswith(".pdf"):
            loader = PyPDFLoader(file_path)
        elif doc_file.endswith(".docx"):
            loader = Docx2txtLoader(file_path)
        elif doc_file.endswith(".txt") or doc_file.endswith(".md"):
            loader = TextLoader(file_path)
        else:
            print(f"Document type {file_path.suffix} not supported.")
            continue
        docs.extend(loader.load())
    except Exception as e:
        print(f"Error loading the document {doc_file}: {e}")

# Load URLs
for url in web_urls:
    try:
        loader = WebBaseLoader(url)
        docs.extend(loader.load())
    except Exception as e:
        print(f"Error loading document from {url}: {e}")

# Optional: view result
print(f"Total documents loaded: {len(docs)}")


In [None]:
docs

In [None]:
# Split the files to chunks 
#The recursive splitter will try to chunk it at:
# The recursive splitter will try to chunk it at:
# Paragraph breaks first
# Then sentence boundaries
# Then newlines
# Then spaces
# Then finally by character count if necessary



text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=5000,
    chunk_overlap=1000,
)
documents_chunks = text_splitter.split_documents(docs)

In [None]:
#  Tokenization vs. Embedding — Explained
# 🟦 Tokenization
# What it is: Splitting text into smaller units called tokens (e.g., words, subwords, or even characters).

# Why it’s used: Language models like GPT or BERT don't read raw text — they process tokens.

# Example:

# text
# Copy
# Edit
# Input: "I love pizza"
# Tokens: ["I", " love", " pizza"]
# ✅ Tokenization is like turning sentences into ID numbers the model can understand.

# 🟩 Embedding
# What it is: Mapping each token (or whole sentence) to a vector of numbers that captures its meaning.

# Why it’s used: Embeddings let models compare meanings, search for similarity, and reason.

# Types:

# Word embeddings (like Word2Vec, GloVe)

# Sentence/document embeddings (like OpenAI’s text-embedding-3-small)

# Example:

# text
# Copy
# Edit
# Token: "pizza" → Embedding: [0.27, -0.41, 0.89, ...]
# 🔄 Workflow Relationship
# In a typical NLP pipeline:

# scss
# Copy
# Edit
# Text → Tokenization → Tokens → Embedding → Vector(s)
# Tokenization: turns text into symbols (tokens)

# Embedding: turns symbols into meaning (vectors)

# 🧩 In LangChain or RAG:
# Tokenization affects chunking and cost estimation (e.g., 4096 token limits).

# Embedding is used for vector search and similarity (e.g., searching documents by meaning).

# ✅ Summary Table:
# Feature	Tokenization	Embedding
# Purpose	Break text into tokens	Map tokens/text into numeric vectors
# Output	List of tokens (strings or IDs)	Vectors (arrays of floats)
# Used for	Input to models	Semantic similarity, search
# Tools	tiktoken, HuggingFace Tokenizers	OpenAI Embedding API, SentenceTransformers


In [None]:
#After we done with Split the documents to chunks 
# We will use tokenizer to trasfer this chunks to token 
# Token means we will still have the words but 
# We will trasfer the words to token 
# 1 toke can be one world more than one word and may be part of a word 

#  Rough Estimate (for English & GPT models)
# 1 token ≈ 3 to 4 characters (letters)

# 1 token ≈ 0.75 words

# 100 tokens ≈ 75 words ≈ 300–400 characters



vector_db = Chroma.from_documents (
    documents = documents_chunks,
    embedding = OpenAIEmbeddings(),
)

# 🟢 1. Chroma.from_documents(...)
# This is a class method that:

# Takes a list of documents (your split chunks)

# Converts them into vector embeddings using the provided embedding model

# Stores them in a ChromaDB instance (either in-memory or on disk)

# This allows you to later search for similar documents using vector similarity (like cosine similarity).

In [None]:
documents_chunks


In [None]:
# # Find the retriver chat and documents 
def _get_context_retriever_chain (vector_db, llm):
    #This turns your Chroma DB into a retriever object. LangChain uses this for semantic search over embeddings.
    retriever = vector_db.as_retriever()

    #This block defines the prompt that the LLM will see when it's generating a search query.
    prompt = ChatPromptTemplate.from_messages([
        MessagesPlaceholder(variable_name="messages"),
            ("user", "{input}"),
            ("user","Given the above conversation,generate a search query to look up in order to n\
            get information relevant to the conversation, focusing on the most recent message."),
            ])
    retriever_chain = create_history_aware_retriever ( llm, retriever, prompt)

    return retriever_chain #  this chain returns a retriever chain (not string). It returns Runnable that gives relevant Document objects based on message history.

In [None]:
# this for complete RAG chain 
def get_conversational_rag_chain(llm):
    #Output of this step: a LangChain retriever chain that returns List[Document]
    retriever_chain = _get_context_retriever_chain (vector_db, llm)
    #{context} placeholder that gets filled with relevant docs
    prompt = ChatPromptTemplate.from_messages([
        ("system" ,
        """You are a helpful assistant. Ypu will have to answer to user's queries 
        You will have some context to help with your answer, but now always would be completely related or helpful.
        You can also use your knowledge to assit answering the user's queries.\n            
        {context}"""),
        MessagesPlaceholder(variable_name="messages"),
        ("user","{input}")
    ])
    # the output of stuff_documents_chain will be a string
    stuff_documents_chain = stuff_documents_chain = create_stuff_documents_chain(llm, prompt)

    return create_retrieval_chain(retriever_chain, stuff_documents_chain)
    #Date 07/27/2025

In [None]:
#Agmented Generation 

llm_stream_openai = ChatOpenAI(
    model="gpt-4o",
    temperature=0.3,
    streaming=True,
)

llm_stream_anthropic = ChatAnthropic(
    model="claude-.-5-sonnet",
    temperature =0.3,
    streaming=True,
)

llm_stream =llm_stream_openai 

messages = [
    {"role" : "user","content":"Hi"},
    {"role" : "assistant", "content": "Hi there! How can I assist you today ? "},
    {"role" : "user", "content" : "what the company name Im trying to apply to" },
]

messages = [HumanMessage(content=m["content"]) if m["role"] == "user" else AIMessage(content=m["content"]) for m in messages]

conversation_rag_chain = get_conversational_rag_chain(llm_stream)
response_message = "*(RAG Response)*\n"
for chunk in conversation_rag_chain.pick("answer").stream({"messages": messages[:-1], "input":messages[-1].content}):
    response_message += chunk 
    print(chunk,end="", flush =True)
    messages.append({"role": "assistant","content": response_message})

