In [38]:
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [39]:
# Extract text from PDF files
def load_pdf_files(data):
    loader = DirectoryLoader(
        data,
        glob="*.pdf",
        loader_cls=PyPDFLoader
    )

    documents = loader.load()
    return documents

In [40]:
extracted_documents = load_pdf_files("E:/7. ML practise Daily/7. GEN AI/11. Chatbot_Github_end to end/Business_Chatbot/data")

In [41]:
length = len(extracted_documents)
print(f"Total number of documents extracted: {length}")

Total number of documents extracted: 176


In [42]:
type(extracted_documents)

list

In [43]:
extracted_documents[0:2]


[Document(metadata={'producer': 'Adobe PDF Library 15.0', 'creator': 'Adobe InDesign 14.0 (Windows)', 'creationdate': '2020-08-03T12:26:17+01:00', 'moddate': '2020-08-03T12:26:41+01:00', 'source': 'E:\\7. ML practise Daily\\7. GEN AI\\11. Chatbot_Github_end to end\\Business_Chatbot\\data\\Data.pdf', 'total_pages': 40, 'page': 0, 'page_label': '1'}, page_content='Three Short\nNovels\nEdited by\nAngela Esterhammer\nThe Edinburgh Edition of  the Works of\nJohn Galt\nThe three novels collected in this volume  \nreveal the diversity of  Galt’s creative abilities. \nGlenfell (1820) is his first publication in the style \nof  Scottish fiction for which he would become \nbest known; Andrew of  Padua, the Improvisatore \n(1820) is a unique synthesis of  his experiences \nwith theatre, educational writing, and travel; \nThe Omen (1825) is a haunting gothic tale. With \ntheir easily readable scope and their vivid \nthemes, each of  the stories has a distinct charm. \nThey cast light on significan

In [44]:
from typing import List
from langchain.schema import Document

def filter_to_minimal_docs(docs: List[Document]) -> List[Document]:
    """
    Given a list of Document objects, return a new list of Document objects
    containing only 'source' in metadata and the original page_content.
    """
    minimal_docs: List[Document] = []
    for doc in docs:
        # Get source from original metadata (returns None if not found)
        src = doc.metadata.get("source")
        
        # Create new Document with only page_content and source metadata
        minimal_docs.append(
            Document(
                page_content=doc.page_content,  # Keep original content
                metadata={"source": src}        # Keep only source in metadata
            )
        )
    return minimal_docs

# This function filters documents to minimal metadata - keeps only source and content
# Useful for reducing document size and removing unnecessary metadata before processing

In [45]:
minimal_documents = filter_to_minimal_docs(extracted_documents)

In [46]:
minimal_documents[0:2]

[Document(metadata={'source': 'E:\\7. ML practise Daily\\7. GEN AI\\11. Chatbot_Github_end to end\\Business_Chatbot\\data\\Data.pdf'}, page_content='Three Short\nNovels\nEdited by\nAngela Esterhammer\nThe Edinburgh Edition of  the Works of\nJohn Galt\nThe three novels collected in this volume  \nreveal the diversity of  Galt’s creative abilities. \nGlenfell (1820) is his first publication in the style \nof  Scottish fiction for which he would become \nbest known; Andrew of  Padua, the Improvisatore \n(1820) is a unique synthesis of  his experiences \nwith theatre, educational writing, and travel; \nThe Omen (1825) is a haunting gothic tale. With \ntheir easily readable scope and their vivid \nthemes, each of  the stories has a distinct charm. \nThey cast light on significant phases of  Galt’s \ncareer as a writer and show his versatility in \nexperimenting with themes, genres, and styles.\nThis volume reproduces Galt’s original editions, \nmaking these virtually unknown works available

In [47]:
def text_splitter(minimal_documents: List[Document]) -> List[Document]:
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=500,
        chunk_overlap=50,
        length_function=len
    )
    texts_chunk = text_splitter.split_documents(minimal_documents)
    return texts_chunk

In [48]:
texts_chunk = text_splitter(minimal_documents)
print(f"Total number of text chunks: {len(texts_chunk)}")

Total number of text chunks: 623


In [49]:
from langchain.embeddings import HuggingFaceEmbeddings

def download_embeddings():
    """
    Download and return the HuggingFace embeddings model.
    """
    model_name = "sentence-transformers/all-MiniLM-L6-v2"
    embeddings = HuggingFaceEmbeddings(
        model_name=model_name
    )
    return embeddings

embedding = download_embeddings()

In [50]:
# vector = embedding.embed_query("Hello world")
# vector

In [51]:
embedding

HuggingFaceEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
), model_name='sentence-transformers/all-MiniLM-L6-v2', cache_folder=None, model_kwargs={}, encode_kwargs={}, multi_process=False, show_progress=False)

In [52]:
# from dotenv import load_dotenv
# import os
# load_dotenv()

# PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
# # OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")


# os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY
# OLLAMA_MODEL = os.getenv("OLLAMA_MODEL", "phi3:mini")  # fallback if not set

# # os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

In [53]:
from dotenv import load_dotenv
import os
load_dotenv()

# Get environment variables
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
HUGGINGFACE_API_TOKEN = os.getenv("HUGGINGFACEHUB_API_TOKEN")
OLLAMA_MODEL = os.getenv("OLLAMA_MODEL", "phi3:mini")

# Set environment variables
os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY
os.environ["HUGGINGFACEHUB_API_TOKEN"] = HUGGINGFACE_API_TOKEN

# Now you can use Hugging Face Inference API
from huggingface_hub import InferenceClient

def get_llm_response(prompt):
    client = InferenceClient(token=HUGGINGFACE_API_TOKEN)
    response = client.text_generation(
        prompt,
        model="microsoft/Phi-3-mini-4k-instruct",
        max_new_tokens=500,
        temperature=0.8
    )
    return response

In [54]:
from pinecone import Pinecone 
pinecone_api_key = PINECONE_API_KEY

pc = Pinecone(api_key=pinecone_api_key)

In [55]:
pc

<pinecone.pinecone.Pinecone at 0x1715710af20>

In [56]:
from pinecone import ServerlessSpec 

index_name = "business-chatbot"

if not pc.has_index(index_name):
    pc.create_index(
        name = index_name,
        dimension=384,  # Dimension of the embeddings
        metric= "cosine",  # Cosine similarity
        spec=ServerlessSpec(cloud="aws", region="us-east-1")
    )


index = pc.Index(index_name)

In [57]:
from langchain_pinecone import PineconeVectorStore

docsearch = PineconeVectorStore.from_documents(
    documents=texts_chunk,
    embedding=embedding,
    index_name=index_name
)

In [58]:
# # Load Existing index 

# from langchain_pinecone import PineconeVectorStore
# # Embed each chunk and upsert the embeddings into your Pinecone index.
# docsearch = PineconeVectorStore.from_existing_index(
#     index_name=index_name,
#     embedding=embedding
# )

In [59]:
dswith = Document(
    page_content="Mahendra is so awesome person.",
    metadata={"source": "Blog"}
)

docsearch.add_documents(documents=[dswith])

['ba297ede-43fd-4d25-bb7f-e7656283cabf']

In [60]:
retriever = docsearch.as_retriever(search_type="similarity", search_kwargs={"k":3})

In [61]:
retrieved_docs = retriever.invoke("What is Acne?")
retrieved_docs

[Document(id='d18194e0-09eb-481e-a439-fe880780b355', metadata={'source': 'E:\\7. ML practise Daily\\7. GEN AI\\11. Chatbot_Github_end to end\\Business_Chatbot\\data\\Medical_book.pdf'}, page_content='GALE ENCYCLOPEDIA OF MEDICINE 226\nAcne\nGEM - 0001 to 0432 - A  10/22/03 1:41 PM  Page 26'),
 Document(id='078a3376-dd34-421f-b283-594a72fad395', metadata={'source': 'E:\\7. ML practise Daily\\7. GEN AI\\11. Chatbot_Github_end to end\\Business_Chatbot\\data\\Medical_book.pdf'}, page_content='GALE ENCYCLOPEDIA OF MEDICINE 2 25\nAcne\nAcne vulgaris affecting a woman’s face. Acne is the general\nname given to a skin disorder in which the sebaceous\nglands become inflamed. (Photograph by Biophoto Associ-\nates, Photo Researchers, Inc. Reproduced by permission.)\nGEM - 0001 to 0432 - A  10/22/03 1:41 PM  Page 25'),
 Document(id='58e419ac-25c2-49a5-a5f5-eadb469cf2bf', metadata={'source': 'E:\\7. ML practise Daily\\7. GEN AI\\11. Chatbot_Github_end to end\\Business_Chatbot\\data\\Medical_book.pd

In [62]:
from langchain.llms import Ollama

chatModel = Ollama(model=OLLAMA_MODEL)

In [63]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

In [64]:
system_prompt = (
    "You are an General story assistant for question-answering tasks. You understand the context very well and Guide us for answers very precisly "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you "
    "don't know. Use two sentences maximum and keep the "
    "answer concise."
    "\n\n"
    "{context}"
)

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)

In [65]:
question_answer_chain = create_stuff_documents_chain(chatModel, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

In [66]:
response = rag_chain.invoke({"input": "Who is sudha murthy ?"})
print(response["answer"])


Sudha Murty is an Indian educator, social worker, and prolific writer born in Shiggaon, Karnataka, in 1950. She received the Padma Shri Award for Literature and has authored numerous books that have been translated into various Indian languages.


In [67]:
response = rag_chain.invoke({"input": "Who did the children go with to the paddy fields ?"})
print(response["answer"])

The children went with their grandmother, Ajja.


In [68]:
response = rag_chain.invoke({"input": "Why were the children surprised by the farming activities they saw?"})
print(response["answer"])

In the city where most things come from supermarkets, witnessing how food is grown and produced through farming was a novel experience for them. The sight of clean seeds being prepared or separating straw from paddy showed them behind-the-scenes work that isn't apparent in everyday urban life.


In [69]:
response = rag_chain.invoke({"input": "What does Ajji’s story suggest about the role of intelligence and resourcefulness in overcoming poverty??"})
print(response["answer"])

Ajji's story suggests that a person can overcome poverty through intelligence and being helpful to others. The woman in her tale demonstrates this by using clever methods, such as selling tree fruit directly from trees rather than relying on intermediaries like wholesalers or supermarkets.
