In [1]:
# Data Ingesion 

from langchain_community.document_loaders import TextLoader


In [2]:
loader = TextLoader("speach.txt")
text_documents = loader.load()
text_documents

[Document(metadata={'source': 'speach.txt'}, page_content='Here’s a short sample motivational speech you could adapt:\n\n---\n\n**Title:** *The Power of Small Steps*\n\nGood morning everyone,\n\nWhen we think about success, we often imagine giant leaps — moments of dramatic change or overnight breakthroughs. But the truth is, most great achievements aren’t born in a single leap. They’re built, brick by brick, in the quiet persistence of small steps.\n\nEvery time you wake up a little earlier, choose to try again after a failure, or push yourself when you’d rather quit — you’re making progress. You may not see the results right away, but like drops of water carving stone, your daily actions are shaping your future.\n\nThe biggest obstacle isn’t the size of your dream, it’s the doubt that whispers, *“It’s too far.”* The answer to that doubt is simple: take the next step. And then the next. Because success isn’t about how fast you move, but how consistently you keep moving.\n\nSo today, d

In [3]:
import os
from dotenv import load_dotenv
load_dotenv()

os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")

In [4]:
#web based Loader
from langchain_community.document_loaders import WebBaseLoader
import bs4

#Load chunk and index the context of the html page

loader = WebBaseLoader(web_paths=("https://lilianweng.github.io/posts/2023-06-23-agent/",), 
bs_kwargs=dict(parse_only=bs4.SoupStrainer(
    class_=("psot-title", "post_content", "post-header")
)))

text_documents = loader.load()

USER_AGENT environment variable not set, consider setting it to identify your requests.


In [5]:
text_documents

[Document(metadata={'source': 'https://lilianweng.github.io/posts/2023-06-23-agent/'}, page_content='\n\n      LLM Powered Autonomous Agents\n    \nDate: June 23, 2023  |  Estimated Reading Time: 31 min  |  Author: Lilian Weng\n\n\n')]

In [6]:
#pdf reader
from langchain_community.document_loaders import PyPDFLoader
loader = PyPDFLoader('attention.pdf')

In [7]:
docs = loader.load()

docs

[Document(metadata={'producer': 'Microsoft® Word for Microsoft 365', 'creator': 'Microsoft® Word for Microsoft 365', 'creationdate': '2025-07-31T20:30:22-05:00', 'author': 'Shyam Sankeerth Chanda', 'moddate': '2025-07-31T20:30:22-05:00', 'source': 'attention.pdf', 'total_pages': 2, 'page': 0, 'page_label': '1'}, page_content='SAI MOUNIK KAMISETTY \nHartford, CT | +1 (302) 312-1377 | saictvl.k@gmail.com \n \nPROFESSIONAL SUMMARY \n• Over 6 years of experience in designing,  developing,  and deploying scalable enterprise Java \napplications.  \n• Hands-on expertise in Java/J2EE, Spring Boot,  Hibernate, SQL, PL/SQL, and web services \nimplementation.  \n• Proficient in using Git, Bitbucket, Jenkins, and practicing CI/CD methodologies in agile development \nenvironments.  \n• Well-versed in testing and quality assurance using tools like JUnit, jMock, Selenium, SOAP UI, and \nPostman.  \n• Experienced in implementing Kafka and Active MQ for messaging, and secure software design using \nSOA

In [8]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
documents = text_splitter.split_documents(docs)

In [9]:
documents[:5]

[Document(metadata={'producer': 'Microsoft® Word for Microsoft 365', 'creator': 'Microsoft® Word for Microsoft 365', 'creationdate': '2025-07-31T20:30:22-05:00', 'author': 'Shyam Sankeerth Chanda', 'moddate': '2025-07-31T20:30:22-05:00', 'source': 'attention.pdf', 'total_pages': 2, 'page': 0, 'page_label': '1'}, page_content='SAI MOUNIK KAMISETTY \nHartford, CT | +1 (302) 312-1377 | saictvl.k@gmail.com \n \nPROFESSIONAL SUMMARY \n• Over 6 years of experience in designing,  developing,  and deploying scalable enterprise Java \napplications.  \n• Hands-on expertise in Java/J2EE, Spring Boot,  Hibernate, SQL, PL/SQL, and web services \nimplementation.  \n• Proficient in using Git, Bitbucket, Jenkins, and practicing CI/CD methodologies in agile development \nenvironments.  \n• Well-versed in testing and quality assurance using tools like JUnit, jMock, Selenium, SOAP UI, and \nPostman.  \n• Experienced in implementing Kafka and Active MQ for messaging, and secure software design using \nSOA

In [10]:
#vector embedding and vector store
from langchain_community.embeddings import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma
db = Chroma.from_documents(documents[:20], OpenAIEmbeddings())

  db = Chroma.from_documents(documents[:20], OpenAIEmbeddings())


In [None]:
## Chroma VECTOR DATABASE  

query = "What are the skill this person has"
result = db.similarity_search(query)
result[0].page_content

'TECHNICAL SKILLS \nLanguages: Java, SQL, PL/SQL \nFrameworks: Spring Boot, Hibernate, JUnit, Selenium \nTools: Git, Bitbucket, Postman, SOAP UI, Maven, Jenkins \nMessaging: Kafka, ActiveMQ  \nDatabases: Oracle, MySQL \nWeb Technologies: REST, SOAP, JBoss \nDevOps: CI/CD, Docker (basic), Agile, Scrum \nVersion Control: Git, Bitbucket \nCloud: AWS (EC2, S3, EKS, Lambda,  CloudWatch, SNS, SQS, API Gateway, DynamoDB) , Azure \nDevOps: JUnit, jMock, Selenium \n \nCERTIFICATIONS \n• AWS Solutions  Architect Associate  \n• Java Developer Nano Degree (Udacity) \n• JetBrains Academy:  Java Developer Track \n \nPROFESSIONAL EXPERIENCE \nRole – Software Engineer Aug 2022 – till date \nCognizant – DE, USA \nProject: Enterprise-scale backend development using Java/J2EE \n \n• Developed backend services using Java and J2EE technologies to support chatbot workflows \nintegrated with third-party APIs. \n• Created secure SOAP and RESTful services based on SOA design patterns for NLP engine integration

In [18]:
#FAISS vector Database

from langchain_community.vectorstores import FAISS
db1=FAISS.from_documents(documents[:20], OpenAIEmbeddings())

In [19]:
db1

<langchain_community.vectorstores.faiss.FAISS at 0x11e5e4810>

In [22]:
from langchain_openai import ChatOpenAI

model=ChatOpenAI()
model


ChatOpenAI(client=<openai.resources.chat.completions.completions.Completions object at 0x169b45990>, async_client=<openai.resources.chat.completions.completions.AsyncCompletions object at 0x169b46290>, root_client=<openai.OpenAI object at 0x169946190>, root_async_client=<openai.AsyncOpenAI object at 0x12fa583d0>, model_kwargs={}, openai_api_key=SecretStr('**********'))

In [None]:
#desgin chatpromt template
from langchain_core.prompts import ChatPromptTemplate
prompt = ChatPromptTemplate.from_template("""
Answer the following question based only on theprovided context. 
Think step by step before providing a detailed answer.
I will tip you $1000 if the user finds the answer helpful.
<context>
{context}
</context>
Question: {input}
"""
)