In [5]:
#Data Ingestion
from langchain_community.document_loaders import TextLoader
loader=TextLoader("sample.txt")
text_documents=loader.load()


text_documents

[Document(metadata={'source': 'sample.txt'}, page_content='Time management is a crucial skill that affects every aspect of our lives, from our professional success to personal well-being. It involves planning and organizing how much time to spend on specific activities to increase efficiency and productivity. Good time management allows individuals to accomplish more in less time, which leads to more free time, less stress, and better focus.\n\nOne of the key benefits of effective time management is the ability to meet deadlines and reduce procrastination. When tasks are broken down into smaller, manageable parts and scheduled appropriately, it becomes easier to stay on track and avoid last-minute pressure. This not only improves the quality of work but also builds trust and reliability in a professional setting.')]

In [1]:
import os

from dotenv import load_dotenv
load_dotenv()

#environment variables
os.environ["GOOGLE_API_KEY"] = os.getenv("GOOGLE_API_KEY")

In [None]:
#Web based loader
from langchain_community.document_loaders import WebBaseLoader
import bs4

loader = WebBaseLoader(web_paths=("https://en.wikipedia.org/wiki/Time_management",))
text_documents = loader.load()


text_documents

[Document(metadata={'source': 'https://en.wikipedia.org/wiki/Time_management', 'title': 'Time management - Wikipedia', 'language': 'en'}, page_content='\n\n\n\nTime management - Wikipedia\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nJump to content\n\n\n\n\n\n\n\nMain menu\n\n\n\n\n\nMain menu\nmove to sidebar\nhide\n\n\n\n\t\tNavigation\n\t\n\n\nMain pageContentsCurrent eventsRandom articleAbout WikipediaContact us\n\n\n\n\n\n\t\tContribute\n\t\n\n\nHelpLearn to editCommunity portalRecent changesUpload fileSpecial pages\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nSearch\n\n\n\n\n\n\n\n\n\n\n\nSearch\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nAppearance\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nDonate\n\nCreate account\n\nLog in\n\n\n\n\n\n\n\n\nPersonal tools\n\n\n\n\n\nDonate Create account Log in\n\n\n\n\n\n\t\tPages for logged out editors learn more\n\n\n\nContributionsTalk\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nContents\nmove to sidebar\nhide\n\n\n\n\n(Top)\n\n\n\n

In [8]:
#pdf reader
from langchain_community.document_loaders import PyPDFLoader
loader=PyPDFLoader("sample.pdf")
docs=loader.load()

In [9]:
docs

[Document(metadata={'producer': 'pdfTeX-1.40.21', 'creator': 'LaTeX with hyperref', 'creationdate': '2023-01-19T16:16:47+08:00', 'author': 'Gaurav Narkhede, Anil Hiwale, Bharat Tidke and Chetan Khadse', 'keywords': 'MIA-LSTM; data preprocessing; iterative imputation; autoencoder; LSTM', 'moddate': '2023-01-20T02:59:16+01:00', 'subject': 'Day by day pollution in cities is increasing due to urbanization. One of the biggest challenges posed by the rapid migration of inhabitants into cities is increased air pollution. Sustainable Development Goal 11 indicates that 99 percent of the world’s urban population breathes polluted air. In such a trend of urbanization, predicting the concentrations of pollutants in advance is very important. Predictions of pollutants would help city administrations to take timely measures for ensuring Sustainable Development Goal 11. In data engineering, imputation and the removal of outliers are very important steps prior to forecasting the concentration of air p

In [10]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
#chunking the text
text_splitter=RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
documents = text_splitter.split_documents(docs)
documents[:5]

[Document(metadata={'producer': 'pdfTeX-1.40.21', 'creator': 'LaTeX with hyperref', 'creationdate': '2023-01-19T16:16:47+08:00', 'author': 'Gaurav Narkhede, Anil Hiwale, Bharat Tidke and Chetan Khadse', 'keywords': 'MIA-LSTM; data preprocessing; iterative imputation; autoencoder; LSTM', 'moddate': '2023-01-20T02:59:16+01:00', 'subject': 'Day by day pollution in cities is increasing due to urbanization. One of the biggest challenges posed by the rapid migration of inhabitants into cities is increased air pollution. Sustainable Development Goal 11 indicates that 99 percent of the world’s urban population breathes polluted air. In such a trend of urbanization, predicting the concentrations of pollutants in advance is very important. Predictions of pollutants would help city administrations to take timely measures for ensuring Sustainable Development Goal 11. In data engineering, imputation and the removal of outliers are very important steps prior to forecasting the concentration of air p

In [31]:
 #vector embedddings
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma
db=Chroma.from_documents(documents[:20], embedding=HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2"))


In [32]:
query="what models are dicuessed to predict the concentration of air pollutants?"
results=db.similarity_search(query)
results[0].page_content

'Information about these pollutants was gathered with the help of an ambient information\nsystem [3]. Due to the small size of pollutants, ﬁne particulates (particulate matter with an\naerodynamic diameter <2.5 mm; PM2.5) can inﬁltrate the respiratory system’s bronchioles\nand alveolar region as well as migrate into blood vessels [4]. PM10 and PM2.5 are the most\ndangerous contaminants. Their pollution levels can be used by government organiza-\ntions and authorities to take preventative measures and necessary action to control and\ndecrease pollution. Predicting PM 2.5 and PM10 concentrations could be of great help to\nadministrations in mitigating the negative consequences of these pollutants. As a result,\nnew approaches for estimating PM 2.5 and PM10 concentrations are always required to\nbe searched for by researchers. Quality of air and weather are inextricably linked with\nmeteorological elements, such as air pressure, humidity, temperature, cloud coverage, wind'

In [1]:
from langchain_community.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
embeddings=HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
db1=FAISS.from_documents(documents[:20], embedding=embeddings)

  embeddings=HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
  from .autonotebook import tqdm as notebook_tqdm


NameError: name 'documents' is not defined

In [30]:
query="what models are dicuessed to predict the concentration of air pollutants?"
results=db1.similarity_search(query)
results[0].page_content

'Information about these pollutants was gathered with the help of an ambient information\nsystem [3]. Due to the small size of pollutants, ﬁne particulates (particulate matter with an\naerodynamic diameter <2.5 mm; PM2.5) can inﬁltrate the respiratory system’s bronchioles\nand alveolar region as well as migrate into blood vessels [4]. PM10 and PM2.5 are the most\ndangerous contaminants. Their pollution levels can be used by government organiza-\ntions and authorities to take preventative measures and necessary action to control and\ndecrease pollution. Predicting PM 2.5 and PM10 concentrations could be of great help to\nadministrations in mitigating the negative consequences of these pollutants. As a result,\nnew approaches for estimating PM 2.5 and PM10 concentrations are always required to\nbe searched for by researchers. Quality of air and weather are inextricably linked with\nmeteorological elements, such as air pressure, humidity, temperature, cloud coverage, wind'

In [37]:
from dotenv import load_dotenv
load_dotenv()  # 👈 loads your .env file

from langchain_google_genai import ChatGoogleGenerativeAI

llm = ChatGoogleGenerativeAI(model="gemini-2.0-flash", temperature=0)
print(llm)


model='models/gemini-2.0-flash' google_api_key=SecretStr('**********') temperature=0.0 client=<google.ai.generativelanguage_v1beta.services.generative_service.client.GenerativeServiceClient object at 0x13edd1e50> default_metadata=() model_kwargs={}


E0000 00:00:1760427274.878334 5309863 alts_credentials.cc:93] ALTS creds ignored. Not running on GCP and untrusted ALTS is not enabled.


In [40]:
from langchain_core.prompts import ChatPromptTemplate
prompt=ChatPromptTemplate.from_template("""
Answer the follow question based only on the provided context.
Think step by step before providing a detailed answer.
I will tip you $10000 if the user finds the answr helpful.
<context>
{context}
</context>
Question: {input} 
""")


In [46]:
#Chain

from langchain.chains.combine_documents import create_stuff_documents_chain
document_chain=create_stuff_documents_chain(llm, prompt)

In [50]:

retriever = db1.as_retriever()
retriever 

VectorStoreRetriever(tags=['FAISS', 'HuggingFaceEmbeddings'], vectorstore=<langchain_community.vectorstores.faiss.FAISS object at 0x13d0e8f10>, search_kwargs={})

User---Enquiry--->Retriever---Vector DB
Vector---Retriever--->LLM (Prompt)--->Stuff documents chain--Response


In [51]:
from langchain.chains import create_retrieval_chain
retrieval_chain=create_retrieval_chain(retriever, document_chain)


In [59]:
response = retrieval_chain.invoke({"input": "Whats the title?"})

In [60]:
response['answer']

'The title is "Novel MIA-LSTM Deep Learning Hybrid Model with Data Preprocessing for Forecasting of PM2.5".'