In [2]:
!pip install chromadb langchain tiktoken



In [5]:
!pip install huggingface langchain_community

Collecting langchain_community
  Downloading langchain_community-0.2.5-py3-none-any.whl.metadata (2.5 kB)
Downloading langchain_community-0.2.5-py3-none-any.whl (2.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m35.9 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hInstalling collected packages: langchain_community
Successfully installed langchain_community-0.2.5


### Setting up the huggingface llm

In [65]:


from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
from langchain_community.llms import HuggingFacePipeline

model_id = "google/flan-t5-large"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForSeq2SeqLM.from_pretrained(model_id)

pipe = pipeline(
    "text2text-generation",
    model=model, 
    tokenizer=tokenizer,
    max_new_tokens=512,
    temperature=0.7,
    top_p=0.95,
    repetition_penalty=1.15
)

llm = HuggingFacePipeline(pipeline=pipe)

# Use the model
response = llm("What is artificial intelligence?")
print(response)





artificial intelligence is the development of computer programs that can learn from data and apply them to solve problems.


### Downloading the data

In [9]:
import os
import requests

# Create a 'pdfs' folder if it doesn't exist
if not os.path.exists('pdfs'):
    os.makedirs('pdfs')

# URL of the PDF you want to download
pdf_url = "https://core.ac.uk/download/pdf/71818866.pdf"

# Get the filename from the URL
filename = pdf_url.split("/")[-1]

# Full path where the PDF will be saved
save_path = os.path.join('pdfs', filename)

# Download the PDF
response = requests.get(pdf_url)

# Check if the request was successful
if response.status_code == 200:
    # Write the content to a file
    with open(save_path, 'wb') as file:
        file.write(response.content)
    print(f"PDF downloaded and saved to {save_path}")
else:
    print(f"Failed to download PDF. Status code: {response.status_code}")

PDF downloaded and saved to pdfs/71818866.pdf


In [12]:
### Converting Pdf to txt file
!pip install pyPDF2
import PyPDF2

def pdf_to_text(pdf_path, output_txt):
    # Open the PDF file in read-binary mode
    with open(pdf_path, 'rb') as pdf_file:
        # Create a PdfReader object instead of PdfFileReader
        pdf_reader = PyPDF2.PdfReader(pdf_file)

        # Initialize an empty string to store the text
        text = ''

        for page_num in range(len(pdf_reader.pages)):
            page = pdf_reader.pages[page_num]
            text += page.extract_text()

    # Write the extracted text to a text file
    with open(output_txt, 'w', encoding='utf-8') as txt_file:
        txt_file.write(text)

if __name__ == "__main__":
    pdf_path = 'pdfs/71818866.pdf'

    output_txt = 'climate.txt'

    pdf_to_text(pdf_path, output_txt)

    print("PDF converted to text successfully!")

PDF converted to text successfully!


In [7]:
from langchain.vectorstores import Chroma
from langchain.document_loaders import DirectoryLoader, TextLoader

In [14]:
loader = DirectoryLoader("/kaggle/working",glob="./*.txt" )

In [16]:
!pip install unstructured

Collecting unstructured
  Downloading unstructured-0.14.8-py3-none-any.whl.metadata (28 kB)
Collecting chardet (from unstructured)
  Downloading chardet-5.2.0-py3-none-any.whl.metadata (3.4 kB)
Collecting filetype (from unstructured)
  Downloading filetype-1.2.0-py2.py3-none-any.whl.metadata (6.5 kB)
Collecting python-magic (from unstructured)
  Downloading python_magic-0.4.27-py2.py3-none-any.whl.metadata (5.8 kB)
Collecting python-iso639 (from unstructured)
  Downloading python_iso639-2024.4.27-py3-none-any.whl.metadata (13 kB)
Collecting langdetect (from unstructured)
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m29.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Collecting rapidfuzz (from unstructured)
  Downloading rapidfuzz-3.9.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting unstructured-client (from unstructured)
  D

In [18]:
document = loader.load()

In [20]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1000,
    chunk_overlap = 200
    
)
text = text_splitter.split_documents(document)

In [23]:
print(text[0].page_content)

Parkland C olle ge A w ith H onor s Projects Honor s Program 2016 Climate Change: C auses, E ffects, and S olut ions Jameel R . Kaddo Parkland Co llege Open access to thi s Es say is brought to you b y Parkland C ollege's institutional reposit ory,SPARK: Scholarship a t Parkland. For mor e infor mation, please contactspark@p arkland.edu.Recomme nded Citation Kaddo , Jameel R., "Climate Change: C auses, Effe cts, a nd S olutions" (2016). A with Honors Projects. 164. http://sp ark.parkland.edu/ah/164 brought to you by CORE View metadata, citation and similar papers at core.ac.uk provided by Scholarship at Parkland


In [24]:
print(len(text))

33


### Creating DB

In [30]:
from langchain import embeddings
persist_directory = 'db'


In [26]:
!pip install sentence-transformers

Collecting sentence-transformers
  Downloading sentence_transformers-3.0.1-py3-none-any.whl.metadata (10 kB)
Downloading sentence_transformers-3.0.1-py3-none-any.whl (227 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m227.1/227.1 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[0mInstalling collected packages: sentence-transformers
Successfully installed sentence-transformers-3.0.1


In [34]:
from langchain_community.embeddings import HuggingFaceEmbeddings

# Initialize the HuggingFaceEmbeddings
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Generate embeddings for a query
query = "Hi how are you"
query_embedding = embeddings.embed_query(query)

print(len(query_embedding))

384


In [36]:
vectordb = Chroma.from_documents(
    documents=text,
    embedding=embeddings,
    persist_directory=persist_directory
)

In [40]:
# ## persists the db to disk
# vectordb.persist()
# warning method depreciated in new version

In [41]:
vectordb = None

In [42]:
vectordb = Chroma(persist_directory=persist_directory,embedding_function=embeddings)

In [43]:
vectordb

<langchain_community.vectorstores.chroma.Chroma at 0x7f98790193c0>

### Make a retriever

In [44]:
retriever = vectordb.as_retriever()

In [69]:
docs = retriever.get_relevant_documents("What are the reason for climate changes?")

  warn_deprecated(


In [75]:
print(docs[2].page_content)

Source:  (“Climate Change” graph done by Robert  Simmon.) The graph was done by taking a sample of ice and another sample was taken from the atmosphere. For the ice sample, drilling a hole through the ice sheets and looking at the air molecules inside the sample determined the concentration of CO 2 and m ethane (Chasing Ice). The graph illustrates that carbon dioxide levels have increased nearly 38 percent from 1750 - 2009 and methane levels have incre ased 148 percent (Riebeek ). Effects of Climate Change Climate change has affected many aspects of our planet. One aspect that has been greatly affected by climate change is the weather. In Romania, for instance, extreme weather events have multiplied since 2002. Burghila et al. stated in “Climate Change Effe cts- Where to Next?”,  that the country’s 2007 drought was the severest in 60 years ( 408). B y increasing the concentration of the greenhouse gases, we are increasing the amount of heat that is in our atmosphere (NASA). Hurricanes


In [73]:
print(len(docs))

4


In [51]:
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate

In [52]:
prompt_template = """Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.

{context}

Question: {question}
Answer:"""
PROMPT = PromptTemplate(
    template=prompt_template, input_variables=["context", "question"]
)

In [61]:
qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=retriever,
    return_source_documents=True,
    chain_type_kwargs={"prompt": PROMPT}
)

In [None]:
query = "What are the reasons for climate changes?"
result = qa({"query": query})

print("Answer:", result["result"])
print("\nSource Documents:")
for doc in result["source_documents"]:
    print(doc.page_content[:200] + "...")  # Print first 200 characters of each source