## Import Packages
Packages and helper functions

In [3]:
import google.generativeai as genai
import wikipedia as wiki
from PyPDF2 import PdfReader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from pathlib import Path
import os
import textwrap
from langchain.vectorstores import FAISS
from langchain.prompts import PromptTemplate
from IPython.display import Markdown


def to_markdown(text):
    text = text.replace("•", "  *")
    return Markdown(textwrap.indent(text, "> ", predicate=lambda _: True))

## Get API Key
Get the Gemini (Google) Api Key from .env file

In [4]:
api = os.getenv('GOOGLE_API_KEY')
genai.configure(api_key=api)
model = genai.GenerativeModel("gemini-1.5-flash")

### Example
An example use case

In [5]:
%%time
response = model.generate_content(f"Summarize the following text in 40 words: {wiki.summary("Peregrine Falcon", sentences=20)}")
to_markdown(response.text)

CPU times: total: 93.8 ms
Wall time: 4.63 s


> Peregrine falcons, the world's fastest animals, are large, cosmopolitan birds of prey with distinctive blue-grey and white plumage.  Reaching speeds exceeding 320 km/h in dives, they inhabit diverse regions globally, except for extreme polar areas, high mountains, and most rainforests.  Females are larger than males.


## Custom Data
Load data from Encyclopedia

### PDF -> Vectors -> Database

In [8]:
data_folder = "Data"

files = [os.path.join(data_folder, file) for file in os.listdir(data_folder) if os.path.isfile(os.path.join(data_folder, file))]

for i, file in enumerate(files, start=1):
    globals()[f"file{i}"] = file

for i, file in enumerate(files, start=1):
    print(f"file{i}: {globals()[f'file{i}']}")

file1: Data\Animals- A Visual Encyclopedia - DK - Birds section.pdf
file2: Data\Book of Indian Birds.pdf
file3: Data\Encyclopedia of birds - International Masters Publishing_2007.pdf
file4: Data\Illustrated Encyclopedia of Birds - DK.pdf


In [13]:
pdf_files = [os.path.join(data_folder, file) for file in os.listdir(data_folder)]

In [16]:
data_pdf = {}

for i, file_path in enumerate(pdf_files, start=1):
    with open(file_path, "rb") as file:
        pdf = PdfReader(file)
        n_pages = len(pdf.pages)
        text = ""
        for page in range(n_pages):
            text += "\n" + pdf.pages[page].extract_text()
        data_pdf[f"text{i}"] = text

In [21]:
with open(file1, "rb") as file:
    pdf = PdfReader(file)
    n_pages = len(pdf.pages)
    text1 = ""
    for i in range(n_pages):
        text1+= "\n"+pdf.pages[i].extract_text()

with open(file2, "rb") as file:
    pdf = PdfReader(file)
    n_pages = len(pdf.pages)
    text2 = ""
    for i in range(n_pages):
        text2+= "\n"+pdf.pages[i].extract_text()

with open(file3, "rb") as file:
    pdf = PdfReader(file)
    n_pages = len(pdf.pages)
    text3 = ""
    for i in range(n_pages):
        text3+= "\n"+pdf.pages[i].extract_text()

with open(file4, "rb") as file:
    pdf = PdfReader(file)
    n_pages = len(pdf.pages)
    text3 = ""
    for i in range(n_pages):
        text4+= "\n"+pdf.pages[i].extract_text()

In [18]:
def get_chunks(text : str):
    splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=200) 
    chunks = splitter.split_text(text)
    return chunks

def get_vectorstore(chunks):
    embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
    if Path("faiss_index").exists():
        vectorstore = FAISS.load_local("faiss_index", embeddings = embeddings, allow_dangerous_deserialization=True)
        vectorstore.add_texts(chunks)
    else:
        vectorstore = FAISS.from_texts(chunks, embedding = embeddings)
    vectorstore.save_local("faiss_index")

In [19]:
for i in range(len(data_pdf)):
    chunk = get_chunks(data_pdf[f"text{i+1}"])
    get_vectorstore(chunk)

## Prompting
Prepare prompt template and set up response for User with an example

In [20]:
prompt_template = """You are a knowledgeable bird encyclopedia chatbot. 
Use {content} (from a PDF encyclopedia) and Wikipedia API to provide accurate information about the bird "{bird}". 
If no reliable information is found, respond with "No information is available about {bird}." 

Include the following details: (if available otherwise skip)
- Scientific name 
- Physical description
- Length
- Wingspan
- Weight
- Lifespan
- Habitat
- Diet
- Behavior
- Distribution
- Conservation status
- Interesting fact

Ensure responses are concise, factual, and sourced from given data. If {bird} is fictional or misspelled, clarify that no data exists.
Note: The response should not write the source or document ID, if some information is not available, just skip it."""

In [21]:
def user_input( bird : str):
    prompt = PromptTemplate(template=prompt_template, input_variables=["content", "bird"])
    database = FAISS.load_local("faiss_index", GoogleGenerativeAIEmbeddings(model="models/embedding-001"), allow_dangerous_deserialization=True)
    encyclopedia_info = database.similarity_search(bird).append(wiki.page(bird).content)
    response = model.generate_content(prompt.format(content = encyclopedia_info, bird = bird))
    return response

In [23]:
%%time
output = user_input("Golden Eagle")
to_markdown(output.text)

CPU times: total: 93.8 ms
Wall time: 5.59 s


> Scientific name: *Aquila chrysaetos*
> 
> Physical description: Large, powerful bird of prey with dark brown body, golden-brown head and neck.
> 
> Length: 76-102 cm
> 
> Wingspan: 1.8-2.3 m
> 
> Weight: 3.8-6.5 kg
> 
> Lifespan: 20-40 years in the wild
> 
> Habitat: Mountainous regions, open woodlands, and grasslands.
> 
> Diet: Primarily mammals such as rabbits, marmots, and ground squirrels; also birds, reptiles, and carrion.
> 
> Behavior:  Solitary or paired birds, highly territorial.  Excellent fliers, known for their powerful talons and hunting prowess.
> 
> Distribution:  Northern Hemisphere, including North America, Europe, and Asia.
> 
> Conservation status: Least Concern (though populations can vary regionally)
> 
> Interesting fact: Golden eagles have exceptionally sharp eyesight, enabling them to spot prey from great distances.
