# General Imports

In [128]:
import os
import time
import re

# General Tasks

In [4]:
from dotenv import load_dotenv
load_dotenv(dotenv_path=".env")

google_search_project_api_key = os.getenv('google_search_api_key')
google_search_project_id = os.getenv('google_search_project_id')
google_gemini_api_key = os.getenv('google_gemini_api_key')


# LLM

In [None]:
from langchain_google_genai import ChatGoogleGenerativeAI

llm = ChatGoogleGenerativeAI(
        model='gemini-1.5-pro',
        temperature=0.9,
        google_api_key=google_gemini_api_key,
    )

In [11]:
response = llm.invoke('how many calories are in 100 grams of wheat flour roti in asia?(Give me short and to the point answer only. do not add markdowns, etc)')
print(response.content)

297 calories 



# Embeddings

In [23]:
from langchain_community.embeddings import HuggingFaceEmbeddings

embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L12-v2", cache_folder="temp")

In [None]:
embeddings.embed_query("what is your name, guru g?")

# Vecotr DB

In [14]:
from langchain_community.vectorstores import FAISS


# Search Engine

In [84]:
import requests
def googlesearch_results(query:str, number_of_results:int=10):
    query = "how many calories are in 100 grams of wheat flour roti in asia?"
    google_search_url_template = f"https://www.googleapis.com/customsearch/v1?key={google_search_project_api_key}&cx={google_search_project_id}&q={query}&num={number_of_results}&gl=pk&cr=countryPK&hl=en&lr=lang_en"
    response = requests.get(google_search_url_template)
    results = []
    for item in response.json()['items']:
        results.append(item['link'])
    return results


from duckduckgo_search import DDGS
def duckduckgo_results(query:str, num_results:int=10):
    results = DDGS().text(
        query+" filetype:html",
        max_results=num_results, region="pk"
    )
    result_list = []
    for result in results:
        result_list.append(result["href"])
    return result_list

# from googlesearch import search
# def googlesearch_results(query:str, num_results:int=10):
#     result_obj = search(
#         query, num_results=num_results,
#         lang="en"
#     )
#  
#     result_list = []
#     for i in result_obj:
#         result_list.append(i)
#     return result_list

In [122]:
googlesearch_results("how many calories are in 100 grams of wheat flour roti in asia?:", 4)

['https://www.pc.gov.pk/uploads/report/Pakistan_Dietary_Nutrition_2019.pdf',
 'https://www.hofo.pk/buckwheat-101-nutrition-facts-and-health-benefits/',
 'https://himalayanchef.pk/blogs/blog/myths-about-rice',
 'https://www.foodnerd.pk/blogs/calories-in-roti']

In [125]:
duckduckgo_results("how many calories are in 100 grams of wheat flour roti in asia?", 6)

['https://rotimatic.com/blogs/roti/calorie-of-roti',
 'https://www.irastoworldhealth.com/nutrition/calories-roti',
 'https://foodstruct.com/food/chapati',
 'https://www.fatsecret.com/calories-nutrition/generic/roti?portionid=333753&portionamount=100.000',
 'https://www.jcookingodyssey.com/whole-wheat-roti/',
 'https://www.livestrong.com/article/305496-the-calories-in-roti/']

# Text Loading

In [126]:
from langchain.document_loaders import WebBaseLoader



In [132]:
def load_webpages(website_url:list):
    """
    Load webpages from a list of urls

    INPUT:
        website_url: list of urls
    OUTPUT:
        langchain document object
    """
    loader = WebBaseLoader(website_url)
    return loader.load()


def text_cleaner(text:str):
    """
    Clean text from html tags, extra spaces, newlines, etc
    INPUT:
        text: string
    OUTPUT:
        cleaned_text: string
    """
    cleaned_text = re.sub(r"\n{3,}", "\n\n", text)
    cleaned_text = re.sub(r"\xa0|\r|\t", " ", cleaned_text)
    # cleaned_text = re.sub(r"\s+", " ", cleaned_text)
    cleaned_text = re.sub(r"\s{2,}", " ", cleaned_text)
    cleaned_text = re.sub(r"<[^>]+>", "", cleaned_text)
    return cleaned_text

def langchain_document_cleaner(document_obj):
    """
    Clean page_content of langchain document object
    INPUT:
        document: langchain document object
    OUTPUT:
        cleaned_document: langchain document object
    """
    for i in range(len(document_obj)):
        document_obj[i].page_content = text_cleaner(document_obj[i].page_content)
    return document_obj

In [138]:
documents = load_webpages(duckduckgo_results("how many calories are in 100 grams of wheat flour roti in asia?", 4))
documents = langchain_document_cleaner(documents)

In [139]:
print(documents[1].page_content)

 How Many Calories in a Roti/Chapati? Nutrition Facts & More - Rotimatic Cart 0
items Shop All Rotimatic Machine Accessories TheGoodRoti Mixes (All-new Range) Gift Rotimatic Reviews Online Reviews Website Reviews Learn Technology Recipes FAQs Blog Support Warranty Track Your Order User Manual Seeking Support Contact Us About Us Request a Demo Shop The Rotimatic Cart 0
items Shop All Shop All Menu Rotimatic Machine Accessories TheGoodRoti Mixes (All-new Range) Gift Rotimatic Reviews Reviews Menu Online Reviews Website Reviews Learn Learn Menu Technology Recipes FAQs Blog Support Support Menu Warranty Track Your Order User Manual Seeking Support Contact Us About Us Log In Skip to content Just added to your cart Qty: View cart () Continue shopping Calories in Roti/Chapati & Other Nutrition Facts
by Rotimatic Team August 10, 2018 Humble little discs that form a staple in every thali or meal, the roti is perhaps the unsung hero of the Indian gastronomic experience. Call it by any name, the 

In [140]:
duckduckgo_results("how many calories are in 100 grams of wheat flour roti in asia?", 4)

['https://rotimatic.com/blogs/roti/calorie-of-roti',
 'https://www.fatsecret.com/calories-nutrition/generic/roti?portionid=333753&portionamount=100.000',
 'https://foodstruct.com/food/chapati',
 'https://www.fatsecret.co.in/calories-nutrition/generic/roti?portionid=333753&portionamount=100.000']