# General Imports

In [1]:
import os
import time
import re
import warnings
import gc

gc.collect()
warnings.filterwarnings('ignore')

# General Tasks

In [2]:
from dotenv import load_dotenv
load_dotenv(dotenv_path=".env")

google_search_project_api_key = os.getenv('google_search_api_key')
google_search_project_id = os.getenv('google_search_project_id')
google_gemini_api_key = os.getenv('google_gemini_api_key')


# LLM

In [3]:
from langchain_google_genai import ChatGoogleGenerativeAI

llm = ChatGoogleGenerativeAI(
        model='gemini-1.5-flash',
        temperature=0.9,
        google_api_key=google_gemini_api_key,
    )

gc.collect()

20

In [4]:
response = llm.invoke('how many calories are in 100 grams of wheat flour roti in asia?(Give me short and to the point answer only. do not add markdowns, etc)')
print(response.content)

Approximately 120-130 calories. 



# Search Engine

In [5]:
import requests
def googlesearch_results(query:str, number_of_results:int=10):
    query = "how many calories are in 100 grams of wheat flour roti in asia?"
    google_search_url_template = f"https://www.googleapis.com/customsearch/v1?key={google_search_project_api_key}&cx={google_search_project_id}&q={query}&num={number_of_results}&gl=pk&cr=countryPK&hl=en&lr=lang_en"
    response = requests.get(google_search_url_template)
    results = []
    for item in response.json()['items']:
        results.append(item['link'])
    return results


from duckduckgo_search import DDGS
def duckduckgo_results(query:str, num_results:int=10):
    results = DDGS().text(
        query+" filetype:html",
        max_results=num_results, region="pk"
    )
    result_list = []
    for result in results:
        result_list.append(result["href"])
    return result_list

# from googlesearch import search
# def googlesearch_results(query:str, num_results:int=10):
#     result_obj = search(
#         query, num_results=num_results,
#         lang="en"
#     )
#  
#     result_list = []
#     for i in result_obj:
#         result_list.append(i)
#     return result_list

gc.collect()

0

In [6]:
googlesearch_results("how many calories are in 100 grams of wheat flour roti in asia?:", 4)

['https://www.pc.gov.pk/uploads/report/Pakistan_Dietary_Nutrition_2019.pdf',
 'https://www.hofo.pk/buckwheat-101-nutrition-facts-and-health-benefits/',
 'https://himalayanchef.pk/blogs/blog/myths-about-rice',
 'https://www.foodnerd.pk/blogs/calories-in-roti']

In [7]:
duckduckgo_results("how many calories are in 100 grams of wheat flour roti in asia?", 6)

['https://rotimatic.com/blogs/roti/calorie-of-roti',
 'https://www.fatsecret.co.in/calories-nutrition/generic/roti?portionid=333753&portionamount=100.000',
 'https://foodstruct.com/food/chapati',
 'https://www.fatsecret.com/calories-nutrition/generic/roti?portionid=333753&portionamount=100.000',
 'https://www.livestrong.com/article/305496-the-calories-in-roti/',
 'https://redcliffelabs.com/myhealth/food-and-nutrition/food-for-overall-health/roti-nutrition-facts-how-many-calories-in-one-roti-health-benefits-advantages/']

# Text Loading

In [None]:
from langchain.document_loaders import WebBaseLoader

In [9]:
def load_webpages(website_url:list):
    """
    Load webpages from a list of urls

    INPUT:
        website_url: list of urls
    OUTPUT:
        langchain document object
    """
    loader = WebBaseLoader(website_url)
    return loader.load()

def text_cleaner(text:str):
    """
    Clean text from html tags, extra spaces, newlines, etc
    INPUT:
        text: string
    OUTPUT:
        cleaned_text: string
    """
    cleaned_text = re.sub(r"\n{3,}", "\n\n", text)
    cleaned_text = re.sub(r"\xa0|\r|\t", " ", cleaned_text)
    # cleaned_text = re.sub(r"\s+", " ", cleaned_text)
    cleaned_text = re.sub(r"\s{2,}", " ", cleaned_text)
    cleaned_text = re.sub(r"<[^>]+>", "", cleaned_text)
    return cleaned_text

def langchain_document_cleaner(document_obj):
    """
    Clean page_content of langchain document object
    INPUT:
        document: langchain document object
    OUTPUT:
        cleaned_document: langchain document object
    """
    for i in range(len(document_obj)):
        document_obj[i].page_content = text_cleaner(document_obj[i].page_content)
    return document_obj

def loader_with_cleaner(website_url:list):
    """
    Load webpages from a list of urls and clean them
    INPUT:
        website_url: list of urls
    OUTPUT:
        langchain document object
    """
    return langchain_document_cleaner(load_webpages(website_url))


In [11]:
# documents = load_webpages(duckduckgo_results("how many calories are in 100 grams of wheat flour roti in asia?", 10))
# documents = langchain_document_cleaner(documents)
# gc.collect()

documents = loader_with_cleaner(duckduckgo_results("how many calories are in 100 grams of wheat flour roti in asia?", 5))
documents

[Document(page_content=" How Many Calories in a Roti/Chapati? Nutrition Facts & More - Rotimatic Cart 0\nitems Shop All Rotimatic Machine Accessories TheGoodRoti Mixes (All-new Range) Gift Rotimatic Reviews Online Reviews Website Reviews Learn Technology Recipes FAQs Blog Support Warranty Track Your Order User Manual Seeking Support Contact Us About Us Request a Demo Shop The Rotimatic Cart 0\nitems Shop All Shop All Menu Rotimatic Machine Accessories TheGoodRoti Mixes (All-new Range) Gift Rotimatic Reviews Reviews Menu Online Reviews Website Reviews Learn Learn Menu Technology Recipes FAQs Blog Support Support Menu Warranty Track Your Order User Manual Seeking Support Contact Us About Us Log In Skip to content Just added to your cart Qty: View cart () Continue shopping Calories in Roti/Chapati & Other Nutrition Facts\nby Rotimatic Team August 10, 2018 Humble little discs that form a staple in every thali or meal, the roti is perhaps the unsung hero of the Indian gastronomic experience

# Creating Chunks

In [12]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

# all-MiniLM-L12-v2 model has a limit of 256 words. ==> 4 x 256 = 1024 characters max
def splitter(documents, chunk_size:int=900, chunk_overlap:int=200):
    """
    Get chunks of text from the documents
    INPUT:
        documents: langchain document objects
        chunk_size: size of each chunk
        chunk_overlap: overlap between chunks
    OUTPUT:
        chunks: langchain document objects
    """
    s = RecursiveCharacterTextSplitter(chunk_size=900, chunk_overlap=200)
    chunks = s.split_documents(documents)
    return chunks

In [13]:
chunks = splitter(documents)

# Embeddings

In [14]:
from langchain_community.embeddings import HuggingFaceEmbeddings
# embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L12-v2", cache_folder="temp")
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2", cache_folder="temp")

gc.collect()

embeddings.embed_query("Warm Up")

[-0.06030871719121933,
 0.023757167160511017,
 0.03897421434521675,
 0.05911151319742203,
 0.045805588364601135,
 -0.00245306477881968,
 0.01700669713318348,
 -0.03242778405547142,
 -0.025306468829512596,
 0.023358669131994247,
 -0.10341684520244598,
 -0.029500722885131836,
 0.05319252610206604,
 0.0054579307325184345,
 0.09033812582492828,
 0.04292973130941391,
 -0.036903031170368195,
 0.038174521178007126,
 -0.04131007939577103,
 -0.003640115959569812,
 -0.09145521372556686,
 0.0248503927141428,
 -0.023028353229165077,
 0.04863588884472847,
 -0.00832334067672491,
 0.048258136957883835,
 0.046705104410648346,
 0.0513889379799366,
 -0.029691914096474648,
 0.042939312756061554,
 -0.09940437972545624,
 -0.03028726577758789,
 0.013974303379654884,
 -0.07470056414604187,
 0.007879376411437988,
 0.07336735725402832,
 -0.021044546738266945,
 -0.03558522090315819,
 -0.09405802935361862,
 0.014925166964530945,
 0.04721025004982948,
 -0.06556256860494614,
 -0.02081584557890892,
 -0.066652171313

# Vector DB

In [15]:
from langchain_community.vectorstores import FAISS

def get_vDB(chunks, embeddings, folder_path:str=None, index_name:str=None):
    '''
    Get vector database from chunks
    INPUT:
        chunks: langchain document objects
        embeddings: langchain embeddings object
        folder_path: path to save the index
        index_name: name of the index
    OUTPUT:
        vector database
    '''
    db = FAISS.from_documents(chunks, embeddings)
    if folder_path and index_name:
        db.save_local(folder_path=folder_path, index_name=index_name)
        print(f"Index saved successfully at {folder_path}/{index_name}")
    return db

In [16]:
db = get_vDB(chunks, embeddings)

In [17]:
db.similarity_search("how many calories are in 100 grams of wheat flour roti in asia?", k=2)

[Document(page_content='recipes and differences in manufacturing. Advertisement Tip According to the U.S. Department of Agriculture (USDA), there are 297 calories in a 100-gram serving, or about 3.5 ounces, of plain roti. This can vary considerably, though. Video of the Day Roti Nutrition Facts The calories in roti can vary based on what it is made with. The USDA indicates that a single piece of plain roti weighing 68 grams offers 202 calories, while a 100-gram serving offers 297. However, it does not specify what that hypothetical roti is made of, in terms of ingredients. Other varieties of roti listed have different calorie counts. Video of the Day For example, another USDA listing for "bread, chappatti or roti (Indian bread), wheat" indicates that 100 grams of roti has 299 calories. An 8-inch, or 52-gram piece has 155 calories. Ultimately, the difference comes down to slight variations in manufacturers and', metadata={'source': 'https://www.livestrong.com/article/305496-the-calories

# Main Flow

In [18]:
from langchain_core.prompts import PromptTemplate

prompt_template = PromptTemplate.from_template(
    '''You are an expert content writer and RAG specialist. Your task is to:
1. Rephrase the user query into list of concise search string optimized for search engines. Use multiple search queries if needed. and return as a list of strings.
2. "Use general values in search queries which have higher chances of existance on search engines, not specific numbers which is difficult to find (Example: if user asks for finding price of 340 grams of potatos, then you can search for general/standard/easy to find value such as 100 grams or 1 kilogram). 
3. Then calculate the result for specific numbers from the general/standard/easy to find value from search results. (Like instead of searching for "calories in 340 grams of wheat flour roti in asia", search for "calories in 100 grams of wheat flour roti in asia" and then calculate the value for 500 grams using values fom 100 grams)
4. Maximum number of search queries should be 4.
5. *Always give answer as a valid JSON object with keys: "original_query", "search_query_list".*

Example:
User Query: "I'm feeling tired all the time. What foods can help?"
Answer:
{{
    "original_query": "how many calories are in 500 grams of wheat flour roti in asia?",
    "search_query": ["calories in 100g wheat flour roti in aisa",
                      "calories in 1kg wheat flour roti in asia",
                      "calories in wheat flour roti in asia"],
}}

User Query: "{user_query}"
Answer:

    '''
)

# prompt = prompt_template.format(user_query=user_query)

In [19]:
import json
def string_to_json(string):
    try:
        string = string.replace('```json', '').replace('```', '').strip()
        json_obj = json.loads(string)
        return json_obj
    except json.JSONDecodeError:
        print("Invalid JSON string")
        return None

In [24]:
user_query = "how many calories in 300 grams of beaf cury"

prompt = prompt_template.format(user_query=user_query)
llm_res = llm.invoke(prompt)
json_res = string_to_json(llm_res.content)

In [25]:
json_res

{'original_query': 'how many calories in 300 grams of beaf cury',
 'search_query_list': ['calories in 100 grams beef curry',
  'calories in beef curry recipe',
  'calories in beef curry']}

In [31]:
llm_res.usage_metadata

{'input_tokens': 374, 'output_tokens': 74, 'total_tokens': 448}

In [35]:
similar_docs = db.similarity_search(json_res['vector_store_query'], k=5)
similar_docs

In [37]:
def documents_to_contextText(documents):
    context_text = ""
    for doc in documents:
        context_text += f"CONTENT:\n'{doc.page_content}'\n\n"
    return context_text

In [38]:
context = documents_to_contextText(similar_docs)

In [39]:
prompt_template_2 = PromptTemplate.from_template(
    ''' Your are a Nutrition Expert. Your task is to:
Give ansswers to the user query in a concise and informative manner. Only Give the answer. Dot not add any markdowns, etc.
Give answer from the given context only. Do not add any extra information. 
*Use maths and calculations to calculate the results if needed.*

##### Context #######
    "{context}"



#### User Query #####
User Query: "{user_query}"
Answer:
    '''
)

prompt_2 = prompt_template_2.format(context=context ,user_query=user_query)

In [40]:
llm_res_2 = llm.invoke(prompt_2)
llm_res_2.content.strip()

# Main Work Pipline

In [45]:
user_query = "is there any chance of increase in the price of stocks of Nvidia in the next 6 months? Also tell the estimated chances of increase in the price of stocks of Nvidia in the next 6 months."
prompt_1 = prompt_template.format(user_query=user_query)
llm_res_1 = string_to_json(llm.invoke(prompt_1).content)

# chunks = splitter(loader_with_cleaner(duckduckgo_results(llm_res_1["search_query"], 5)))
# db = get_vDB(chunks, embeddings)

similar_chunks = db.similarity_search(llm_res_1["vector_store_query"], k=5)
context = documents_to_contextText(similar_chunks)

prompt_2 = prompt_template_2.format(context=context, user_query=user_query)
llm_res_2 = llm.invoke(prompt_2)

llm_res_2.content.strip()

'This document does not contain information about stock prices or investment advice.'

In [30]:
gc.collect()

33728

In [122]:
print(context)

CONTENT:
'Estimate: A typical whole wheat roti, which is about 6-7 inches wide, contains roughly 70-80 calories.Variation: The calorie count may vary slightly depending on factors like how thick it is and how it’s cooked.Factors that Affect Roti CaloriesType of Flour:Whole Wheat Roti: This is the most common and healthy option.Multigrain Roti: It’s made from a mix of flours, which can affect its calorie content.Other Flour Types: Using different flours like millet or barley can change the calorie count.Additions During Cooking:Oil or Ghee: Putting oil or ghee while cooking can add calories.Butter or Ghee Topping: Spreading butter or ghee on top adds more calories.Size and Thickness:Larger or Thicker Rotis: If your rotis are bigger or thicker than usual, they’ll have more calories.What You Eat with Roti MattersCurries: The type of curry or side dish you have with roti can vary in'

CONTENT:
'made from whole wheat flour can vary depending on the exact size and thickness, but as a general