# General Imports

In [1]:
import os
import time
import re
import warnings
import gc

gc.collect()
warnings.filterwarnings('ignore')

# General Tasks

In [2]:
from dotenv import load_dotenv
load_dotenv(dotenv_path=".env")

google_search_project_api_key = os.getenv('google_search_api_key')
google_search_project_id = os.getenv('google_search_project_id')
google_gemini_api_key = os.getenv('google_gemini_api_key')


# LLM

In [77]:
from langchain_google_genai import ChatGoogleGenerativeAI

llm = ChatGoogleGenerativeAI(
        model='gemini-1.5-flash',
        temperature=0.9,
        google_api_key=google_gemini_api_key,
    )

gc.collect()

148517

In [78]:
response = llm.invoke('how many calories are in 100 grams of wheat flour roti in asia?(Give me short and to the point answer only. do not add markdowns, etc)')
print(response.content)

Approximately 270-300 calories. 



# Search Engine

In [5]:
import requests
def googlesearch_results(query:str, number_of_results:int=10):
    query = "how many calories are in 100 grams of wheat flour roti in asia?"
    google_search_url_template = f"https://www.googleapis.com/customsearch/v1?key={google_search_project_api_key}&cx={google_search_project_id}&q={query}&num={number_of_results}&gl=pk&cr=countryPK&hl=en&lr=lang_en"
    response = requests.get(google_search_url_template)
    results = []
    for item in response.json()['items']:
        results.append(item['link'])
    return results


from duckduckgo_search import DDGS
def duckduckgo_results(query:str, num_results:int=10):
    results = DDGS().text(
        query+" filetype:html",
        max_results=num_results, region="pk"
    )
    result_list = []
    for result in results:
        result_list.append(result["href"])
    return result_list

# from googlesearch import search
# def googlesearch_results(query:str, num_results:int=10):
#     result_obj = search(
#         query, num_results=num_results,
#         lang="en"
#     )
#  
#     result_list = []
#     for i in result_obj:
#         result_list.append(i)
#     return result_list

gc.collect()

0

In [6]:
googlesearch_results("how many calories are in 100 grams of wheat flour roti in asia?:", 4)

['https://www.pc.gov.pk/uploads/report/Pakistan_Dietary_Nutrition_2019.pdf',
 'https://www.hofo.pk/buckwheat-101-nutrition-facts-and-health-benefits/',
 'https://himalayanchef.pk/blogs/blog/myths-about-rice',
 'https://www.foodnerd.pk/blogs/calories-in-roti']

In [7]:
duckduckgo_results("how many calories are in 100 grams of wheat flour roti in asia?", 6)

['https://rotimatic.com/blogs/roti/calorie-of-roti',
 'https://foodstruct.com/food/chapati',
 'https://www.irastoworldhealth.com/nutrition/calories-roti',
 'https://www.fatsecret.com/calories-nutrition/generic/roti?portionid=333753&portionamount=100.000',
 'https://www.fatsecret.co.in/calories-nutrition/generic/roti?portionid=333753&portionamount=100.000',
 'https://redcliffelabs.com/myhealth/food-and-nutrition/food-for-overall-health/roti-nutrition-facts-how-many-calories-in-one-roti-health-benefits-advantages/']

# Text Loading

In [8]:
from langchain.document_loaders import WebBaseLoader



In [9]:
def load_webpages(website_url:list):
    """
    Load webpages from a list of urls

    INPUT:
        website_url: list of urls
    OUTPUT:
        langchain document object
    """
    loader = WebBaseLoader(website_url)
    return loader.load()

def text_cleaner(text:str):
    """
    Clean text from html tags, extra spaces, newlines, etc
    INPUT:
        text: string
    OUTPUT:
        cleaned_text: string
    """
    cleaned_text = re.sub(r"\n{3,}", "\n\n", text)
    cleaned_text = re.sub(r"\xa0|\r|\t", " ", cleaned_text)
    # cleaned_text = re.sub(r"\s+", " ", cleaned_text)
    cleaned_text = re.sub(r"\s{2,}", " ", cleaned_text)
    cleaned_text = re.sub(r"<[^>]+>", "", cleaned_text)
    return cleaned_text

def langchain_document_cleaner(document_obj):
    """
    Clean page_content of langchain document object
    INPUT:
        document: langchain document object
    OUTPUT:
        cleaned_document: langchain document object
    """
    for i in range(len(document_obj)):
        document_obj[i].page_content = text_cleaner(document_obj[i].page_content)
    return document_obj

def loader_with_cleaner(website_url:list):
    """
    Load webpages from a list of urls and clean them
    INPUT:
        website_url: list of urls
    OUTPUT:
        langchain document object
    """
    return langchain_document_cleaner(load_webpages(website_url))


In [None]:
# documents = load_webpages(duckduckgo_results("how many calories are in 100 grams of wheat flour roti in asia?", 10))
# documents = langchain_document_cleaner(documents)
# gc.collect()

documents = loader_with_cleaner(duckduckgo_results("how many calories are in 100 grams of wheat flour roti in asia?", 10))
documents

# Creating Chunks

In [27]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

# all-MiniLM-L12-v2 model has a limit of 256 words. ==> 4 x 256 = 1024 characters max
def splitter(documents, chunk_size:int=900, chunk_overlap:int=200):
    """
    Get chunks of text from the documents
    INPUT:
        documents: langchain document objects
        chunk_size: size of each chunk
        chunk_overlap: overlap between chunks
    OUTPUT:
        chunks: langchain document objects
    """
    s = RecursiveCharacterTextSplitter(chunk_size=900, chunk_overlap=200)
    chunks = s.split_documents(documents)
    return chunks

In [39]:
chunks = splitter(documents)

# Embeddings

In [41]:
from langchain_community.embeddings import HuggingFaceEmbeddings
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L12-v2", cache_folder="temp")

gc.collect()

57937

# Vector DB

In [42]:
from langchain_community.vectorstores import FAISS

def get_vDB(chunks, embeddings, folder_path:str=None, index_name:str=None):
    '''
    Get vector database from chunks
    INPUT:
        chunks: langchain document objects
        embeddings: langchain embeddings object
        folder_path: path to save the index
        index_name: name of the index
    OUTPUT:
        vector database
    '''
    db = FAISS.from_documents(chunks, embeddings)
    if folder_path and index_name:
        db.save_local(folder_path=folder_path, index_name=index_name)
        print(f"Index saved successfully at {folder_path}/{index_name}")
    return db

In [38]:
db = get_vDB(chunks, embeddings)

In [None]:
db.similarity_search("how many calories are in 100 grams of wheat flour roti in asia?", k=2)

# Main Flow

In [16]:
from langchain_core.prompts import PromptTemplate

prompt_template = PromptTemplate.from_template(
    '''You are an expert content writer and RAG specialist. Your task is to:
1. Rephrase the user query into a concise search string optimized for search engines.
2. Rephrase the query to capture the semantic intent for searching in a vector store.
3. *Always Return the result as a JSON object with keys: "original_query", "search_query", "vector_store_query".*

Example:
User Query: "I'm feeling tired all the time. What foods can help?"
Answer:
{{
    "original_query": "I'm feeling tired all the time. What foods can help?",
    "search_query": "foods to boost energy levels and reduce fatigue",
    "vector_store_query": "foods to help with fatigue"
}}

User Query: "{user_query}"
Answer:

    '''
)

# prompt = prompt_template.format(user_query=user_query)

In [17]:
import json
def string_to_json(string):
    try:
        string = string.replace('```json', '').replace('```', '').strip()
        json_obj = json.loads(string)
        return json_obj
    except json.JSONDecodeError:
        print("Invalid JSON string")
        return None

In [18]:
user_query = "how many calories are in 100 grams of wheat flour roti in asia?"

prompt = prompt_template.format(user_query=user_query)
llm_res = llm.invoke(prompt)
json_res = string_to_json(llm_res.content)

In [19]:
json_res

{'original_query': 'how many calories are in 100 grams of wheat flour roti in asia?',
 'search_query': 'calories in 100g wheat flour roti',
 'vector_store_query': 'calorie count for wheat roti, portion size 100g'}

In [20]:
similar_docs = db.similarity_search(json_res['vector_store_query'], k=5)

In [21]:
similar_docs

[Document(page_content='Calories in 100 g of Roti and Nutrition Facts Register | Sign In India Search: Foods Recipes Fitness Members My Fatsecret Foods Recipes Fitness Community Foods Roti Food database and calorie counter 100 g\nRoti Nutrition Facts Serving Size\n100 g per serve Energy\n1105 kj 264 kcal Fat\n1.3g Saturated Fat\n0.228g Monounsaturated Fat\n0.158g Polyunsaturated Fat\n0.552g Carbohydrates\n55.81g Sugar\n0.28g Fibre\n7.1g Protein\n9.61g Sodium\n486mg Cholesterol\n0mg Potassium\n239mg Last updated: 07 May 20 09:27 PM\nSource: FatSecret Platform API 13% of RDI*\n(264 cal) Calorie Breakdown: Carbohydrate (82%) Fat (4%) Protein (14%) *Based on an RDI of 2000 calories What is my Recommended Daily Intake? Photos View Photos Nutritional Summary: Cals\n264 Fat\n1.3g Carbs\n55.81g Prot\n9.61g There are 264 calories in 100 grams of Roti. Calorie Breakdown: 4% fat, 82% carbs, 14% prot. Common serving sizes: Serving Size', metadata={'source': 'https://www.fatsecret.co.in/calories-nu

In [49]:
def documents_to_contextText(documents):
    context_text = ""
    for doc in documents:
        context_text += f"CONTENT:\n'{doc.page_content}'\n\n"
    return context_text

In [51]:
context = documents_to_contextText(similar_docs)

In [105]:
prompt_template_2 = PromptTemplate.from_template(
    ''' Your are a Nutrition Expert. Your task is to:
Give ansswers to the user query in a concise and informative manner. Only Give the answer. Dot not add any markdowns, etc.
Give answer from the given context only. Do not add any extra information. If you do not find the answer in the context, you can say 'Sorry, I don't know'. 
*Use maths and calculations if needed.*


##### Context #######
    "{context}"





#### User Query #####
User Query: "{user_query}"
Answer:
    '''
)

prompt_2 = prompt_template_2.format(context=context ,user_query=user_query)

In [64]:
llm_res_2 = llm.invoke(prompt_2)

In [66]:
llm_res_2.content.strip()

'299 calories'

# Main Work Pipline

In [107]:
user_query = "how much market value of nvidia stocks increased/decreased in today?"
prompt_1 = prompt_template.format(user_query=user_query)
llm_res_1 = string_to_json(llm.invoke(prompt_1).content)

chunks = splitter(loader_with_cleaner(duckduckgo_results(llm_res_1["search_query"], 5)))
db = get_vDB(chunks, embeddings)

similar_chunks = db.similarity_search(llm_res_1["vector_store_query"], k=5)
context = documents_to_contextText(similar_chunks)

prompt_2 = prompt_template_2.format(context=context, user_query=user_query)
llm_res_2 = llm.invoke(prompt_2)

llm_res_2.content.strip()

'$2.27'

In [79]:
gc.collect()

33