In [1]:
from langgraph.graph import StateGraph, START, END
from langchain_core.tools import tool
from langchain_core.messages import AIMessage, HumanMessage
from langchain_core.documents import Document
from langchain_community.document_loaders import WebBaseLoader

USER_AGENT environment variable not set, consider setting it to identify your requests.


In [2]:
from Tutor.Tools.OCRModel import OCRModel
from Tutor.Tools.WebSearch import WebSearch
from Tutor.Services.InputHandler import InputHandler
from Tutor.Services.Routing import Routing
from Tutor.Services.EmbeddingModel import EmbeddingModel
from Tutor.Services.VectorStore import VectorStore
from Tutor.Data.PushToDB import PushToDB
from Tutor.Services.ReasoningModel import ReasoningModel
from Tutor.Services.TeachingModel import TeachingModel

  from .autonotebook import tqdm as notebook_tqdm

For example, replace imports like: `from langchain_core.pydantic_v1 import BaseModel`
with: `from pydantic import BaseModel`
or the v1 compatibility namespace if you are working in a code base that has not been fully upgraded to pydantic 2 yet. 	from pydantic.v1 import BaseModel

  from Tutor.Services.ReasoningModel import ReasoningModel


In [3]:
import os
import yaml
from typing import TypedDict
from IPython.display import Image, display

In [4]:
yaml_path = "params.yaml"
with open (yaml_path) as yaml_file:
    content = yaml.safe_load(yaml_file)

In [5]:
content

{'embedding_model': 'sentence-transformers/all-MiniLM-L6-v2',
 'reasoning': {'name': 'llama-3.3-70b-versatile', 'task': 'Reasoning'},
 'teaching_model': {'name': 'deepseek-r1-distill-llama-70b',
  'task': 'Question Answering'},
 'vectorstore': {'num_results': 5, 'similarity_threshold': 0.5},
 'data': {'folder_name': 'Tutor/Data/Math/'}}

In [6]:
DATA_DIR = content['data']['folder_name']
DATA_DIR

'Tutor/Data/Math/'

### Testing our Embedding Model

In [7]:
query = "This is a sample query"

In [8]:
documents = [
    "This is sentence 1",
    "This is sentence 2"
]

In [9]:
embedding_model = EmbeddingModel(model_name = content['embedding_model'])

[2025-05-11 09:55:09,164] - 24 logger - INFO - EmbeddingModel: - Embedding model 'sentence-transformers/all-MiniLM-L6-v2' initialized successfully.


In [10]:
print(embedding_model.embed_query(text=query)[:5])

[2025-05-11 09:55:09,704] - 36 logger - INFO - EmbeddingModel: - Embedding text: This is a sample query...
[2025-05-11 09:55:10,174] - 40 logger - ERROR - EmbeddingModel: - Embedding error: Expecting value: line 1 column 1 (char 0)


TutorException: Error occurred while running the python script, in file [d:\Codes\RAG Models\Math Tutor\Tutor\Services\EmbeddingModel.py], line [37]. 
Error Message: [Expecting value: line 1 column 1 (char 0)]

In [None]:
print(embedding_model.embed_documents(texts=documents)[:5])

The embedding model is working properly, without any errors, for both the text and the documents.<br>
Only thing to remember is that our embeddings model is accessed via API, hence a stable internet connection is required

### Test the vector store

In [None]:
vector_store = VectorStore(model_name = content['embedding_model'])

 Checking if we can add data into our vector DB 

In [None]:
documents = [
    "IPL is going on and RCB is the team at the first position",
    "There has been a huge attack on the man living the next door"
]

In [None]:
# First we will create a document object
document_objects = [Document(page_content=doc, metadata={"source": "example"}) for doc in documents]

# Then we will add the documents to the vector store
vector_store.add_documents(document_objects)

Data is being added successfully into our vector database, the only thing we have to ensure is that, it is a Document Object.

Check if we are able to retrieve data from the vector database

In [None]:
results = vector_store.retrieve(query="RCB", threshold=content['vectorstore']['similarity_threshold'], num_results=content['vectorstore']['num_results'])

In [None]:
results

We can see that the retrieving of the data is also happening successfully.

This means that our vector store is working properly and successfully.

### Testing the web search tool

In [None]:
query = "Can you calculate the differential of x^2+3x+2 at x=2?"

In [None]:
def web_search_tool(query):
    """Tool to perform a web search."""
    web_search = WebSearch(max_results=content['vectorstore']['num_results'])
    results = web_search.search(query)
    return results

In [None]:
response = web_search_tool(query)

In [None]:
response

In [None]:
web_search_tool("How many 4 digit numbers can be made with 1,2,3,4,5,6,7,8,9 such that no two consecutive digits are same?")

We can see that our web search tool is working, and it is returning a dictionary. The main things is the url,  from which we can scrape the data

In [None]:
loader = WebBaseLoader(
    response['results'][0]['url']
)

In [None]:
doc = loader.load()

In [None]:
doc

In [None]:
def ocr_tool(image):
    """Tool to perform OCR on an image."""
    ocr_model = OCRModel()
    text = ocr_model.get_text(image)
    return text

In [None]:
vector_store = VectorStore(model_name = content['embedding_model'])

In [None]:
input_handler = InputHandler()
routing = Routing(vector_store)

In [None]:
class State(TypedDict):
    messages: list
    input_type: ['Text', 'Image', None]
    route_type: ['Found in DB', 'Not Found in DB', None]

In [None]:
def generate_response(state: State) -> State:
    """Generate a response based on input type and possible OCR results."""
    input_type = state["input_type"]
    ocr_result = state.get("ocr_result")
    
    if input_type == "text":
        response = "I detected your input as text. Processing directly."
    elif input_type == "image" and ocr_result:
        response = f"I detected your input as an image. OCR extracted the following text: {ocr_result}"
    else:
        response = "I couldn't properly process your input."
    
    return {"messages": state["messages"] + [AIMessage(content=response)]}

In [None]:
graph_builder = StateGraph(State)

In [None]:
# First node is the node to identify the input type
graph_builder.add_node(
    "DetectInputType", input_handler.check_input_type
)

# Next node is the OCR model, which will be the tool
graph_builder.add_node(
    "ocr_tool", ocr_tool
)

# Router Node
graph_builder.add_node(
    "router-node", routing.route
)

graph_builder.add_node(
    "web_search_tool", web_search_tool
)

In [None]:
graph_builder.add_conditional_edges(
        "DetectInputType",
        lambda state: state["input_type"],
        {
            "text": "router-node",  # If text, go directly to response
            "image": "ocr_tool",          # If image, go to OCR first
        }
    )

In [None]:
# Connect OCR to response generation
graph_builder.add_edge("ocr_tool", "router-node")
    
# Set the final node to END
graph_builder.add_conditional_edges(
        "router-node",
        lambda state: state["route_type"],
        {
            "Found in DB": END,
            "Not Found in DB": "web_search_tool",
        }
    )

graph_builder.add_edge("web_search_tool", END)

In [None]:
graph_builder.set_entry_point("DetectInputType")

In [None]:
graph = graph_builder.compile()

In [None]:
display(Image(graph.get_graph().draw_mermaid_png()))