In [1]:
%%capture
!pip install lancedb embed-anything-gpu datasets huggingface_hub smolagents

In [None]:
import os
import requests
from urllib.parse import urlparse

urls = [
    "https://content.dgft.gov.in/Website/CIEP.pdf",
    "https://content.dgft.gov.in/Website/GAE.pdf",
    "https://content.dgft.gov.in/Website/HTE.pdf",
]

med_url = ["https://www.biorxiv.org/content/10.1101/2025.01.23.634433v1.full.pdf"]

def download_files(urls, folder_name="downloaded_docs"):
    """Downloads files from a list of URLs to a specified folder."""

    if not os.path.exists(folder_name):
        os.makedirs(folder_name)

    for url in urls:
        try:
            response = requests.get(url, stream=True)
            response.raise_for_status()  # Raise an exception for bad status codes

            parsed_url = urlparse(url)
            filename = os.path.basename(parsed_url.path)
            filepath = os.path.join(folder_name, filename)

            with open(filepath, 'wb') as file:
                for chunk in response.iter_content(chunk_size=8192):
                    file.write(chunk)

            print(f"Downloaded {filename} to {folder_name}")

        except requests.exceptions.RequestException as e:
            print(f"Error downloading {url}: {e}")


def med_files(urls, folder_name="medical_docs"):
    """Downloads files from a list of URLs to a specified folder."""

    if not os.path.exists(folder_name):
        os.makedirs(folder_name)

    for url in med_url:
        try:
            response = requests.get(url, stream=True)
            response.raise_for_status()  # Raise an exception for bad status codes

            parsed_url = urlparse(url)
            filename = os.path.basename(parsed_url.path)
            filepath = os.path.join(folder_name, filename)

            with open(filepath, 'wb') as file:
                for chunk in response.iter_content(chunk_size=8192):
                    file.write(chunk)

            print(f"Downloaded {filename} to {folder_name}")

        except requests.exceptions.RequestException as e:
            print(f"Error downloading {url}: {e}")

download_files(urls)
med_files(med_url)

In [None]:
import os
import requests
from urllib.parse import urlparse

def download_files_generic(urls, folder_name, chunk_size=8192):
    """
    Downloads files from a list of URLs to a specified folder.
    
    Args:
        urls (list): List of URLs to download files from
        folder_name (str): Name of the folder to save files to
        chunk_size (int, optional): Size of chunks to download. Defaults to 8192
        
    Returns:
        list: List of tuples containing (filename, success_status, error_message if any)
    """
    if not isinstance(urls, (list, tuple)):
        raise ValueError("URLs must be provided as a list or tuple")
        
    if not folder_name:
        raise ValueError("Folder name must be provided")
        
    # Create results list to track downloads
    results = []
    
    # Create folder if it doesn't exist
    if not os.path.exists(folder_name):
        os.makedirs(folder_name)
    
    for url in urls:
        try:
            response = requests.get(url, stream=True)
            response.raise_for_status()
            
            parsed_url = urlparse(url)
            filename = os.path.basename(parsed_url.path)
            
            # Handle empty filenames
            if not filename:
                filename = f"downloaded_file_{len(results)}"
                
            filepath = os.path.join(folder_name, filename)
            
            with open(filepath, 'wb') as file:
                for chunk in response.iter_content(chunk_size=chunk_size):
                    file.write(chunk)
                    
            print(f"Downloaded {filename} to {folder_name}")
            results.append((filename, True, None))
            
        except requests.exceptions.RequestException as e:
            error_msg = str(e)
            print(f"Error downloading {url}: {error_msg}")
            results.append((url, False, error_msg))
            
    return results

In [16]:
import datasets
from smolagents import Tool

import lancedb
import embed_anything
from embed_anything import EmbeddingModel, WhichModel, ONNXModel
from uuid import uuid4


class RetrieverTool(Tool):
    name = "retriever"
    description = "Uses semantic search to retrieve policies about india that could be most relevant to answer your query."
    inputs = {
        "query": {
            "type": "string",
            "description": "The query to perform. This should be semantically close to your target documents. Use the affirmative form rather than a question.",
        }
    }
    output_type = "string"

    def __init__(self, directory, **kwargs):
        super().__init__(**kwargs)
        self.model = EmbeddingModel.from_pretrained_onnx(WhichModel.Bert, ONNXModel.AllMiniLML6V2Q)
        self.connection = lancedb.connect("tmp/general")
        if "docs" in self.connection.table_names():
            self.table = self.connection.open_table("docs")
        else:
            self.embeddings = embed_anything.embed_directory(directory, embedder = self.model)
            docs = []
            for e in self.embeddings:
                docs.append({
                    "vector": e.embedding,
                    "text": e.text,
                    "id": str(uuid4())
                })
            self.table = self.connection.create_table("docs", docs)

    def forward(self, query: str) -> str:
        assert isinstance(query, str), "Your search query must be a string"

        query_vec = embed_anything.embed_query([query], embedder = self.model)[0].embedding
        docs = self.table.search(query_vec).limit(5).to_pandas()["text"]
        return "\nRetrieved documents:\n" + "".join(
            [f"\n\n===== Document {str(i)} =====\n" + doc for i, doc in enumerate(docs)]
        )

class MedicalRetrieverTool(Tool):
    name = "medical_retriever"
    description = "Uses semantic search to retrieve medicine related documennts most relevant to answer your query. Use this when the query is rlated to medicine or medical science or physiotherapy or psychology. "
    inputs = {
        "query": {
            "type": "string",
            "description": "The query to perform. This should be semantically close to your target documents. Use the affirmative form rather than a question.",
        }
    }
    output_type = "string"

    def __init__(self, directory, **kwargs):
        super().__init__(**kwargs)
        self.model =EmbeddingModel.from_pretrained_hf(WhichModel.Bert, model_id='NeuML/pubmedbert-base-embeddings')
        self.connection = lancedb.connect("tmp/medical")
        if "docs" in self.connection.table_names():
            self.table = self.connection.open_table("docs")
        else:
            self.embeddings = embed_anything.embed_directory(directory, embedder = self.model)
            docs = []
            for e in self.embeddings:
                docs.append({
                    "vector": e.embedding,
                    "text": e.text,
                    "id": str(uuid4())
                })
            self.table = self.connection.create_table("docs", docs)

    def forward(self, query: str) -> str:
        assert isinstance(query, str), "Your search query must be a string"

        query_vec = embed_anything.embed_query([query], embedder = self.model)[0].embedding
        docs = self.table.search(query_vec).limit(5).to_pandas()["text"]
        return "\nRetrieved documents:\n" + "".join(
            [f"\n\n===== Document {str(i)} =====\n" + doc for i, doc in enumerate(docs)]
        )

In [None]:
import datasets
from smolagents import Tool

import lancedb
import embed_anything
from embed_anything import EmbeddingModel, WhichModel, ONNXModel
from uuid import uuid4

# class RetrieverTool(Tool):
#     name = "retriever"
#     description = "Uses semantic search to retrieve policies about india that could be most relevant to answer your query."
#     inputs = {
#         "query": {
#             "type": "string",
#             "description": "The query to perform. This should be semantically close to your target documents. Use the affirmative form rather than a question.",
#         }
#     }
#     output_type = "string"

#     def __init__(self, directory, **kwargs):
#         super().__init__(**kwargs)
#         self.model = EmbeddingModel.from_pretrained_onnx(WhichModel.Bert, ONNXModel.AllMiniLML6V2Q)
#         self.connection = lancedb.connect("tmp/lancedb")
#         if "docs" in self.connection.table_names():
#             self.table = self.connection.open_table("docs")
#         else:
#             self.embeddings = embed_anything.embed_directory(directory, embedder = self.model)
#             docs = []
#             for e in self.embeddings:
#                 docs.append({
#                     "vector": e.embedding,
#                     "text": e.text,
#                     "id": str(uuid4())
#                 })
#             self.table = self.connection.create_table("docs", docs)

#     def forward(self, query: str) -> str:
#         assert isinstance(query, str), "Your search query must be a string"

#         query_vec = embed_anything.embed_query([query], embedder = self.model)[0].embedding
#         docs = self.table.search(query_vec).limit(5).to_pandas()["text"]
#         return "\nRetrieved documents:\n" + "".join(
#             [f"\n\n===== Document {str(i)} =====\n" + doc for i, doc in enumerate(docs)]
#         )


from smolagents import CodeAgent, OpenAIServerModel, TransformersModel, DuckDuckGoSearchTool
from google.colab import userdata
api_key = os.environ.get('OPENAI_API_KEY') # Get the key from environment variables
retriever_tool = RetrieverTool("downloaded_docs")
medical_tool = MedicalRetrieverTool("medical_docs")
# medical_tool = RetrieverTool("downloaded_docs")
agent = CodeAgent(
    tools=[retriever_tool, medical_tool],
    model=OpenAIServerModel(model_id = "gpt-4o-mini", api_base = "https://api.openai.com/v1/", api_key = api_key),
    verbosity_level=2,
)

agent_output = agent.run("What are the different policies for indian manufacturing and what are the medical risks of radiotherapy?")

print("Final output:")
print(agent_output)