In [1]:
from ddg import Duckduckgo
from duckduckgo_search import DDGS
import pandas as pd
import requests
from dotenv import load_dotenv
import os
from pydantic_ai import Agent, RunContext
from pydantic_ai.models.groq import GroqModel
from pydantic_ai.models.ollama import OllamaModel
import random
import nest_asyncio
from pydantic import BaseModel, Field
from devtools import debug
from schema.schema import ResearchSchema, ScrapeDataSchema
from langchain_ollama import ChatOllama

nest_asyncio.apply()
load_dotenv()

True

In [2]:
agent_model = OllamaModel(model_name="llama3.2:latest")
chat_model = ChatOllama(model="llama3.2:latest")
# model = GroqModel('llama-3.3-70b-versatile', api_key=os.getenv('GROQ_API_KEY'))


In [3]:
from tqdm import tqdm


async def _parseScrapedData(scrapedData: str, searchQuery: str)->str:

    parse_prompt = f'''

    You are an advanced text extraction model. Your task is to extract only the main textual content from the provided scraped webpage data, removing all unrelated elements such as URLs, images, JavaScript, or redundant navigational text. 
    Focus on delivering clear, meaningful content that reflects the primary information and purpose of the webpage. You will also be given a search query which you will use to find the textual data, from the web scraped data, that is relevant to the search query  
    
    Instructions:
     - Don't modify the text extracted from the data into your own words. Just extract the main textual data as it is.
     - Don't say 'Here is the extracted content..' or 'This is a web page...', just remove the unwanted data from the entire input and only output the main textual data as it is.
     - Only find out textual data that is relevant to the search query given to you

    Search Query: {searchQuery}
    Web Scraped Input: (Data to be parsed) 

    {scrapedData}

    Output: 

    '''
    return chat_model.invoke(parse_prompt).content

async def _webSearch(query: str)->dict:
    print(f"Recieved Query: {query}. Starting Web Search")
    list_ = DDGS().text(  
                keywords = query,
                region = 'wt-wt',
                safesearch = 'off',
                timelimit = '7d',
                max_results = 5
            )

    headers = {
    "Authorization": f"Bearer {os.getenv('JINA_API_KEY')}"
    }

    data = ""

    print(f"Web Search Complete. Starting Scraping")

    for result in tqdm(list_, desc="Scraping Links"):
        try:
            url = f"https://r.jina.ai/{result['href']}"
            response = requests.get(url, headers=headers)

            parsed_data = await _parseScrapedData(
                scrapedData = response.text,
                searchQuery = query
            )

            response.raise_for_status()
            data += parsed_data + "\n"

        except Exception as e:
            print(f"Error fetching URL {url}: {e}")

    return data


In [None]:
webSearch_agent = Agent(
    agent_model,
    system_prompt = '''
        You are a web searching assistant that does web searches based on query. You must execute the  
        You will be given a query and you must use the tool that Searches across the web using query. 
        You will not answer the query based on your data and will always rely upon web scraping results.
    '''
)

@webSearch_agent.tool
async def searchWeb(ctx: RunContext[str], query: str):
    """Search across the web using query.

    Args: 

        query: query to search
    """
    
    print(f"Recieved Query: {query}. Starting Web Search")
    list_ = DDGS().text(  
                keywords = query,
                region = 'wt-wt',
                safesearch = 'off',
                timelimit = '7d',
                max_results = 5
            )

    headers = {
    "Authorization": f"Bearer {os.getenv('JINA_API_KEY')}"
    }

    data = ""

    print(f"Web Search Complete. Starting Scraping")

    for result in tqdm(list_, desc="Scraping Links"):
        try:
            url = f"https://r.jina.ai/{result['href']}"
            response = requests.get(url, headers=headers)

            parsed_data = await _parseScrapedData(
                scrapedData = response.text,
                searchQuery = query
            )

            response.raise_for_status()
            data += parsed_data + "\n"

        except Exception as e:
            print(f"Error fetching URL {url}: {e}")

    return data

results = await webSearch_agent.run(
    'Tata Steel Vision'
)


Recieved Query: Tata Steel Vision. Starting Web Search
Web Search Complete. Starting Scraping


Scraping Links:  20%|██        | 1/5 [00:00<00:03,  1.14it/s]

In [5]:
print(results.data)

Tata Steel Vision is about Tata Steel Limited's vision and mission statements outlining its strategic goals, values, and culture.

**Mission Statement:**
"Tata Steel aims to be the world's best steel company by being the best in everything we do."

**Vision Statement:**
"To become the global leader in steel production, innovation, and sustainability, while creating long-term value for our stakeholders."

The mission statement emphasizes Tata Steel's commitment to excellence and its desire to be the best in every aspect of operations. The vision statement highlights the company's ambition to become a global leader in steel production, innovation, and sustainability.

Tata Steel core values guide operations:

1.  **Integrity:** Tatesh steel commitment ethical practices honesty transparency.
2.  **Excellence:** Continuously strive quality technology investments innovative approach.
3.  **Responsibility:** Promote sustainable development community well-being.
4.  **Innovation:** Fosters gr

In [44]:
research_agent = Agent(
    model,
    result_type = ResearchSchema,
    system_prompt = 
    
    '''
        You are a research expert agent tasked with conducting a thorough research of a company given to you. You will be given a company name using which you have to do multiple searches using the tools given to you.
        During these multiple searches you must use the top 5 links for each search to gather information about a company's and must gather the following information:

        1. About the Company
            - Industry and Market Segment:
                - Primary industry (e.g., Healthcare, Automotive, Retail).
                - Sub-segment specialization (e.g., diagnostic tools, electric vehicles, e-commerce logistics).

            - Key Offerings:
                - Products and services.
                - Unique selling propositions (USPs).
                - Technological capabilities (e.g., AI-based solutions, IoT integration).

            - Strategic Focus Areas:
                - Operational improvements (e.g., supply chain optimization).
                - Customer experience (e.g., personalized recommendations, chatbots).
                - Sustainability initiatives, if applicable.

            - Current Technology Adoption:
                - Existing AI/ML systems or tools in use.
                - Public partnerships with AI/ML providers (e.g., AWS, Azure, Google Cloud).

        2. About the Industry
            - Market Trends:
                - Industry-specific AI and GenAI adoption trends.
                - Emerging technologies and use cases in the sector.

            - Competitor Analysis:
                - Key competitors and their offerings.
                - AI/ML initiatives or breakthroughs by competitors.
                - Relevant partnerships and collaborations.

            - Challenges and Opportunities:
                - Pain points the industry faces.
                - Opportunities for AI to address these challenges. 

        You are free to do multiple searches but must provide all the data/ study done and gathered for this company based on the above points.
        You are a research expert tasked with gathering information about a company. Use the `searchWeb` tool to search the web and return data for the following queries:

    '''
)

@research_agent.tool
async def searchWeb(ctx: RunContext[str], query: str):
    """Search across the web using query.

    Args: 

        query: query to search
    """

    print(f"Recieved Query: {query}. Starting Web Search")
    list_ = DDGS().text(  
                keywords = query,
                region = 'wt-wt',
                safesearch = 'off',
                timelimit = '7d',
                max_results = 5
            )

    headers = {
    "Authorization": f"Bearer {os.getenv('JINA_API_KEY')}"
    }

    data = ""

    print(f"Web Search Complete. Starting Scraping")

    for result in tqdm(list_, desc="Scraping Links"):
        try:
            url = f"https://r.jina.ai/{result['href']}"
            response = requests.get(url, headers=headers)
            response.raise_for_status()
            data += response.text + "\n"
        except Exception as e:
            print(f"Error fetching URL {url}: {e}")

results = await research_agent.run(
    'Tata Steel'
)
