In [4]:
from ddg import Duckduckgo
from duckduckgo_search import DDGS
import pandas as pd
import requests
from dotenv import load_dotenv
import os
from pydantic_ai import Agent, RunContext
from pydantic_ai.models.groq import GroqModel
from pydantic_ai.models.ollama import OllamaModel
import random
import nest_asyncio
from pydantic import BaseModel, Field
from typing import List
from devtools import debug
from schema.schema import ResearchSchema, WebSearchEvent, ScrapeURLEvent, ScrapeResultsSchema
from langchain_ollama import ChatOllama
from llama_index.llms.ollama import Ollama
from llama_index.core.settings import Settings
from llama_index.core.workflow import (
    Event,
    StartEvent,
    StopEvent,
    Workflow,
    Context,
    step
)
from tqdm import tqdm

nest_asyncio.apply()
load_dotenv()

ModuleNotFoundError: No module named 'schema'

In [5]:
from workflows.web_search_workflow import WebSearchWorkflow
from utils.settings import project_settings

print("Config Complete")

search_workflow = WebSearchWorkflow(timeout=60, verbose=True)
result = await search_workflow.run(search_query = "Tata Motors vision", jina_api_key = project_settings.JINA_API_KEY)

  from .autonotebook import tqdm as notebook_tqdm
None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


Config Complete
Running step _webSearch
Step _webSearch produced event WebSearchEvent
Running step _scrapeURL


Scraping search results: 100%|██████████| 2/2 [00:20<00:00, 10.27s/it]


Step _scrapeURL produced event StopEvent


In [7]:
print(result.main)

VISION

As a high-performance Organization, we are by FY2019

*   Among the Top 3 Global CV and Domestic PV
*   Achieving Sustainable Financial Performance
*   Delivering Exciting Innovation

To become the most aspirational  
Indian auto brand by 2024

Our vision is to be a world-class, future-ready organization, driven by a passion for innovation, customer-centricity, and sustainability. We aim to be among the top three global commercial vehicle and domestic passenger vehicle manufacturers, achieving sustainable financial performance and delivering exciting innovations. 

Our mission is to innovate mobility solutions with passion to enhance the quality of life. We are committed to making responsible choices, creating a positive legacy, and connecting aspirations. 

We have a strong focus on sustainability, with an emphasis on decarbonization, circular economy, and preservation of the natural environment. We are integrating sustainability into our business strategy and are dedicated to

In [11]:
class WebSearchWorkflow(Workflow):

    @step
    async def _webSearch(self, ctx: Context, ev: StartEvent)->WebSearchEvent:

        search_query = ev.search_query

        await ctx.set("search_query", search_query)

        list_ = DDGS().text(  
                keywords = search_query,
                region = 'wt-wt',
                safesearch = 'off',
                timelimit = '7d',
                max_results = 2
            )

        return WebSearchEvent(url_list=list_)

    @step
    async def _scrapeURL(self, ctx: Context, ev: WebSearchEvent)->StopEvent:
        url_list = ev.url_list
        search_query:str = await ctx.get("search_query")
        
        data = ""
        source_urls = []
        for search_result in tqdm(url_list, desc="Scraping search results"):
            fetched_raw_data = _apiRequest(url = search_result['href'])
            data+=fetched_raw_data+"\n"
            source_urls.append(search_result['href'])



        parsed_data = await _parseScrapedData(
                scrapedData = data,
                searchQuery = search_query
        )
            
        data = ScrapeResultsSchema(urls = source_urls, query = search_query, main = parsed_data)
        return StopEvent(result=data)

search_workflow = WebSearchWorkflow(timeout=60, verbose=True)
result = await search_workflow.run(search_query = "Tata Motors vision")
    
result

Running step _webSearch
Step _webSearch produced event WebSearchEvent
Running step _scrapeURL


Scraping search results: 100%|██████████| 2/2 [00:01<00:00,  1.60it/s]


Step _scrapeURL produced event StopEvent


ScrapeResultsSchema(urls=['https://tatamotors.co.id/about-us/visionmision-values/', 'https://www.tatamotors.com/organisation/about-us/'], query='Tata Motors vision', main='Tata Motors vision: "Our vision is to be the world\'s best automotive brand, with a focus on innovation, sustainability and customer delight. We aim to create a sustainable future for our customers, employees and the environment. Our long-term strategy is to become a leader in electric vehicles, while also expanding our portfolio of passenger and commercial vehicles."')

In [2]:
agent_model = OllamaModel(model_name="llama3.2:latest")
chat_model = ChatOllama(model="llama3.2:latest")
# model = GroqModel('llama-3.3-70b-versatile', api_key=os.getenv('GROQ_API_KEY'))


In [18]:
list_ = DDGS().text(  
                keywords = "pm modi",
                region = 'wt-wt',
                safesearch = 'off',
                timelimit = '7d',
                max_results = 5
            )

list_

[{'title': 'pm modi book - Huge Selection Of Products - amazon.in',
  'href': 'https://www.bing.com/aclick?ld=e8403E3nGawVVyHT7z5CMvejVUCUxDdN8EY8nXDeRFntyutzKsxqk75zn_iX9wWzzL6jHJc585Q-z-mwEgBAojUyGwh-yxMQ-jmeEBEZa3DTT0vroBS_LmqeY2iNLnvTHjNl7ms9SFuvGl5OHOQxXta9yT7dzOqDkgezDD2zTL6ZfIa2Yf9GxnfZyx-V74066SYn1a8A&u=aHR0cHMlM2ElMmYlMmZ3d3cuYW1hem9uLmluJTJmcyUyZiUzZmllJTNkVVRGOCUyNmtleXdvcmRzJTNkcG0lMmJtb2RpJTJiYm9vayUyNmluZGV4JTNkYXBzJTI2dGFnJTNkbXNuZGVza3N0ZGluLTIxJTI2cmVmJTNkcGRfc2xfdGtzdmV5ZDcxX2IlMjZhZGdycGlkJTNkMTMxMzkxODAwMDkxNzAyNSUyNmh2YWRpZCUzZDgyMTIwMTQ1OTU3NjM1JTI2aHZuZXR3JTNkcyUyNmh2cW10JTNkYiUyNmh2Ym10JTNkYmIlMjZodmRldiUzZGMlMjZodmxvY2ludCUzZCUyNmh2bG9jcGh5JTNkMTU3NjQ5JTI2aHZ0YXJnaWQlM2Rrd2QtODIxMjA3NTE2NTg4MjElM2Fsb2MtOTAlMjZoeWRhZGNyJTNkMjM2NDJfMjI5MTc5MSUyNm1jaWQlM2QlMjZtc2Nsa2lkJTNkYzZiZjVkMTQ5NDZkMThkNzc5MTUzYTY1MWZhODkxZDc&rlid=c6bf5d14946d18d779153a651fa891d7',
  'body': 'Choose From a Wide Selection Of Informative and Comprehensive Books For You. Amazon Offers an Array 

In [3]:
from tqdm import tqdm

async def _parseScrapedData(scrapedData: str, searchQuery: str)->str:

    parse_prompt = f'''

    You are an advanced text extraction model. Your task is to extract only the main textual content from the provided scraped webpage data, removing all unrelated elements such as URLs, images, JavaScript, or redundant navigational text. 
    Focus on delivering clear, meaningful content that reflects the primary information and purpose of the webpage. You will also be given a search query which you will use to find the textual data, from the web scraped data, that is relevant to the search query  
    
    Instructions:
     - Don't modify the text extracted from the data into your own words. Just extract the main textual data as it is.
     - Don't say 'Here is the extracted content..' or 'This is a web page...', just remove the unwanted data from the entire input and only output the main textual data as it is.
     - Only find out textual data that is relevant to the search query given to you

    Search Query: {searchQuery}
    Web Scraped Input: (Data to be parsed) 

    {scrapedData}

    Output: 

    '''
    return chat_model.invoke(parse_prompt).content

async def _webSearch(query: str)->dict:
    print(f"Recieved Query: {query}. Starting Web Search")
    list_ = DDGS().text(  
                keywords = query,
                region = 'wt-wt',
                safesearch = 'off',
                timelimit = '7d',
                max_results = 5
            )

    headers = {
    "Authorization": f"Bearer {os.getenv('JINA_API_KEY')}"
    }

    data = ""

    print(f"Web Search Complete. Starting Scraping")

    for result in tqdm(list_, desc="Scraping Links"):
        try:
            url = f"https://r.jina.ai/{result['href']}"
            response = requests.get(url, headers=headers)

            parsed_data = await _parseScrapedData(
                scrapedData = response.text,
                searchQuery = query
            )

            response.raise_for_status()
            data += parsed_data + "\n"

        except Exception as e:
            print(f"Error fetching URL {url}: {e}")

    return data


In [4]:
webSearch_agent = Agent(
    agent_model,
    system_prompt = '''
        You are a web searching assistant that does web searches based on query. You must execute the  
        You will be given a query and you must use the tool that Searches across the web using query. 
        You will not answer the query based on your data and will always rely upon web scraping results.
    '''
)

@webSearch_agent.tool
async def searchWeb(ctx: RunContext[str], query: str):
    """Search across the web using query.

    Args: 

        query: query to search
    """
    
    list_ = DDGS().text(  
                keywords = query,
                region = 'wt-wt',
                safesearch = 'off',
                timelimit = '7d',
                max_results = 2
            )

    headers = {
    "Authorization": f"Bearer {os.getenv('JINA_API_KEY')}"
    }

    data = ""

    for result in tqdm(list_, desc=f"Scraping and Parsing Links for query {query}"):
        try:
            url = f"https://r.jina.ai/{result['href']}"
            response = requests.get(url, headers=headers)

            parsed_data = await _parseScrapedData(
                scrapedData = response.text,
                searchQuery = query
            )

            response.raise_for_status()
            data += parsed_data + "\n"

        except Exception as e:
            print(f"Error fetching URL {url}: {e}")

    return data


In [5]:
results = await webSearch_agent.run(
    'Tata Steel Vision'
)

print(results.data)

Scraping and Parsing Links for query Tata Steel Vision: 100%|██████████| 2/2 [00:19<00:00,  9.71s/it]


Based on the web search results, it appears that Tata Steel is a steel company based in India with a long history of innovation and a strong commitment to employee welfare and safety. The company operates globally and has developed a range of business models and strategies to ensure sustainability and growth. Tata Steel offers a range of steel products and solutions for different industries and is committed to engaging with local communities and supporting development initiatives.


In [15]:
class MyDeps(BaseModel):
    response: str
    query: str

class JudgeResponseType(BaseModel):
    similar: bool

judging_agent = Agent(
    agent_model,
    # result_type = JudgeResponseType,
    deps_type = MyDeps,
    system_prompt='''
        You are an expert agent tasked with determining if a given response answers a specific query.
        You will be provided with:
        - A query: a question or request for information.
        - A response: the potential answer to the query.

        Your job is to:
        1. Fully understand the query.
        2. Analyze whether the response directly, indirectly, or partially answers the query.
        3. Return `true` if the response answers the query in any capacity, otherwise return `false`.

        Be objective and ensure that your judgments are clear and based only on the content of the query and response.
    '''
)

@judging_agent.tool
async def judge_responses(ctx: RunContext[str], query: str):
    response =  ctx.deps.response
    query =  ctx.deps.query

    prompt = f"""
        Query: "{query}"
        Response: "{response}"
        
        Does the response answer the query? Answer 'true' if yes, and 'false' if no. Don't give me or use any python code. Just answer logically 
        """
    
    result = chat_model.invoke(prompt).content
    print(result)
    
    # is_answering = result.strip().lower() == "true"
    
    # return JudgeResponseType(similar=is_answering)
    return result


In [16]:
deps = MyDeps(response=results.data, query='Tata Steel Vision')

res = await judging_agent.run(
    'Is the response is answering the query or not',
    deps = deps
)

print(res.data)

In order to determine if a response answers a query, we need more information about both the query and the response.

Please provide the query and the response, and I'll do my best to determine whether the response answers the query or not. 

For example:

Query: "What is the capital of France?"
Response: "The capital of France is Paris."

You can format your input as a dictionary with two keys: 'query' and 'response'. For example:
```
{
    'query': 'What is the capital of France?',
    'response': 'The capital of France is Paris.'
}
```

Given this input, I will return `True` if the response answers the query in any capacity. Otherwise, I will return `False`.


In [3]:
import requests
response = requests.get('https://ip.oxylabs.io/location')
print(response.text)

{"ip":"122.161.78.200","providers":{"dbip":{"country":"IN","asn":"AS24560","org_name":"Bharti Airtel Limited","city":"Shahdara","zip_code":"","time_zone":"","meta":"\u003ca href='https://db-ip.com'\u003eIP Geolocation by DB-IP\u003c/a\u003e"},"ip2location":{"country":"IN","asn":"","org_name":"","city":"Almora","zip_code":"263137","time_zone":"+05:30","meta":"This site or product includes IP2Location LITE data available from \u003ca href=\"https://lite.ip2location.com\"\u003ehttps://lite.ip2location.com\u003c/a\u003e."},"ipinfo":{"country":"IN","asn":"AS24560","org_name":"Bharti Airtel Ltd., Telemedia Services","city":"","zip_code":"","time_zone":"","meta":"\u003cp\u003eIP address data powered by \u003ca href=\"https://ipinfo.io\" \u003eIPinfo\u003c/a\u003e\u003c/p\u003e"},"maxmind":{"country":"IN","asn":"AS24560","org_name":"Bharti Airtel Ltd., Telemedia Services","city":"New Delhi","zip_code":"","time_zone":"+05:30","meta":"This product includes GeoLite2 Data created by MaxMind, avail

In [44]:
# research_agent = Agent(
#     model,
#     result_type = ResearchSchema,
#     system_prompt = 
    
#     '''
#         You are a research expert agent tasked with conducting a thorough research of a company given to you. You will be given a company name using which you have to do multiple searches using the tools given to you.
#         During these multiple searches you must use the top 5 links for each search to gather information about a company's and must gather the following information:

#         1. About the Company
#             - Industry and Market Segment:
#                 - Primary industry (e.g., Healthcare, Automotive, Retail).
#                 - Sub-segment specialization (e.g., diagnostic tools, electric vehicles, e-commerce logistics).

#             - Key Offerings:
#                 - Products and services.
#                 - Unique selling propositions (USPs).
#                 - Technological capabilities (e.g., AI-based solutions, IoT integration).

#             - Strategic Focus Areas:
#                 - Operational improvements (e.g., supply chain optimization).
#                 - Customer experience (e.g., personalized recommendations, chatbots).
#                 - Sustainability initiatives, if applicable.

#             - Current Technology Adoption:
#                 - Existing AI/ML systems or tools in use.
#                 - Public partnerships with AI/ML providers (e.g., AWS, Azure, Google Cloud).

#         2. About the Industry
#             - Market Trends:
#                 - Industry-specific AI and GenAI adoption trends.
#                 - Emerging technologies and use cases in the sector.

#             - Competitor Analysis:
#                 - Key competitors and their offerings.
#                 - AI/ML initiatives or breakthroughs by competitors.
#                 - Relevant partnerships and collaborations.

#             - Challenges and Opportunities:
#                 - Pain points the industry faces.
#                 - Opportunities for AI to address these challenges. 

#         You are free to do multiple searches but must provide all the data/ study done and gathered for this company based on the above points.
#         You are a research expert tasked with gathering information about a company. Use the `searchWeb` tool to search the web and return data for the following queries:

#     '''
# )

# @research_agent.tool
# async def searchWeb(ctx: RunContext[str], query: str):
#     """Search across the web using query.

#     Args: 

#         query: query to search
#     """

#     print(f"Recieved Query: {query}. Starting Web Search")
#     list_ = DDGS().text(  
#                 keywords = query,
#                 region = 'wt-wt',
#                 safesearch = 'off',
#                 timelimit = '7d',
#                 max_results = 5
#             )

#     headers = {
#     "Authorization": f"Bearer {os.getenv('JINA_API_KEY')}"
#     }

#     data = ""

#     print(f"Web Search Complete. Starting Scraping")

#     for result in tqdm(list_, desc="Scraping Links"):
#         try:
#             url = f"https://r.jina.ai/{result['href']}"
#             response = requests.get(url, headers=headers)
#             response.raise_for_status()
#             data += response.text + "\n"
#         except Exception as e:
#             print(f"Error fetching URL {url}: {e}")

# results = await research_agent.run(
#     'Tata Steel'
# )


In [7]:
from requests.exceptions import ProxyError, ReadTimeout, ConnectTimeout
import requests


scheme_proxy_map = {
    'http': "http://2.56.215.247:3128",
    'https': "https://88.198.24.108:8080",
    'https://example.org': "https://my-user:aegi1Ohz@2.56.215.247:8044"
}

try:
    response = requests.get('https://ip.oxylabs.io/location', proxies=scheme_proxy_map)
except (ProxyError, ReadTimeout, ConnectTimeout) as error:
        print('Unable to connect to the proxy: ', error)
else:
    print(response.text)

Unable to connect to the proxy:  HTTPSConnectionPool(host='ip.oxylabs.io', port=443): Max retries exceeded with url: /location (Caused by ProxyError('Unable to connect to proxy', NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x00000226BDEF5650>: Failed to establish a new connection: [WinError 10061] No connection could be made because the target machine actively refused it')))


In [None]:
import threading
import queue
import requests

q= queue.Queue()
valid_proxies = []

with open('proxies.txt', 'r', encoding='utf-8') as file:
    proxies = file.read().split('\n')
    for p in proxies:
        q.put(p)

def check_proxies():
    global q
    while not q.empty():
        proxy = q.get()
        try:
            res = requests.get("http://ipinfo.io/json", proxies={"http":proxy, "https": proxy})
        except:
            continue

        if res.status_code ==200:
            print(proxy)