In [3]:
import nest_asyncio
from browser_use import Agent
nest_asyncio.apply()

INFO     [browser_use] BrowserUse logging setup complete with level info
INFO     [root] Anonymized telemetry enabled. See https://github.com/gregpr07/browser-use for more information.


In [1]:
from langchain_google_vertexai.model_garden import ChatAnthropicVertex
from langchain_google_vertexai import ChatVertexAI

In [2]:
project = "ai-call-bot-440714"
location = "us-east5"

# Initialise the Model
model = ChatAnthropicVertex(
    model_name="claude-3-5-sonnet-v2@20241022",
    project=project,
    location=location,
)

In [4]:
agent = Agent(
    task="Go to https://wargaming.com/en/careers/ and extract the list of all paths of the vacancies by href, if needed click next page button",
    llm=model,
)
result = await agent.run()

INFO     [agent] 🚀 Starting task: Go to https://wargaming.com/en/careers/ and extract the list of all paths of the vacancies by href, if needed click next page button
INFO     [agent] 
📍 Step 1
INFO     [agent] 🤷 Eval: Unknown - Starting fresh on a blank page
INFO     [agent] 🧠 Memory: Need to go to Wargaming careers page and extract vacancy hrefs
INFO     [agent] 🎯 Next goal: Navigate to the Wargaming careers page
INFO     [agent] 🛠️  Action 1/1: {"go_to_url":{"url":"https://wargaming.com/en/careers/"}}
ERROR    [agent] ❌ Result failed 1/3 times:
 Error executing action go_to_url: Page.goto: Timeout 30000ms exceeded.
Call log:
  - navigating to "https://wargaming.com/en/careers/", waiting until "load"

INFO     [agent] 
📍 Step 2
INFO     [agent] ⚠ Eval: Failed - The initial URL navigation timed out, but we landed on the correct page anyway
INFO     [agent] 🧠 Memory: We are on the Wargaming careers page and need to click on Latest job opportunities to find the vacancy listings
INFO    

In [4]:
from playwright.async_api import async_playwright
import asyncio

async def extract_vacancy_paths():
    async with async_playwright() as p:
        browser = await p.chromium.launch(
            headless=True,
            args=[
            '--no-sandbox',
            '--disable-blink-features=AutomationControlled',
            '--disable-infobars',
            '--disable-background-timer-throttling',
            '--disable-popup-blocking',
            '--disable-backgrounding-occluded-windows',
            '--disable-renderer-backgrounding',
            '--disable-window-activation',
            '--disable-focus-on-load',
            '--no-first-run',
            '--no-default-browser-check',
            '--no-startup-window',
            '--window-position=0,0',
            '--window-size=1280,1000',
            '--disable-web-security',
            '--disable-site-isolation-trials',
            '--disable-features=IsolateOrigins,site-per-process'
        ])
        page = await browser.new_page()

        vacancy_paths = []


        try:
            await page.goto("https://wargaming.com/en/careers/", timeout=30000)
            await page.wait_for_load_state('networkidle')
        except Exception as e:
            print(f"Navigation timeout occurred: {e}")
            # Continue execution as the page might still be usable



        # Wait for and click the jobs list link
        try:
            await page.wait_for_selector('a[href="#jobs_list"]', timeout=30000)
            await page.click('a[href="#jobs_list"]')
            await page.wait_for_load_state('networkidle')
        except Exception as e:
            print(f"Selector 'a[href='#jobs_list']' not found: {e}")
            # Continue execution as the page might still be usable

        # Wait for job listings to appear
        try:
            await page.wait_for_selector('.job-listing', timeout=30000)
        except Exception as e:
            print(f"Selector '.job-listing' not found: {e}")
            # Continue execution as the page might still be usable

        # Try different selectors for job links
        selectors = [
            '.job-listing a',
            'a[href*="/careers/vacancy/"]',
            '[data-vacancy-link]',
        ]

        for selector in selectors:
            try:
                links = await page.eval_on_selector_all(
                    selector,
                    "elements => elements.map(el => el.getAttribute('href'))"
                )
                if links:
                    vacancy_paths.extend(links)
                    break
            except Exception as e:
                print(f"Selector {selector} failed: {e}")
                continue
        # Filter out None values and duplicates
        return list(set([path for path in vacancy_paths if path]))

In [16]:
paths = await extract_vacancy_paths()
print(f"Found {len(paths)} unique vacancy paths:")
for path in paths:
    print(path)

Navigation timeout occurred: Page.goto: Timeout 30000ms exceeded.
Call log:
  - navigating to "https://wargaming.com/en/careers/", waiting until "load"

Selector 'a[href='#jobs_list']' not found: Timeout 30000ms exceeded.
"load" event fired
Selector '.job-listing' not found: Page.wait_for_selector: Timeout 30000ms exceeded.
Call log:
  - waiting for locator(".job-listing") to be visible

Found 0 unique vacancy paths:


In [20]:
async def extract_vacancy_details(url: str) -> str:
    """
    Extract HTML content from a specific vacancy page.

    Args:
        url (str): Full URL of the vacancy page

    Returns:
        str: HTML content of the page
    """
    async with async_playwright() as p:
        browser = await p.chromium.launch(
            headless=True,
            args=[
                '--no-sandbox',
                '--disable-blink-features=AutomationControlled',
                '--disable-infobars',
                # ... keeping same browser arguments as before ...
            ],
        )
        page = await browser.new_page()

        try:
            try:
                await page.goto(url, timeout=30000)
                await page.wait_for_load_state('networkidle')
            except Exception as e:
                print(e)
            # Get the full HTML content
            html_content = await page.content()
            return html_content

        except Exception as e:
            print(f"Error extracting vacancy details from {url}: {e}")
            return ""
        finally:
            await browser.close()

# Example usage:
url = "https://wargaming.com/en/careers/vacancy_3013800_belgrade/"
html_content = await extract_vacancy_details(url)
print(f"Retrieved {len(html_content)} characters of HTML content")

Page.goto: Timeout 30000ms exceeded.
Call log:
  - navigating to "https://wargaming.com/en/careers/vacancy_3013800_belgrade/", waiting until "load"

Retrieved 329360 characters of HTML content


In [21]:
print(html_content)


<!DOCTYPE html><html lang="en" data-reactid=".2f5zg8kupeu" data-react-checksum="-837268163"><head data-reactid=".2f5zg8kupeu.0"><title data-reactid=".2f5zg8kupeu.0.0">Wargaming Vacancies | Wargaming</title><script id="facebook-jssdk" src="//connect.facebook.net/en_US/sdk.js#xfbml=1&amp;version=v2.5&amp;appId=937703406266328"></script><script async="" src="https://www.googletagmanager.com/gtm.js?id=GTM-58QVDL8"></script><script type="text/javascript" src="https://cdn.cookielaw.org/consent/dd285fd9-b01f-4f61-9f32-df1f9805693e/OtAutoBlock.js" data-reactid=".2f5zg8kupeu.0.1"></script><script src="https://cdn.cookielaw.org/scripttemplates/otSDKStub.js" data-document-language="true" type="text/javascript" charset="UTF-8" data-domain-script="dd285fd9-b01f-4f61-9f32-df1f9805693e" data-reactid=".2f5zg8kupeu.0.2"></script><script data-reactid=".2f5zg8kupeu.0.3">function OptanonWrapper() { }</script><script data-reactid=".2f5zg8kupeu.0.4">(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push(
  {'gtm.star

In [19]:
import httpx

url = "https://wargaming.com/en/careers/vacancy_3013800_belgrade/"
response = httpx.get(url)

if response.status_code == 200:
    html = response.text
    print(len(html))
else:
    print(f"Ошибка: {response.status_code}")

184155


In [11]:
from pydantic import BaseModel, Field


class Vacancy(BaseModel):
    """Structured representation of a job vacancy extracted by Gemini from HTML content.

    This model defines the expected format and structure for vacancy information,
    helping to standardize LLM output parsing.
    """

    title: str = Field(
        description="Job position title or role name extracted from the vacancy posting",
    )
    description: str = Field(
        description="Full job description including overview and key details of the position",
    )
    skills: list[str] = Field(
        description="List of required technical and soft skills for the position",
    )
    required_experience: str | None = Field(
        default=None,
        description="Required years of experience or experience level (e.g., '3-5 years', 'Entry level')",
    )
    location: str = Field(
        description="Physical location or geographical area of the job position",
    )

    level: str = Field(description="Seniority or professional level (e.g., 'Junior', 'Middle', 'Senior')")
    salary: str | None = Field(
        default=None,
        description="Salary range or compensation information if provided",
    )
    responsibilities: list[str] = Field(
        description="List of key job duties and responsibilities for the position",
    )
    benefits: list[str] | None = Field(
        default=None,
        description="List of company benefits, perks, and additional compensation offerings",
    )
    remote_type: str = Field(
        description="Work arrangement type: 'office' (on-site), 'hybrid', or 'remote'",
    )

    department: str | None = Field(
        default=None,
        description="Company department, division, or business unit offering the position",
    )

In [12]:
from langchain_google_vertexai import ChatVertexAI

llm = ChatVertexAI(
    model="gemini-1.5-flash-002",
    temperature=0,
    max_tokens=None,
    max_retries=6,
    stop=None,
)


In [13]:
structured_llm = llm.with_structured_output(Vacancy)

In [14]:
vacancy = structured_llm.invoke(html_content)

In [16]:
vacancy2 = structured_llm.invoke(html)
print(vacancy2)

title='Customer Relations Advisor - Russian' description='\\"<p style=\\"margin-top: 12pt; margin-bottom: 12pt;\\"><strong><span style=\\"font-size: 11pt; font-family: Nunito,sans-serif; color: #000000;\\">Customer Relations Advisor - Russian </span></strong></p><p style=\\"margin-top: 12pt; margin-bottom: 12pt;\\"><strong><span style=\\"font-size: 11pt; font-family: Nunito,sans-serif; color: #000000;\\">About Us:</span></strong></p><p style=\\"margin-top: 12pt; margin-bottom: 12pt;\\">\\"<span style=\\"font-size: 11pt; font-family: Nunito,sans-serif; color: #000000; font-weight: 400;\\">At Booking.com, data drives our decisions. Technology is at our core. And innovation is everywhere. But our company is more than datasets, lines of code or A/B tests. We&rsquo;re the thrill of the first night in a new place. The excitement of the next morning. The friends you encounter. The journeys you take. The sights you see. And the memories you make. Through our products, partners and people, we m

In [18]:
pprint(vacancy2.dict())

{'benefits': ['25 (rising to 28) days of annual paid time off and generous '
              'paid leave scheme including: parent, grandparent, bereavement, '
              'and care leave',
              'Hybrid working including flexible working arrangements, and up '
              'to 20 days per year working from abroad (home country)',
              'Industry leading product discounts - up to &euro;1400 per year '
              '- for yourself, including automatic Genius Level 3 status and '
              'Booking.com wallet credit',
              'Working hours on a Monday to Friday basis',
              'Huge learning and development platform tailored to you',
              'Shopping and leisure discounts through Perkbox',
              'Wellbeing focus',
              'Mental health first-aiders and free supports',
              'Performance-based annual bonuses.',
              'Health Insurance Discounts.',
              'Contributory pension plan.',
              'Delicious ca

In [15]:
from pprint import pprint
pprint(vacancy.dict())

{'benefits': ['25 (rising to 28) days of annual paid time off and generous '
              'paid leave scheme including: parent, grandparent, bereavement, '
              'and care leave',
              'Hybrid working including flexible working arrangements, and up '
              'to 20 days per year working from abroad (home country)',
              'Industry leading product discounts - up to €1400 per year - for '
              'yourself, including automatic Genius Level 3 status and '
              'Booking.com wallet credit',
              'Working hours on a Monday to Friday basis',
              'Huge learning and development platform tailored to you',
              'Shopping and leisure discounts through Perkbox',
              'Wellbeing focus',
              'Mental health first-aiders and free supports',
              'Performance-based annual bonuses.',
              'Health Insurance Discounts.',
              'Contributory pension plan.',
              'Delicious catered