In [3]:
from playwright.async_api import async_playwright
from langchain_google_vertexai.model_garden import ChatAnthropicVertex
from langchain_google_vertexai import ChatVertexAI
import nest_asyncio
from pprint import pprint
from langchain_core.prompts import PromptTemplate
from langchain_core.tools import Tool
from langchain_experimental.utilities import PythonREPL

nest_asyncio.apply()

In [4]:
python_repl = PythonREPL()
repl_tool = Tool(
    name="python_repl",
    description="A Python shell. Use this to execute python commands. Input should be a valid python command. If you want to see the output of a value, you should print it out with `print(...)`.",
    func=python_repl.run,
)

In [23]:
llm = ChatVertexAI(
    model="gemini-1.5-pro-002",
    temperature=0,
    max_tokens=None,
    max_retries=6,
    stop=None,
)

In [25]:
prompt = PromptTemplate(
    input_variables=["html_content"],
    template="""
    analyze the following html content and write a python class for extracting links vacancies from this page


this is example of class:
import traceback

from loguru import logger
from playwright.async_api import Page

from app.link_extractor.base import BaseLinkExtractor


class BookingLinkExtractor(BaseLinkExtractor):

    def __init__(self, logger: logger = logger) -> None:
        super().__init__("https://jobs.booking.com/booking/jobs", logger)
        self.all_links = set()

    @property
    def name(self) -> str:
        return "Booking"

    async def _load_all_content(self, page: Page) -> None:
        try:
            await page.wait_for_selector('a[href*="/booking/jobs/"]', timeout=self.timeout)
        except TimeoutError as e:
            self.logger.info("Timeout while loading content", error=e)

        while True:
            current_links = await page.eval_on_selector_all(
                'a[href*="/booking/jobs/"]',
                "elements => elements.map(el => el.getAttribute('href'))",
            )
            self.all_links.update(current_links)
            self.logger.info(
                "Found vacancies on current page",
                current_links=len(current_links),
                total_links=len(self.all_links),
            )

            try:
                next_button = await page.query_selector(
                    "button.mat-focus-indicator.mat-tooltip-trigger"
                    ".mat-paginator-navigation-next.mat-icon-button"
                    ".mat-button-base",
                )

                if not next_button or await next_button.is_disabled():
                    self.logger.info("No more links on the pages available")
                    break

                await next_button.click()
                await page.wait_for_load_state("networkidle")
                await page.wait_for_timeout(2000)
            except Exception:  # noqa: BLE001
                self.logger.info("Error during pagination", error=traceback.format_exc())
                break

    async def _extract_links(self, page: Page) -> list[str]:  # noqa: ARG002
        return list(getattr(self, "all_links", set()))

    {html_content}
    """,
)

In [26]:
tools = [repl_tool]
llm_with_tools = prompt | llm

In [20]:
async with async_playwright() as p:
    browser = await p.chromium.launch(
        headless=False,  # Запускаем браузер в видимом режиме
    )
    url = "https://www.tbank.ru/career/vacancies/all/moscow/"
    context = await browser.new_context()
    page = await context.new_page()
    try:
        await page.goto(url, timeout=30000) 
        await page.wait_for_load_state('domcontentloaded')
        await page.wait_for_load_state('networkidle')
        await page.wait_for_selector('.loading-spinner', state='hidden', timeout=10000)
    except:
        pass
    html_content = await page.content()


In [27]:
result = llm_with_tools.invoke({"html_content": html_content})

In [28]:
print(result.content)


```python
import traceback

from loguru import logger
from playwright.async_api import Page

from app.link_extractor.base import BaseLinkExtractor


class TinkoffLinkExtractor(BaseLinkExtractor):

    def __init__(self, logger: logger = logger) -> None:
        super().__init__("https://www.tbank.ru/career/vacancies/all/moscow/", logger)
        self.all_links = set()

    @property
    def name(self) -> str:
        return "Tinkoff"

    async def _load_all_content(self, page: Page) -> None:
        await page.wait_for_load_state("networkidle")  # Ensure initial content loads

        while True:
            try:
                # Wait for vacancy cards to appear
                await page.wait_for_selector(".VacancyCard__panel-desktop_aq6NiZ", timeout=self.timeout)

                # Extract links from current page
                current_links = await page.eval_on_selector_all(
                    '.VacancyCard__button-desktop_kq6NiZ a',  # Select links within the button
           

In [3]:
project = "ai-call-bot-440714"
location = "us-east5"

# Initialise the Model
model = ChatAnthropicVertex(
    model_name="claude-3-5-sonnet-v2@20241022",
    project=project,
    location=location,
)

In [6]:
async def extract_vacancy_paths():
    async with async_playwright() as p:
        browser = await p.chromium.launch(
            headless=True,
            args=[
            '--no-sandbox',
            '--disable-blink-features=AutomationControlled',
            '--disable-infobars',
            '--disable-background-timer-throttling',
            '--disable-popup-blocking',
            '--disable-backgrounding-occluded-windows',
            '--disable-renderer-backgrounding',
            '--disable-window-activation',
            '--disable-focus-on-load',
            '--no-first-run',
            '--no-default-browser-check',
            '--no-startup-window',
            '--window-position=0,0',
            '--window-size=1280,1000',
            '--disable-web-security',
            '--disable-site-isolation-trials',
            '--disable-features=IsolateOrigins,site-per-process'
        ])
        page = await browser.new_page()

        vacancy_paths = []


        try:
            await page.goto("https://wargaming.com/en/careers/", timeout=30000)
            await page.wait_for_load_state('networkidle')
        except Exception as e:
            print(f"Navigation timeout occurred: {e}")
            # Continue execution as the page might still be usable



        # Wait for and click the jobs list link
        try:
            await page.wait_for_selector('a[href="#jobs_list"]', timeout=30000)
            await page.click('a[href="#jobs_list"]')
            await page.wait_for_load_state('networkidle')
        except Exception as e:
            print(f"Selector 'a[href='#jobs_list']' not found: {e}")
            # Continue execution as the page might still be usable

        # Wait for job listings to appear
        try:
            await page.wait_for_selector('.job-listing', timeout=30000)
        except Exception as e:
            print(f"Selector '.job-listing' not found: {e}")
            # Continue execution as the page might still be usable

        # Try different selectors for job links
        selectors = [
            '.job-listing a',
            'a[href*="/careers/vacancy/"]',
            '[data-vacancy-link]',
        ]

        for selector in selectors:
            try:
                links = await page.eval_on_selector_all(
                    selector,
                    "elements => elements.map(el => el.getAttribute('href'))"
                )
                if links:
                    vacancy_paths.extend(links)
                    break
            except Exception as e:
                print(f"Selector {selector} failed: {e}")
                continue
        # Filter out None values and duplicates
        return list(set([path for path in vacancy_paths if path]))

In [16]:
paths = await extract_vacancy_paths()
print(f"Found {len(paths)} unique vacancy paths:")
for path in paths:
    print(path)

Navigation timeout occurred: Page.goto: Timeout 30000ms exceeded.
Call log:
  - navigating to "https://wargaming.com/en/careers/", waiting until "load"

Selector 'a[href='#jobs_list']' not found: Timeout 30000ms exceeded.
"load" event fired
Selector '.job-listing' not found: Page.wait_for_selector: Timeout 30000ms exceeded.
Call log:
  - waiting for locator(".job-listing") to be visible

Found 0 unique vacancy paths:


In [43]:
async def extract_visible_text(url: str):
    """
    Save the full HTML content of a web page to a file.

    Args:
        url (str): The URL of the web page.
        output_file (str): The file path where HTML content will be saved.
    """
    async with async_playwright() as p:
        browser = await p.chromium.launch(
            headless=False,  # Запускаем браузер в видимом режиме
        )
        context = await browser.new_context()
        page = await context.new_page()

        try:
            
            try:
                await page.goto(url, timeout=30000) 
                await page.wait_for_load_state('domcontentloaded')
                await page.wait_for_load_state('networkidle')
                await page.wait_for_selector('.loading-spinner', state='hidden', timeout=10000)
            except:
                pass  # Игнорируем, если индикатора загрузки нет
            
            
            # Извлекаем HTML-код страницы
            html_content = await page.evaluate("""
                () => {
                    // Try different selectors for the main content
                    const selectors = [
                        '.vacancy-description',  // Common class for vacancy content
                        'main',                 // Main content area
                        'article',              // Article content
                        '#content'              // Content div
                    ];
                    
                    for (const selector of selectors) {
                        const element = document.querySelector(selector);
                        if (element) {
                            return element.innerText;
                        }
                    }
                    
                    // Fallback: return body text if no specific container found
                    return document.body.innerText;
                }
            """)

            # Сохраняем HTML-код в файл
            return html_content

        except Exception as e:
            print(f"Error: {e}")

        finally:
            await browser.close()

# Example usage:
url = "https://wargaming.com/en/careers/vacancy_3050484_guildford"
html_content = await extract_visible_text(url)
print(f"Retrieved text content:\n{html_content}")

Retrieved text content:
Vacancy
HOMECAREERSWORK AT WARGAMINGVACANCY
Русский
 WARGAMING Explore our offices 
Home
About
News
Careers
Work at Wargaming
FAQ
Games
World of Tanks
World of Warships
World of Warships Blitz
World of Warplanes
World of Tanks Blitz
World of Tanks Modern Armor
Master of Orion
Partnership
Privacy
GDPR Request Form
CCPA Request Form
Giving Back
Home
About
News
Careers
Work at Wargaming
FAQ
Games
World of Tanks
World of Warships
World of Warships Blitz
World of Warplanes
World of Tanks Blitz
World of Tanks Modern Armor
Master of Orion
Partnership
Privacy
GDPR Request Form
CCPA Request Form
Giving Back
Home
About
News
Careers
Work at Wargaming
FAQ
Games
World of Tanks
World of Warships
World of Warships Blitz
World of Warplanes
World of Tanks Blitz
World of Tanks Modern Armor
Master of Orion
Partnership
Privacy
GDPR Request Form
CCPA Request Form
Giving Back
Lead Technical Artist (Unannounced Project)
GUILDFORD, UK
Art Production
Area
APPLY NOW

Job Overview

Based 

In [40]:
type(html_content)


str

In [41]:
pprint(html_content)


('About Us: At Booking.com, data drives our decisions. Technology is at our '
 'core. And innovation is everywhere. But our company is more than datasets, '
 'lines of code or A/B tests. We’re the thrill of the first night in a new '
 'place. The excitement of the next morning. The friends you encounter. The '
 'journeys you take. The sights you see. And the memories you make. Through '
 'our products, partners and people, we make it easier for everyone to '
 'experience the world.\n'
 '\n'
 '\xa0\n'
 '\n'
 'Leadership/Team Quote:\n'
 '\n'
 'The team is at the forefront of Generative AI innovation, driving solutions '
 'for travel-related chatbots, text generation and summarization applications, '
 'Q&A systems, and free-text search.\xa0\n'
 '\n'
 'The team you are applying to is spearheading a research effort to develop '
 'leading LLMs, specifically designed for the travel domain.\xa0\n'
 '\n'
 'This pioneering initiative combines cutting-edge AI research with practical '
 'applicati

In [9]:
import requests

url = "https://careers.indrive.com/vacancies/dbe16ffafceca21025542e8035cdd4d5"
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36"
}
response = requests.get(url, headers=headers)

if response.status_code == 200:
    print(response.text)

<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta http-equiv="X-UA-Compatible" content="IE=edge"/><meta name="keywords" content="Keywords"/><meta name="viewport" content="minimum-scale=1, initial-scale=1, width=device-width, shrink-to-fit=no, user-scalable=no, viewport-fit=cover"/><meta name="mobile-web-app-capable" content="yes"/><meta name="apple-mobile-web-app-capable" content="yes"/><meta name="application-name" content="inDrive"/><meta name="apple-mobile-web-app-title" content="inDrive"/><meta name="theme-color" content="#fff"/><meta name="msapplication-navbutton-color" content="#fff"/><meta name="apple-mobile-web-app-status-bar-style" content="black-translucent"/><link rel="canonical" href="https://careers.indrive.com/vacancies/[id]"/><link rel="shortcut icon" href="/favicon.svg"/><meta name="msapplication-starturl" content="/"/><meta name="description" content="Dive into the world of career possibilities at Indrive. Join our dynamic and innovative team today."/><meta prope

In [1]:
import httpx

url = "https://careers.indrive.com/vacancies/dbe16ffafceca21025542e8035cdd4d5"
response = httpx.get(url)

if response.status_code == 200:
    html = response.text
    print(len(html))
else:
    print(f"Ошибка: {response.status_code}")

21978


Pretty printing has been turned OFF


In [44]:
from pydantic import BaseModel, Field


class Vacancy(BaseModel):
    """Structured representation of a job vacancy extracted by Gemini from HTML content.

    This model defines the expected format and structure for vacancy information,
    helping to standardize LLM output parsing.
    """

    title: str = Field(
        description="Job position title or role name extracted from the vacancy posting",
    )
    description: str = Field(
        description="Full job description including overview and key details of the position",
    )
    skills: list[str] = Field(
        description="List of required technical and soft skills for the position",
    )
    required_experience: str | None = Field(
        default=None,
        description="Required years of experience or experience level (e.g., '3-5 years', 'Entry level')",
    )
    location: str = Field(
        description="Physical location or geographical area of the job position",
    )

    level: str = Field(description="Seniority or professional level (e.g., 'Junior', 'Middle', 'Senior')")
    salary: str | None = Field(
        default=None,
        description="Salary range or compensation information if provided",
    )
    responsibilities: list[str] = Field(
        description="List of key job duties and responsibilities for the position",
    )
    benefits: list[str] | None = Field(
        default=None,
        description="List of company benefits, perks, and additional compensation offerings",
    )
    remote_type: str = Field(
        description="Work arrangement type: 'office' (on-site), 'hybrid', or 'remote'",
    )

    department: str | None = Field(
        default=None,
        description="Company department, division, or business unit offering the position",
    )

In [45]:
llm = ChatVertexAI(
    model="gemini-1.5-flash-002",
    temperature=0,
    max_tokens=None,
    max_retries=6,
    stop=None,
)


In [46]:
structured_llm = llm.with_structured_output(Vacancy)

In [47]:
vacancy = structured_llm.invoke(html_content)

In [48]:
pprint(vacancy.dict())

{'benefits': ['5 weeks of Annual Leave per calendar year - increasing over '
              'time to a maximum 30 days per year, with the ability to buy up '
              'to an additional 5 days through our holiday buy scheme',
              'Additional paid time off (5 Personal Days, Birthday Leave, '
              'Marriage Leave, Compassionate Leave)',
              'Sick Leave Compensation and enhanced Maternity Leave Benefits',
              'Dog-friendly office',
              'Onsite canteen (with an endless supply of free snacks and '
              'drinks)',
              'Video games area',
              'Family coverage for Private Medical Insurance (pre-existing '
              'conditions are covered), Life Insurance, Pension scheme '
              'offering a matched 4% tax free employee contribution',
              'Mental well-being program (iFeel)',
              'Wellbeing perks (Mental Health Days, Dedicated Well-being room, '
              'Team of Mental Heath Fir

In [15]:
pprint(vacancy.dict())

{'benefits': ['25 (rising to 28) days of annual paid time off and generous '
              'paid leave scheme including: parent, grandparent, bereavement, '
              'and care leave',
              'Hybrid working including flexible working arrangements, and up '
              'to 20 days per year working from abroad (home country)',
              'Industry leading product discounts - up to €1400 per year - for '
              'yourself, including automatic Genius Level 3 status and '
              'Booking.com wallet credit',
              'Working hours on a Monday to Friday basis',
              'Huge learning and development platform tailored to you',
              'Shopping and leisure discounts through Perkbox',
              'Wellbeing focus',
              'Mental health first-aiders and free supports',
              'Performance-based annual bonuses.',
              'Health Insurance Discounts.',
              'Contributory pension plan.',
              'Delicious catered