In [1]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import asyncio
try:
    import nest_asyncio
    nest_asyncio.apply()
except ImportError:
    pass
from bs4 import BeautifulSoup
import random
import pandas as pd
import logging
import os
from tqdm import tqdm

USER_AGENTS = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/113.0.0.0 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:112.0) Gecko/20100101 Firefox/112.0",
    "Mozilla/5.0 (iPhone; CPU iPhone OS 16_0 like Mac OS X) AppleWebKit/605.1.15 Version/16.0 Mobile/15E148 Safari/604.1",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/113.0.0.0 Edg/113.0.0.0"
]

TIME_PARAMS = ["r86400", "r604800", "r2592000"]  # 1d, 7d, 30d
GERMANY_CITIES = [
    "Berlin", "Hamburg", "Munich", "Frankfurt", "Cologne", "Stuttgart",
    "Duesseldorf", "Leipzig", "Dresden", "Hanover", "Nuremberg",
    "Bremen", "Essen", "Dortmund"
]
KEYWORDS = [
    "Systemadministrator",           # Системный администратор, прям классика
    "IT Systemadministrator",        # IT системный админ
    "IT Administrator",              # IT-админ
    "Administrator",                 # Иногда просто так
    "System Engineer",               # Сис. инженер
    "IT System Engineer",            # IT системный инженер
    "Systembetreuer",                # Обслуживание систем
    "Netzwerkadministrator",         # Админ сетей
    "Netzwerkadministratorin",       # Политкорректная версия
    "Systemtechniker",               # Системный техник
    "IT Support Administrator",      # Смесь саппорта и админства
    "Serveradministrator",           # Серверный админ
    "IT Fachinformatiker Systemintegration", # Популярно для немцев
    "IT Specialist Systemintegration",       # Английская версия
    "Fachinformatiker Systemintegration"     # Просто и понятно
]

CONCURRENCY_LIMIT = 20
RETRIES = 3
TIMEOUT = 15
MAX_PAGES = 40
OUTPUT_CSV = 'techsupport_germany_Big.csv.csv'
ERROR_LOG = 'errors.log'

logging.basicConfig(
    filename=ERROR_LOG,
    filemode='a',
    level=logging.WARNING,
    format='%(asctime)s %(levelname)s %(message)s'
)

def sanitize(text: str) -> str:
    return text.replace(" ", "%20") \
               .replace("ü", "ue") \
               .replace("ö", "oe") \
               .replace("ä", "ae") \
               .replace("ß", "ss")

def run_coro(coro):
    try:
        return asyncio.run(coro)
    except RuntimeError:
        loop = asyncio.get_event_loop()
        return loop.run_until_complete(coro)

async def fetch(session, url: str) -> str:
    for attempt in range(1, RETRIES + 1):
        try:
            headers = {"User-Agent": random.choice(USER_AGENTS)}
            async with session.get(url, headers=headers, timeout=TIMEOUT) as resp:
                resp.raise_for_status()
                return await resp.text()
        except Exception as e:
            logging.warning(f"[{attempt}/{RETRIES}] {e} @ {url}")
            await asyncio.sleep(2 ** attempt)
    logging.error(f"All retries failed for {url}")
    return ""

def parse_job_cards(html: str, kw: str, city: str) -> list:
    soup = BeautifulSoup(html, 'html.parser')
    cards = soup.find_all('div', class_='base-card')
    jobs = []
    for card in cards:
        title = (card.select_one('h3.base-search-card__title') or '').get_text(strip=True)
        company = (card.select_one('h4.base-search-card__subtitle') or '').get_text(strip=True)
        location = (card.select_one('span.job-search-card__location') or '').get_text(strip=True)
        link_el = card.select_one('a.base-card__full-link')
        link = link_el['href'] if link_el and link_el.has_attr('href') else ''
        # Только если есть всё необходимое!
        if title and company and location and link:
            jobs.append({
                'Title':    title,
                'Company':  company,
                'Location': location,
                'Link':     link,
                'Keyword':  kw,
                'City':     city
            })
    return jobs

async def gather_search_pages() -> list:
    import aiohttp
    urls = []
    for tp in TIME_PARAMS:
        for city in GERMANY_CITIES:
            for kw in KEYWORDS:
                for page in range(MAX_PAGES):
                    start = page * 25
                    url = (
                        "https://www.linkedin.com/jobs/search/"
                        f"?keywords={sanitize(kw)}"
                        f"&location={sanitize(city)}"
                        f"&f_TPR={tp}"
                        f"&start={start}"
                    )
                    urls.append((url, kw, city))

    print(f"🔗 Total URLs to fetch: {len(urls)}")
    connector = aiohttp.TCPConnector(limit=CONCURRENCY_LIMIT)
    async with aiohttp.ClientSession(connector=connector) as session:
        sem = asyncio.Semaphore(CONCURRENCY_LIMIT)
        jobs = []

        async def worker(u, kw, city):
            async with sem:
                html = await fetch(session, u)
                if not html:
                    logging.warning(f"No HTML fetched for {u}")
                    return
                cards = await asyncio.get_event_loop().run_in_executor(
                    None, parse_job_cards, html, kw, city
                )
                jobs.extend(cards)

        tasks = [worker(u, kw, city) for u, kw, city in urls]
        for f in tqdm(asyncio.as_completed(tasks), total=len(tasks), desc="Gathering links"):
            await f

    # Убираем дубли по ссылкам
    unique = {j['Link']: j for j in jobs if j['Link']}
    print(f"🟢 Links parsed: {len(jobs)}, unique: {len(unique)}")
    return list(unique.values())

def main():
    if os.path.exists(OUTPUT_CSV):
        os.remove(OUTPUT_CSV)
    print("🔍 Gathering job links...")
    jobs = run_coro(gather_search_pages())
    print(f"✅ Collected {len(jobs)} unique job links.")
    df = pd.DataFrame(jobs)
    df.to_csv(OUTPUT_CSV, index=False)
    print("🎉 All done! CSV заполнен базовыми данными.")

if __name__ == "__main__":
    main()

🔍 Gathering job links...
🔗 Total URLs to fetch: 25200


Gathering links: 100%|█████████████████████████████████████████████████████████| 25200/25200 [5:02:18<00:00,  1.39it/s]


🟢 Links parsed: 29100, unique: 29100
✅ Collected 29100 unique job links.
🎉 All done! CSV заполнен базовыми данными.
