In [1]:
import aiohttp
import asyncio
from bs4 import BeautifulSoup
import pandas as pd

In [2]:
async def fetch(url, session):
    try:
        async with session.get(url) as response:
            if response.status == 200:
                return await response.text()
            else:
                print(f"Failed to fetch {url}: HTTP {response.status}")
                return None
    except Exception as e:
        print(f"An error occurred while fetching {url}: {e}")
        return None

In [3]:
def safe_extract(element, selector, attr=None):
    """Helper function to safely extract text or attribute from an element."""
    try:
        if attr:
            return element.select_one(selector)[attr].strip()
        return element.select_one(selector).text.strip()
    except (AttributeError, TypeError):
        return "N/A"


def extract_job_data(html):
    soup = BeautifulSoup(html, "html.parser")
    jobs = soup.find_all("tr", class_="job")
    csv_data = []

    # Loop through each job element
    for job in jobs:
        # Use the helper function to extract values
        title = safe_extract(job, "td.company_and_position h2")
        company = safe_extract(job, "td.company_and_position h3")
        location = safe_extract(job, "td.company_and_position div.location")
        salary = safe_extract(job, "td.company_and_position div.location + div.tooltip")

        # Extract tags
        tags_elements = job.select("td.tags h3")
        tagList = [tag.text.strip() for tag in tags_elements if tag]

        csv_data.append(
            {
                "job_title": title,
                "company": company,
                "location": location,
                "salary": salary,
                "tags": ",".join(list(set(tagList))),
            }
        )
    return csv_data

In [4]:
async def scrape_jobs(categories):
    base_url = "https://remoteok.com"
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
    }
    tasks = []
    results = {}

    async with aiohttp.ClientSession(headers=headers) as session:
        for category in categories:
            url = f"{base_url}/remote-{category}-jobs"
            tasks.append(fetch(url, session))

        responses = await asyncio.gather(*tasks)

        for category, html in zip(categories, responses):
            if html:
                job_data = extract_job_data(html)
                if job_data:
                    results[category] = job_data

    return results

In [5]:
def save_to_csv(data):
    for category, jobs in data.items():
        if jobs:
            df = pd.DataFrame(jobs)
            filename = f"{category}_jobs.csv"
            df.to_csv(filename, index=False, encoding="utf-8")
            print(f"Saved {len(jobs)} jobs to {filename}")
        else:
            print(f"No jobs found for {category}")

In [6]:
import nest_asyncio
import asyncio

# Apply the nest_asyncio patch
nest_asyncio.apply()


# Your async functions
async def main():
    categories = [
        "engineer",
        "executive",
        "senior",
        "developer",
        "finance",
        "sys admin",
    ]  # Add more categories as needed
    results = await scrape_jobs(categories)
    save_to_csv(results)


# Run the main function
await main()

Saved 20 jobs to engineer_jobs.csv
Saved 15 jobs to executive_jobs.csv
Saved 20 jobs to senior_jobs.csv
Saved 20 jobs to developer_jobs.csv
Saved 19 jobs to finance_jobs.csv
Saved 1 jobs to sys admin_jobs.csv
