In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import random
from datetime import datetime


In [4]:
def scrape_jobs_from_page(page_num):
    url = f"https://remoteok.com/remote-dev-jobs?page={page_num}"
    headers = {
        "User-Agent": "Mozilla/5.0"
    }

    response = requests.get(url, headers=headers)
    if response.status_code != 200:
        print(f"Page {page_num}: Failed (Status {response.status_code})")
        return []

    soup = BeautifulSoup(response.text, "html.parser")
    job_rows = soup.select("tr.job")

    jobs = []
    for row in job_rows:
        job = {
            "id": row.get("data-id"),
            "date_posted": row.get("data-epoch"),
            "company": row.select_one("td.company .companyLink") and row.select_one("td.company .companyLink").text.strip(),
            "position": row.select_one("h2") and row.select_one("h2").text.strip(),
            "location": row.select_one("div.location") and row.select_one("div.location").text.strip(),
            "tags": ", ".join([tag.text.strip() for tag in row.select("td.tags a")]),
        }

        # Extract salary if present
        if "$" in job["location"]:
            parts = job["location"].replace("$", "").replace("–", "-").split("-")
            if len(parts) == 2:
                job["salary_min"] = parts[0].strip()
                job["salary_max"] = parts[1].strip()
            else:
                job["salary_min"] = job["salary_max"] = ""
        else:
            job["salary_min"] = job["salary_max"] = ""

        job["salary_range"] = f"{job['salary_min']} - {job['salary_max']}"

        # Convert epoch date
        if job["date_posted"]:
            job["date_posted"] = datetime.fromtimestamp(int(job["date_posted"])).strftime('%Y-%m-%d')
        else:
            job["date_posted"] = ""

        jobs.append(job)

    print(f"✅ Page {page_num}: {len(jobs)} jobs found")
    return jobs


In [8]:
all_jobs = []
total_pages = 5000  # or increase it later

# Set output filename
filename = "remoteok_tech_jobs.csv"

# Loop through pages
for page in range(1, total_pages + 1):
    jobs = scrape_jobs_from_page(page)
    all_jobs.extend(jobs)

    # Save progress after each page
    df = pd.DataFrame(all_jobs)
    df.to_csv(filename, index=False)

    # Delay to be polite
    time.sleep(random.uniform(1.5, 3.0))

print(f"\n✅ Scraping complete: {len(all_jobs)} total jobs saved to '{filename}'")


✅ Page 1: 20 jobs found
✅ Page 2: 20 jobs found
✅ Page 3: 20 jobs found
✅ Page 4: 20 jobs found
✅ Page 5: 20 jobs found
✅ Page 6: 20 jobs found
✅ Page 7: 20 jobs found
✅ Page 8: 20 jobs found
✅ Page 9: 20 jobs found
✅ Page 10: 20 jobs found
✅ Page 11: 20 jobs found
✅ Page 12: 20 jobs found
✅ Page 13: 20 jobs found
✅ Page 14: 20 jobs found
✅ Page 15: 20 jobs found
✅ Page 16: 20 jobs found
✅ Page 17: 20 jobs found
✅ Page 18: 20 jobs found
✅ Page 19: 20 jobs found
✅ Page 20: 20 jobs found
✅ Page 21: 20 jobs found
✅ Page 22: 20 jobs found
✅ Page 23: 20 jobs found
✅ Page 24: 20 jobs found
✅ Page 25: 20 jobs found
✅ Page 26: 20 jobs found
✅ Page 27: 20 jobs found
✅ Page 28: 20 jobs found
✅ Page 29: 20 jobs found
✅ Page 30: 20 jobs found
✅ Page 31: 20 jobs found
✅ Page 32: 20 jobs found
✅ Page 33: 20 jobs found
✅ Page 34: 20 jobs found
✅ Page 35: 20 jobs found
✅ Page 36: 20 jobs found
✅ Page 37: 20 jobs found
✅ Page 38: 20 jobs found
✅ Page 39: 20 jobs found
✅ Page 40: 20 jobs found
✅ Page 41