<a href="https://colab.research.google.com/github/Anuragpandey2005/freelance-scrapper/blob/main/freelance_scrapper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import time
import logging
logging.basicConfig(filename='scraper.log', level=logging.ERROR)

# 1. Step 1: Environment Setup

In [None]:
!pip install requests beautifulsoup4 pandas
!pip install selenium webdriver-manager

# 2. Step 2: Choose 3 Platforms

We will use:

* RemoteOK

* PeoplePerHour

* Wellfound (AngelList)



# 3. Step 3: Write the Scraper for Each Site

🔹 remoteok.com

In [None]:
import requests
import pandas as pd

def scrape_remoteok():
    url = "https://remoteok.com/api"
    response = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'})
    data = response.json()[1:]  # First item is metadata

    jobs = []
    for job in data:
        jobs.append({
            "Job Title": job.get("position"),
            "Job Description": job.get("description"),
            "Category/Tags": ", ".join(job.get("tags", [])),
            "Budget/Rate": job.get("salary"),
            "Client Location": job.get("location"),
            "Posting Date": job.get("date"),
            "Job URL": f"https://remoteok.com{job.get('url')}"
        })

    return pd.DataFrame(jobs)



🔹 peopleperhour.com (uses static HTML scraping)


In [None]:
from bs4 import BeautifulSoup

def scrape_peopleperhour():
    url = "https://www.peopleperhour.com/freelance-jobs"
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    jobs = []
    for job_card in soup.select(".job-search-result"):
        jobs.append({
            "Job Title": job_card.select_one(".job-title").text.strip() if job_card.select_one(".job-title") else None,
            "Job Description": job_card.select_one(".job-description").text.strip() if job_card.select_one(".job-description") else None,
            "Category/Tags": job_card.select_one(".breadcrumb").text.strip().replace("\n", ", ") if job_card.select_one(".breadcrumb") else "N/A",
            "Budget/Rate": job_card.select_one(".budget").text.strip() if job_card.select_one(".budget") else None,
            # PeoplePerHour doesn’t show client location without login
            "Client Location": "",  # Field disabled due to login requirement
            "Posting Date": job_card.select_one(".job-footer > span").text.strip() if job_card.select_one(".job-footer > span") else "N/A",
            "Job URL": "https://www.peopleperhour.com" + job_card.a["href"]
        })

    return pd.DataFrame(jobs)

🔹 wellfound.com (static scraping example — very limited due to JavaScript use)

In [None]:
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.service import Service

def scrape_wellfound():
    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service)
    driver.get("https://wellfound.com/jobs")

    jobs = []
    # Wait for JavaScript to load (adjust time if needed)
    time.sleep(5)

    # Extract job titles from the page
    job_cards = driver.find_elements(By.CSS_SELECTOR, ".JobCard__Title") ## Wellfound uses dynamic classes; inspect the page to update selectors if broken
    for job in job_cards:
        jobs.append({
            "Job Title": job.find_element(By.CSS_SELECTOR, ".job-title").text,
            "Job Description": job.find_element(By.CSS_SELECTOR, ".description").text,
            "Category/Tags": "Tech, Startup",
            "Budget/Rate": "Varies",
            "Client Location": "Global",
            "Posting Date": "NA",
            "Job URL": job.find_element(By.CSS_SELECTOR, "a").get_attribute("href")
        })

    driver.quit()
    return pd.DataFrame(jobs)
# Wellfound uses React, so we need Selenium to render JavaScript
# Warning: CSS classes change frequently! Re-inspect if this breaks.
options = Options()
options.add_argument('--headless')  # Run Chrome in background

 * Exception Handling

In [None]:
def scrape_remoteok():
    try:
        url = "https://remoteok.com/api"
        response = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'}, timeout=10)
        response.raise_for_status()  # Check for HTTP errors
        data = response.json()[1:]
        # ... rest of your code ...
    except Exception as e:
        logging.error(f"RemoteOK failed: {e}")
        return pd.DataFrame()  # Return empty DataFrame


# STEP 4: Combine data and save results

In [None]:
# ================================================
# STEP 4: Combine data and save results
# ================================================
def main():
    try:
        # Scrape all platforms
        print("🟡 Scraping RemoteOK...")
        df_remoteok = scrape_remoteok()

        print("🟡 Scraping PeoplePerHour...")
        df_peopleperhour = scrape_peopleperhour()

        print("🟡 Scraping Wellfound (this takes 10 seconds)...")
        df_wellfound = scrape_wellfound()

        # Combine data
        all_jobs = pd.concat(
            [df_remoteok, df_peopleperhour, df_wellfound],
            ignore_index=True
        )

        # Save with today's date
        from datetime import datetime
        today = datetime.now().strftime("%Y-%m-%d")
        filename = f"freelance_jobs_{today}.csv"
        all_jobs.to_csv(filename, index=False)

        print(f"✅ Success! Saved {len(all_jobs)} jobs to {filename}")

    except Exception as e:
        print(f"❌ Critical error: {e}")
        logging.error(f"Main function failed: {e}")

# Run the script
if __name__ == "__main__":
    main()