In [1]:
import requests
from bs4 import BeautifulSoup
import csv
import os
import time
import random
import re
from datetime import datetime

# 📁 Struktur direktori
TODAY = datetime.now().strftime('%Y-%m-%d')
DATA_DIR = "data"
os.makedirs(DATA_DIR, exist_ok=True)
LINKS_CSV = os.path.join(DATA_DIR, f"pending_links_{TODAY}.csv")
OUTPUT_CSV = os.path.join(DATA_DIR, f"job_details_{TODAY}.csv")


# ✅ Kata kunci yang dicari (Fokus pada pembuatan kode Python)
KEYWORDS = [
    "python", "web scraper", "python developer",
    "python programming",
    "python script",
    "python automation",
    "python project",
    "python coding",
    "django",
    "flask",
    "python web development",
    "fastapi",
    "python data analysis",
    "web scraping",
    "python scraper",
    "selenium",
    "beautifulsoup",
    "pandas",
    "deep learning",
    "tensorflow",
    "keras",
    "scikit-learn",
    "pyqt",
    "tkinter",
    "data visualization python",
    "excel automation python",
    "pdf automation python",
    "etl python",
]

# ❌ Kalimat deadline yang dihindari (dilonggarkan)
WARNING_LIST = [
    "within 1 hour", "in 1 hour", "immediately", "asap", "super urgent",
    "need fast", "quick delivery"
]

# 🔎 Hitung skor urgensi, jika terlalu tinggi → ditolak
def is_deadline_unreasonable(text):
    text = text.lower()
    score = sum(1 for kw in WARNING_LIST if kw in text)
    return score >= 1  # Tolak jika ada ≥ 1 kata terburu-buru dari WARNING_LIST yang diperbarui

# 💵 Ambil budget minimum dari teks dan cek apakah hourly
def extract_price_info(price_text):
    price_text = price_text.lower()
    is_hourly = 'hourly' in price_text or '/hr' in price_text
    match = re.search(r'\$([\d,.]+)', price_text)
    min_budget = 0
    if match:
        try:
            min_budget = float(match.group(1).replace(',', ''))
        except:
            min_budget = 0
    return min_budget, is_hourly


# 🔍 Cek apakah teks mengandung keyword relevan
def is_relevant(text):
    text = text.lower()
    # Gunakan kata kunci yang lebih spesifik untuk memastikan fokus pada 'pembuatan kode python'
    relevant_keywords = KEYWORDS
    return any(re.search(rf"\b{re.escape(kw)}\b", text) for kw in relevant_keywords)

# 🧼 Bersihkan URL
def normalize_url(url):
    return url.strip().split('?')[0]

# 🔐 Hindari formula injection Excel
def escape_formula(val):
    if isinstance(val, str):
        val = val.strip()
        if val.startswith(('=', '+', '-', '@')):
            return "'" + val
    return val

# 🌐 Ambil job dari 1 halaman (Sesi 1: Mengumpulkan Link)
def get_freelancer_links(page=1):
    url = f"https://www.freelancer.com/jobs?page={page}"
    headers = {'User-Agent': 'Mozilla/5.0'}
    try:
        response = requests.get(url, headers=headers, timeout=15)
        if response.status_code != 200:
            print(f"[WARN] Gagal ambil halaman {page}: {response.status_code}")
            return []

        soup = BeautifulSoup(response.text, 'html.parser')
        job_cards = soup.find_all('div', class_='JobSearchCard-item')
        jobs = []

        for card in job_cards:
            title_tag = card.find('a', class_='JobSearchCard-primary-heading-link')
            if not title_tag:
                continue
            title = title_tag.text.strip()
            link = f"https://www.freelancer.com{title_tag['href']}"

            price_tag = card.find('div', class_='JobSearchCard-primary-price')
            bid_info_tag = card.find('div', class_='JobSearchCard-secondary-entry')
            bid_text = price_tag.text if price_tag else (bid_info_tag.text if bid_info_tag else '')
            client_price = re.sub(r'\s+', ' ', bid_text.strip()) or 'N/A'

            # 🎯 Ambil timestamp (jika ada)
            timestamp_tag = card.find('span', class_='JobSearchCard-secondary-entry')
            timestamp = timestamp_tag.text.strip() if timestamp_tag else 'N/A'


            combined_text = f"{title} {client_price} {timestamp}"
            min_budget, is_hourly = extract_price_info(client_price)


            # 🎯 Filter relevansi, deadline, budget (Min price $9), dan hindari hourly
            if is_relevant(combined_text) and not is_deadline_unreasonable(combined_text) and min_budget >= 9 and not is_hourly:
                jobs.append({
                    'title': title,
                    'client_price': client_price,
                    'link': normalize_url(link),
                    'timestamp': timestamp
                })
            else:
                reason = []
                if not is_relevant(combined_text):
                    reason.append("❌ irrelevant keyword")
                if is_deadline_unreasonable(combined_text):
                    reason.append("⚠️ bad deadline")
                if min_budget < 9:
                    reason.append(f"💲 too low (${min_budget})")
                if is_hourly:
                    reason.append("⏱️ hourly job")
                print(f"⛔ Skipped: {title} → {', '.join(reason)}")


        return jobs

    except Exception as e:
        print(f"[ERROR] Halaman {page}: {e}")
        return []

# 💾 Simpan link ke CSV
def save_links_to_csv(jobs):
    if not jobs:
        return

    existing_links = set()
    if os.path.exists(LINKS_CSV):
        with open(LINKS_CSV, newline='', encoding='utf-8-sig') as f:
            reader = csv.DictReader(f)
            for row in reader:
                existing_links.add(normalize_url(row['link']))

    new_entries = 0
    with open(LINKS_CSV, mode='a', newline='', encoding='utf-8-sig') as file:
        writer = csv.DictWriter(file, fieldnames=['title', 'client_price', 'link', 'timestamp'])
        if file.tell() == 0:
            writer.writeheader()
        for job in jobs:
            clean_link = normalize_url(job['link'])
            if clean_link not in existing_links:
                writer.writerow({k: escape_formula(v) for k, v in job.items()})
                existing_links.add(clean_link)
                new_entries += 1

    print(f"✅ {new_entries} link baru disimpan di {LINKS_CSV}")
    return new_entries > 0 # Return True if new entries were added

# 🌐 Ambil deskripsi dari 1 halaman job (Sesi 2: Mengambil Detail)
def scrape_job_detail(url):
    headers = {'User-Agent': 'Mozilla/5.0'}
    try:
        res = requests.get(url, headers=headers, timeout=15)
        if res.status_code != 200:
            print(f"[WARN] Gagal buka {url}")
            return None
        soup = BeautifulSoup(res.text, 'html.parser')

        # 🎯 Cari tag <p class="Project-description whitespace-pre-line">
        desc_tag = soup.find('p', attrs={'class': 'Project-description whitespace-pre-line'})

        if not desc_tag:
            return None

        return desc_tag.get_text(separator=' ', strip=True)
    except Exception as e:
        print(f"[ERROR] Gagal scrape {url} → {e}")
        return None

# 🚀 Main loop
def main():
    # Sesi 1: Mengumpulkan Link
    print("--- Memulai Sesi 1: Mengumpulkan Link ---")
    new_links_found = False
    for page in range(1, 6):  # 6 halaman pertama
        print(f"🔎 Scraping page {page}...")
        jobs = get_freelancer_links(page)
        if save_links_to_csv(jobs):
            new_links_found = True
        time.sleep(random.uniform(2, 4))

    # Sesi 2: Mengambil Detail
    print("\n--- Memulai Sesi 2: Mengambil Detail Job ---")
    if not new_links_found or not os.path.exists(LINKS_CSV):
        print("[X] Tidak ditemukan link baru atau file CSV tidak ada.")
        return

    with open(LINKS_CSV, newline='', encoding='utf-8-sig') as infile:
        reader = csv.DictReader(infile)
        jobs_to_scrape = list(reader)

    if not jobs_to_scrape:
        print("[X] Tidak ada link untuk di-scrape.")
        return


    with open(OUTPUT_CSV, mode='w', newline='', encoding='utf-8-sig') as outfile:
        fieldnames = ['title', 'client_price', 'link', 'timestamp', 'description']
        writer = csv.DictWriter(outfile, fieldnames=fieldnames)
        writer.writeheader()

        for job in jobs_to_scrape:
            print(f"🔍 Mengambil deskripsi: {job['title']}")
            desc = scrape_job_detail(job['link'])
            if desc:
                writer.writerow({
                    'title': job['title'],
                    'client_price': job['client_price'],
                    'link': job['link'],
                    'timestamp': job['timestamp'],
                    'description': desc
                })
            else:
                print(f"⚠️ Deskripsi kosong: {job['link']}")
            time.sleep(random.uniform(2, 4)) # Jeda antar permintaan detail

    print(f"\n✅ Selesai! Hasil disimpan di: {OUTPUT_CSV}")


if __name__ == "__main__":
    while True:
        main()
        print("⏳ Tunggu 300 detik sebelum scraping ulang...\n") # Timer 300 detik
        time.sleep(300) # Timer 300 detik

--- Memulai Sesi 1: Mengumpulkan Link ---
🔎 Scraping page 1...
⛔ Skipped: WordPress Elementor Website Enhancement → ❌ irrelevant keyword
⛔ Skipped: Part-Time Job Listing Creation → ❌ irrelevant keyword
⛔ Skipped: Email Tracking Setup → ❌ irrelevant keyword
⛔ Skipped: Informatica Data Quality (IDQ) Training → ❌ irrelevant keyword
⛔ Skipped: ambalaj tasarımı → ❌ irrelevant keyword
⛔ Skipped: Rotate and Edit Product Videos → ❌ irrelevant keyword
⛔ Skipped: Aspiring Copywriter Focusing on Storytelling → ❌ irrelevant keyword
⛔ Skipped: Cartoonish 2D Train Animation → ❌ irrelevant keyword
⛔ Skipped: Tech Self-Help Book Writer → ❌ irrelevant keyword
⛔ Skipped: Excel Data Entry for Manufacturer → ❌ irrelevant keyword
⛔ Skipped: Architect Needed for Modern Commercial Layout → ❌ irrelevant keyword
⛔ Skipped: Industrial Machinery Drafting → ❌ irrelevant keyword
⛔ Skipped: Experienced PPC Expert for Online Campaign Optimization → ❌ irrelevant keyword
⛔ Skipped: Wordpress Website Development → ❌ ir

KeyboardInterrupt: 