In [15]:
import requests
from bs4 import BeautifulSoup
import time
import random
import json
import csv
import logging
import os
from dotenv import load_dotenv
from selenium import webdriver
from selenium.webdriver.edge.service import Service
from selenium.webdriver.edge.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities  # Import for network capture

load_dotenv()

# --- Configuration ---
BASE_URL = "https://www.fiverr.com/"  # Revert to the main Fiverr homepage
SEARCH_QUERY = "Artificial Intelligence"  # (Currently not used due to robots.txt)
HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 11.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
}
NUM_PAGES_TO_SCRAPE = int(os.getenv("NUM_PAGES", 1))
OUTPUT_FORMAT = os.getenv("OUTPUT_FORMAT", "json")
MIN_DELAY = int(os.getenv("MIN_DELAY", 3))
MAX_DELAY = int(os.getenv("MAX_DELAY", 7))

# --- Use a raw string with double backslashes for the path ---
CHROMEDRIVER_PATH = r'C:\Users\Anshd\Downloads\edgedriver_win64\msedgedriver.exe'

# --- Logging Setup ---
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")

console_handler = logging.StreamHandler()
console_handler.setFormatter(formatter)

file_handler = logging.FileHandler("fiverr_scraper.log")
file_handler.setFormatter(formatter)

logger.addHandler(console_handler)
logger.addHandler(file_handler)

def log_error(message):
    logger.error(message)

# --- Verify Driver Path ---
if os.path.exists(CHROMEDRIVER_PATH):
    print(f"Driver found at: {CHROMEDRIVER_PATH}")
else:
    print(f"Driver not found at: {CHROMEDRIVER_PATH}")

Driver found at: C:\Users\Anshd\Downloads\edgedriver_win64\msedgedriver.exe


In [16]:
class Scraper:
    def __init__(self, base_url, headers, min_delay, max_delay):
        self.base_url = base_url
        self.headers = headers
        self.min_delay = min_delay
        self.max_delay = max_delay

    def fetch_page(self, url):
        """Fetches a webpage with error handling and respecting robots.txt (basic check)."""
        try:
            # Basic robots.txt check
            if not self.is_allowed_by_robots_txt(url):
                log_error(f"Access to {url} is disallowed by robots.txt")
                return None

            time.sleep(random.uniform(self.min_delay, self.max_delay))
            response = requests.get(url, headers=self.headers)
            response.raise_for_status()
            return response.content
        except requests.exceptions.RequestException as e:
            log_error(f"Error fetching {url}: {e}")
            return None

    def is_allowed_by_robots_txt(self, url):
        """
        Checks if a URL is allowed by robots.txt (very basic implementation for Fiverr).
        """
        disallowed_patterns = [
            "/orders/timeline/",
            "/pinned_flashes/",
            "/gigs/*/share/",
            "/specials/",
            "/packages/",
            "/categories/silly",
            "/categories/fifa",
            "/categories/Halloween",
            "/categories/Postcards",
            "/purchases",
            "/user_sessions",
            "/users/",
            "/counter/",
            "/collaborate/",
            "/search/",
            "/search_results/gigs/",
            "/match/website/",
            "/pages/website-developer-match",
            "/v4/",
            "/pro/",
            "/about-pro",
            "/pro-resources",
            "/pro-solutions",
            "/gigs/search",
            "/recommendations/",
            "/contact_me/",
            "/conversations/",
            "/bookmarks/",
            "/inbox/",
            "/seller_onboarding/",
            "/checkout/package/",
            "/match/",
            "/pages/website-developer-match",
            "/cdn-proxy/px/",
            "/cdn-proxy/pim/",
            "/search_results/",
            "/custom_orders/",
            "/studios/",
            "/v1/stats",
            "/v1/browser-performance",
            "/assets/shared/",
            "/logo-maker/brief/",
            "/logo-maker/choose-variation/",
            "/logo-maker/wordpress",
            "/logo-maker/woo",
            "/api/v1/activities"
        ]

        for pattern in disallowed_patterns:
            if pattern in url:
                return False

        if "page=" in url and not url.endswith("page=2"):
            return False

        return True

    def parse_project_listings(self, html_content):
        """Parses project listings from the HTML content. (Currently not used)."""
        # This function is not currently used because we are fetching data from the /g?payload URL, not from the HTML.
        # We will need to update this function later to extract project details if needed.
        return []

    def scrape_categories(self, category_urls):
        """ Placeholder function, as we are not scraping category URLs now. """
        # We are not scraping category URLs in the traditional way.
        # We are fetching a GIF file that contains the data.
        # This function will be updated later to handle data extraction from the GIF.
        return []
    def capture_network_requests(self, driver, target_url_part):
        """Captures network requests made by the browser and returns the response body of the matching request."""
        performance_logs = driver.get_log("performance")

        for entry in performance_logs:
            log = json.loads(entry["message"])["message"]
            if (
                log["method"] == "Network.responseReceived"
                and target_url_part in log["params"]["response"]["url"]
            ):
                request_id = log["params"]["requestId"]
                try:
                    response_body = driver.execute_cdp_cmd("Network.getResponseBody", {"requestId": request_id})
                    return response_body["body"]
                except Exception as e:
                    log_error(f"Error getting response body: {e}")
                    return None
        return None    

In [17]:
class DataHandler:
    def __init__(self, output_format="json", output_filename="fiverr_ai_projects"):
        self.output_format = output_format
        self.output_filename = output_filename

    def save_data(self, data):
        """Saves the scraped data to a file (JSON or CSV)."""
        try:
            if self.output_format == "json":
                with open(f"{self.output_filename}.json", "w", encoding="utf-8") as f:
                    json.dump(data, f, indent=4, ensure_ascii=False)
                logger.info(f"Data successfully saved to {self.output_filename}.json")
            elif self.output_format == "csv":
                with open(f"{self.output_filename}.csv", "w", newline="", encoding="utf-8") as f:
                    writer = csv.DictWriter(f, fieldnames=data[0].keys() if data else [])
                    writer.writeheader()
                    writer.writerows(data)
                logger.info(f"Data successfully saved to {self.output_filename}.csv")
            else:
                raise ValueError(f"Unsupported output format: {self.output_format}")
        except (IOError, TypeError, ValueError) as e:
            log_error(f"Error saving data: {e}")

In [18]:
def get_fiverr_category_urls(base_url, headers):
    """
    Fetches the /g?payload=... URL, extracts the GIF data using Selenium, and saves it.
    """
    options = Options()
    options.add_argument("--headless")
    options.add_argument("--disable-gpu")
    options.add_argument(f"user-agent={headers['User-Agent']}")

    # Add options to enable network performance logging
    options.set_capability("goog:loggingPrefs", {"performance": "ALL"})  # Use goog:loggingPrefs for Chrome/Edge

    service = Service(executable_path=CHROMEDRIVER_PATH)
    driver = webdriver.Edge(service=service, options=options)

    driver.get(base_url)

    # Wait for the iframe to be available and switch to it
    try:
        WebDriverWait(driver, 30).until(
            EC.frame_to_be_available_and_switch_to_it((By.NAME, "ftrif"))
        )
        print("Switched to iframe successfully.")
    except Exception as e:
        log_error(f"Error switching to iframe: {e}")
        driver.quit()
        return []

    # Wait for a placeholder element inside the iframe
    try:
        WebDriverWait(driver, 30).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "div.iy5y630.iy5y632q.iy5y631ko"))
        )
        print("Placeholder element found within iframe.")
    except Exception as e:
        log_error(f"Error waiting for placeholder element: {e}")
        driver.quit()
        return []

    # Wait for any a tag with data-testid category_tree to appear
    try:
        WebDriverWait(driver, 30).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "a[data-testid='category_tree']"))
        )
        print("Category links found within iframe.")
    except Exception as e:
        log_error(f"Error waiting for category links: {e}")
        driver.quit()
        return []

    # Capture network requests to get the GIF data
    target_url_part = "/g?payload="  # Part of

In [19]:
def main():
    """Main function to run the scraper."""
    logger.info("Starting the Fiverr AI project scraper...")

    # Now we are fetching a GIF, not category URLs directly
    gif_files = get_fiverr_category_urls(BASE_URL, HEADERS)

    if not gif_files:
        logger.error("Could not retrieve GIF data. Exiting.")
        return

    # Placeholder for data extraction from the GIF (to be implemented later)
    # For now, we are just saving the GIF
    for gif_file in gif_files:
        logger.info(f"GIF data saved to {gif_file}")

    logger.info("Scraping finished.")

In [20]:
if __name__ == "__main__":
    main()

2025-01-09 23:41:54,125 - __main__ - INFO - Starting the Fiverr AI project scraper...
2025-01-09 23:41:54,125 - __main__ - INFO - Starting the Fiverr AI project scraper...
2025-01-09 23:41:54,125 - __main__ - INFO - Starting the Fiverr AI project scraper...
2025-01-09 23:41:54,125 - __main__ - INFO - Starting the Fiverr AI project scraper...
2025-01-09 23:42:29,083 - __main__ - ERROR - Error switching to iframe: Message: 
Stacktrace:
	(No symbol) [0x00007FF7D8246B05]
	Microsoft::Applications::Events::EventProperty::empty [0x00007FF7D856F464+1437348]
	sqlite3_dbdata_init [0x00007FF7D8612D96+643174]
	(No symbol) [0x00007FF7D816C9DB]
	(No symbol) [0x00007FF7D816CAE3]
	(No symbol) [0x00007FF7D81A92F7]
	(No symbol) [0x00007FF7D818C1DF]
	(No symbol) [0x00007FF7D8163437]
	(No symbol) [0x00007FF7D81A6BFF]
	(No symbol) [0x00007FF7D818BE03]
	(No symbol) [0x00007FF7D8162984]
	(No symbol) [0x00007FF7D8161E30]
	(No symbol) [0x00007FF7D8162571]
	Microsoft::Applications::Events::EventProperty::empty 