In [None]:
import pathlib
import re
import subprocess
import typing


def is_in_jupyter_notebook() -> bool:
    """It checks whether a Jupyter notebook is being run"""
    try:
        get_ipython
        return True
    except NameError:
        return False


def is_on_gcolab() -> bool:
    """It checks whether a Jupyter notebook is being run on Google Colab"""
    if not is_in_jupyter_notebook():
        return False

    return "google.colab" in str(get_ipython())


def is_ubuntu_20_04() -> bool:
    import lsb_release
    metadata = lsb_release.get_os_release()

    distro  = metadata["ID"].lower()
    release = metadata["RELEASE"]

    return distro == "ubuntu" and release == "20.04"


def setup_ubuntu_20_04() -> None:
    """It sets up a Ubuntu 20.04 container with the Chromium browser

    For more information, see 
    https://github.com/googlecolab/colabtools/issues/3347#issuecomment-1387453484
    """
    # It adds debian buster
    EOF_debian_buster = """\
deb [arch=amd64 signed-by=/usr/share/keyrings/debian-buster.gpg] http://deb.debian.org/debian buster main
deb [arch=amd64 signed-by=/usr/share/keyrings/debian-buster-updates.gpg] http://deb.debian.org/debian buster-updates main
deb [arch=amd64 signed-by=/usr/share/keyrings/debian-security-buster.gpg] http://deb.debian.org/debian-security buster/updates main
"""
    !echo "$EOF_debian_buster" > /etc/apt/sources.list.d/debian.list

    # It adds keys
    !apt-key adv --keyserver keyserver.ubuntu.com --recv-keys DCC9EFBF77E11517
    !apt-key adv --keyserver keyserver.ubuntu.com --recv-keys 648ACFD622F3D138
    !apt-key adv --keyserver keyserver.ubuntu.com --recv-keys 112695A0E562B32A

    !apt-key export 77E11517 | gpg --dearmour -o /usr/share/keyrings/debian-buster.gpg
    !apt-key export 22F3D138 | gpg --dearmour -o /usr/share/keyrings/debian-buster-updates.gpg
    !apt-key export E562B32A | gpg --dearmour -o /usr/share/keyrings/debian-security-buster.gpg

    # It adds the debian repo for chromium* packages only
    # Note the double-blank lines between entries
    EOF_chromium_pref = """\
Package: *
Pin: release a=eoan
Pin-Priority: 500


Package: *
Pin: origin "deb.debian.org"
Pin-Priority: 300


Package: chromium*
Pin: origin "deb.debian.org"
Pin-Priority: 700
"""
    !echo "$EOF_chromium_pref" > /etc/apt/preferences.d/chromium.pref

    # It installs the packages
    !apt-get update
    !apt-get install -y chromium chromium-driver
    !apt-get install -y xvfb


def setup_requirements() -> None:
    requirements = " ".join([
        "PyVirtualDisplay==3.0",  # To run a virtual display
        "undetected-chromedriver==3.2.1",
    ])

    !python3 -m pip install --upgrade pip
    !python3 -m pip install --upgrade $requirements


def get_module_path(module: str) -> typing.Optional[pathlib.Path]:
    """It gets the absolute path of a module, or None if not installed"""
    response = subprocess.run(
        ["python3", "-m", "pip", "show", module], 
        capture_output=True
    )

    try:
        response.check_returncode()
    except subprocess.CalledProcessError:
        return None

    stdout = response.stdout.decode()

    try:
        RE_abspath = "\nLocation: (?P<abspath>.*)\n"

        matches = re.search(RE_abspath, stdout)
        abspath = matches.group("abspath")
    except AttributeError:
        return None

    dist_packages = pathlib.Path(abspath).resolve()
    return dist_packages / module


def patch_undetected_chromedriver() -> None:
    """It forces `undetected-chromedriver` to run the Chromium webdriver

    For more information, see 
    https://github.com/ultrafunkamsterdam/undetected-chromedriver/issues/108#issuecomment-1170269377
    """
    chromedriver_filename = "chromedriver_linux64.zip"

    src_chromedriver_filepath = ROOT / chromedriver_filename
    dst_chromedriver_filepath = pathlib.Path("/tmp") / chromedriver_filename

    !zip -j "$src_chromedriver_filepath" /usr/bin/chromedriver

    module = "undetected_chromedriver"
    module_path = get_module_path(module)

    patcher_filepath = module_path / "patcher.py"

    with patcher_filepath.open("rt") as f:
        contents = f.read()

    src = f"'file://{src_chromedriver_filepath}'"
    dst = f"'{dst_chromedriver_filepath}'"

    # It is forced to use the local webdriver
    contents = contents.replace(
        f"return urlretrieve(u)[0]",
        f"return urlretrieve({src}, filename={dst})[0]"
    )

    with patcher_filepath.open("wt") as f:
        f.write(contents)


def setup_container() -> None:
    """It sets up the container which is being run"""
    if is_ubuntu_20_04():
        setup_ubuntu_20_04()

    setup_requirements()
    patch_undetected_chromedriver()


assert is_on_gcolab(), "It seems you are not on Google Colab"


ROOT = pathlib.Path().resolve()
anchor = ROOT / "anchor.txt"


if not anchor.exists():
    setup_container()
    anchor.touch()

In [None]:
import pathlib
import time
from dataclasses import dataclass, asdict
from datetime import datetime, timedelta

import bs4
import pandas as pd
import pytz
import pyvirtualdisplay
import undetected_chromedriver.v2 as uc
from bs4 import BeautifulSoup as BS
from joblib import delayed, Parallel
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait

In [None]:
CET = pytz.timezone("CET")

In [None]:
@dataclass
class Metadata:
    """A class that maps the metadata of a Bidoo closed auction"""
    auction: int
    auction_src: str
    product_image: str
    product: str
    timestamp: int
    winner_username: str

In [None]:
def current_localized_date() -> datetime.date:
    """It gets the current date localized in Bidoo's timezone (CET)"""
    localized = datetime.now().astimezone(CET)
    localized = localized.date()

    return localized

In [None]:
def htmlparse_metadata(referer: str, auction: bs4.Tag) -> typing.Optional[Metadata]:
    metadata = None

    try:
        timestamp_node = auction.find("abbr", {"data-utime": True})
        timestamp = int(timestamp_node["data-utime"])

        container = auction.find(class_="row")

        product_image_node = container.select_one("a.closed-auction-img > img")
        product_image = product_image_node["src"]

        product_node = container.select_one(".media-heading > a")
        product = product_node.text

        auction_src = product_node["href"]
        auction_src = f"{referer}{auction_src}"

        auction = int(auction_src.split("_")[-1])

        winner_username_node = container.select_one(".username > span")
        winner_username = winner_username_node.text

        metadata = Metadata(
            auction,
            auction_src,
            product_image,
            product,
            timestamp,
            winner_username
        )
    except Exception:
        pass

    return metadata

In [None]:
def filter_metadata_by_date(
    auctions: typing.List[Metadata], reference: datetime.date
) -> typing.List[Metadata]:
    same_date_1d_mask = [
        datetime.fromtimestamp(metadata.timestamp, CET)
        for metadata in auctions
    ]

    same_date_1d_mask = [
        datetime_.date() == reference for datetime_ in same_date_1d_mask
    ]

    filtered = [
        auction for idx, auction in enumerate(auctions) if same_date_1d_mask[idx]
    ]

    return filtered

In [None]:
def scrape_closed_auctions_metadata(
    endpoint: str, 
    referer: str,
    timeout_secs: float, 
    webdriver: uc.Chrome,
    **joblib_kwargs
) -> typing.List[typing.Optional[Metadata]]:
    webdriver.get(endpoint)

    WebDriverWait(webdriver, timeout_secs).until(
        EC.visibility_of_element_located(
            (By.CSS_SELECTOR, "div.list.media.loading")
        )
    )

    soup = BS(webdriver.page_source, "html.parser")

    auctions = soup.find_all("div", class_="data_offset")

    auctions = Parallel(**joblib_kwargs)(
        delayed(htmlparse_metadata)(referer, auction)
        for auction in auctions
    )

    return auctions

In [None]:
def scraping_closed_auctions_metadata(
    closed_auctions_endpoint: str,
    output_location: pathlib.Path, 
    referer: str,
    sleep_secs: float,
    timeout_secs: float,
    **joblib_kwargs
) -> None:
    shard_date = current_localized_date()
    shard_date = shard_date - timedelta(days=1)  # It runs after midnight

    shard_location = output_location / str(shard_date)

    if not shard_location.exists():
        shard_location.mkdir(parents=True)

    base_endpoint = f"{referer}/{closed_auctions_endpoint}"

    options = uc.ChromeOptions()
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    
    with pyvirtualdisplay.Display(visible=0, size=(800, 600)) as _:
        webdriver = uc.Chrome(options=options)

        finished = False
        page = 0

        try:
            while not finished:
                page += 1
                page_filepath = shard_location / f"{page}.csv"

                endpoint = f"{base_endpoint}?_={page}"

                if page_filepath.exists():
                    continue

                shard = scrape_closed_auctions_metadata(
                    endpoint, referer, timeout_secs, webdriver, **joblib_kwargs
                )

                shard = [metadata for metadata in shard if metadata is not None]
                shard_count = len(shard)

                filtered = filter_metadata_by_date(shard, shard_date)
    
                if filtered:
                    frame = pd.DataFrame([asdict(metadata) for metadata in finished])
                    frame.to_csv(page_filepath, index=False)

                finished = len(filtered) < shard_count

                if finished:
                    continue

                time.sleep(sleep_secs)
        finally:
            webdriver.quit()

In [None]:
joblib_kwargs = {
    "backend": "threading",
    "n_jobs": -1
}

In [None]:
kwargs = {
    "closed_auctions_endpoint": "closed_auctions.php",
    "output_location": ROOT / "output",
    "referer": "https://it.bidoo.com",
    "sleep_secs": 30.,
    "timeout_secs": 30.,
    **joblib_kwargs
}

In [None]:
scraping_closed_auctions_metadata(**kwargs)