# BidooBot: Data scraping

## Website analysis

### Requirements

In [None]:
!pip install selenium
!pip install cloudscraper

!apt-get update
!apt install chromium-chromedriver

!cp /usr/lib/chromium-browser/chromedriver /usr/bin

In [None]:
DIR_root = '/content/drive/MyDrive/Colab Notebooks/bidoobot'
DIR_data = DIR_root + '/data'
DIR_logs = DIR_root + '/logs'

In [None]:
DIR_data_raw = DIR_data + '/raw'

Prevent Colab from disconnecting for inactivity after 30 minutes by launching the block below with JS code.

In [None]:
import IPython

JS_prevent_disconnect = '''
T = 60; // Seconds

function preventDisconnect() {
  console.log("Clicking some button...");
  document.querySelector("#comments > span").click()
}

setInterval(preventDisconnect, T*1000);
'''

IPython.display.Javascript(JS_prevent_disconnect)

### Closed auctions

Collect data from closed auctions on https://it.bidoo.com/

In [None]:
import cloudscraper
import os
import pandas as pd
import pytz
import sys
import time
import threading
import typing as t

from bs4 import BeautifulSoup as bs

from datetime import datetime

from joblib import Parallel
from joblib import delayed

from multiprocessing import Manager

In [None]:
sys.path.insert(0, '/usr/lib/chromium-browser/chromedriver')

from selenium import webdriver

from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

In [None]:
SITE_referer = 'https://it.bidoo.com'

In [None]:
T_min_Delta = 5 # Minutes

In [None]:
BID_cost  = 0.13 # EUR
BID_raise = 0.01 # EUR
BID_start = 0.01 # EUR

In [None]:
AUCTION_h_start = 12
AUCTION_h_end   = 24
AUCTION_tz      = 'CET'

In [None]:
class Auction:
    URL: str
    bid: float
    currency: str
    id: int
    image_URL: str
    n_bids: int
    name: str
    shipping_expenses: float
    timestamp: float
    timer: int
    value: float
    winner_expenses: float
    winner_modality: str
    winner_n_bids: int
    winner_savings_pct: float
    winner_username: str

    def __init__(self,
                 URL: str,
                 bid: float,
                 currency: str,
                 id: int,
                 image_URL: str,
                 n_bids: int,
                 name: str,
                 shipping_expenses: float,
                 timestamp: float,
                 timer: int,
                 value: float,
                 winner_expenses: float,
                 winner_modality: str,
                 winner_n_bids: int,
                 winner_savings_pct: float,
                 winner_username: str):
        self.URL = URL
        self.bid = bid
        self.currency = currency
        self.id = id
        self.image_URL = image_URL
        self.n_bids = n_bids
        self.name = name
        self.shipping_expenses = shipping_expenses
        self.timestamp = timestamp
        self.timer = timer
        self.value = value
        self.winner_expenses = winner_expenses
        self.winner_modality = winner_modality
        self.winner_n_bids = winner_n_bids
        self.winner_savings_pct = winner_savings_pct
        self.winner_username = winner_username

In [None]:
def get_current_datetime() -> datetime:
    """
    Get current datetime in Bidoo's timezone (CET).
    """
    now = datetime.now()
    tz  = pytz.timezone(AUCTION_tz)

    return now.astimezone(tz)

In [None]:
def get_current_date() -> str:
    """
    Get current date in Bidoo's timezone (CET).
    """
    now = get_current_datetime()
    return now.strftime("%Y%m%d")

In [None]:
def get_current_time() -> str:
    """
    Get current time in Bidoo's timezone (CET).
    """
    now = get_current_datetime()
    return now.strftime("%H:%M:%S")

In [None]:
def get_current_hour() -> int:
    """
    Get current hour in Bidoo's timezone (CET).
    """
    now = get_current_datetime()
    return int(now.strftime('%H'))

In [None]:
PATH_closed_auctions = DIR_data_raw + '/closed_auctions_{}.csv' \
                                      .format(get_current_date())

In [None]:
PATH_log_access = DIR_logs + '/access_{}.log' \
                             .format(get_current_date())

PATH_log_error  = DIR_logs + '/error_{}.log' \
                             .format(get_current_date())                                

Chrome webscraper

In [None]:
WEB_scraper = cloudscraper.create_scraper(
    browser={
        'browser': 'chrome',
        'platform': 'windows',
        'desktop': True
    })

Chrome webdriver

In [None]:
options = webdriver.ChromeOptions()

options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
# options.add_argument("--disable-extensions")

In [None]:
def extract_auction(drivers: t.Dict, r: str) \
                   -> t.Optional[Auction]:
    try:
        auction_HTML = bs(r, 'html.parser')

        # Chrome webscraper

        # 01. UNIX timestamp
        timestamp_HTML = auction_HTML.find('abbr', {'data-utime' : True})
        timestamp = float(timestamp_HTML['data-utime'])

        row_HTML = auction_HTML.find(class_='row')

        # 02. Product image URL
        image_HTML = row_HTML.select_one('a.closed-auction-img > img')
        image_URL = image_HTML['src']

        # 03. Product name
        name_HTML = row_HTML.select_one('.media-heading > a')
        name = name_HTML.text

        # 04. URL
        URL = '{}{}'.format(SITE_referer,
                            name_HTML['href'])
        
        # 05. Id
        id = int(URL.split('_')[-1])

        # 06. Winner username
        winner_username_HTML = row_HTML.select_one('.username > span')
        winner_username = winner_username_HTML.text
    except Exception:
        return None

    # Chrome webdriver

    T_id = threading.current_thread().name

    try:
        driver = drivers[T_id]
    except KeyError:
        drivers[T_id] = webdriver.Chrome('chromedriver', options=options)
        driver = drivers[T_id]

    try:
        driver.get(URL)
        wait = WebDriverWait(driver, 10)
        
        # 07. Bid
        wait.until(
            EC.visibility_of_element_located(
                (By.CSS_SELECTOR, '.auction-action-price')))
        
        bid_HTML = driver.find_element(By.CSS_SELECTOR, '.auction-action-price')
        bid = float(bid_HTML.text.replace(',', '.').split(' ')[0])

        # 08. Number of bids
        n_bids = int((bid - BID_start)/BID_raise)

        # 09. Product value
        wait.until(
            EC.visibility_of_element_located(
                (By.CSS_SELECTOR, '.product-value.hidden-xs')))
        
        value_HTML = driver.find_element(By.CSS_SELECTOR, '.product-value.hidden-xs')
        value = float(value_HTML.text.replace(',', '.').split(' ')[0])

        # 10. Shipping expenses
        wait.until(
            EC.visibility_of_element_located(
                (By.CSS_SELECTOR, '.text-success')))
        
        shipping_expenses_HTML = driver.find_element(By.CSS_SELECTOR, '.text-success')
        shipping_expenses = shipping_expenses_HTML.text.replace(',', '.').split(' ')[0].strip()

        if shipping_expenses == 'Gratuite': # Free shipping expenses
            shipping_expenses = 0.0
        else:
            shipping_expenses = float(shipping_expenses)

        # 11. Winner bids
        wait.until(
            EC.visibility_of_element_located(
                (By.CSS_SELECTOR, '.bids-used > span')))
        
        winner_n_bids_HTML = driver.find_element(By.CSS_SELECTOR, '.bids-used > span')
        winner_n_bids = int(winner_n_bids_HTML.text)

        # 12. Winner expenses
        winner_expenses = shipping_expenses + winner_n_bids*BID_cost

        # 13. Winner savings %
        winner_savings_pct = 1. - winner_expenses / value

        # 14. Timer
        wait.until(
            EC.visibility_of_element_located(
                (By.CSS_SELECTOR, '.auction-action-timer > p > strong')))
        
        timer_HTML = driver.find_element(By.CSS_SELECTOR, '.auction-action-timer > p > strong')
        timer = int(timer_HTML.text.split(' ')[0])

        # 15. Winner modality
        wait.until(
            EC.visibility_of_element_located(
                (By.CSS_SELECTOR, '.auction-history > table > tbody > tr > td.tB')))
        
        bidder_modalities = driver.find_elements(By.CSS_SELECTOR, '.auction-history > table > tbody > tr > td.tB')
        winner_modality = bidder_modalities[0].text

        # TODO: Auction history

        # 16. Currency
        # TODO: Currency with Babel library
        currency = 'EUR'

        return Auction(URL,
                    bid,
                    currency,
                    id,
                    image_URL,
                    n_bids,
                    name,
                    shipping_expenses,
                    timestamp,
                    timer,
                    value,
                    winner_expenses,
                    winner_modality,
                    winner_n_bids,
                    winner_savings_pct,
                    winner_username)
    except Exception:
        return None

In [None]:
def scrape_closed_auctions(scraper: cloudscraper.CloudScraper,
                           n: int = -1) \
                          -> t.List[Auction]:
    WEB_drivers = {} # Selenium drivers
    
    # Closed auctions endpoint
    SITE_endpoint = 'closed_auctions.php'

    URL = '{}/{}'.format(SITE_referer,
                         SITE_endpoint)
    
    r = scraper.get(URL)
    soup = bs(r.content, 'html.parser')

    auctions_HTML = soup.find_all('div', class_='data_offset')

    if n > -1:
        auctions_HTML = auctions_HTML[:n]

    auctions = Parallel(n_jobs=-1, backend="threading")(
        delayed(extract_auction)(WEB_drivers,
                                 str(a_HTML))
        for a_HTML in auctions_HTML
    )

    for driver in WEB_drivers.values():
        driver.quit()

    return auctions

In [None]:
def write2file(f: t.TextIO,
               msg: str,
               print_: bool = True,
               newline: bool = True,
               flush_: bool = True):
    if print_:
        print(msg)

    if newline:
        msg = msg + '\n'

    f.write(msg)

    if flush_:
        f.flush()

Scrape closed auctions every `T_min_Delta` minutes

In [None]:
df_auctions = None
t_Delta = T_min_Delta*60

if os.path.isfile(PATH_closed_auctions):
    df_auctions = pd.read_csv(
        PATH_closed_auctions,
        header='infer',
        index_col='id'
        )
    
with open(PATH_log_access, 'a') as f_log_access:
    with open(PATH_log_error, 'a') as f_log_error:
        if df_auctions is None:
            msg = '{} - {} - Opening auction market...' \
                  .format(get_current_date(),
                          get_current_time())
                      
            write2file(f_log_access, msg)

        while True:
            h = get_current_hour()

            if h < AUCTION_h_start \
                    or h > AUCTION_h_end:
                msg = '{} - {} - Closing auction market...' \
                      .format(get_current_date(),
                              get_current_time())
                      
                write2file(f_log_access, msg)
                
                break
                
            t_start = time.time()

            try:
                auctions = scrape_closed_auctions(WEB_scraper)

                if len(auctions) == 0:
                    raise Exception()

                good_auctions = [a for a in auctions if a is not None]
                evil_auctions = [a for a in auctions if a is None]

                len_good_auctions = len(good_auctions)
                len_evil_auctions = len(evil_auctions)

                if len_evil_auctions > 0:
                    msg = '{} - {} - Lost some closed auctions: {}' \
                          .format(get_current_date(),
                                  get_current_time(),
                                  len_evil_auctions)
                          
                    write2file(f_log_access, msg)
                    
                if len_good_auctions > 0:
                    df_auctions_n = pd.DataFrame(
                            data=[vars(a) for a in good_auctions]).set_index('id')

                    if df_auctions is None:
                        df_auctions = df_auctions_n
                    else:
                        df_auctions = pd.concat([df_auctions,
                                                df_auctions_n]) \
                                        .drop_duplicates()       \
                                        .sort_values(by=['timestamp'],
                                                    ascending=False)

                    df_auctions.to_csv(PATH_closed_auctions)
            except (ConnectionError,
                    ConnectionResetError):
                msg = '{} - {} - [104] Connection reset by peer' \
                      .format(get_current_date(),
                              get_current_time())
                      
                write2file(f_log_error, msg)
            except Exception:
                msg = '{} - {} - [500] Something happened' \
                      .format(get_current_date(),
                              get_current_time())
                      
                write2file(f_log_error, msg)

            t_end = time.time()
            t_sleep = t_Delta - (t_end - t_start)

            if t_sleep > 0:
                time.sleep(t_sleep)

In [None]:
# def scrape_auction_categories(
#         scraper: cloudscraper.CloudScraper) \
#         -> t.Dict[str, t.List[t.Tuple[str, str]]]:
#     WEB_drivers = {} # Selenium drivers
    
#     # Main auctions endpoint
#     SITE_endpoint = ''

#     URL = '{}/{}'.format(SITE_referer,
#                          SITE_endpoint)

#     r = scraper.get(URL)
#     soup = bs(r.content, 'html.parser')

#     categories_HTML = soup.find_all('div', class_='CategoryMenu')

#     return r.status_code, categories_HTML

#     # if n > -1:
#     #     auctions_HTML = auctions_HTML[:n]

#     # auctions = Parallel(n_jobs=-1, backend="threading")(
#     #     delayed(extract_auction)(WEB_drivers,
#     #                              str(a_HTML))
#     #     for a_HTML in auctions_HTML
#     # )

#     # for driver in WEB_drivers.values():
#     #     driver.quit()

#     # return auctions

In [None]:
# WEB_scraper.get(SITE_referer)

In [None]:
# scrape_auction_categories(WEB_scraper)