### SCRAPER DO BRICKECONOMY

Ten odpali się elegancko. 

In [None]:
#Importuję biblioteki 

import datetime as dt
import time
import logging
import pickle

# from pyvirtualdisplay import Display
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from bs4 import BeautifulSoup

logging.basicConfig(format='%(asctime)s %(levelname)s:%(message)s', level=logging.INFO, datefmt='%I:%M:%S')
logger = logging.getLogger("BriconomyScrapper")

In [2]:
def create_driver(use_headless = False, exec_path = "./drivers/chromedriver"):
    options = webdriver.ChromeOptions()
    if use_headless:
        logger.info("Using driver in headless mode")
        options.add_argument("headless")
    options.add_argument("--ignore-certificate-errors")
   
    service = Service(executable_path=exec_path)
    return webdriver.Chrome(service=service, options=options)

def get_page_src(url: str):
    logger.debug("Creating driver")
    driver = create_driver()
    driver.get(url)
    time.sleep(2)
    logger.debug(f"Getting page source for {url}")
    page_src = driver.page_source
    driver.quit()
    logger.debug("Exiting driver")
    return page_src

def get_table(page_src, chart_id):
    try:
        soup = BeautifulSoup(page_src, "html.parser")
        chart = soup.find("div", id=chart_id)
        table = chart.find("table")
        table_head = table.find("thead")
        table_body = table.find("tbody")
        table_rows = table_body.find_all("tr")
        return table_head, table_rows
    except Exception as e:
        # logger.warning(f"Fetching table {chart_id} from page source failed due to {e}")
        raise RuntimeError(f"Fetching table {chart_id} from page source failed") from e

def get_info(page_src):
    try:
        # Poprawienie operatora przypisania
        soup = BeautifulSoup(page_src, "html.parser")
        
        # Znalezienie odpowiednich sekcji na stronie
        box = soup.find("div", class_="side-box-body")
        head = soup.find_all("div", class_="col-xs-5")
        tekst = soup.find_all("div", class_="col-xs-7")  # Poprawienie błędu w cudzysłowach

        # Pobranie tekstu z odpowiednich elementów
        d_num1 = head[0].text
        d_num2 = tekst[0].text
        d_name1 = head[1].text
        d_name2 = tekst[1].text
        d_theme1 = head[2].text
        d_theme2 = tekst[2].text

        # Stworzenie listy z informacjami
        info = [d_num1, d_num2, d_name1, d_name2, d_theme1, d_theme2]
        logger.debug("Successfully retrieved set info")
        return info
    except Exception as e:
        logger.warning(f"Failed to extract set info due to {e}")
        raise RuntimeError("Failed to extract set info") from e
    
def get_price_data(page_src):
    try:
        _, table_rows = get_table(page_src, "pricechart")
        data = []
        for table_row in table_rows:
            row_data = table_row.find_all("td")
            date = dt.datetime.strptime(row_data[0].text, "%b %d, %Y").date()
            price = float(row_data[1].text)
            data.append({"Date": date, "Price": price})
        logger.debug("Successfully retrieved price data")
        return data
    except Exception as e:
        logger.warning(f"Failed to extract prices due to {e}")
        raise RuntimeError("Failed to extract prices") from e
    
def get_sales_data(page_src):
    try:
        _, table_rows = get_table(page_src, "saleschartmonth")
        data = []
        for table_row in table_rows:
            row_data = table_row.find_all("td")
            date = dt.datetime.strptime(row_data[0].text, "%b %d, %Y").date()
            min_price = float(row_data[1].text)
            q1_price = float(row_data[2].text)
            q3_price = float(row_data[3].text)
            max_price = float(row_data[4].text)
            data.append({"Date": date, "Min": min_price, "Q1": q1_price, "Q3": q3_price, "Max": max_price})
        logger.debug("Successfully retrieved monthly sales data")
        return data
    except Exception as e:
        logger.warning(f"Failed to extract monthly sales price ranges due to {e}")
        raise RuntimeError("Failed to extract monthly sales price ranges") from e
    
def get_data(url):
    try:
        page_src = get_page_src(url)
    except Exception as e:
        return repr(e)
    try:
        info_data = get_info(page_src)
    except Exception as e:
        info_data = repr(e)
    try:
        price_data = get_price_data(page_src)
    except Exception as e:
        price_data = repr(e)
    try:
        sales_data = get_sales_data(page_src)
    except Exception as e:
        sales_data = repr(e)
    return {"Info": info_data, "Prices": price_data, "Monthly sales price ranges": sales_data}

In [3]:
# Zapisywałam dane w batchach latami. Brickeconomy często się zawieszał, zawieszał mi I.P, 
# wieszał się internet itp.

if __name__ == "__main__":
    # display = Display(visible=False, size=(1600, 1200))
    # display.start()

    with open('./remaining_links_from_2013_to_2024.pkl', 'rb') as fp:
        links_by_year = pickle.load(fp)
    
    data_by_set = {}
    
        # For filtering by years
    year_range = range(2018, 2025)
    if year_range is not None:
        links_by_year = {k: v for k, v in links_by_year.items() if k in year_range}
    
    all_links_count = len([link for links in links_by_year.values() for link in links])
    link_count = 1
    
    for links in links_by_year.values():
        for link in links:
            url = f"https://www.brickeconomy.com{link}"
            logger.info(f"Fetching data from {url} [{link_count}/{all_links_count}]")
            data_by_set[link] = get_data(url)
            link_count = link_count + 1

    with open('./brickonomy_sets_1993_2002.pkl', 'wb') as fp:
        pickle.dump(data_by_set, fp, protocol=pickle.HIGHEST_PROTOCOL)

12:47:15 INFO:Fetching data from https://www.brickeconomy.com/set/911831-1/lego-star-wars-kylo-rens-shuttle [1/4488]
12:47:25 INFO:Fetching data from https://www.brickeconomy.com/set/10260-1/lego-downtown-diner [2/4488]


KeyboardInterrupt: 