# Lego set info scraper

## Importing needed packages

In [13]:
#Import Libraries
import os
import numpy as np
import pandas as pd
import selenium
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium import webdriver
from tqdm.notebook import tqdm
print('Packages Imported')

Packages Imported


## Loading the set IDs scraped using GetIDFromSetList jupyter notebook as dataframes.
I will be ignoring any of the IDs that start with ISBN as these are books.

In [2]:

ID_df_raw = pd.read_csv('SetIDs.csv', header=None)
ID_df_mod = ID_df_raw[ID_df_raw[0].str.contains("ISBN")==False]


Just a quick look at the data to get a feel for how many entries there are.

In [3]:
ID_df_mod.describe()

Unnamed: 0,0
count,19144
unique,19144
top,700-12
freq,1


Setting up the selenium options, again headless

In [8]:
#selenium options
def selenium_setup():
    options = Options()

    options.add_argument("--window-size=1920,1200")
    options.add_argument('--headless')
    return webdriver.Chrome(options=options)



## Functions for scraping data
The first function is designed to scrape text elements from the webpage.

In [5]:

def get_textElement(driver,pageSource, text):
    return_text='null'
    if "<dt>"+text+"</dt>"in  pageSource:
        return_text = driver.find_element(By.XPATH, "//dt[text() = '"+text+"']/following-sibling::dd").text
    return return_text

The second function extracts the Price from the webpage and formats it so only the price in dollars is returned.

In [6]:
#get dollar price
def get_price_dollars(driver,pageSource):
    price = 0
    if "<dt>RRP</dt>"in  pageSource:
        price_element = driver.find_element(By.XPATH, "//dt[text()= 'RRP']/following-sibling::dd")
        for i in price_element.text.split('/'):
            if '$' in i:
                price = i.strip().replace('$','')
    return price


I create the dataframe with the necessary headings that can be populated from the website.

In [7]:

legoData_df = pd.DataFrame(columns=['Set_number','Name','Set_type','Theme','Theme_group','Subtheme','Year_released', 'Pieces','Minifigs','Price', 'Age_range' ])
legoData_df

Unnamed: 0,Set_number,Name,Set_type,Theme,Theme_group,Subtheme,Year_released,Pieces,Minifigs,Price,Age_range


## Main loop for extracting data
Here the main loop is executed for each ID that has been read in to extract the needed data.
Of note is the minifigs feature as it not only contains the number of minifigs in a set but also the number of *unique* minifigures in brackets. Here I have formatted it to remove the unique minifig values as minifigs lose their unique status as time goes on and new sets come out. A value for "unique minifigs on release" would be of much greater interest.

Here the value is printed every 100 IDs in order to get a feel for how much longer is left to run in the script.

In [11]:
driver = selenium_setup()

for i,legoID in enumerate(tqdm(ID_df_mod[0])):

    driver.get('https://brickset.com/sets/'+str(legoID))
    pageSource = driver.page_source


    lego_name = get_textElement(driver,pageSource,"Name")
    lego_setType = get_textElement(driver,pageSource,"Set type")
    lego_theme = get_textElement(driver,pageSource,"Theme")
    lego_themeGroup = get_textElement(driver,pageSource,"Theme group")
    lego_subtheme = get_textElement(driver,pageSource,"Subtheme")
    lego_yearReleased = get_textElement(driver,pageSource,"Year released")
    lego_pieces = get_textElement(driver,pageSource,"Pieces")
 
    lego_minifigs = get_textElement(driver,pageSource,"Minifigs").split(" ",1)[0]
    lego_price = get_price_dollars(driver,pageSource)
    lego_ageRange = get_textElement(driver,pageSource,"Age range")


    legoData_df.loc[len(legoData_df.index)] = [legoID,lego_name,lego_setType,lego_theme,lego_themeGroup,lego_subtheme, lego_yearReleased, 
                                     lego_pieces,lego_minifigs, lego_price, lego_ageRange]

print("Completed!")

  0%|          | 0/19144 [00:00<?, ?it/s]

Completed!


Output dataframe to csv.

In [27]:
legoData_df.to_csv('legoData.csv', sep = ',', index=False)


## Update

In [33]:
driver = selenium_setup()
legoDataUpdate_df = pd.DataFrame(columns=['Set_number','Name','Set_type','Theme','Theme_group','Subtheme','Year_released', 'Pieces','Minifigs','Price', 'Age_range' ])
if os.path.exists('legoData.csv'):
    legoData_old_df = pd.read_csv('legoData.csv', index_col=False)
    ID_df_raw = pd.read_csv('SetIDs.csv', header=None)
    ID_df_mod = ID_df_raw[ID_df_raw[0].str.contains("ISBN")==False]
    for i,legoID in enumerate(tqdm(ID_df_mod[0])):
        if ~legoData_old_df['Set_number'].str.contains((legoID)).any():
            
            driver.get('https://brickset.com/sets/'+str(legoID))
            pageSource = driver.page_source


            lego_name = get_textElement(driver,pageSource,"Name")
            lego_setType = get_textElement(driver,pageSource,"Set type")
            lego_theme = get_textElement(driver,pageSource,"Theme")
            lego_themeGroup = get_textElement(driver,pageSource,"Theme group")
            lego_subtheme = get_textElement(driver,pageSource,"Subtheme")
            lego_yearReleased = get_textElement(driver,pageSource,"Year released")
            lego_pieces = get_textElement(driver,pageSource,"Pieces")

            lego_minifigs = get_textElement(driver,pageSource,"Minifigs").split(" ",1)[0]
            lego_price = get_price_dollars(driver,pageSource)
            lego_ageRange = get_textElement(driver,pageSource,"Age range")


            legoDataUpdate_df.loc[len(legoDataUpdate_df.index)] = [legoID,lego_name,lego_setType,lego_theme,lego_themeGroup,lego_subtheme, lego_yearReleased, 
                                     lego_pieces,lego_minifigs, lego_price, lego_ageRange]
       
      
    if len(legoDataUpdate_df)>0:
        fullData  = pd.concat([legoData_old_df,legoDataUpdate_df])    
        fullData.to_csv('legoData.csv', sep = ',', index=False)
    else:
        print("No new entries")
    print("Complete!")
    
else:
   print("Perform initial run!")

  0%|          | 0/19355 [00:00<?, ?it/s]

Complete!


In [34]:
fullData

Unnamed: 0,Set_number,Name,Set_type,Theme,Theme_group,Subtheme,Year_released,Pieces,Minifigs,Price,Age_range
0,700-12,Automatic Binding Bricks,Normal,SYSTEM,Basic,BASIC SET,1949.0,142.0,,0.0,
1,700_1_1-1,Individual 2 x 4 Bricks,Other,SYSTEM,Basic,SUPPLEMENTAL,1950.0,,,0.0,
2,700_1_2-1,Individual 2 x 2 Bricks,Other,SYSTEM,Basic,SUPPLEMENTAL,1950.0,,,0.0,
3,700_A-1,Automatic Binding Bricks Small Brick Set (Lego...,Other,SYSTEM,Basic,BASIC SET,1950.0,,,0.0,
4,700_B_1-1,Individual 1 x 4 x 2 Window (without glass),Other,SYSTEM,Basic,SUPPLEMENTAL,1950.0,,,0.0,
...,...,...,...,...,...,...,...,...,...,...,...
206,IDEASPAB5-1,Modular Racers,Other,MISCELLANEOUS,Miscellaneous,IDEAS PICK A BRICK MODEL,2023,,,0,
207,IDEASPAB6-1,Garden Dreams,Other,MISCELLANEOUS,Miscellaneous,IDEAS PICK A BRICK MODEL,2023,,,0,
208,IDEASPAB7-1,Brick-quarium,Other,MISCELLANEOUS,Miscellaneous,IDEAS PICK A BRICK MODEL,2023,,,0,
209,RABBIT-1,Rabbit,Other,PROMOTIONAL,Miscellaneous,,2023,32,,0,


In [32]:
legoData_old_df['Set_number']

0             700-12
1          700_1_1-1
2          700_1_2-1
3            700_A-1
4          700_B_1-1
            ...     
19139      5007790-1
19140      5007851-1
19141    BLOCKS099-1
19142    BLOCKS100-1
19143    BLOCKS101-1
Name: Set_number, Length: 19144, dtype: object