# Web-scraping for Lego images

In [5]:
import requests
from time import sleep
from bs4 import BeautifulSoup
import pandas as pd
import urllib.request
from datetime import datetime
import csv

In [6]:
# Helper function for downloading images
def dl_jpg(url, file_path, file_name):
    full_path = file_path + file_name + ".jpg"
    urllib.request.urlretrieve(url, full_path)

## Scraping basic content

### Create dictionaries with page URLs

In [7]:
sw_pages = dict()
dup_pages = dict()
tech_pages = dict()

# Change the number of pages depending on how many ads you want to scrape.
for i in range(0, 5):
    if i == 0:
        sw_page = "https://www.ricardo.ch/de/c/lego-star-wars-70601/?item_condition=used"
        sw_pages.update({i: sw_page})
        duplo_page = "https://www.ricardo.ch/de/c/duplo-41818/?item_condition=used"
        dup_pages.update({i: duplo_page})
        tech_page = "https://www.ricardo.ch/de/c/lego-technik-41822/?item_condition=used"
        tech_pages.update({i: tech_page})
        
    else:
        sw_page = f"https://www.ricardo.ch/de/c/lego-star-wars-70601/?item_condition=used&next_offset={i*59}&page={i+1}"
        sw_pages.update({i: sw_page})
        duplo_page = f"https://www.ricardo.ch/de/c/duplo-41818/?item_condition=used&next_offset={(i*59)+1}&page={i+1}"
        dup_pages.update({i: duplo_page})
        tech_page = f"https://www.ricardo.ch/de/c/lego-technik-41822/?item_condition=used&next_offset={(i*59)+1}&page={i+1}"
        tech_pages.update({i: tech_page})

In [8]:
dup_pages

{0: 'https://www.ricardo.ch/de/c/duplo-41818/?item_condition=used',
 1: 'https://www.ricardo.ch/de/c/duplo-41818/?item_condition=used&next_offset=60&page=2',
 2: 'https://www.ricardo.ch/de/c/duplo-41818/?item_condition=used&next_offset=119&page=3',
 3: 'https://www.ricardo.ch/de/c/duplo-41818/?item_condition=used&next_offset=178&page=4',
 4: 'https://www.ricardo.ch/de/c/duplo-41818/?item_condition=used&next_offset=237&page=5'}

### Scrape content for each line

#### Starwars

In [9]:
# Scrape pages
starwars_pages = list()

for i in range(len(sw_pages)):
    page_html = requests.get(sw_pages[i], timeout = 3)
    page_content = BeautifulSoup(page_html.content, "html.parser")
    starwars_pages.append(page_content)
    sleep(3)

In [10]:
# Extract box with ad url
starwars_a = list()

for i in range(len(starwars_pages)):
    a = starwars_pages[i].findAll("a", 
        {"class": "MuiGrid-root link--2etfD MuiGrid-item MuiGrid-grid-xs-6 MuiGrid-grid-sm-4 MuiGrid-grid-md-3"})
    starwars_a.append(a)

In [11]:
# Save the urls
starwars_href = list()

for i in range(len(starwars_a)):
    for j in range(len(starwars_a[0])):
        sw_href = starwars_a[i][j].get("href")
        starwars_href.append(f"https://www.ricardo.ch{sw_href}")

In [12]:
# Scrape individual ads
starwars_ads = list()
starwars_time = list()

for i in range(0, 300):
    page_html = requests.get(starwars_href[i], timeout = 3)
    page_content = BeautifulSoup(page_html.content, "html.parser")
    starwars_ads.append(page_content)
    starwars_time.append(datetime.now().strftime("%d/%m/%Y %H:%M:%S"))   # Add the time of scraping.
    sleep(3)

In [13]:
# Save the contents and timestamp
for i, j in zip(range(len(starwars_ads)), starwars_ads):
    filename = f"starwars_ads_{i}.html"
    path = "html/"
    with open(path + filename, "w") as file:
        file.write(str(j))
        
starwars_time = pd.Series(starwars_time)
starwars_time.to_csv("timestamp_sw.csv")

In [14]:
# Extract the image box
starwars_jpegs = list()

for i in range(len(starwars_ads)):
    if starwars_ads[i].findAll("img", {"class": "jss156"}):
        image = starwars_ads[i].findAll("img", {"class": "jss156"})
        starwars_jpegs.append(image)
    elif starwars_ads[i].findAll("img", {"class": "jss164"}):
        image = starwars_ads[i].findAll("img", {"class": "jss164"})
        starwars_jpegs.append(image)

In [15]:
# Flatten the list of lists
flat_starwars_jpegs = []

for sublist in starwars_jpegs:
    if not sublist:
        flat_starwars_jpegs.append("NaN")
    else:
        for item in sublist:
            flat_starwars_jpegs.append(item)

In [16]:
# Save URLs and ad titles
starwars_jpegs_links = list()
list_starwars_titles = list()
list_starwars_line = list()

for i in range(len(flat_starwars_jpegs)):
    if flat_starwars_jpegs[i] == "NaN":
        starwars_jpegs_links.append("NaN")
    else:
        source = flat_starwars_jpegs[i].get("src")
        starwars_jpegs_links.append(source)

    if flat_starwars_jpegs[i] == "NaN":
        list_starwars_titles.append("NaN")
    else:
        title = flat_starwars_jpegs[i].get("alt")
        list_starwars_titles.append(title)

    line = "starwars"
    list_starwars_line.append(line)

lego_sw_df = pd.DataFrame({"line": list_starwars_line, "title": list_starwars_titles, "image_url": 
                           starwars_jpegs_links, "scraped_at": starwars_time})

In [18]:
# Save images to the drive
for i in range(len(lego_sw_df)):
    if lego_sw_df["image_url"][i] == "missing":
        pass
    else:
        url = lego_sw_df["image_url"][i]
        file_name = f"{datetime.now().strftime('%Y%m%d')}_legoset_{i}"
        dl_jpg(url, "images/", file_name)

#### Duplo

In [19]:
# Scrape pages
duplo_pages = list()

for i in range(len(dup_pages)):
    page_html = requests.get(dup_pages[i], timeout = 3)
    page_content = BeautifulSoup(page_html.content, "html.parser")
    duplo_pages.append(page_content)
    sleep(3)

In [20]:
# Extract box with ad url
duplo_a = list()

for i in range(len(duplo_pages)):
    a = duplo_pages[i].findAll("a", 
        {"class": "MuiGrid-root link--2etfD MuiGrid-item MuiGrid-grid-xs-6 MuiGrid-grid-sm-4 MuiGrid-grid-md-3"})
    duplo_a.append(a)

In [21]:
# Save the urls
duplo_href = list()

for i in range(len(duplo_a)):
    for j in range(len(duplo_a[0])):
        dp_href = duplo_a[i][j].get("href")
        duplo_href.append(f"https://www.ricardo.ch{dp_href}")

In [None]:
# Scrape individual ads
duplo_ads = list()
duplo_time = list()

for i in range(0, 300):
    page_html = requests.get(duplo_href[i], timeout = 3)
    page_content = BeautifulSoup(page_html.content, "html.parser")
    duplo_ads.append(page_content)
    duplo_time.append(datetime.now().strftime("%d/%m/%Y %H:%M:%S"))
    sleep(3)

In [None]:
# Save the contents and timestamp
for i, j in zip(range(len(duplo_ads)), duplo_ads):
    filename = f"duplo_ads_{i}.html"
    path = "html/"
    with open(path + filename, "w") as file:
        file.write(str(j))
        
duplo_time = pd.Series(duplo_time)
duplo_time.to_csv("timestamp_duplo.csv")

In [None]:
# Extract the image box
duplo_jpegs = list()

for i in range(len(duplo_ads)):
    if duplo_ads[i].findAll("img", {"class": "jss156"}):
        image = duplo_ads[i].findAll("img", {"class": "jss156"})
        duplo_jpegs.append(image)
    elif duplo_ads[i].findAll("img", {"class": "jss164"}):
        image = duplo_ads[i].findAll("img", {"class": "jss164"})
        duplo_jpegs.append(image)
    else:
        duplo_jpegs.append("")

In [None]:
# Flatten the list of lists
flat_duplo_jpegs = []

for sublist in duplo_jpegs:
    if not sublist:
        flat_duplo_jpegs.append("NaN")
    else:
        for item in sublist:
            flat_duplo_jpegs.append(item)

In [None]:
# Save URLs and ad titles
duplo_jpegs_links = list()
list_duplo_titles = list()
list_duplo_line = list()

for i in range(len(flat_duplo_jpegs)):
    if flat_duplo_jpegs[i] == "NaN":
        duplo_jpegs_links.append("NaN")
    else:
        source = flat_duplo_jpegs[i].get("src")
        duplo_jpegs_links.append(source)
    
    if flat_duplo_jpegs[i] == "NaN":
        list_duplo_titles.append("NaN")
    else:
        title = flat_duplo_jpegs[i].get("alt")
        list_duplo_titles.append(title)
    
    line = "duplo"
    list_duplo_line.append(line)
    
lego_duplo_df = pd.DataFrame({"line": list_duplo_line, "title": list_duplo_titles, "image_url": 
                              duplo_jpegs_links, "scraped_at": duplo_time})

In [None]:
# Save images to the drive
for i in range(len(lego_duplo_df)):
    if lego_duplo_df["image_url"][i] == "NaN":
        pass
    else:
        url = lego_duplo_df["image_url"][i]
        file_name = f"{datetime.now().strftime('%Y%m%d')}_legoset_{i + len(lego_sw_df)}"
        dl_jpg(url, "images/", file_name)

#### Technic

In [None]:
# Scrape pages
technic_pages = list()

for i in range(len(tech_pages)):
    page_html = requests.get(tech_pages[i], timeout = 3)
    page_content = BeautifulSoup(page_html.content, "html.parser")
    technic_pages.append(page_content)
    sleep(3)

In [None]:
# Extract box with ad url
technic_a = list()

for i in range(len(technic_pages)):
    a = technic_pages[i].findAll("a", 
        {"class": "MuiGrid-root link--2etfD MuiGrid-item MuiGrid-grid-xs-6 MuiGrid-grid-sm-4 MuiGrid-grid-md-3"})
    technic_a.append(a)

In [None]:
# Save the urls
technic_href = list()

for i in range(len(technic_a)):
    for j in range(len(technic_a[0])):
        tc_href = technic_a[i][j].get("href")
        technic_href.append(f"https://www.ricardo.ch{tc_href}")
        

In [None]:
# Scrape individual ads
technic_ads = list()
technic_time = list()

for i in range(0, 300):
    page_html = requests.get(technic_href[i], timeout = 3)
    page_content = BeautifulSoup(page_html.content, "html.parser")
    technic_ads.append(page_content)
    technic_time.append(datetime.now().strftime("%d/%m/%Y %H:%M:%S"))
    sleep(3)

In [None]:
# Save the contents and timestamp
for i, j in zip(range(len(technic_ads)), technic_ads):
    filename = f"technic_ads_{i}.html"
    path = "html/"
    with open(path + filename, "w") as file:
        file.write(str(j))

technic_time = pd.Series(technic_time)
technic_time.to_csv("timestamp_technic.csv")

In [None]:
# Extract the image box
technic_jpegs = list()

for i in range(len(technic_ads)):
    if technic_ads[i].findAll("img", {"class": "jss156"}):
        image = technic_ads[i].findAll("img", {"class": "jss156"})
        technic_jpegs.append(image)
    elif technic_ads[i].findAll("img", {"class": "jss164"}):
        image = technic_ads[i].findAll("img", {"class": "jss164"})
        technic_jpegs.append(image)
    else:
        technic_jpegs.append("")

In [None]:
technic_jpegs[0:10]

In [None]:
# Flatten the list of lists
flat_technic_jpegs = []

for sublist in technic_jpegs:
    if not sublist:
        flat_technic_jpegs.append("NaN")
    else:
        for item in sublist:
            flat_technic_jpegs.append(item)

In [None]:
# Save URLs and ad titles
technic_jpegs_links = list()
list_technic_titles = list()
list_technic_line = list()

for i in range(len(flat_technic_jpegs)):
    if flat_technic_jpegs[i] == "NaN":
        technic_jpegs_links.append("NaN")
    else:
        source = flat_technic_jpegs[i].get("src")
        technic_jpegs_links.append(source)
    
    if flat_technic_jpegs[i] == "NaN":
        list_technic_titles.append("NaN")
    else:
        title = flat_technic_jpegs[i].get("alt")
        list_technic_titles.append(title)
    
    line = "technic"
    list_technic_line.append(line)
    
lego_tech_df = pd.DataFrame({"line": list_technic_line, "title": list_technic_titles, "image_url": 
                             technic_jpegs_links, "scraped_at": technic_time})

In [None]:
# Save images to the drive
for i in range(len(lego_tech_df)):
    if lego_tech_df["image_url"][i] == "NaN":
        pass
    else:
        url = lego_tech_df["image_url"][i]
        file_name = f"{datetime.now().strftime('%Y%m%d')}_legoset_{i + len(lego_sw_df) + len(lego_duplo_df)}"
        dl_jpg(url, "images/", file_name)

In [None]:
len(lego_sw_df), len(lego_duplo_df), len(lego_tech_df)

## Compile data frame

In [None]:
# Combine the three lines
lego_df = pd.concat([lego_sw_df, lego_duplo_df, lego_tech_df], ignore_index=True)
lego_df["scraped_at"] = pd.to_datetime(lego_df["scraped_at"])
lego_df.tail()

In [None]:
# Add one column for the image name
image_name = list()

for i in range(len(lego_df)):
    file_name = f"{i}_legosets.jpg"
    image_name.append(file_name)

lego_df["image_name"] = image_name
lego_df.head()

In [None]:
# Extract the info box with interesting information
starwars_info = list()
duplo_info = list()
technic_info = list()

for i in range(len(starwars_ads)):
    info = starwars_ads[i].findAll("div", {"class": "mainInfo--hdpPQ"})
    starwars_info.append(info)
    
    info = duplo_ads[i].findAll("div", {"class": "mainInfo--hdpPQ"})
    duplo_info.append(info)
    
    info = technic_ads[i].findAll("div", {"class": "mainInfo--hdpPQ"})
    technic_info.append(info)
    

In [None]:
flat_starwars_info = []   ### Replace empty lists with string 'NaN'.
for sublist in starwars_info:
    if not sublist:
        flat_starwars_info.append("NaN")
    else:
        for item in sublist:
            flat_starwars_info.append(item)
            
flat_duplo_info = []
for sublist in duplo_info:
    if not sublist:
        flat_duplo_info.append("NaN")
    else:
        for item in sublist:
            flat_duplo_info.append(item)
            
flat_technic_info = []
for sublist in technic_info:
    if not sublist:
        flat_technic_info.append("NaN")
    else:
        for item in sublist:
            flat_technic_info.append(item)

In [None]:
sw_missing = 0
for i in flat_starwars_info:
    if i == 'NaN':
        sw_missing = sw_missing + 1

dup_missing = 0
for i in flat_duplo_info:
    if i == 'NaN':
        dup_missing = dup_missing + 1

tech_missing = 0
for i in flat_technic_info:
    if i == 'NaN':
        tech_missing = tech_missing + 1

print(f"Star Wars: {sw_missing/len(flat_starwars_info)*100}% missing")
print(f"Duplo: {dup_missing/len(flat_duplo_info)*100}% missing")
print(f"Technic: {tech_missing/len(flat_technic_info)*100}% missing")

### Next bid and purchase price

In [None]:
next_bid_auction = list()

for i in range(len(flat_starwars_info)):
    if flat_starwars_info[i].find("input", {"class": "jss176 jss175"}):
        price = flat_starwars_info[i].find("input", {"class": "jss176 jss175"}).get("value")
        next_bid_auction.append(price)
    
    elif flat_starwars_info[i].find("input", {"class": "jss173 jss172"}):
        price = flat_starwars_info[i].find("input", {"class": "jss173 jss172"}).get("value")
        next_bid_auction.append(price)
    
    else:
        next_bid_auction.append("NaN")

for i in range(len(flat_duplo_info)):
    if flat_duplo_info[i].find("input", {"class": "jss176 jss175"}):
        price = flat_duplo_info[i].find("input", {"class": "jss176 jss175"}).get("value")
        next_bid_auction.append(price)
        
    elif flat_duplo_info[i].find("input", {"class": "jss173 jss172"}):
        price = flat_duplo_info[i].find("input", {"class": "jss173 jss172"}).get("value")
        next_bid_auction.append(price)
        
    else:
        next_bid_auction.append("NaN")

for i in range(len(flat_technic_info)):
    if flat_technic_info[i].find("input", {"class": "jss176 jss175"}):
        price = flat_technic_info[i].find("input", {"class": "jss176 jss175"}).get("value")
        next_bid_auction.append(price)
        
    elif flat_technic_info[i].find("input", {"class": "jss173 jss172"}):
        price = flat_technic_info[i].find("input", {"class": "jss173 jss172"}).get("value")
        next_bid_auction.append(price)
    
    else:
        next_bid_auction.append("NaN")
        
next_bid_auction[25:30]

In [None]:
price_buy_now = list()

for i in range(len(flat_starwars_info)):
    if flat_starwars_info[i].find("div", {"class": "price--rC2BI"}):
        price = flat_starwars_info[i].find("div", {"class": "price--rC2BI"}).get_text()
        price_buy_now.append(price)
    else:
        price_buy_now.append("NaN")

for i in range(len(flat_duplo_info)):
    if flat_duplo_info[i].find("div", {"class": "price--rC2BI"}):
        price = flat_duplo_info[i].find("div", {"class": "price--rC2BI"}).get_text()
        price_buy_now.append(price)
    else:
        price_buy_now.append("NaN")
        
for i in range(len(flat_technic_info)):
    if flat_technic_info[i].find("div", {"class": "price--rC2BI"}):
        price = flat_technic_info[i].find("div", {"class": "price--rC2BI"}).get_text()
        price_buy_now.append(price)
    else:
        price_buy_now.append("NaN")

price_buy_now[25:30]

In [None]:
lego_df["next_bid_auction"] = next_bid_auction
lego_df["next_bid_auction"] = lego_df["next_bid_auction"].astype(float)

lego_df["price_buy_now"] = price_buy_now
lego_df["price_buy_now"] = lego_df["price_buy_now"].astype(float)

lego_df.iloc[40,]

### End date and time remaining

In [None]:
end_date = list()

for i in range(len(flat_starwars_info)):
    if flat_starwars_info[i].findAll("span", {"class": "jss171"}):
        date = flat_starwars_info[i].findAll("span", {"class": "jss171"})
        date_stripped = date[0].get_text().strip("schedule").replace("Dez", "Dec")
        
        if len(date_stripped) > 25:
            date_stripped = date_stripped[date_stripped.find('|'):].strip("| ")
        
        date_stripped = datetime.strptime(date_stripped, "%d. %b. %Y, %H:%M")
        end_date.append(date_stripped)
    
    elif flat_starwars_info[i].findAll("span", {"class": "jss168"}):
        date = flat_starwars_info[i].findAll("span", {"class": "jss168"})
        date_stripped = date[0].get_text().strip("schedule").replace("Dez", "Dec")
        
        if len(date_stripped) > 25:
            date_stripped = date_stripped[date_stripped.find('|'):].strip("| ")
        
        date_stripped = datetime.strptime(date_stripped, "%d. %b. %Y, %H:%M")
        end_date.append(date_stripped)
    
    else:
        end_date.append("NaN")
        
for i in range(len(flat_duplo_info)):
    if flat_duplo_info[i].findAll("span", {"class": "jss171"}):
        date = flat_duplo_info[i].findAll("span", {"class": "jss171"})
        date_stripped = date[0].get_text().strip("schedule").replace("Dez", "Dec")
        
        if len(date_stripped) > 25:
            date_stripped = date_stripped[date_stripped.find('|'):].strip("| ")
          
        date_stripped = datetime.strptime(date_stripped, "%d. %b. %Y, %H:%M")
        end_date.append(date_stripped)
        
    elif flat_duplo_info[i].findAll("span", {"class": "jss168"}):
        date = flat_duplo_info[i].findAll("span", {"class": "jss168"})
        date_stripped = date[0].get_text().strip("schedule").replace("Dez", "Dec")
        
        if len(date_stripped) > 25:
            date_stripped = date_stripped[date_stripped.find('|'):].strip("| ")
        
        date_stripped = datetime.strptime(date_stripped, "%d. %b. %Y, %H:%M")
        end_date.append(date_stripped)        
        
    else:
        end_date.append("NaN")
        
for i in range(len(flat_technic_info)):
    if flat_technic_info[i].findAll("span", {"class": "jss171"}):
        date = flat_technic_info[i].findAll("span", {"class": "jss171"})
        date_stripped = date[0].get_text().strip("schedule").replace("Dez", "Dec")

        if len(date_stripped) > 25:
            date_stripped = date_stripped[date_stripped.find('|'):].strip("| ")
  
        date_stripped = datetime.strptime(date_stripped, "%d. %b. %Y, %H:%M")
        end_date.append(date_stripped)
        
    elif flat_technic_info[i].findAll("span", {"class": "jss168"}):
        date = flat_technic_info[i].findAll("span", {"class": "jss168"})
        date_stripped = date[0].get_text().strip("schedule").replace("Dez", "Dec")
        
        if len(date_stripped) > 25:
            date_stripped = date_stripped[date_stripped.find('|'):].strip("| ")
        
        date_stripped = datetime.strptime(date_stripped, "%d. %b. %Y, %H:%M")
        end_date.append(date_stripped)        

    else:
        end_date.append("NaN")

In [None]:
lego_df["ends_on"] = end_date
lego_df["ends_on"] = pd.to_datetime(lego_df["ends_on"])
lego_df["time_remaining"] = lego_df["ends_on"] - lego_df["scraped_at"]
lego_df["seconds_remaining"] = lego_df["time_remaining"].dt.seconds
lego_df.head()

### Sale type

In [None]:
sale_type = list()

for i in range(len(lego_df)):
    if (lego_df["next_bid_auction"].isna()[i] == False) & (lego_df["price_buy_now"].isna()[i] == False):
        sale_type.append("both")
    elif lego_df["next_bid_auction"].isna()[i] == False:
        sale_type.append("auction")    
    elif lego_df["price_buy_now"].isna()[i] == False:
        sale_type.append("buy_now")
    else:
        sale_type.append("NaN")
        
len(sale_type)

In [None]:
lego_df["sale_type"] = sale_type
lego_df

In [None]:
lego_df.dtypes

In [None]:
lego_df.to_csv(f"{datetime.now().strftime("%Y%m%d")}_lego_sets.csv")