# eBay Sold Item Scraper

## Imports

In [1]:
from selenium.webdriver.chrome.options import Options
from selenium import webdriver      
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.by import By
import time
from datetime import datetime
import pandas as pd
import numpy as np
import urllib.request
import os

## Functions

In [2]:
def create_card_list(scraped_article_nos):
    cards_urls, article_numbers, names, conditions, final_prices, currencies, bids, auction_types, shipping_list  = [], [], [], [], [], [], [], [], []
    pages = driver.find_element_by_class_name("pages").find_elements_by_class_name("pg")
    page_urls = [page.get_attribute("href") for page in pages]
    for i in range(len(page_urls)):
        if i == 0:
            print("First page")
            pass
        else:
            driver.get(page_urls[i])
            "New page opened."
        
        cards = driver.find_elements_by_css_selector(".sresult.lvresult.clearfix.li")
        for card in cards:
            article_number = int(card.get_attribute("listingid"))
            if article_number in scraped_article_nos:
                print(f"Card {article_number} already scraped. Skipping.")
                continue
            cards_urls.append(card.find_element_by_class_name("lvtitle").find_element_by_class_name("vip").get_attribute("href"))
            article_numbers.append(article_number)
            names.append(card.find_element_by_class_name("lvtitle").text)
            try:
                condition = card.find_element_by_class_name("lvsubtitle").text
            except:
                condition = np.nan
            conditions.append(condition)
            price = card.find_element_by_css_selector(".bold.bidsold").text.split(" ")
            final_price = price[-1]
            if "." in final_price:
                final_price.replace(".", "").replace(",", ".")
            else:
                final_price.replace(",", ".")
            final_prices.append(final_price)
            currency = price[0]
            currencies.append(currency)
            auction_type = card.find_element_by_class_name("lvformat").text
            if "Gebot" in auction_type:
                bid = auction_type.split(" ")[0]
                auction_type = "Auction"
            else:
                bid = np.nan
            bids.append(bid)
            auction_types.append(auction_type)
            try:
                shipping_list.append(card.find_element_by_class_name("bfsp").text)
            except:
                shipping_list.append(card.find_element_by_class_name("fee").text.split(" ")[2].replace(",", "."))
    return cards_urls, article_numbers, names, conditions, final_prices, currencies, bids, auction_types, shipping_list


def find_in_list(target, target_list):
    for i, lst in enumerate(target_list):
        for j, pokemon in enumerate(lst):
            if pokemon == target.title():
                return (i, j)
    return (None, None)

def convert_name(name_list, search_list):
    for e in search_list:
        for pokemon in name_list:
            if e.title() in pokemon:
                index = find_in_list(e, name_list)
                if index[1] == 0:
                    card_language = "English"
                elif index[1] == 1:
                    card_language = "French"
                elif index[1] == 2:
                    card_language = "German"
                card_name = name_list[index[0]][0]
                return card_name, card_language

def skip_unsold():
    try:
        driver.find_element_by_css_selector(".vi-inl-lnk.vi-original-listing").click()
    except:
        pass

def get_name_language(name_list, offer_name):
    try:
        card_name, card_language = convert_name(name_list, offer_name.split(" "))
    except:
        card_name, card_language = None, None
    return card_name, card_language

def get_seller_info():
    try:
        seller_name = driver.find_element_by_css_selector(".ux-textspans--PSEUDOLINK.ux-textspans--BOLD").text
    except:
        seller_name = "Not found."
    try:
        items_sold_seller = driver.find_elements_by_class_name("ux-textspans--PSEUDOLINK")[2].text
    except:
        items_sold_seller = "Not found."
    return seller_name, items_sold_seller


def get_images(card_name, card_language, final_bid_amount, offer_name, article_number):
    first_image = driver.find_element_by_class_name("v-pnl-item")
    first_image.click()
    next_image = driver.find_element_by_xpath("""//*[@id="viEnlargeImgLayer"]/div[1]/div/a[2]""")
    last_image = next_image.get_attribute("aria-disabled")
    images_list = []
    image_counter = 1
    while last_image == "false":
        image_link = driver.find_element_by_xpath("""//*[@id="viEnlargeImgLayer_img_ctr"]""").get_attribute("src")
        path_name = f"cards/ebay/PSA 10s/{card_name}/{card_language}/{final_bid_amount}/{offer_name}"
        if not os.path.isdir(path_name):
            os.makedirs(path_name)
        urllib.request.urlretrieve(str(image_link), f"{path_name}/{article_number}_{image_counter}.jpg")
        images_list.append(image_link)
        image_counter += 1
        next_image.click()
        last_image = next_image.get_attribute("aria-disabled")
    image_link = driver.find_element_by_xpath("""//*[@id="viEnlargeImgLayer_img_ctr"]""").get_attribute("src")
    path_name = f"cards/ebay/PSA 10s/{card_name}/{card_language}/{final_bid_amount}/{offer_name}"

    if not os.path.isdir(path_name):
        os.makedirs(path_name)
    try:
        urllib.request.urlretrieve(str(image_link), f"{path_name}/{article_number}_{image_counter}.jpg")
        images_list.append(image_link)
    except:
        pass
    return images_list, path_name

def get_auction_date():
    replacements = {"Mai": "May", "Okt": "Oct", "Dez": "Dec"}
    try:
        auction_date = driver.find_element_by_css_selector(".u-flL.vi-bboxrev-posabs.vi-bboxrev-dsplinline").text.strip(" MEZ")
        for month in replacements.keys():
            if month in auction_date:
                auction_date = auction_date.replace(month, replacements[month])
        auction_date = datetime.strptime(auction_date, '%d. %b. %Y %H:%M:%S')
    except:
        auction_date = np.nan
    return auction_date


def get_article_info(stored, cards_urls, article_numbers, names, conditions, final_prices, currencies, bids, auction_types, shipping_list):
    names_df = pd.read_csv("pokemon_names.csv")
    name_list = [list(row) for row in names_df.values]
    index = 0

    # Loop through all cards on the page
    for card_url in cards_urls:
        driver.get(card_url)
        
        auction_date = get_auction_date()

        # Skip unsold items
        skip_unsold()

        # Get name and language of the card
        card_name, card_language = get_name_language(name_list, names[index])

        # Get name of the seller and items sold
        seller_name, items_sold_seller = get_seller_info()

        # Get images and save them
        try:
            images_list, path_name = get_images(card_name, card_language, final_prices[index], names[index], article_numbers[index])
        except:
            images_list = []
            path_name = np.nan
        
        # Write retrieved fields to a dict and append to existing df
        new_card = {"Article Number": article_numbers[index], "Name of the Offer": names[index], "Card name": card_name,
                    "Card language": card_language, "condition": conditions[index], "auction date": auction_date,
                    "number of bids": bids[index], "final bid": final_prices[index], "currency": currencies[index],
                    "shipping fee": shipping_list[index], "seller": seller_name, "items sold by seller": items_sold_seller,
                    "images": images_list, "local path": path_name, "auction type": auction_types[index], "url": card_url}
        # Append each scraped card to df
        stored = stored.append(new_card, ignore_index=True)
        stored.to_csv("/Users/ferris/Dropbox/jupyter notebooks/Scraping/Pokemon/Ebay Scraped.csv", index = False)
        index += 1
    print(f"{index} cards scraped.")


def gdpr():
    try:
        driver.find_element_by_id("gdpr-banner-accept").click()
    except:
        pass

def scrape(search_term):
    start_time = datetime.now()
    url = f"https://www.ebay.de/sch/Sammeln-Seltenes/1/i.html?_from=R40&LH_Sold=1&_mPrRngCbx=1&_udlo=1&_udhi=&_samilow=&_samihi=&_sadis=15&_stpos=80333&_sop=12&_dmd=1&LH_Complete=1&_fosrp=1&_nkw={search_term}&_ipg=200&rt=nc"
    driver.get(url)
    gdpr()
    stored = pd.read_csv("/Users/ferris/Dropbox/jupyter notebooks/Scraping/Pokemon/Ebay Scraped.csv")
    stored["Article Number"] = stored["Article Number"].astype(int)
    scraped_article_nos = stored["Article Number"].tolist()
    cards_urls, article_numbers, names, conditions, final_prices, currencies, bids, auction_types, shipping_list = create_card_list(scraped_article_nos)
    get_article_info(stored, cards_urls, article_numbers, names, conditions, final_prices, currencies, bids, auction_types, shipping_list)
    print("Scrape completed.")
    end_time = datetime.now()
    print(f"Duration of scrape: {end_time - start_time}")

## Scraping

In [None]:
option = Options()
option.headless = True
driver = webdriver.Chrome(options = option)
ebay_cards_list = scrape("psa+10+base")

## Analysis

In [26]:
df = pd.read_csv("/Users/ferris/Dropbox/jupyter notebooks/Scraping/Pokemon/Ebay Scraped.csv")
df["auction date"] = pd.to_datetime(df["auction date"])

bids = df["final bid"].tolist()
bids_cleaned = []
for bid in bids:
    if bid.count(',') == 1:
        bid = bid.replace(',', '.')
        if bid.count('.') == 2:
            bid = bid.replace('.', '', 1)
    bids_cleaned.append(bid)
bids_cleaned

df["final bid"] = bids_cleaned
df["final bid"] = df["final bid"].astype(float)
#df[df["Card name"] == "Charizard"].sort_values(by = "auction date")#["final bid"]#.plot(kind = "bar");
len(df)

808

In [29]:
df['final bid'].describe()

count      808.000000
mean       365.299963
std       2865.675320
min          1.000000
25%         71.000000
50%        100.000000
75%        231.000000
max      79999.990000
Name: final bid, dtype: float64