# Pokémon TCG webscraper

In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
import time



## Getting all cards and their data

In [2]:
# Extract the card code from the link and format it
def get_card_code_from_link(link):
    set_code = link.split('/')[-2]
    number_card = link.split('/')[-1]
    number_card = (3 - len(number_card)) * '0' + number_card
    return set_code + number_card

# Write the output line given to a file
def write_output(key, value):
    with open('data/card_printings.txt', 'a') as f:
        f.write(key + ':' + ','.join(value) + '\n')

In [3]:
# Write the header of a file
with open('data/card_printings.txt', 'w') as f:
    f.write('<first_print>:<print1>,<print2>,...,<printn>\n')

In [6]:
# Set the URL that we will start scraping from
URL = 'https://limitlesstcg.com/cards'

# Open Chrome
driver = webdriver.Chrome()

# Open the URL
driver.get(URL)
time.sleep(0.1)

# Let's scrape all the links to the sets
e_sets = driver.find_elements(By.XPATH, '/html/body/main/div/table/tbody/tr/td[1]/a')
set_links = [element.get_attribute('href') for element in e_sets]

# Create a list to store roughly all the cards codes that have been already scraped
with open('data/scraped_cards.txt', 'r') as f:
    scraped_cards = f.read().splitlines()

# Let's enter the page for each set and scrape the links to each card
for set_link in set_links:
    
    # Read the set code from the file to check if this set has already been scraped
    with open('data/temp_sets.txt', 'r') as f:
        temp_sets = f.read().splitlines()
        if set_link.split('/')[-1] in temp_sets:
            continue
    
    # Create a dictionary to store the output for this set
    output = {} # key is code for the first print for that card, value is a list of other printings

    # Go to set link
    driver.get(set_link)
    time.sleep(0.1)
    
    # Get all the links to the cards
    e_all_cards = driver.find_element(By.CLASS_NAME, 'card-search-grid').find_elements(By.XPATH, './a')
    cards_links = [e.get_attribute('href') for e in e_all_cards]

    # Then enter the page for each card and scrape the data
    for card_link in cards_links:
        
        # Get the card code
        card_code = get_card_code_from_link(card_link)
        
        # If this card has already been scraped, then skip it
        if card_code in scraped_cards:
            continue

        # Go to card link
        driver.get(card_link)
        time.sleep(0.1)

        # Get the first print of this card and then all the other printings
        e_other_printings = driver.find_element(By.CLASS_NAME, 'card-prints-versions').find_elements(By.XPATH, './tbody/tr/td[1]/a')

        # Initialize the first print variables
        is_first_print = True
        first_print = None

        # Get the first print of this card and all the other printings
        for e in e_other_printings:
            
            try:
                code = get_card_code_from_link(e.get_attribute('href'))
            except: # if e has no href, then it is the current card
                code = card_code

            if code not in scraped_cards:
                scraped_cards.append(card_code)

                if is_first_print:
                    first_print = code
                    is_first_print = False
                    output[first_print] = []
                else:
                    output[first_print].append(code)

    # Write the set code to a file for checking progress
    with open('data/temp_sets.txt', 'a') as f:
        f.write(set_link.split('/')[-1] + '\n')
    
    # Backup the scraped cards
    with open('data/scraped_cards.txt', 'w') as f:
        f.write('\n'.join(scraped_cards))

    # Write the output to a file
    for key, value in output.items():
        write_output(key, value)

# Close the session
driver.close()

The chromedriver version (117.0.5938.92) detected in PATH at /opt/homebrew/bin/chromedriver might not be compatible with the detected chrome version (118.0.5993.117); currently, chromedriver 118.0.5993.70 is recommended for chrome 118.*, so it is advised to delete the driver in PATH and retry
