In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
import time
import pyperclip



In [2]:
# set up the PATHs
URL_WEBSITE = 'https://limitlesstcg.com/tournaments?time=12months&type=all&format=standard&region=all&show=100'
URL_WEBSITE_TEST = 'https://limitlesstcg.com/tournaments?time=1months&type=all&format=standard&region=all&show=100'

In [3]:
# I need to use selenium to scrape the data from the website below
# https://limitlesstcg.com/tournaments?time=12months&type=all&format=standard&region=all&show=100
# And extract the decklists from the top 8 players of each tournament and save them in a csv file
# The csv file should have the following columns:
# Tournament Name, Date, Player Name, Deck Name, Card1, Card2, ..., Card60
# The csv file should have a row for each decklist
# begin now


# Initialize the WebDriver (in this case, for Chrome)
driver = webdriver.Chrome()

# Go to the limitlesstcg website
driver.get(URL_WEBSITE)

# Get all the urls for the tournaments

# all elements have a tag <a> and the href="/tournament/xxxxx" where xxxxx is the tournament id
# I can use the xpath to find all the elements with the tag <a> and then extract the href attribute
# wait for the page to load
time.sleep(1)

elements = driver.find_elements(By.XPATH, '/html/body/main/div/table/tbody//td[3]/a')
urls = []
for element in elements:
    urls.append(element.get_attribute('href'))

print(len(urls))
print(urls[0])

52
https://limitlesstcg.com/tournaments/385


In [4]:
deck_urls = []

for url in urls:
    driver.get(url)
    time.sleep(1)
    # get the link for the decks of the top 8 players
    elements = driver.find_elements(By.XPATH, '/html/body/main/div/table/tbody//td[4]/a')
    i = 0
    for element in elements:
        deck_url = element.get_attribute('href') + '/cards'
        if len(element.get_attribute('href').split('?')) == 2:
            deck_url = (element.get_attribute('href').split('?')[0] + '/cards?' + element.get_attribute('href').split('?')[1])
        deck_urls.append(deck_url)
        i += 1
        if i == 8:
            break

print(len(deck_urls))
print(deck_urls)

416
['https://limitlesstcg.com/decks/264/cards', 'https://limitlesstcg.com/decks/240/cards?variant=1', 'https://limitlesstcg.com/decks/264/cards', 'https://limitlesstcg.com/decks/264/cards?variant=1', 'https://limitlesstcg.com/decks/212/cards?variant=38', 'https://limitlesstcg.com/decks/264/cards?variant=1', 'https://limitlesstcg.com/decks/255/cards', 'https://limitlesstcg.com/decks/264/cards', 'https://limitlesstcg.com/decks/224/cards?variant=2', 'https://limitlesstcg.com/decks/255/cards', 'https://limitlesstcg.com/decks/264/cards?variant=3', 'https://limitlesstcg.com/decks/247/cards', 'https://limitlesstcg.com/decks/255/cards', 'https://limitlesstcg.com/decks/241/cards', 'https://limitlesstcg.com/decks/241/cards', 'https://limitlesstcg.com/decks/241/cards', 'https://limitlesstcg.com/decks/241/cards', 'https://limitlesstcg.com/decks/257/cards?variant=2', 'https://limitlesstcg.com/decks/211/cards?variant=1', 'https://limitlesstcg.com/decks/247/cards', 'https://limitlesstcg.com/decks/24

In [5]:
def get_set_codes_and_numbers():
    # Getting the a full list of set codes and card amounts (numbers) ever printed
    all_set_codes = []
    all_set_numbers = []
    driver.get('https://limitlesstcg.com/cards')
    time.sleep(1)
    set_codes = driver.find_elements(By.XPATH, '/html/body/main/div/table/tbody//td[1]/a/span')
    set_numbers = driver.find_elements(By.XPATH, '/html/body/main/div/table/tbody//td[3]/a')

    if len(set_codes) != len(set_numbers):
        print('Error: the number of set codes and set numbers is different')
        return None, None

    # then get the content of the element
    for i in range(len(set_codes)):
        set_code, set_number = set_codes[i], set_numbers[i]
        all_set_codes.append(set_code.text)
        all_set_numbers.append(int(set_number.text.split('\n')[0]))

    return all_set_codes, all_set_numbers

In [6]:
def get_clean_decklist(data):
    # Split the decklist into Pokemon, Trainer and Energy. Then remove Energy.
    card_list = data.split('\n\n')[:2]
    # Remove the first element of each list and separates the cards.
    card_list = [x.split('\n')[1:] for x in card_list]
    # join the both lists (append one into other)
    card_list = card_list[0] + card_list[1]
    # split each element by the ' '
    card_list = [x.split(' ') for x in card_list]
    # get the last two elements of each list
    card_list = [x[-2:] for x in card_list]
    # if the second element has just 2 characters, put a 0 in front of it
    card_list = [[x[0], '0' + x[1]] if len(x[1]) == 2 else x for x in card_list]
    # join the elements of each list (append one into other)
    card_list = [x[0] + x[1] for x in card_list]
    
    return card_list

In [7]:
i = 0
with open(f'output/decks.csv', 'w') as f:
    # write the header
    f.write('deck_id,card_id\n')

for deck_url in deck_urls:
    driver.get(deck_url)
    time.sleep(1)
    # get the button by class='export'
    button = driver.find_element(By.CLASS_NAME, 'export')
    button.click()
    copied_text = pyperclip.paste()
    # save the text in a file
    with open(f'output/decks.csv', 'a') as f:
        clean_decklist = get_clean_decklist(copied_text)
        for card in clean_decklist:
            f.write(f'{i},{card}\n')
    i += 1

In [None]:
driver.quit()