# 01 - WebScraper para cartas de Pokémon TCG

Alunos: Arthur Barreto, Enricco Gemha e Felipe Catapano

In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
import time
import re
import os
from functools import reduce
import datetime

In [None]:
# Define the base URL to scrape
URL = 'https://limitlesstcg.com'

DIRNAME  = 'data'
FILENAME = 'tournaments.csv'
PATH = os.path.join(DIRNAME, FILENAME)

In [None]:
# Create a Chrome driver with the options
driver = webdriver.Chrome()

In [None]:
def get_all_tournaments(driver):

    # Initialize list of tournament page links
    links = []

    # Get all tournament links
    elements = driver.find_elements(By.XPATH, '/html/body/main/div/table/tbody/tr/td[3]/a')

    # Append each tournament page link to list
    for e in elements:
        links.append(e.get_attribute('href'))

    return links

In [None]:
filters = ['regional', 'national', 'international', 'worlds', 'special', 'cl', 'rl', 'online', 'players_cup', 'invitational']
tournament_links_by_type = {'regional': [], 'national': [], 'international': [], 'worlds': [], 'others': []}

for filter in filters:
    # Go to URL for all tournaments between 2011 and 2023
    driver.get(URL+f'/tournaments?time=all&type={filter}&format=all&region=all')

    # Get all the links to the tournaments
    tls = get_all_tournaments(driver)
    if filter in ['regional', 'national', 'international', 'worlds']:
        tournament_links_by_type[filter] = tls
    else:
        tournament_links_by_type['others'] += tls

print(f"Regionals: {len(tournament_links_by_type['regional'])}")
print(f"Nationals: {len(tournament_links_by_type['national'])}")
print(f"Internationals: {len(tournament_links_by_type['international'])}")
print(f"Worlds: {len(tournament_links_by_type['worlds'])}")
print(f"Others: {len(tournament_links_by_type['others'])}")

# Close the session
driver.close()

In [None]:
# PATH_TMP = os.path.join(DIRNAME, 'data_tournaments.txt')
# with open(PATH_TMP, 'w') as f:
#     f.write(str(tournament_links_by_type))

# PATH_TMP = os.path.join(DIRNAME, 'left_to_scrap.txt')
# with open(PATH_TMP, 'w') as f:
#     f.write(str(tournament_links_by_type))

In [None]:
# with open(PATH, 'w') as f:
#     f.write('id_card,name_card,amount_card,price_card,energy_type_card,type_card,combo_type_id,combo_type_name,id_player,name_player,country_player,all_time_score,ranking_player_tournament,id_tournament,category_tournament,name_tournament,country_tournament,year_tournament,month_tournament,day_tournament,valid_rotation_at_tournament\n')

In [None]:
def get_and_write_tournament_data(t_category, t_link):

    driver = webdriver.Chrome()
    driver.get(t_link)

    tournament = {}

    # Create the id for the tournament
    tournament['category_tournament'] = t_category
    tournament['id_tournament'] = int(t_link.split('/')[-1])
    tournament['name_tournament'] = driver.find_element(By.CLASS_NAME, 'infobox-heading').text.strip().replace(',','')
    if 'Expanded' in tournament['name_tournament'] or 'expanded' in tournament['name_tournament']:
        return
    try:
        tournament['country_tournament'] = driver.find_element(By.CLASS_NAME, 'infobox-heading').find_element(By.XPATH, './img').get_attribute('data-tooltip')
    except:
        tournament['country_tournament'] = 'None'
    try:
        tournament['valid_rotation_at_tournament'] = driver.find_element(By.CLASS_NAME, 'infobox-line').find_element(By.XPATH, './a[1]').get_attribute('href').split('=')[-1]
    except:
        tournament['valid_rotation_at_tournament'] = 'None'
    t_date = driver.find_element(By.CLASS_NAME, 'infobox-line').text.split('•')[0].strip()
    t_day, t_month, t_year = t_date.split(' ')
    t_day = re.findall('\d+', t_day)[0]
    t_day = t_day if len(t_day) == 2 else '0' + t_day
    t_month = str(time.strptime(t_month[:3],'%b').tm_mon)
    t_month = t_month if len(t_month) == 2 else '0' + t_month
    tournament['year_tournament'] = t_year
    tournament['month_tournament'] = t_month
    tournament['day_tournament'] = t_day

    # Skip first row (header)
    e_decklist_row = driver.find_elements(By.XPATH, '/html/body/main/div/table/tbody/tr')[1:]
    players = []

    for e in e_decklist_row:
        player = {}
        player['ranking_player_tournament'] = e.find_element(By.XPATH, './td[1]').text
        player['country_player'] = e.find_element(By.XPATH, './td[3]/img').get_attribute('data-tooltip')
        try:
            e_player = e.find_element(By.XPATH, './td[2]/a[1]')
        except:
            e_player = e.find_element(By.XPATH, './td[2]/div/a[1]')
        player['id_player'] = e_player.get_attribute('href').split('/')[-1]
        player['name_player'] = e_player.text

        player['link_player_page'] = e_player.get_attribute('href')

        try:
            player['combo_type_id'] = e.find_element(By.XPATH, './td[4]/a').get_attribute('href').split('/')[-1].split('?')[0]
            player['combo_type_name'] = e.find_element(By.XPATH, './td[4]/a/span').get_attribute('data-tooltip')
        except:
            player['combo_type_id'] = 'None'
            player['combo_type_name'] = 'None'

        try:
            player ['link_decklist'] = e.find_element(By.XPATH, './td[5]/a').get_attribute('href')
            player['has_decklist'] = True
        except:
            player['has_decklist'] = False

        players.append(player)
    
    for player in players:
        driver.get(player['link_player_page'])
        player['all_time_score'] = driver.find_element(By.XPATH, '/html/body/main/div/section[2]/div/table[2]/tbody/tr[2]/td[2]').text

    decklists = []
    for player in players:
        if not player['has_decklist']:
            continue
        driver.get(player['link_decklist'])

        decklist = {}
        
        e_pokemon_cards = driver.find_elements(By.XPATH, '/html/body/main/div/div[1]/div[2]/div[1]/div[1]/div/a[1]')
        e_trainer_cards = driver.find_elements(By.XPATH, '/html/body/main/div/div[1]/div[2]/div[1]/div[2]/div/a[1]')
        card_urls = [element.get_attribute('href') for element in e_pokemon_cards]
        card_urls.extend([element.get_attribute('href') for element in e_trainer_cards])
        card_names = [element.find_element(By.XPATH, './span[2]').text for element in e_pokemon_cards]
        card_names.extend([element.find_element(By.XPATH, './span[2]').text for element in e_trainer_cards])
        card_amounts = [element.find_element(By.XPATH, './span[1]').text for element in e_pokemon_cards]
        card_amounts.extend([element.find_element(By.XPATH, './span[1]').text for element in e_trainer_cards])
        cards_to_iterate = zip(card_urls, card_names, card_amounts)

        decklists.append({'player': player, 'cards': cards_to_iterate})

    for decklist in decklists:

        cards_to_iterate = decklist['cards']
        player = decklist['player']

        for card_url, card_name, card_amount in cards_to_iterate:

            card = {}
            driver.get(card_url)
            try:
                not_release_internationally = driver.find_element(By.XPATH, '/html/body/main/div/section[1]/div[2]/table/tbody/tr[2]/td/span')
                if not_release_internationally.text == 'This card has not been released internationally yet.':
                    continue
            except:
                pass

            card['name_card'] = card_name
            card['amount_card'] = card_amount
            
            e_card = driver.find_element(By.XPATH, '/html/body/main/div/section[1]/div[2]/table/tbody/tr[2]')

            try:
                href_card_url = e_card.find_element(By.XPATH, './td[1]/a').get_attribute('href')
                set_card = href_card_url.split('/')[-2]
                number_card = href_card_url.split('/')[-1]
                number_card = (3 - len(number_card)) * '0' + number_card
                card['id_card'] = set_card + number_card
            except:
                set_card = card_url.split('/')[-2]
                number_card = card_url.split('/')[-1]
                number_card = (3 - len(number_card)) * '0' + number_card
                card['id_card'] = set_card + number_card
            
            try:
                card['price_card'] = float(e_card.find_element(By.XPATH, './td[2]/a').text.replace('$',''))
            except:
                card['price_card'] = float(e_card.find_element(By.XPATH, './td[3]/a').text.replace('€','')) * 1.05
                try:
                    card['price_card'] = 'None'
                except:
                    card['price_card'] = 'None'
            
            card['type_card'] = driver.find_element(By.XPATH, '/html/body/main/div/section[1]/div[1]/div[2]/div[1]/div[1]/div[1]/p[2]').text.split('-')[0].strip()
            if card['type_card'] == 'Pokémon':
                card['energy_type_card'] = driver.find_element(By.XPATH, '/html/body/main/div/section[1]/div[1]/div[2]/div[1]/div[1]/div[1]/p[1]').text.split('-')[1].strip()
            else:
                card['energy_type_card'] = 'None'


            with open(PATH, 'a') as f:
                f.write(f'{card["id_card"]},{card["name_card"]},{card["amount_card"]},{card["price_card"]},{card["energy_type_card"]},{card["type_card"]},{player["combo_type_id"]},{player["combo_type_name"]},{player["id_player"]},{player["name_player"]},{player["country_player"]},{player["all_time_score"]},{player["ranking_player_tournament"]},{tournament["id_tournament"]},{tournament["category_tournament"]},{tournament["name_tournament"]},{tournament["country_tournament"]},{tournament["year_tournament"]},{tournament["month_tournament"]},{tournament["day_tournament"]},{tournament["valid_rotation_at_tournament"]}\n')
    
    driver.close()
    return

In [None]:
# Get the tournament links from `left_to_scrap.txt`
PATH_TMP = os.path.join(DIRNAME, 'left_to_scrap.txt')
with open(PATH_TMP, 'r') as f:
    tournament_links_by_type = eval(f.read())

In [None]:
# Iterate through all the categories
for category, tournament_links in tournament_links_by_type.items():
    
    # Iterate through all the tournaments
    for tl in tournament_links:

        # Call the function to get the data from a tournament
        get_and_write_tournament_data(category, tl)

        # Remove the tournament link from the file `left_to_scrap.txt`
        with open(PATH_TMP, 'r') as f:
            data = eval(f.read())
            data[category].remove(tl)
            if len(data[category]) == 0:
                del data[category]
        with open(PATH_TMP, 'w') as f:
            f.write(str(data))

In [None]:
# delete the file `left_to_scrap.txt` and `data_tournaments.txt`, since they are not needed anymore
os.remove(PATH_TMP)
PATH_TMP = os.path.join(DIRNAME, 'data_tournaments.txt')
os.remove(PATH_TMP)

## Part 2 - Add regions instead of countries

In [None]:
FILENAME_AUX = 'countries.txt'
PATH_AUX = os.path.join(DIRNAME, FILENAME_AUX)

In [None]:
def create_or_get_region_file():
    # if file 'countries.txt' does exist, return
    file_exists = os.path.exists(PATH_AUX)
    if file_exists:
        # read the second line of the file and return it
        with open(PATH_AUX, 'r', encoding='utf8') as f:
            return eval(f.readlines()[0])
    else:
        # for all the lines get all the countries
        countries = []
        with open(PATH, 'r', encoding='utf8') as f:
            for line in f:
                countries.append(line.split(',')[16])
        countries = countries[1:]
        countries = list(set(countries)) # get all unique countries

        retval = {'NA':[], 'EU':[], 'SA':[], 'AS-OC':[], 'JP':[], 'ONLINE':[], 'AF':[]}
        for country in countries:
            region_code = input('Enter the region code of ' + country + ': ')
            retval[region_code].append(country)
        with open(PATH_AUX, 'w', encoding='utf8') as f:
            # write the dictionary to the file
            f.write(str(retval))
        return retval

In [None]:
country_region = create_or_get_region_file()
lines = []
# substitute the country with the region
with open(PATH, 'r', encoding='utf8') as f:
    i = 0
    for line in f:
        if i == 0:
            line = line.split(',')
            line.insert(16, 'region_tournament')
            # use reduce lambda function to join the list of strings and append it to the list of lines
            lines.append(reduce(lambda x, y: x + ',' + y, line))
            i += 1
            continue
        country = line.split(',')[16]
        if country == 'South Africa' or country == 'Pokémon Trading Card Game Online':
            continue
        for region in country_region:
            if country in country_region[region]:
                line = line.split(',')
                line.insert(16, region)
                # use reduce lambda function to join the list of strings and append it to the list of lines
                lines.append(reduce(lambda x, y: x + ',' + y, line))
                break
        # else:
        #     lines.append(line.replace(country, 'Error!!!!!'))
        i += 1
print(len(lines))
with open(PATH, 'w', encoding='utf8') as f:
    for line in lines:
        f.write(line)

In [None]:
# delete the PATH_AUX file
os.remove(PATH_AUX)

## Part 3 - Add format rotation to data

Despite the lack of easy availability of this kind of data, I was able to find the format rotation information in different links, mainly due to X (former Twitter) great engine search. The sources for the tournament rotation data are listed below.

Standard 2023
https://www.pokemon.com/us/pokemon-news/2023-pokemon-tcg-standard-format-rotation-and-pokemon-tool-errata

Standard 2022
https://www.pokemon.com/us/pokemon-news/2022-pokemon-tcg-championship-series-season-format-rotation

Standard 2021
https://www.pokemon.com/us/pokemon-news/2021-season-pokemon-tcg-format-rotation

Standard 2020
https://www.pokemon.com/us/pokemon-news/2020-season-pokemon-tcg-format-rotation

Standard 2019
https://www.pokemon.com/us/pokemon-news/2019-season-pokemon-tcg-format-rotation

Standard 2018
https://www.pokemon.com/uk/strategy/a-look-at-the-2018-pokemon-tcg-standard-rotation

Standard 2017
https://gonintendo.com/stories/260682-pokemon-tcg-format-rotation-for-2017

Standard 2016
https://www.youtube.com/watch?v=aykx4xX5U1M

Standard 2015
https://www.youtube.com/watch?v=tpWUvggYmkU

Standard 2014
https://www.youtube.com/watch?v=C7xuh5ycQMs

Standard 2013
https://www.youtube.com/watch?v=AdcQhG0yXeE

Standard 2012
https://www.youtube.com/watch?v=PtNo5PsUGyc&t=688s

In [None]:
# Copy the file 'tournaments.csv' to 'tournaments_copy.csv'
PATH_TMP = os.path.join(DIRNAME, 'tournaments_copy.csv')
# Delete the file if it already exists
if os.path.exists(PATH_TMP):
    os.remove(PATH_TMP)
 
# Check the operating system and use the respective command
if os.name == 'nt':  # Windows
    cmd = f'copy "{PATH}" "{PATH_TMP}"'
else:  # Unix/Linux
    cmd = f'cp "{PATH}" "{PATH_TMP}"'
 
# Copy file
os.system(cmd)

# Clean the file 'tournaments.csv'
with open(PATH, 'w', encoding='utf8') as f:
    f.write('')

In [None]:
# The row names are:
# rotation_name,year_begin,month_begin,day_begin

rotations = [
    ['standard_2012',datetime.date(2011,9,1)],
    ['standard_2013',datetime.date(2012,9,1)],
    ['standard_2014',datetime.date(2013,8,28)],
    ['standard_2015',datetime.date(2014,9,3)],
    ['standard_2016',datetime.date(2015,9,1)],
    ['standard_2017',datetime.date(2016,9,1)],
    ['standard_2018',datetime.date(2017,9,1)],
    ['standard_2019',datetime.date(2018,8,28)],
    ['standard_2020',datetime.date(2019,8,15)],
    ['standard_2021',datetime.date(2020,8,28)],
    ['standard_2022',datetime.date(2021,9,10)],
    ['standard_2023',datetime.date(2023,4,14)],
]

In [None]:
with open(PATH_TMP, 'r') as f:
    is_first_line = True
    for line in f:
        if is_first_line:
            line = line.replace('\n','').split(',')
            line.extend(['rotation_name','year_begin','month_begin','day_begin\n'])
            print(line)
            line = reduce(lambda x, y: x + ',' + y, line)
            print(line)
            with open(PATH, 'a') as f:
                f.write(line)
            is_first_line = False
            continue

        # Get data from the line
        line = line.replace('\n','').split(',')
        year_tournament = line[18]
        month_tournament = line[19]
        day_tournament = line[20]
        try:
            date_tournament = datetime.date(int(year_tournament), int(month_tournament), int(day_tournament))
        except:
            print(f'Error: {len(line)}')
            print(f'Error: {year_tournament}, {month_tournament}, {day_tournament}')
            continue
        # Check if the tournament is between in the rotation
        # the rotation begin in `rotation_date` and ends in the following `rotation_date`
        i = 0
        rotation = None
        while i < len(rotations)-1:
            if (date_tournament >= rotations[i][1]) and (date_tournament < rotations[i+1][1]):
                rotation = rotations[i]
                break
            if date_tournament >= rotations[-1][1]:
                rotation = rotations[-1]
                break
            if date_tournament < rotations[0][1]:
                rotation = ['standard_2011', datetime.date(2010,9,1)]
                break
            i += 1
        # Add the rotation data to the line
        line.extend([rotation[0], str(rotation[1].year), str(rotation[1].month), str(rotation[1].day)+'\n'])
        line = reduce(lambda x, y: x + ',' + y, line)

        # use reduce lambda function to join the list of strings and append it to the list of lines
        with open(PATH, 'a') as f:
            f.write(line)

# delete the file 'tournaments_copy.csv'
os.remove(PATH_TMP)