# Retrieve tournaments

> Nei dati scaricati mancano: 

Requirements

In [1]:
%pip install -r ../requirements.txt -q

Note: you may need to restart the kernel to use updated packages.


Imports

In [2]:
import os
import time

import pandas as pd

import requests
import xml.etree.ElementTree as ET
import json
from tqdm import tqdm
import logging
from bs4 import BeautifulSoup

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager

Logging configuration

In [3]:
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger()

Constants

In [4]:
# Repositories managing
DATA_DIR = os.path.join("..", "data")

TOP_GAMES_LIST_DIR = os.path.join(DATA_DIR, "temp")
TOP_GAMES_LIST_FILE = os.path.join(TOP_GAMES_LIST_DIR, "boardgames_ranks.csv")

USERNAMES_DIR = os.path.join(DATA_DIR, "temp")
USERNAMES_FILE = os.path.join(USERNAMES_DIR, "usernames.json")

TOURNAMENTS_DIR = DATA_DIR
TOURNAMENTS_FILE = os.path.join(TOURNAMENTS_DIR, "raw", "tournaments.json")

In [5]:
# Download parameters
REQUEST_DELAY = 1
BACKUP_PERIOD = 10
MAX_RETRIES = 5

GAME_NUM = 2

In [6]:
# URLs
BGG_BASE_URL = "https://boardgamegeek.com/xmlapi2"
BGA_BASE_URL = "https://boardgamearena.com"

Utility functions

In [7]:
def save_to_json(data, filename):
    """
    Salva i dati in un file JSON.
    """
    with open(filename, "w", encoding="utf-8") as f:
        json.dump(data, f, ensure_ascii=False, indent=4)

In [8]:
def append_to_json(new_data, filename):
    """
    Aggiunge nuovi dati a un file JSON esistente senza duplicati.

    Args:
        new_data (list): I nuovi dati da aggiungere.
        filename (str): Il nome del file JSON.
    """
    try:
        # Legge i dati esistenti, se il file esiste.
        with open(filename, "r", encoding="utf-8") as f:
            existing_data = json.load(f)
    except FileNotFoundError:
        # Se il file non esiste, crea una lista vuota.
        existing_data = []

    # Rimuove duplicati combinando i dati nuovi con quelli esistenti.
    combined_data = {json.dumps(item, sort_keys=True): item for item in existing_data + new_data}.values()

    # Salva i dati combinati nel file JSON.
    with open(filename, "w", encoding="utf-8") as f:
        json.dump(list(combined_data), f, ensure_ascii=False, indent=4)


## 1.1 Functions

In [9]:
def find_tournaments_by_id(id):
    time.sleep(1)
    if id==-1:
        return pd.DataFrame(columns=["game", "reg_players", "max_players", "date"])
    driver.get('https://boardgamearena.com/tournamentlist?d&time=0&prestige=0&type=0&players_per_match_min=0&players_per_match_max=0&gamecateg=3&status=future&game='+str(id)+'&tournament_i_registered=0&full=false')
    html= driver.page_source
    soup = BeautifulSoup(html)
    tournament_list=soup.find(id="tournament_list")
    games_list_html=tournament_list.find_all(class_="tournaments-list-result__gamename")
    registered_players_html=tournament_list.find_all(class_="tournaments-list-result__players-reg")
    max_players_html=tournament_list.find_all(class_="tournaments-list-result__players-max")
    date_tournament_html = tournament_list.find_all(class_="tournaments-list-result__date")

    games_list = []
    reg_players = []
    max_players = []
    date_tournament = []
    for i in range(len(games_list_html)):
        games_list.append(games_list_html[i].get_text())
        reg_players.append(registered_players_html[i].get_text())
        max_players.append(max_players_html[i].get_text())
        date_tournament.append(date_tournament_html[i].get_text().split(' ')[1])

    zipped = list(zip(games_list, reg_players, max_players, date_tournament))
    df = pd.DataFrame(zipped, columns=["game", "reg_players", "max_players", "date"])
    return df

In [10]:
def find_id_from_gamename(gamename):
    driver.get('https://boardgamearena.com/gamepanel?game='+gamename)
    time.sleep(1)
    html= driver.page_source
    soup = BeautifulSoup(html)
    x=soup.find_all(class_="bgabutton bgabutton_blue bga-button-inner flex-1 truncate svelte-1154gir")
    if not x:
        return -1
    id = str(x[1]).split("=")[3].split('"')[0]
    return id

In [11]:
# Top games data
games = pd.read_csv(TOP_GAMES_LIST_FILE).loc[:GAME_NUM-1, ["id", "name"]]

In [12]:
# Setup Chrome options
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument("--headless")  # Run in headless mode
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")

# Initialize the WebDriver
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)

2025-01-11 20:18:06,258 - INFO - Get LATEST chromedriver version for google-chrome
2025-01-11 20:18:07,016 - INFO - Get LATEST chromedriver version for google-chrome
2025-01-11 20:18:07,642 - INFO - There is no [win64] chromedriver "131.0.6778.264" for browser google-chrome "131.0.6778" in cache
2025-01-11 20:18:07,643 - INFO - Get LATEST chromedriver version for google-chrome
2025-01-11 20:18:09,147 - INFO - WebDriver version 131.0.6778.264 selected
2025-01-11 20:18:09,155 - INFO - Modern chrome version https://storage.googleapis.com/chrome-for-testing-public/131.0.6778.264/win32/chromedriver-win32.zip
2025-01-11 20:18:09,156 - INFO - About to download new driver from https://storage.googleapis.com/chrome-for-testing-public/131.0.6778.264/win32/chromedriver-win32.zip
2025-01-11 20:18:09,874 - INFO - Driver downloading response is 200
2025-01-11 20:18:16,341 - INFO - Get LATEST chromedriver version for google-chrome
2025-01-11 20:18:17,607 - INFO - Driver has been saved in cache [C:\Us

In [14]:
# Initialize the DataFrame for tournaments
df_tournaments = pd.DataFrame(columns=["game", "reg_players", "max_players", "date"])

# Iterate over the game names with a progress bar
for name in tqdm(games["name"], desc="Fetching tournaments", total=games.shape[0]):
    time.sleep(REQUEST_DELAY)  # Delay
    try:
        # Find tournaments by game ID
        game_id = find_id_from_gamename(name.lower().replace(" ", ""))
        df = find_tournaments_by_id(game_id)
        # Concatenate the new data to the DataFrame
        df_tournaments = pd.concat([df_tournaments, df], axis=0)
    except Exception as e:
        logger.error(f"Error fetching tournaments for game {name}: {e}")

# Save the DataFrame to a JSON file
print()
df_tournaments.to_json(TOURNAMENTS_FILE, orient="records")
logger.info(f"Saved tournaments data to {TOURNAMENTS_FILE}")

Fetching tournaments: 100%|██████████| 2/2 [00:06<00:00,  3.16s/it]
2025-01-11 20:19:15,988 - INFO - Saved tournaments data to ..\data\raw\tournaments.json
