# 2. Retrieve threads

> Note:
> - I dati sono sufficienti ma occorre ristrutturarli, in quanto diversi da come li vorremmo
> - Le reply non sono dirette: il subject delle reply è "Re: [subject originale]" e il body è il testo della reply

Requirements

In [26]:
%pip install -r ../requirements.txt -q

Note: you may need to restart the kernel to use updated packages.


Imports

In [27]:
import os
import time

import pandas as pd

import requests
import xml.etree.ElementTree as ET
import json
from tqdm import tqdm
import logging
from bs4 import BeautifulSoup

Logging configuration

In [28]:
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger()

Constants

In [29]:
# Repositories managing
DATA_DIR = os.path.join("..", "data")

TOP_GAMES_LIST_DIR = DATA_DIR
TOP_GAMES_LIST_FILE = os.path.join(TOP_GAMES_LIST_DIR, "boardgames_ranks.csv")

USERNAMES_DIR = os.path.join(DATA_DIR, "temp")
USERNAMES_FILE = os.path.join(USERNAMES_DIR, "usernames.json")

FORUMS_DIR = DATA_DIR
FORUMS_FILE = os.path.join(FORUMS_DIR, "raw", "forums.json")

In [30]:
# Download parameters
REQUEST_DELAY = 1
BACKUP_PERIOD = 10
MAX_RETRIES = 5
MAX_THREADS = 5

GAME_NUM = 2

MAX_FORUMS_PER_GAME = 2
MAX_THREADS_PER_FORUM = 5
MAX_MESSAGES_PER_THREAD = 5

In [31]:
# URLs
BGG_BASE_URL = "https://boardgamegeek.com/xmlapi2"

Utility functions

In [32]:
def save_to_json(data, filename):
    """
    Salva i dati in un file JSON.
    """
    with open(filename, "w", encoding="utf-8") as f:
        json.dump(data, f, ensure_ascii=False, indent=4)

In [33]:
def append_to_json(new_data, filename):
    """
    Aggiunge nuovi dati a un file JSON esistente senza duplicati.

    Args:
        new_data (list): I nuovi dati da aggiungere.
        filename (str): Il nome del file JSON.
    """
    try:
        # Legge i dati esistenti, se il file esiste.
        with open(filename, "r", encoding="utf-8") as f:
            existing_data = json.load(f)
    except FileNotFoundError:
        # Se il file non esiste, crea una lista vuota.
        existing_data = []

    # Rimuove duplicati combinando i dati nuovi con quelli esistenti.
    combined_data = {json.dumps(item, sort_keys=True): item for item in existing_data + new_data}.values()

    # Salva i dati combinati nel file JSON.
    with open(filename, "w", encoding="utf-8") as f:
        json.dump(list(combined_data), f, ensure_ascii=False, indent=4)

## 2.1 Functions

In [34]:
def fetch_forum_list(game_id, max_forums=100):
    """
    Fetches the list of forums associated with a game.
    """
    url = f"{BGG_BASE_URL}/forumlist?id={game_id}&type=thing"
    time.sleep(REQUEST_DELAY)  # Delay
    response = requests.get(url)
    if response.status_code == 200:
        root = ET.fromstring(response.content)
        forums = []
        for forum in root.findall("forum")[:max_forums]:
            forums.append({
                "id": forum.attrib.get("id"),
                "title": forum.attrib.get("title"),
                "num_threads": int(forum.attrib.get("numthreads", 0)),
                "num_posts": int(forum.attrib.get("posts", 0)),
                "last_post_date": forum.attrib.get("lastpostdate")
            })
        return forums
    else:
        logger.error(f"Error fetching forums for game {game_id}: {response.status_code}")
        return []

In [35]:
def fetch_threads_from_forum(forum_id, max_threads=5):
    """
    Fetches threads from a specific forum, limiting the number of threads.
    """
    url = f"{BGG_BASE_URL}/forum?id={forum_id}&page=1"
    time.sleep(REQUEST_DELAY)  # Delay
    response = requests.get(url)
    if response.status_code == 200:
        root = ET.fromstring(response.content)
        threads = []
        for thread in root.findall("threads/thread")[:max_threads]:
            threads.append({
                "thread_id": thread.attrib.get("id"),
                "author": thread.attrib.get("author"),
                "subject": thread.attrib.get("subject"),
                "num_articles": int(thread.attrib.get("numarticles", 0)),
                "post_date": thread.attrib.get("postdate"),
                "last_post_date": thread.attrib.get("lastpostdate")
            })
        return threads
    else:
        logger.error(f"Error fetching threads for forum {forum_id}: {response.status_code}")
        return []

In [36]:
def fetch_messages_from_thread(thread_id, max_posts=5, sleep_time=0.5, max_retries=5):
    """
    Fetches messages from a specific thread, supporting pagination.
    """
    messages = []
    usernames = set()
    
    url = f"{BGG_BASE_URL}/thread?id={thread_id}"
    status_code = 500

    while status_code != 200:
        time.sleep(sleep_time)  # Delay
        try:
            response = requests.get(url)
            status_code = response.status_code
            if status_code == 200:
                break
            max_retries -= 1
            if max_retries == 0:
                logger.error(f"Error fetching messages for thread {thread_id}: {status_code}. Retries exhausted.")
                return [], []
        except Exception as e:
            logger.error(f"Error fetching messages for thread {thread_id}: {e}. Retries exhausted.")
            return [], []
    
    if response.status_code == 200:
        root = ET.fromstring(response.content)
        for article in root.find("articles").findall("article")[:max_posts]:
            usernames.add(article.attrib.get("username"))
            messages.append({
                "article_id": article.attrib.get("id"),
                "username": article.attrib.get("username"),
                "post_date": article.attrib.get("postdate"),
                "edit_date": article.attrib.get("editdate"),
                "num_edits": int(article.attrib.get("numedits", 0)),
                "subject": article.find("subject").text if article.find("subject") is not None else None,
                "content": article.find("body").text if article.find("body") is not None else None
            })
        logger.info(f"\t\tDownloaded {len(messages)} messages from thread {thread_id}")
    else:
        logger.error(f"Error fetching messages for thread {thread_id}: {response.status_code}")
        return [], []
    
    return messages, list(usernames)

## 2.2 Execution

In [37]:
with open(USERNAMES_FILE, "r", encoding="utf-8") as f:
    usernames = set(json.load(f))

# Top games data
games = pd.read_csv(TOP_GAMES_LIST_FILE).loc[:GAME_NUM-1, ["id", "name"]]

In [38]:
saved_once = False

In [39]:
for name, game_id in tqdm(zip(games["name"], games["id"]), desc="Fetching forums", total=games.shape[0]):
    print("")
    game = {
        "id": game_id,
        "name": name,
        "forums": []
    }
    # Fetch the list of forums for the specified game
    forums = fetch_forum_list(game_id, max_forums=MAX_FORUMS_PER_GAME)
    logger.info(f"Retrieving {len(forums)} forums for game {game['name']}...")

    # For each forum in the list of forums
    for forum in forums:
        # Fetch the threads from the specified forum
        threads = fetch_threads_from_forum(forum["id"], max_threads=MAX_THREADS_PER_FORUM)
        logger.info(f"\tRetrieving {len(threads)} threads for forum {forum['title']}...")

        # For each thread in the list of threads
        for thread in threads:
            # Fetch the messages and users from the specified thread
            thread["messages"], users = fetch_messages_from_thread(thread["thread_id"], max_posts=MAX_MESSAGES_PER_THREAD)
            # Update the set of usernames with the fetched users
            usernames.update(users)

        # Add the threads to the forum
        forum["threads"] = threads

    # Add the forums to the game
    game["forums"] = forums

    # Save the data to the JSON file
    if not saved_once:
        # Save the data to the JSON file
        save_to_json([game], FORUMS_FILE)
        saved_once = True
    else:
        # Append the data to the JSON file
        append_to_json([game], FORUMS_FILE)
    logger.info(f"Saved data for game {game['name']} into '{FORUMS_FILE}'")

    time.sleep(REQUEST_DELAY)  # Delay  

# Save the set of usernames to the JSON file
save_to_json(list(usernames), USERNAMES_FILE)

Fetching forums:   0%|          | 0/2 [00:00<?, ?it/s]




2025-01-11 19:55:01,636 - INFO - Retrieving 2 forums for game Brass: Birmingham...
2025-01-11 19:55:03,476 - INFO - 	Retrieving 5 threads for forum Reviews...
2025-01-11 19:55:04,911 - INFO - 		Downloaded 5 messages from thread 3415529
2025-01-11 19:55:06,183 - INFO - 		Downloaded 5 messages from thread 3363071
2025-01-11 19:55:07,482 - INFO - 		Downloaded 1 messages from thread 3418104
2025-01-11 19:55:09,410 - INFO - 		Downloaded 5 messages from thread 3405292
2025-01-11 19:55:10,887 - INFO - 		Downloaded 3 messages from thread 3375145
2025-01-11 19:55:12,598 - INFO - 	Retrieving 5 threads for forum Sessions...
2025-01-11 19:55:13,821 - INFO - 		Downloaded 5 messages from thread 3043438
2025-01-11 19:55:15,082 - INFO - 		Downloaded 2 messages from thread 2645328
2025-01-11 19:55:16,308 - INFO - 		Downloaded 3 messages from thread 2325688
2025-01-11 19:55:17,561 - INFO - 		Downloaded 3 messages from thread 2067171
2025-01-11 19:55:18,828 - INFO - 		Downloaded 5 messages from thread 20




2025-01-11 19:55:21,589 - INFO - Retrieving 2 forums for game Pandemic Legacy: Season 1...
2025-01-11 19:55:23,408 - INFO - 	Retrieving 5 threads for forum Reviews...
2025-01-11 19:55:24,706 - INFO - 		Downloaded 2 messages from thread 3359270
2025-01-11 19:55:25,975 - INFO - 		Downloaded 2 messages from thread 2914484
2025-01-11 19:55:27,245 - INFO - 		Downloaded 2 messages from thread 2587737
2025-01-11 19:55:28,543 - INFO - 		Downloaded 1 messages from thread 2886655
2025-01-11 19:55:29,864 - INFO - 		Downloaded 5 messages from thread 2512372
2025-01-11 19:55:31,717 - INFO - 	Retrieving 5 threads for forum Sessions...
2025-01-11 19:55:33,143 - INFO - 		Downloaded 1 messages from thread 3398935
2025-01-11 19:55:34,559 - INFO - 		Downloaded 1 messages from thread 3368723
2025-01-11 19:55:35,937 - INFO - 		Downloaded 5 messages from thread 2598830
2025-01-11 19:55:37,232 - INFO - 		Downloaded 2 messages from thread 2273751
2025-01-11 19:55:38,756 - INFO - 		Downloaded 2 messages from t