In this notebook we are going to be using the boardgamegeek api to scrape statistics for every board game they have in their collection. Each request will check for 500 ids, and the api will return any board games, board game expansions, or role playing game items within those 500 ids. It will filter out any other types of items. We will do this for all of their possible ids (325000). For each game we will be collecting the id, type of item, name, publishing year, designer's name, artist's name, publisher, minimum number of players, maximum number of players, playing time, suggested minimum age, categories the game falls into, mechanics the game incorporates (such as set collection, or rolling a die to move), families of games the game falls into, the average reviewer rating, the weighted average reviewer rating, the strategical complexity (identified as weight) of the game, and the number of reviews. We will write this information into csv files in groups of 100,000 to limit the size of our files. These files will be cleaned up in another notebook.

In [1]:
import requests
import time
import datetime
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

In [2]:
% run feature_extraction.py

In [4]:
# Warning - takes several hours to run
start = datetime.datetime.now()

games = {}
many_ratings_ids = {}
id_numbers = ''
for i in range(1, 325001):
    
    id_numbers += (str(i) + ', ')
    if i % 500 == 0:
        response = requests.get(f'https://www.boardgamegeek.com/xmlapi2/thing?type=boardgame,boardgameexpansion,rpgitem&stats=1&id={id_numbers}')
        if response.status_code == 404:
            print(i)
            id_numbers=''
            continue
        elif response.status_code == 200:
            game = response.content
            soup = BeautifulSoup(game, "xml")
            items = soup.findAll('item')
            for item in items:
                game_id = item.get('id')
                thing_type = item.get('type')
                name = extract_feature(item, 'name', {'type':'primary'})
                year = extract_feature(item, 'yearpublished')
                designer = extract_feature(item, 'link', {'type':'boardgamedesigner'})
                artist = extract_feature(item, 'link', {'type': 'boardgameartist'})
                publisher = extract_feature(item, 'link', {'type': 'boardgamepublisher'})
                min_players = extract_feature(item, 'minplayers')
                max_players = extract_feature(item, 'maxplayers')
                play_time = extract_feature(item, 'playingtime')
                min_age = extract_feature(item, 'minage')
                categories = extract_multiple_features(item, 'category')
                mechanics = extract_multiple_features(item, 'mechanic')
                families = extract_multiple_features(item, 'family')
                avg_rating = extract_feature(item, 'average')
                bayes_avg = extract_feature(item, 'bayesaverage')
                weight = extract_feature(item, 'averageweight')
                num_ratings = extract_feature(item, 'usersrated')
            
                games[game_id] = [name, thing_type, year, designer, artist, publisher, min_players, max_players, 
                                  play_time, min_age, num_ratings, avg_rating, bayes_avg, weight, categories,
                                  mechanics, families]

            time.sleep(1)
            id_numbers = ''
        else:
            print(response.status_code)
            print(i)
            id_numbers = ''
            time.sleep(10)
        if i % 100000 == 0:
            print_csv(games, i)
            games = {}

print_csv(games, i)

end = datetime.datetime.now()
print(end-start, "Time Elapsed")

1:21:20.685385 Time Elapsed


In [None]:
# This cell can be used to clean up any dropped requests
"""start = datetime.datetime.now()

games = {}
many_ratings_ids = {}
id_numbers = ''
for i in range(1, 501):
    
    id_numbers += (str(i) + ', ')
    if i % 5 == 0:
        response = requests.get(f'https://www.boardgamegeek.com/xmlapi2/thing?type=boardgame,boardgameexpansion,rpgitem&stats=1&id={id_numbers}')
        print(id_numbers)
        if response.status_code == 404:
            print(i)
            id_numbers=''
            continue
        elif response.status_code == 200:
            game = response.content
            soup = BeautifulSoup(game, "xml")
            items = soup.findAll('item')
            for item in items:
                game_id = item.get('id')
                thing_type = item.get('type')
                name = extract_feature('name', {'type':'primary'})
                description = item.find('description').text
                year = extract_feature('yearpublished')
                designer = extract_feature('link', {'type':'boardgamedesigner'})
                artist = extract_feature('link', {'type': 'boardgameartist'})
                publisher = extract_feature('link', {'type': 'boardgamepublisher'})
                min_players = extract_feature('minplayers')
                max_players = extract_feature('maxplayers')
                play_time = extract_feature('playingtime')
                min_age = extract_feature('minage')
                categories = extract_multiple_features('category')
                mechanics = extract_multiple_features('mechanic')
                families = extract_multiple_features('family')
                avg_rating = extract_feature('average')
                bayes_avg = extract_feature('bayesaverage')
                weight = extract_feature('averageweight')
                
                try:
                    num_ratings = item.find('usersrated').get('value')

                except:
                    num_ratings = np.nan
            
                games[game_id] = [name, thing_type, description, year, designer, artist, publisher, min_players, max_players, play_time, min_age, num_ratings, avg_rating, bayes_avg, weight, categories, mechanics, families]

            time.sleep(1)
            id_numbers = ''
        else:
            print(response.status_code)
            print(i)
            id_numbers = ''
        if i % 100000 == 0:
            games_df = pd.DataFrame.from_dict(games, orient='index', columns=['name', 'type', 'description', 'year', 'designer', 'artist', 'publisher', 'min_players', 'max_players', 'play_time', 'min_age', 'num_ratings', 'avg_rating', 'bayes_avg', 'weight', 'categories', 'mechanics', 'families'])
            games_df.to_csv(f'games_to_{i}')
            games = {}

games_df = pd.DataFrame.from_dict(games, orient='index', columns=['name', 'type', 'description', 'year', 'designer', 'artist', 'publisher', 'min_players', 'max_players', 'play_time', 'min_age', 'num_ratings', 'avg_rating', 'bayes_avg', 'weight', 'categories', 'mechanics', 'families'])
games_df.to_csv(f'games_to_{i}')

end = datetime.datetime.now()
print(end-start, "Time Elapsed")"""