In [2]:
# Import Statements
import pandas as pd
import numpy as np
import requests
import time
import xmltodict

In [3]:
games = pd.read_csv('../data/ranked_data.csv', index_col = 0)

In [49]:
# API base url
base_url = 'https://www.boardgamegeek.com/xmlapi2/thing?id='

## Function to Read in Text

In [50]:
# Code to turn parsed data into understandable dictionary of data
def parse_to_data(raw):
    
    # Empty dictionary
    data = {}
    
    # Distinguish
    data['id'] = raw['items']['item']['@id']
    try:
        data['name'] = raw['items']['item']['name'][0]['@value']
    except:
        pass
    try:
        data['name'] = raw['items']['item']['name']['@value']
    except:
        pass
    
    # Get yearpublished, minplayers, maxplayers
    data['year'] = raw['items']['item']['yearpublished']['@value']
    data['min_players'] = raw['items']['item']['minplayers']['@value']
    data['max_players'] = raw['items']['item']['maxplayers']['@value']
    
    # Get playtime, age
    data['playtime'] = raw['items']['item']['playingtime']['@value']
    data['min_time'] = raw['items']['item']['minplaytime']['@value']
    data['max_time'] = raw['items']['item']['maxplaytime']['@value']
    data['min_age'] = raw['items']['item']['minage']['@value']
    
    # Get board game categories and mechanics
    mech = 1
    fam = 1
    for i, item in enumerate(raw['items']['item']['link']):

        if item['@type'] == 'boardgamecategory':
            data['cat_' + str(i+1)] = item['@value']
            
        elif item['@type'] == 'boardgamemechanic':
            data['mech_' + str(mech)] = item['@value']
            mech += 1
            
        elif item['@type'] == 'boardgamefamily':
            data['fam_' + str(fam)] = item['@value']
            fam += 1
            
        else:
            pass
#         if item['@type'] in ['boardgamecategory', 'boardgamemechanic', 'boardgamefamily']:
#             data['cat_' + str(i+1)] = item['@value']
    
    # Get user poll data for suggested number of players
    for i, item in enumerate(raw['items']['item']['poll'][0]['results']):
        data['best_players_' + str(i+1)] = item['result'][0]['@numvotes']
    
    # Get user poll data for suggested age
    for i, item in enumerate(raw['items']['item']['poll'][1]['results']['result']):
        data['best_age_' + str(item['@value'])] = item['@numvotes']
        
    # Get user poll data for language dependence
    for i, item in enumerate(raw['items']['item']['poll'][2]['results']['result']):
        data['language_prof_' + str(item['@level'])] = item['@numvotes']
    
    return data

## Scraping from BGG using API

In [33]:
# Function to scrape all ids given for categorical data
def id_scrapes(ids):
    scraped_df = pd.DataFrame(index=['id', 'name', 'year', 'min_players', 'max_players',
                                     'playtime', 'min_time', 'max_time', 'min_age']).T
    
    for game_id in ids:
        print(game_id)
        res = requests.get(base_url + str(game_id))
        try:
            parsed = xmltodict.parse(res.text)
            try:
                if parsed['items']['item']['@type'] == 'boardgame':
                    data = parse_to_data(parsed)

                    # Merge
                    scraped_df = scraped_df.append(data, ignore_index = True)

                    # Sleep
                    time.sleep(8)

            except:
                time.sleep(8)
        except:
            print(f'{game_id} has an xml problem')
            
    return scraped_df

In [30]:
# Initial scrape wasn't able to get all the data.
# I chose to scrape seperately 3 times to meet the 2000 game requirement and solve gaps in data.

all_id = pd.read_csv('../data/ranked_data.csv', index_col=0).iloc[:2000, :]
some_id = pd.read_csv('../data/scrape1.csv', index_col=0)
lost_ids = list(set(all_id['id']) - set(some_id['id']))

lost_df = id_scrapes(lost_ids)
lost_df.to_csv('../data/scrape2.csv')

test = pd.read_csv('../data/ranked_data.csv', index_col=0).iloc[2000:2025]

lost_lost_df = id_scrapes(test['id'])
lost_lost_df.to_csv('../data/scrape3.csv')

In [61]:
# Create a complete dataset based on all scrapes
df1 = pd.read_csv('../data/scrape1.csv', index_col=0)
df2 = pd.read_csv('../data/scrape2.csv', index_col=0)
df3 = pd.read_csv('../data/scrape3.csv', index_col=0)

cat_df = pd.DataFrame()
cat_df = pd.concat([cat_df, df1], ignore_index=True)
cat_df = pd.concat([cat_df, df2], ignore_index=True)
cat_df = pd.concat([cat_df, df3], ignore_index=True)

cat_df

Unnamed: 0,id,name,year,min_players,max_players,playtime,min_time,max_time,min_age,cat_1,...,language_prof_741,language_prof_742,language_prof_743,language_prof_744,language_prof_745,language_prof_581,language_prof_582,language_prof_583,language_prof_584,language_prof_585
0,174430,Gloomhaven,2017,1,4,120,60,120,14,Adventure,...,,,,,,,,,,
1,161936,Pandemic Legacy: Season 1,2015,2,4,60,60,60,13,Environmental,...,,,,,,,,,,
2,224517,Brass: Birmingham,2018,2,4,120,60,120,14,Economic,...,,,,,,,,,,
3,167791,Terraforming Mars,2016,1,5,120,120,120,12,Economic,...,,,,,,,,,,
4,291457,Gloomhaven: Jaws of the Lion,2020,1,4,120,30,120,14,Adventure,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2017,83667,First Train to Nuremberg,2010,2,4,120,45,120,12,Trains,...,,,,,,,,,,
2018,949,Twixt,1962,2,4,30,30,30,12,Abstract Strategy,...,,,,,,,,,,
2019,301716,Glasgow,2020,2,2,30,30,30,10,City Building,...,,,,,,,,,,
2020,35488,The Name of the Rose,2008,2,5,75,75,75,10,Bluffing,...,,,,,,,,,,


In [63]:
# Export to csv
cat_df.to_csv('../data/cat_data.csv')