In [49]:
#imports
import numpy as np
import pandas as pd

import re
import time

from bs4 import BeautifulSoup
import requests

# Script for scraping board game data from boardgamegeek.com

This is a script that scrapes 48 attributes for baordgames from boardgamegeek.com. Initially the script is configured to scrape any range of pages starting on the browse menu with the sort by popularity(or rank). It can be modified to scrape in other fashions. The scraper generates a list of links for games for any given page. In this case there are 100 games per page. Once a list of links is generated, it requests the 'stats' page of each game, this was determined to be where the most relevant information was located, however all the game data is delivered via a js script, which is parsed manually into dictionaries because of its messy structure. This was decided as a better route than introducing anything heavier than BS4 (like selenium, phantomjs).

In [50]:
start_time = time.time()

page_n = 1 #keep track of which page the scraper is on
games = []# list to collect data
base_url = 'https://boardgamegeek.com/' #the base url
start_url = 'https://boardgamegeek.com/browse/boardgame/page/' #start page

# a list of attributes to be collected from 'all_d'
collect_list = ["objectid","name","yearpublished","sortindex","minplayers",
                "maxplayers","minplaytime","maxplaytime","minage","best",
                "max","totalvotes","playerage","languagedependence", 
                "usersrated","average","baverage","stddev","avgweight", 
                "numweights","numgeeklists","numtrading","numwanting",
                "numcomments","views","numplays","numplays_month","news",
                "blogs","weblink","podcast","label","boardgamedesigner",
                "boardgameartist","boardgamepublisher","boardgamehonor",
                "boardgamecategory","boardgamemechanic","boardgameexpansion",
                "boardgameversion","boardgamefamily"]
#a list of attributes to be collected from "credit_d"
credit_collect_list = ["boardgamedesigner","boardgameartist","boardgamepublisher",
                       "boardgamehonor","boardgamecategory","boardgameversion"]
# while loop to continue until page_n reached the decided limit
while page_n <= 200 :
    #request the page
    game_names = BeautifulSoup(requests.get(start_url+str(page_n)).text,'lxml')
    #get list of games from page
    game_list = game_names.find_all('td',{'class':'collection_objectname'}) 
    
    #below is logic to stop the loop if no additional links are available
    if not game_list:
        print('No more results: Exiting...') 
        break
    else:
        print('Getting page:{}'.format(page_n))
        
        #iterate through game list for page_n
        for game in game_list:
            game_data = [] #collect data
            #request game page
            game_page = BeautifulSoup(requests.get(base_url+\
                        game.select('a')[0]['href']+'/stats').text,'html.parser')
            #capture whole description of game
            desc_tag = game_page.find_all('meta',{'property':'og:description'})
            desc_str = str(desc_tag).split('>',1)[0][16:-27].replace('&amp;ldquo;','"')\
            .replace('&amp;rdquo;','"').replace('\n',' ')
            
            #manipulate js script data and create dictionaries for attribute look up
            script =game_page.find("script", text=re.compile("GEEK.geekitemPreload\s+="))
            data_list = str(script).replace('=4&mt=8&at=','').replace(':{"link"','')\
            .split("=",9)[9].strip().split(':{')
            
            comm_d = {i.split(':')[0]:' '.join(i.split(':')[1:])\
                      for i in data_list[3][:-1].split(',') if len(i.split(':'))> 1 }
            stats_d = {i.split(':')[0]:' '.join(i.split(':')[1:])\
                       for i in data_list[5][:-1].split(',') if len(i.split(':'))> 1 }
            counts_d = {i.split(':')[0]:' '.join(i.split(':')[1:])\
                        for i in data_list[6][:-1].split(',') if len(i.split(':'))> 1 }
            info_d = {i.split(':')[0]:' '.join(i.split(':')[1:])\
                      for i in data_list[7][:-1].split(',') if len(i.split(':'))> 1 }
            cr_cnt_d = {i.split(':')[0]:' '.join(i.split(':')[1:])\
                        for i in data_list[9][:-1].split(',') if len(i.split(':'))> 1 }
            rank_d = {i.split(':')[0]:' '.join(i.split(':')[1:])\
                      for i in data_list[10][:-1].split(',') if len(i.split(':'))> 1 }
            
            all_d = {**comm_d, **stats_d,**counts_d,**info_d,**cr_cnt_d,**rank_d}
            
            #extract data from messy string
            credit_d = {}
            for i in data_list[8].split('],'):
                item = i.split(":",1)
                try:
                    values = [h.split(',')[0] for h in\
                              [j.split(':')[1] for j in item[1].split('},{')]]
                except:
                    pass
                
                credit_d[item[0]]=values
                
            #iterate through list of desired attributes to capture values
            for i in collect_list:
                if '"'+i+'"' in all_d.keys():
                    game_data.append(all_d['"'+i+'"'].replace('"',''))
                else:
                    game_data.append('NaN')
            for i in credit_collect_list:
                if '"'+i+'"' in credit_d.keys():
                    game_data.append(credit_d['"'+i+'"'])
                else:
                    game_data.append('NaN')
            
            game_data.append(desc_str) #append game description
            game_data.append(game.select('a')[0]['href'])
            games.append(game_data)
        page_n += 1
        
end_time = time.time()

print('total_time: {} minutes'.format((end_time-start_time)/60))

Getting page:1
Getting page:2
Getting page:3
Getting page:4
Getting page:5
Getting page:6
Getting page:7
Getting page:8
Getting page:9
Getting page:10
Getting page:11
Getting page:12
Getting page:13
Getting page:14
Getting page:15
Getting page:16
Getting page:17
Getting page:18
Getting page:19
Getting page:20
Getting page:21
Getting page:22
Getting page:23
Getting page:24
Getting page:25
Getting page:26
Getting page:27
Getting page:28
Getting page:29
Getting page:30
Getting page:31
Getting page:32
Getting page:33
Getting page:34
Getting page:35
Getting page:36
Getting page:37
Getting page:38
Getting page:39
Getting page:40
Getting page:41
Getting page:42
Getting page:43
Getting page:44
Getting page:45
Getting page:46
Getting page:47
Getting page:48
Getting page:49
Getting page:50
Getting page:51
Getting page:52
Getting page:53
Getting page:54
Getting page:55
Getting page:56
Getting page:57
Getting page:58
Getting page:59
Getting page:60
Getting page:61
Getting page:62
Getting page:63
G

In [51]:
df = pd.DataFrame.from_records(games)

In [74]:
naidx = [i for i in range(len(df[1])) if list(df[1])[i] == "NaN"]

In [75]:
naidx

[154,
 649,
 2380,
 3345,
 6890,
 7014,
 7055,
 7207,
 7544,
 7735,
 7757,
 13185,
 13427,
 17617]

The loop needs to run again over the missing games with corrected code

In [213]:
missing_games = ['boardgame/555/princes-florence',
                 'boardgame/52461/legacy-testament-duke-de-crecy',
                 'boardgame/212765/songbirds',
                 'boardgame/266460/yinzi-shining-ming-dynasty',
                 'boardgame/228234/hatsuden',
                 'boardgame/162292/draugr',
                 'boardgame/193584/last-garden',
                 'boardgame/240584/blend-coffee-lab',
                 'boardgame/215469/pyramids-deadline',
                 'boardgame/282438/throne-allegoria/',
                 'boardgame/22203/face-mat',
                 'boardgame/275087/auf-der-walz',
                 'boardgame/208736/lawless-empire',
                 'boardgame/155208/raiders-lost-tomb' ]

In [214]:
m_games_data = []

In [215]:
for game in missing_games:
    game_data = [] #collect data
    #request game page
    game_page = BeautifulSoup(requests.get(base_url+\
                game+'/stats').text,'html.parser')
    #capture whole description of game
    desc_tag = game_page.find_all('meta',{'property':'og:description'})
    desc_str = str(desc_tag).split('>',1)[0][16:-27].replace('&amp;ldquo;','"')\
    .replace('&amp;rdquo;','"').replace('\n',' ')

    #manipulate js script data and create dictionaries for attribute look up
    script =game_page.find("script", text=re.compile("GEEK.geekitemPreload\s+="))
    #added 'replace' in text statements to accomodate dictionary assignment
    data_list = str(script).replace('=4&mt=8&at=','').replace(':{"link"','')\
    .replace(':{"orderurl"','').replace(':{"shopifyname"','').split("=",9)[9].strip().split(':{')

    comm_d = {i.split(':')[0]:' '.join(i.split(':')[1:])\
              for i in data_list[3][:-1].split(',') if len(i.split(':'))> 1 }
    stats_d = {i.split(':')[0]:' '.join(i.split(':')[1:])\
               for i in data_list[5][:-1].split(',') if len(i.split(':'))> 1 }
    counts_d = {i.split(':')[0]:' '.join(i.split(':')[1:])\
                for i in data_list[6][:-1].split(',') if len(i.split(':'))> 1 }
    info_d = {i.split(':')[0]:' '.join(i.split(':')[1:])\
              for i in data_list[7][:-1].split(',') if len(i.split(':'))> 1 }
    cr_cnt_d = {i.split(':')[0]:' '.join(i.split(':')[1:])\
                for i in data_list[9][:-1].split(',') if len(i.split(':'))> 1 }
    rank_d = {i.split(':')[0]:' '.join(i.split(':')[1:])\
              for i in data_list[10][:-1].split(',') if len(i.split(':'))> 1 }

    all_d = {**comm_d, **stats_d,**counts_d,**info_d,**cr_cnt_d,**rank_d}

    #extract data from messy string
    credit_d = {}
    for i in data_list[8].split('],'):
        item = i.split(":",1)
        try:
            values = [h.split(',')[0] for h in\
                      [j.split(':')[1] for j in item[1].split('},{')]]
        except:
            pass

        credit_d[item[0]]=values

    #iterate through list of desired attributes to capture values
    for i in collect_list:
        if '"'+i+'"' in all_d.keys():
            game_data.append(all_d['"'+i+'"'].replace('"',''))
        else:
            game_data.append('NaN')
    for i in credit_collect_list:
        if '"'+i+'"' in credit_d.keys():
            game_data.append(credit_d['"'+i+'"'])
        else:
            game_data.append('NaN')

    game_data.append(desc_str) #append game description
    m_games_data.append(game_data)

Replace rows in df with updated information

In [234]:
for idx,new_data in zip(naidx,m_games_data):
    for i in range(len(new_data)):
        df.loc[idx][i] = new_data[i] 
        

Label the columns

In [238]:
df.columns = ["objectid","name","yearpublished","sortindex","minplayers",
                "maxplayers","minplaytime","maxplaytime","minage","min_community",
                "max_community","totalvotes","playerage","languagedependence", 
                "usersrated","average","baverage","stddev","avgweight", 
                "numweights","numgeeklists","numtrading","numwanting",
                "numcomments","siteviews","numplays","numplays_month","news",
                "blogs","weblink","podcast","label","boardgamedesigner_cnt",
                "boardgameartist_cnt","boardgamepublisher_cnt","boardgamehonor_cnt",
                "boardgamecategory_cnt","boardgamemechanic_cnt","boardgameexpansion_cnt",
                "boardgameversion_cnt","boardgamefamily_cnt","boardgamedesigner",
                 "boardgameartist","boardgamepublisher","boardgamehonor","boardgamecategory",
                 "boardgameversion",'description']

In [239]:
df

Unnamed: 0,objectid,name,yearpublished,sortindex,minplayers,maxplayers,minplaytime,maxplaytime,minage,min_community,...,boardgameexpansion_cnt,boardgameversion_cnt,boardgamefamily_cnt,boardgamedesigner,boardgameartist,boardgamepublisher,boardgamehonor,boardgamecategory,boardgameversion,description
0,174430,Gloomhaven,2017,1,1,4,60,120,12,[{min 3,...,4,19,7,"[""Isaac Childres""]","[""Alexandr Elichev"", ""Josh T. McDowell"", ""Alva...","[""Cephalofair Games"", ""Albi"", ""Asmodee"", ""Feue...","[""2017 Best Science Fiction or Fantasy Board G...","[""Adventure"", ""Exploration"", ""Fantasy"", ""Fight...","[""Chinese edition"", ""Czech edition"", ""English ...",Gloomhaven is a game of Euro-inspired tactica...
1,161936,Pandemic Legacy Season 1,2015,1,2,4,60,60,13,[{min 4,...,0,33,3,"[""Rob Daviau"", ""Matt Leacock""]","[""Chris Quilliams""]","[""Z-Man Games, ""Asterion Press"", ""Devir"", ""Fil...","[""2015 Cardboard Republic Immersionist Laurel ...","[""Environmental"", ""Medical""]","[""Chinese blue edition"", ""Chinese red edition""...",Pandemic Legacy is a co-operative campaign gam...
2,167791,Terraforming Mars,2016,1,1,5,120,120,12,[{min 3,...,15,29,6,"[""Jacob Fryxelius""]","[""Isaac Fryxelius""]","[""FryxGames"", ""Arclight"", ""Fantasmagoria"", ""Gh...","[""2016 Cardboard Republic Architect Laurel Nom...","[""Economic"", ""Environmental"", ""Industry \/ Man...","[""Bulgarian edition"", ""Chinese edition"", ""Czec...","In the 2400s, mankind begins to terraform the ..."
3,182028,Through the Ages A New Story of Civilization,2015,1,2,4,120,120,14,[{min 3,...,1,14,2,"[""Vlaada Chv\u00e1til""]","[""Filip Murmak"", ""Radim Pech"", ""Jakub Politzer...","[""Czech Games Edition"", ""Cranio Creations"", ""D...","[""2015 Golden Geek Best Strategy Board Game No...","[""Card Game"", ""Civilization"", ""Economic""]","[""Chinese edition"", ""Czech edition"", ""English ...",Through the Ages: A New Story of Civilization ...
4,224517,Brass Birmingham,2018,1,2,4,60,120,14,[{min 3,...,0,9,6,"[""Gavan Brown"", ""Matt Tolman"", ""Martin Wallace""]","[""Lina Cossette"", ""David Forest"", ""Damien Mamm...","[""Roxley"", ""BoardM Factory"", ""Conclave Editora...","[""2018 Golden Geek Best Board Game Artwork & P...","[""Economic"", ""Industry \/ Manufacturing"", ""Tra...","[""English deluxe edition"", ""English retail edi...",Brass: Birmingham is an economic strategy game...
5,233078,Twilight Imperium (Fourth Edition),2017,1,3,6,240,480,14,[{min 6,...,0,11,3,"[""Dane Beltrami"", ""Corey Konieczka"", ""Christia...","[""Scott Schomburg""]","[""Fantasy Flight Games"", ""ADC Blackfire Entert...","[""2017 Golden Geek Best Strategy Board Game No...","[""Civilization"", ""Economic"", ""Negotiation"", ""P...","[""Chinese edition"", ""Czech edition"", ""English ...",Twilight Imperium (Fourth Edition) is a game o...
6,12333,Twilight Struggle,2005,1,2,2,120,180,13,[{min 2,...,9,32,5,"[""Ananda Gupta"", ""Jason Matthews""]","[""Viktor Csete"", ""Rodger B. MacGowan"", ""Chechu...","[""GMT Games"", ""(Self-Published)"", ""Asterion Pr...","[""2005 Charles S. Roberts Best Modern Era Boar...","[""Modern Warfare"", ""Political"", ""Wargame""]","[""Bard Centrum Gier Polish deluxe edition"", ""B...","&amp;quot;Now the trumpet summons us again, no..."
7,187645,Star Wars Rebellion,2016,1,2,4,180,240,14,[{min 2,...,1,10,2,"[""Corey Konieczka""]","[""Matt Allsopp"", ""David Ardila"", ""Balaskas"", ""...","[""Fantasy Flight Games"", ""ADC Blackfire Entert...","[""2016 Best Science Fiction or Fantasy Board G...","[""Fighting"", ""Miniatures"", ""Movies \/ TV \/ Ra...","[""Czech edition"", ""English edition"", ""French e...",From the publisher: Star Wars: Rebellion is a...
8,220308,Gaia Project,2017,1,1,4,60,150,12,[{min 3,...,0,13,3,"[""Jens Dr\u00f6gem\u00fcller"", ""Helge Ostertag""]","[""Dennis Lohausen""]","[""Feuerland Spiele"", ""Cranio Creations"", ""Dice...","[""2017 Golden Geek Best Solo Board Game Nomine...","[""Civilization"", ""Economic"", ""Science Fiction""...","[""Chinese edition"", ""Dutch edition"", ""English ...",Gaia Project is a new game in the line of Terr...
9,169786,Scythe,2016,1,1,5,90,115,14,[{min 4,...,16,21,6,"[""Jamey Stegmaier""]","[""Jakub Rozalski""]","[""Stonemaier Games"", ""Albi"", ""Angry Lion Games...","[""2016 Cardboard Republic Architect Laurel Nom...","[""Economic"", ""Fighting"", ""Science Fiction"", ""T...","[""Chinese edition"", ""Czech edition"", ""English ...",It is a time of unrest in 1920s Europa. The as...


In [240]:
df.to_csv('boardgames-raw.csv')