# Board games scraping

In [None]:
import requests as re
from bs4 import BeautifulSoup
import pandas as pd
import json
from datetime import datetime
import time
from tqdm.notebook import tqdm_notebook

In [None]:
def bgg_top(n_pages=20, start_page=1):
    '''
    On this script you can get list of top board games from boardgamegeek.com
    -----
    Paramrters
    -----
    n_pages: how many pages you searh, default - 20, because without sighning in on boardgamegeek.com you can load only first 20 pages
    start_page: from this page we start scrapping, default - 1.
    '''
    url_main = 'https://boardgamegeek.com'
    url_searh_boardgames = url_main + '/browse/boardgame'
    url_page = '/page/'
    # n_pages = 20
    game_top_list = pd.DataFrame()
    for i in range(n_pages):
        r = re.get(url_searh_boardgames + url_page + str(start_page+i))
        soup = BeautifulSoup(r.text)
        rows = soup.find_all('tr', id='row_')
        for j, row in enumerate(rows):
            rank = row.find('td', class_='collection_rank').find('a').get('name')
            gameinfo = row.find('div', id='results_objectname'+str(j+1))
            title = gameinfo.find('a').text
            try:
                year = gameinfo.find('span').text
            except:
                year = ''
            try:
                description = row.find('p', class_='smallefont dull').text.strip()
            except:
                description = ''
            link = url_main + gameinfo.find('a').get('href')
            game_id = str(gameinfo.find('a').get('href'))[len('/boardgame/'):]
            game_id = game_id[:game_id.find('/')]
            ratings = row.find_all('td', class_='collection_bggrating')
            rat_list = []
            for i in ratings:
                rat_list.append(i.text.strip())
            geek_rating = rat_list[0]
            avg_rating = rat_list[1]
            num_votes = rat_list[2]

            game = {'rank': rank, 'title': title, 'game_id':game_id, 'description':description, 'year':year, 'link': link, 
                    'geek_rating':geek_rating, 'avg_rating':avg_rating, 'num_votes':num_votes}
            game_top_list = pd.concat([game_top_list, pd.DataFrame([game])])

    return game_top_list

In [None]:
def get_bgg_categories():
    category_list = pd.DataFrame()
    url_main = 'https://boardgamegeek.com'
    url_cat = 'https://boardgamegeek.com/browse/boardgamecategory'
    r = re.get(url_cat)
    soup = BeautifulSoup(r.text)
    table = soup.find('table', class_='forum_table')
    rows = table.find_all('a')
    for row in rows:
        link = url_main + row.get('href')
        category = row.text
        cat = {'category':category, 'link':link}
        category_list = pd.concat([category_list, pd.DataFrame([cat])])
    return category_list

In [None]:
def get_api_bgg_game_data(unique_ids):
    '''
    This function retrieves aggregated game information via the boardgamegeek.com API
    
    ----------
    
    unique_ids - list of unique boardgames ids (it is best to feed a list of no more than 50 id)
    
    ----------
    Using API BoardGameGeek:
    https://api.geekdo.com/xmlapi/boardgame/37111?stats=1&pricehistory=1&marketplace=1&comments=1

    base - https://api.geekdo.com/xmlapi/boardgame
    game - /37111 - gameid
    params - ?stats=1&pricehistory=1&marketplace=1&comments=1
    comments: Show brief user comments on games (set it to 1, absent by default)
    stats: Include game statistics (set it to 1, absent by default)
    historical: Include historical game statistics (set it to 1, absent by default) - Use from/end parameters to set starting and ending dates. Returns all data starting from 2006-03-18.
    from: Set the start date to include historical data (format: YYYY-MM-DD, absent by default )
    to: Set the end date to include historical data (format: YYYY-MM-DD, absent by default )
    pricehistory: retrieve the marketplace history for this item (set it to 1, absent by default)
    marketplace: retrieve the current marketplace listings (set it to 1, absent by default)
    '''
    url_id = ''
    for i in range(len(unique_ids)):
        if i == 0:
            url_id = unique_ids[i]
        else:
            url_id += ',' + unique_ids[i]

    api_boardgame = 'https://api.geekdo.com/xmlapi/boardgame/'
    api_params = '?stats=1&pricehistory=1&marketplace=1&comments=1'
    r = re.get(api_boardgame + url_id + api_params)
    soup = BeautifulSoup(r.text, 'xml')
    list_bg = soup.find_all('boardgame')
    games = pd.DataFrame()
    for bg in list_bg:
    #     get basic info
        boardgame_id = bg.get('objectid')
        try:
            year_published = bg.find('yearpublished').text
        except:
            year_published = ""
        try:
            minplayers = bg.find('minplayers').text
        except:
            minplayers = ""
        try:
            maxplayers = bg.find('maxplayers').text
        except:
            maxplayers = ""
        try:
            minplaytime = bg.find('minplaytime').text
        except:
            minplaytime = ""
        try:
            maxplaytime = bg.find('maxplaytime').text
        except:
            maxplaytime = ""
        try:
            title = bg.find('name', primary='true').text
        except:
            title = ""
        try:
            age = bg.find('age').text
        except:
            age = ""
        try:
            description = bg.find('description').text
        except:
            description = ""
        try:
            main_publisher = bg.find('boardgamepublisher').text
        except:
            main_publisher = ""

        
        publishers = []
        try:
            for i in bg.find_all('boardgamepublisher'):
                publishers.append(i.text)
        except:
            pass
        honors = []
        try:
            for i in bg.find_all('boardgamehonor'):
                honors.append(i.text)
        except:
            pass
        expansions = []
        try:
            for i in bg.find_all('boardgameexpansion'):
                expansions.append(i.text)
        except:
            pass
        accessories = []
        try:
            for i in bg.find_all('boardgameaccessory'):
                accessories.append(i.text)
        except:
            pass
        artists = []
        try:
            for i in bg.find_all('boardgameartist'):
                artists.append(i.text)
        except:
            pass
        mechanics = []
        try:
            for i in bg.find_all('boardgamemechanic'):
                mechanics.append(i.text)
        except:
            pass
        category = []
        try:
            for i in bg.find_all('boardgamecategory'):
                category.append(i.text)
        except:
            pass
        podcast_episodes = []
        try:
            for i in bg.find_all('boardgamepodcastepisode'):
                podcast_episodes.append(i.text)
        except:
            pass
        designers = []
        try:
            for i in bg.find_all('boardgamedesigner'):
                designers.append(i.text)
        except:
            pass
        graphic_designers = []
        try:
            for i in bg.find_all('boardgamegraphicdesigner'):
                graphic_designers.append(i.text)
        except:
            pass
        subdomains = []
        try:
            for i in bg.find_all('boardgamesubdomain'):
                subdomains.append(i.text)
        except:
            pass
        implementations = []
        try:
            for i in bg.find_all('boardgameimplementation'):
                implementations.append(i.text)
        except:
            pass

    #     get info about voting
        suggested_numplayers = {}
        try:
            for i in bg.find('poll').find_all('results'):
                numplayers = i.get('numplayers')
                dict_vote = {}
                for f in i.find_all('result'):
                    dict_vote[f.get('value')] = f.get('numvotes')
                suggested_numplayers[numplayers] = dict_vote
        except:
            pass

    #     get all shorts comments
        comments = {}
        try:
            for comment in bg.find_all('comment'):
                comments[comment.get('username')] = comment.text
        except:
            pass

    #     get statistics info
        stats = bg.find('statistics')
        try:
            users_rated = stats.find('usersrated').text
        except:
            users_rated = ''
        try:
            average_rating = stats.find('average').text
        except:
            average_rating = ''
        try:
            bayes_average_rating = stats.find('bayesaverage').text
        except:
            bayes_average_rating = ''
        try:
            median = stats.find('median').text
        except:
            median = ''
        try:
            stddev = stats.find('stddev').text
        except:
            stddev = ''
        try:
            owned = stats.find('owned').text
        except:
            owned = ''
        try:
            trading = stats.find('trading').text
        except:
            trading = ''
        try:
            wishing = stats.find('wishing').text
        except:
            wishing = ''
        try:
            num_of_comments = stats.find('numcomments').text
        except:
            num_of_comments = ''
        try:
            num_of_weights = stats.find('numweights').text
        except:
            num_of_weights = ''
        try:
            average_weight = stats.find('averageweight').text
        except:
            average_weight = ''
        ranks = {}
        try:
            ranking = stats.find_all('rank')
            for rank in ranking:
                ranks[rank.get('friendlyname')] = rank.get('value')
        except:
            pass
            
    #     get historical sales data
        try:
            sales_history = bg.find('marketplacehistory')
            listings = sales_history.find_all('listing')
            date_format = '%a, %d %b %Y %H:%M:%S +0000'
            marketplace_history = {}
            for i, listing in enumerate(listings):
                listdate = datetime.strptime(listing.find('listdate').text, date_format)
                saledate = datetime.strptime(listing.find('saledate').text, date_format)
                delta = saledate - listdate
                saletime = delta.days*24*60*60 + delta.seconds
                unix_saledate = int(datetime.timestamp(saledate))

                price = listing.find('price').text
                currency = listing.find('price').get('currency')
                condition = listing.find('condition').text

                marketplace_history[i] = {'unix_saledate':unix_saledate, 'saletime':saletime, 
                                         'price':price, 'currency':currency, 'condition':condition}
        except:
            marketplace_history = {}


    #     get real sales data
        try:
            sales_history = bg.find('marketplacelistings')
            listings = sales_history.find_all('listing')
            marketplace_listings = {}
            for i, listing in enumerate(listings):
                listdate = datetime.strptime(listing.find('listdate').text, date_format)
                unix_listdate = int(datetime.timestamp(listdate))

                price = listing.find('price').text
                currency = listing.find('price').get('currency')
                condition = listing.find('condition').text

                marketplace_listings[i] = {'unix_listdate':unix_listdate,
                                         'price':price, 'currency':currency, 'condition':condition}
        except:
            marketplace_listings = {}

        game = {'boardgame_id':boardgame_id, 'title': title, 'year_published':year_published, 'minplayers':minplayers, 'maxplayers':maxplayers, 
                'minplaytime': minplaytime, 'maxplaytime':maxplaytime, 'age':age, 'users_rated':users_rated, 
                'average_rating':average_rating, 'bayes_average_rating':bayes_average_rating, 'median':median, 
                'stddev':stddev, 'owned':owned, 'trading':trading, 'wishing':wishing, 'num_of_comments':num_of_comments, 
                'num_of_weights':num_of_weights, 'average_weight':average_weight, 'ranks':ranks, 
                'main_publisher':main_publisher, 
                'description':description, 'publishers':publishers, 'honors':honors, 'expansions':expansions, 
                'accessories':accessories, 'artists':artists, 'mechanics':mechanics, 'category':category, 
                'designers':designers, 'graphic_designers':graphic_designers, 'subdomains':subdomains, 
                'implementations':implementations, 'suggested_numplayers':suggested_numplayers,
                'podcast_episodes':podcast_episodes, 'comments':comments, 'marketplace_history':marketplace_history, 
                'marketplace_listings':marketplace_listings}
        games = pd.concat([games, pd.DataFrame([game])])
    return games

In [None]:
def get_user_ratings(nickname, sleep=True):
    '''
    This function takes the boardgamegeek.com username as input 
    and returns a pandas DataFrame with all of the user's scores
    '''
    url_coll_main = 'https://api.geekdo.com/xmlapi/collection/'
    params = '?rated=1'
    
    if sleep: 
        r = re.get(url_coll_main + nickname + params)
        time.sleep(14.66)
    time.sleep(0.33)
    r = re.get(url_coll_main + nickname + params)
    soup = BeautifulSoup(r.text, features="xml")
    ratings = pd.DataFrame()
    rows = soup.find_all('item')
    for item in rows:
        boardgame_id = item.get('objectid')
        try:
            title = item.find('name').text
        except:
            title = ''
        try:
            rating = item.find('stats').find('rating').get('value')
        except:
            rating = ''
        try:
            num_of_plays = item.find('numplays').text
        except:
            num_of_plays = ''
        try:
            comment = item.find('comment').text
        except:
            comment = ''

        status = item.find('status')
        own = status.get('own')
        prevowned = status.get('prevowned')
        fortrade = status.get('fortrade')
        want = status.get('want')
        wanttoplay = status.get('wanttoplay')
        wanttobuy = status.get('wanttobuy')
        wishlist = status.get('wishlist') 
        preordered = status.get('preordered')
        last_modified = status.get('lastmodified')


        vote = {'nickname':nickname, 'title': title, 'boardgame_id':boardgame_id, 'rating':rating, 
                'num_of_plays':num_of_plays, 
                    'comment': comment, 'own':own, 'prevowned':prevowned, 'fortrade':fortrade, 
                    'want':want, 'wanttoplay':wanttoplay, 'wanttobuy':wanttobuy, 
                    'wishlist':wishlist, 'preordered':preordered, 'last_modified':last_modified}
        ratings = pd.concat([ratings, pd.DataFrame([vote])])
    return ratings