# Capstone Project 2

## Building the dataset

In [1]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
from time import sleep
import re
import time
from tqdm import tqdm

### Building a dataframe with the 50 most rated boardgames (code found on the Internet) 

In [2]:
def req(msg, slp=0.2):
    """Make fault tolerant BGG server requests."""
    # Sleep to make sure you are not pinging the server to frequently
    sleep(slp)
    # Keep trying requests until status-code is 200
    status_code = 500
    while status_code != 200:
        sleep(slp)
        try:
            r = requests.get(msg)
            status_code = r.status_code
            # if status_code != 200:
                # print("Server Error! Response Code %i. Retrying..." % (r.status_code))
        except:
            # print("An exception has occurred, probably a momentory loss of connection. Waiting three seconds...")
            sleep(3)
    return r

In [3]:
def request(msg, slp=1):
    '''A wrapper to make robust https requests.'''
    status_code = 500  # Want to get a status-code of 200
    while status_code != 200:
        sleep(slp)  # Don't ping the server too often
        try:
            r = requests.get(msg)
            status_code = r.status_code
            if status_code != 200:
                print("Server Error! Response Code %i. Retrying..." % (r.status_code))
        except:
            print("An exception has occurred, probably a momentory loss of connection. Waiting one seconds...")
            sleep(1)
    return r

In [4]:
# Get full HTML for a specific page in the full listing of boardgames sorted by 
r = request("https://boardgamegeek.com/browse/boardgame/page/%i?sort=numvoters&sortdir=desc" % (1,))
soup = BeautifulSoup(r.text, "html.parser")    
    
# Get rows for the table listing all the games on this page
table = soup.find_all("tr", attrs={"id": "row_"})  # Get list of all the rows (tags) in the list of games on this page
df = pd.DataFrame(columns=["id", "name", "nrate", "pic_url"], index=range(len(table)))  # DF to hold this pages results
    
# Loop through each row and pull out the info for that game
for idx, row in enumerate(table):
    # Row may or may not start with a "boardgame rank" link, if YES then strip it
    links = row.find_all("a")
    if "name" in links[0].attrs.keys():
        del links[0]
    gamelink = links[1]  # Get the relative URL for the specific game
    gameid = int(gamelink["href"].split("/")[2])  # Get the game ID by parsing the relative URL
    gamename = gamelink.contents[0]  # Get the actual name of the game as the link contents
    imlink = links[0]  # Get the URL for the game thumbnail
    thumbnail = imlink.contents[0]["src"]

    ratings_str = row.find_all("td", attrs={"class": "collection_bggrating"})[2].contents[0]
    nratings = int("".join(ratings_str.split()))

    df.iloc[idx, :] = [gameid, gamename, nratings,thumbnail]

sleep(2) # Keep the BGG server happy.

# Prepare a "# of FULL pages of ratings" column to track # API calls needed
df["nfullpages"] = (df["nrate"]-50).apply(round, ndigits=-2)/100  # Round DOWN to nearest 100

# 50 most nrated game
df=df.iloc[0:50]
del df['pic_url']

In [5]:
print(df.shape)
df

(50, 4)


Unnamed: 0,id,name,nrate,nfullpages
0,13,Catan,87364,873.0
1,822,Carcassonne,87035,870.0
2,30549,Pandemic,85844,858.0
3,68448,7 Wonders,71104,711.0
4,36218,Dominion,69600,696.0
5,9209,Ticket to Ride,61886,618.0
6,31260,Agricola,58028,580.0
7,3076,Puerto Rico,57424,574.0
8,40692,Small World,54692,546.0
9,178900,Codenames,53435,534.0


### Building a dataframe with the reviews, rates and some information on the 50 most rated boardgames

In [6]:
def game_info (soup,ID):
    """Get the boardgame information"""
    name=soup('name')[0]["value"]
    year=soup('yearpublished')[0]["value"]
    min_play=soup('minplayers')[0]["value"]
    max_play=soup('maxplayers')[0]["value"]
    min_time=soup('minplaytime')[0]["value"]
    max_time=soup('maxplaytime')[0]["value"]
    min_age=soup('minage')[0]["value"]

    Category=[]
    for a in range(0,10):
        if soup('link')[a]['type']=="boardgamecategory":   
            Category.append(soup('link')[a]["value"])
    category=Category[0]
    return name,year,min_play,max_play,min_time,max_time,min_age,category

In [7]:
def game_rating_reviews (soup,review,username,rating):
    """Get the boardgames rates and reviews"""
    for nb in range(0,99):
        review.append(soup('comment')[nb]["value"])
        username.append(soup('comment')[nb]["username"])
        rate.append(soup('comment')[nb]["rating"])
    return review,username,rate

In [None]:
# transform the column with the id as a object column and the column with the number of pages as an integer column
df.id=df.id.astype(str)
df.nfullpages=df.nfullpages.astype(int)
# add a progression bar
for i in tqdm(range(20)):
    # Go through the 30 boardgames
    for a in range(0,50):
        nb_rate=df.nrate[a]
        ID=df.id[a]
        nb_page=df.nfullpages[a]
        # Go through the n pages of reviews
        for b in range (0,nb_page):
            b_str=str(b)
            # Access to each of the boardgame information page by page
            url="https://www.boardgamegeek.com/xmlapi2/thing?id="+ID+"&page="+b_str+"&ratingcomments=1"
            r=requests.get(url)
            soup=BeautifulSoup(r.text)
            # Check if the connection works
            if r.status_code==200:
                review=[]
                username=[]
                rate=[]
                # Get the boardgames information
                name,year,min_play,max_play,min_time,max_time,min_age,category=game_info(soup,ID)
                # Get the boardgames rates & reviews
                review,username,rate=game_rating_reviews(soup,review,username,rate)
                # Building the dataframe for each boardgame
                data={'id':ID,'name':name,'year':year,'min_play':min_play,'max_play':max_play,'min_time':min_time,
                     'max_time':max_time,'min_age':min_age,'category':category,'nb_rate':nb_rate,'username':username,
                     'review':review,'rate':rate}
                df_bgg_onegame=pd.DataFrame(data)
                # Drop the raws without any review
                df_bgg_onegame=df_bgg_onegame.drop(df_bgg_onegame[df_bgg_onegame.review==''].index) 
                # Concatenate the dataframes
                if a==0 & b=='0':
                    df_bgg=df_bgg_onegame.copy()
                else:
                    df_bgg=df_bgg.append(df_bgg_onegame).reset_index(drop=True)
    time.sleep(3)

In [10]:
# Saving the dataset in a cvs format
df_bgg.to_csv("bgg_review.csv", index=False, encoding="utf-8")
print(df_bgg.shape)
df_bgg

(304910, 13)


Unnamed: 0,id,name,year,min_play,max_play,min_time,max_time,min_age,category,nb_rate,username,review,rate
0,13,Catan,1995,3,4,60,120,10,Negotiation,87205,Deleted010518,2008-04-05 \r\r\r\r\n\r\r\r\r\nDave Loved the ...,7.2
1,13,Catan,1995,3,4,60,120,10,Negotiation,87205,fliptrackr,Like this game (even though I don't own it). ...,7.2
2,13,Catan,1995,3,4,60,120,10,Negotiation,87205,ShirKhan,"Classic. Besides that, fairly random and often...",7.2
3,13,Catan,1995,3,4,60,120,10,Negotiation,87205,bkunes,"Enjoyable game, challenge is getting at least ...",7.2
4,13,Catan,1995,3,4,60,120,10,Negotiation,87205,BenjoBaker,What can I say? If you haven't played Settler...,7.2
5,13,Catan,1995,3,4,60,120,10,Negotiation,87205,fformighieri,Esse jogo é uma excelente maneira de introduzi...,7.2
6,13,Catan,1995,3,4,60,120,10,Negotiation,87205,Tibetgeek,"First Euro-style strategy game. It was fun, go...",7.2
7,13,Catan,1995,3,4,60,120,10,Negotiation,87205,Podicle,"Haven't played this for a while, mainly becaus...",7.2
8,13,Catan,1995,3,4,60,120,10,Negotiation,87205,ividdythou,"So much hate for this game floating around, I ...",7.2
9,13,Catan,1995,3,4,60,120,10,Negotiation,87205,GamerVP,One of the games where I ended up with most of...,7.2
