# Web Scraping and Data Collection

The data collected utilizes primarily data collected from user input regarding factors such as number of players, playtime, language proficiency (how difficult it is to understand instructions), and age recommendations. Through this data, the goal of the project is to create a recommender system that uses this data to recommend similar games. Stretch goals include being able to choose specific characteristics (game category) in the search.

In [15]:
# Import libraries
import pandas as pd
import numpy as np
import requests
import time
import xmltodict
import os

# Web Scraping
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver import FirefoxOptions

## Part 1 - ID and Rankings
|Feature|Type|Dataset|Description|
|---|---|---|---|
|id|int|ranked_data.csv|Id of the board game.|
|name|object|ranked_data.csv|Name of the board game.|
|collection_rank|int|ranked_data.csv|Ranking of the board game.|
|geek_rating|float|ranked_data.csv|Bayesian averaged rating, reduces influence of individual ratings.|
|avg_rating|float|ranked_data.csv|Uses user input ratings to give an average score.|
|num_voters|int|ranked_data.csv|Number of people who gave a user rating.|
|price|object|ranked_data.csv|Price of the game according to GeekMarket.|

In [3]:
# Function to take in all data from a page and put it into a dataframe
def page_to_data(soup):

    # Create empty dataframe with desired columns
    table_index = ['id', 'name', 'collection_rank', 'geek_rating', 'avg_rating', 'num_voters', 'price']
    df = pd.DataFrame(index=table_index).T
    
    # Empty dict needs: 'id', 'name', 'collection_rank', 'geek_rating', 'avg_rating', 'num_voters', 'price'
    # Iterate through each game
    for game in soup.find_all(id = 'row_'):
        data = {}
        # Find section, get value, put it into data for all columns
        # ID 
        temp = game.find('a', href = True)
        data['id'] = temp.get('href').split('/')[2]
        
        # Name
        temp = game.find('a', {'class': 'primary'})
        data['name'] = temp.contents[0]
        
        # Collection Rank
        temp = game.find('td', {'class': 'collection_rank'})
        data['collection_rank'] = temp.contents[2].strip('\t').strip('\n').strip('\t')
        
        # Geek Rating
        temp = game.find_all('td', {'class': 'collection_bggrating'})
        data['geek_rating'] = temp[0].contents[0].strip('\n').strip('\t')
    
        # Average Rating
        data['avg_rating'] = temp[1].contents[0].strip('\n').strip('\t')
        
        # Number of Voters
        data['num_voters'] = temp[2].contents[0].strip('\n').strip('\t')
        
        # Price
        try:
            temp = game.find('a', {'class': 'ulprice'})
            data['price'] = temp.contents[1].contents[0]
        except:
            data['price'] = '(unavailable)'
        
        # Insert row of data in dataframe
        df = df.append(data, ignore_index=True)
        
    return df

In [25]:
def ranked_scrape():
    # Empty dataframe for data insertion
    games = pd.DataFrame()

    # 50 pages -> 5000 games
    for i in range(1, 51):

        # Create selenium executable
        base_url = 'https://boardgamegeek.com/browse/boardgame/page/'
        fo = FirefoxOptions()
        web = webdriver.Firefox(executable_path= '/home/clifford/Documents/geckodriver', options = fo)

        # Use selenium to scrape page
        web.get(base_url + str(i))
        web.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        page = BeautifulSoup(web.page_source, 'lxml')
        web.close()

        # Read and insert data
        games = games.append(page_to_data(page), ignore_index = True)

        # Robots.txt says 5s, do 8 for buffer
        time.sleep(8)

    # Export to csv
    games.to_csv('../data/ranked_data.csv')

## Part 2 - Categorical Data
|Feature|Type|Dataset|Description|
|---|---|---|---|
|id|int|cat_data.csv|Id of the game.|
|name|object|cat_data.csv|Name of the game.| 
|year|int|cat_data.csv|Year game was published.| 
|min/max_players|int|cat_data.csv|Minimum and maximum players allowed.| 
|playtime|int|cat_data.csv|Estimated game playtime.| 
|min/max_time|int|cat_data.csv|Minimum and maximum estimated game playtime.| 
|min_age|int|cat_data.csv|Minimum recommended age.| 
|cat_#|object|cat_data.csv|Each category signifies a category or aspect of the game.|
|categories|object|cat_data.csv|Contains a list of all of the 5 categories attributed to the game.|

In [9]:
# Code to turn parsed data into understandable dictionary of data
def parse_to_data(raw):
    
    # Empty dictionary
    data = {}
    
    # Distinguish
    data['id'] = raw['items']['item']['@id']
    try:
        data['name'] = raw['items']['item']['name'][0]['@value']
    except:
        pass
    try:
        data['name'] = raw['items']['item']['name']['@value']
    except:
        pass
    
    # Get yearpublished, minplayers, maxplayers
    data['year'] = raw['items']['item']['yearpublished']['@value']
    data['min_players'] = raw['items']['item']['minplayers']['@value']
    data['max_players'] = raw['items']['item']['maxplayers']['@value']
    
    # Get playtime, age
    data['playtime'] = raw['items']['item']['playingtime']['@value']
    data['min_time'] = raw['items']['item']['minplaytime']['@value']
    data['max_time'] = raw['items']['item']['maxplaytime']['@value']
    data['min_age'] = raw['items']['item']['minage']['@value']
    
    # Get board game categories and mechanics
    mech = 1
    fam = 1
    for i, item in enumerate(raw['items']['item']['link']):

        if item['@type'] == 'boardgamecategory':
            data['cat_' + str(i+1)] = item['@value']
            
        elif item['@type'] == 'boardgamemechanic':
            data['mech_' + str(mech)] = item['@value']
            mech += 1
            
        elif item['@type'] == 'boardgamefamily':
            data['fam_' + str(fam)] = item['@value']
            fam += 1
            
        else:
            pass
#         if item['@type'] in ['boardgamecategory', 'boardgamemechanic', 'boardgamefamily']:
#             data['cat_' + str(i+1)] = item['@value']
    
    # Get user poll data for suggested number of players
    for i, item in enumerate(raw['items']['item']['poll'][0]['results']):
        data['best_players_' + str(i+1)] = item['result'][0]['@numvotes']
    
    # Get user poll data for suggested age
    for i, item in enumerate(raw['items']['item']['poll'][1]['results']['result']):
        data['best_age_' + str(item['@value'])] = item['@numvotes']
        
    # Get user poll data for language dependence
    for i, item in enumerate(raw['items']['item']['poll'][2]['results']['result']):
        data['language_prof_' + str(item['@level'])] = item['@numvotes']
    
    return data

In [12]:
# Function to scrape all ids given for categorical data
def id_scrapes(ids):
    scraped_df = pd.DataFrame(index=['id', 'name', 'year', 'min_players', 'max_players',
                                     'playtime', 'min_time', 'max_time', 'min_age']).T
    base_url = 'https://www.boardgamegeek.com/xmlapi2/thing?id='
    
    for game_id in ids:
        res = requests.get(base_url + str(game_id))

        parsed = xmltodict.parse(res.text)
        try:
            if parsed['items']['item']['@type'] == 'boardgame':
                data = parse_to_data(parsed)
                
                # Merge
                scraped_df = scraped_df.append(data, ignore_index = True)
        
                # Sleep
                time.sleep(8)

        except:
            time.sleep(8)
            
    return scraped_df

In [31]:
# Execute scrape based on desired ranked ids
all_id = pd.read_csv('../data/ranked_data.csv', index_col=0)
ids = all_id['id']
ids = ids[:2000]

# Run functions to get categorical data
# cat_df = id_scrapes(ids)
# cat_df.to_csv('../data/cat_data.csv')

This part of the project proved to be finicky without multiple passthroughs. To fix this, a seperate notebook titled "1.5_Scrape_Testing" was used.

## Part 3 - User Ratings
|Feature|Type|Dataset|Description|
|---|---|---|---|
|username|object|user_ratings.csv|Name of the user reviewer.|
|rating|int|user_ratings.csv|Rating from 1 to 10.| 
|value|object|user_ratings.csv|Comments and review NLP.|

In [2]:
# Function to turn web data into a dataframe with all comments in a page
def comments_into_list(comments):
    rating_index = ['id', 'username', 'rating', 'value']
    test_df = pd.DataFrame(index = rating_index).T
    for comment in comments:
        data = {}
        try:
            data['username'] = comment['@username']
        except:
            pass
        try:
            data['rating'] = comment['@rating']
        except:
            pass
        try:
            data['value'] = comment['@value']
        except:
            pass
        test_df = test_df.append(data, ignore_index = True)
    return test_df

def list_into_df(game_id, parsed_text):
    # comments_into_list returns a dataframe with (generally) 100 rows * 4 columns with an empty id column
    comment_list = parsed_text['items']['item']['comments']['comment'] # len of comment_list should be 100 until last one
    comment_df = comments_into_list(comment_list)
    comment_df['id'] = game_id
    
    return comment_df

# Function to get one page of comments
def page_ratings(game_id, page_num):
    # Get url for use
    base_url = 'https://www.boardgamegeek.com/xmlapi2/thing?id='
    url = f'{base_url}{game_id}&ratingcomments=1&page={page_num}'
    
    # Get scraped page
    res = requests.get(url)
    parsed = xmltodict.parse(res.text)
    return parsed

# Function to get all ratings in a page
def all_page_scrape(game_id):
    # Instantiate a page counter and new dataframe
    page = 1
    all_comments = pd.DataFrame()
    
    # This while loop makes it so that the code stops when there are no more ratings to scrape
    while 'comment' in page_ratings(game_id, page)['items']['item']['comments']:
        all_comments = all_comments.append(list_into_df(game_id, page_ratings(game_id, page)),
                                           ignore_index=True)
        page += 1
        time.sleep(8)
    
    return all_comments

# Function to get scraped ratings of a game into a csv
def scrape_to_csv(game_id_list):
    
    # Create directory for csvs if one doesn't exist
    try:
        os.mkdir('./game_ratings')
    except:
        pass

    # Scrape and export a csv for desired ids
    for game_id in game_id_list:
        rating_df = all_page_scrape(game_id)
        rating_df.to_csv(f'./game_ratings/{game_id}.csv')

# This code is purely decorative as the actual scraping was done remotely.
# all_id = pd.read_csv('./ranked_data.csv', index_col=0)
# ids = all_id['id']
# ids = ids[:2025]
# scrape_to_csv(ids)

The scraping for this section was done using two seperate AWS Cloud Services in order to gather the user ratings.