In [None]:
from bs4 import BeautifulSoup
import requests 
import pandas as pd
from IPython.display import clear_output
import time
import os

In [None]:
BASE_DIR = os.getcwd()
links_dir = BASE_DIR+"\\data\\game_links.txt"
csv_dir = BASE_DIR+"\\data\\bgg_csv100.csv"

###UNCOMMENT IF USING PREVIOUSLY EXTRACTED LINKS
#with open(links_dir) as f:
#    lines = f.read()
#    game_links=lines.split(',')

url_page="https://boardgamegeek.com/browse/boardgame/page/"
url_root="https://boardgamegeek.com"

user_agent = {'User-agent': 'Mozilla/5.0'}

We initially thought that we're supposed to collect information about 50k games, so the solution here doesn't involve Selenium due to how long it would take.

The process:
First we extract the links for each game via "extract_links" function (with the ability to decide the start and finish page).

Then we "extract soups" via "extract_soups" function by going to every single link we've collected and getting a reponse through the request.get() function. We take the response and convert it into a readable string via Beautiful Soup. In BGG's case what we receive is a javascript code. 

Then through "extract_data_into_dataframe" we first disassemble the javascript block in order to eliminate the unneccessary parts (parts that could still contain potential keywords but without any useful info). We then go through the chunks of code and keep looking for our very specific keywords that exactly the place where our information resides. We then clean that string and the result is the information we were looking for. 
This function, after collecting each bit of information into a separate list combines them all into a single dataframe. Then we save the dataframe into a csv file, completeting the process.

The "main" function is simply an automation of the whole process, but each step can be called upon individually depending on the need. For example if we've already collected the links then we can skip that part altogether by supplying the links manually (requires uncommenting in the 2nd block)

In [None]:
###GATHERS LINKS OF EACH GAME

def extract_links(page_start,page_finish):
    page_num = page_start
    max_page = page_finish
    num_of_links = ((max_page-page_num)+1)*100 #there are 100 game links per BGG page
    game_links = []

    while(page_num<=max_page):
        next_page=url_page+str(page_num)
        response = requests.get(next_page,headers=user_agent)
        soup = BeautifulSoup(response.content, "html.parser")
        main_table = soup.find("table", attrs={"class":"collection_table"})
        game_rows = main_table.find_all("tr", attrs={"id":"row_"})

        #finding and collecting the actual links
        for game in game_rows:
            game_links.append(game.find("a", attrs={"class":"primary"})['href'])

        clear_output(wait=True)
        print(f"{len(game_links)}/{num_of_links} links collected")

        page_num+=1
        
    return game_links

In [None]:
###SAVES LINKS IN links_save_dir 

def save_links(game_links):
    with open(links_dir, "w") as f:
        for game in game_links:
            if(game == game_links[0]):
                f.write(game)
            else:
                f.write(','+game)

In [None]:
###PRINTS CURRENT PROGRESS OF SOUP EXTRACTION

#since extraction can take many hours for over 50k pages, for sake of convinience we've added a timer and updates for the progress

def soup_progress(total, len_soups,len_links):
    minutes = int((total/60)%60)
    hours = int((total/60)/60)
    seconds = int(total % 60)

    #if the number is a double digit one such as 12, leave it as is, if it's single digit like 7 then we turn it into 07
    if(seconds>9):
        seconds_modifier=""
    else:
        seconds_modifier="0"

    if(minutes>9):
        minutes_modifier=""
    else:
        minutes_modifier="0"

    if(hours>9):
        hours_modifier=""
    else:
        hours_modifier="0"

    percent = (len_soups/len_links)*100
    percent = float("{:.2f}".format(percent))

    clear_output(wait=True)

    print(f"{len_soups}/{len_links} soups, {hours_modifier}{hours}:{minutes_modifier}{minutes}:{seconds_modifier}{seconds} time passed, {(percent)}%")

In [None]:
###GOES TO LINKS AND CREATES SOUPS TO EXTRACT DATA FROM 

def extract_soups(game_links):
    soups=[]
    total=0

    for link in game_links:
        start=time.time() #Starting the timer for soup_progress
        
        next_page=url_root+link
        response = requests.get(next_page)
        soup = BeautifulSoup(response.content,"html.parser")
        soups.append(soup)
        
        end = time.time()#Ending the timer for soup_progress
        total+=(end-start)#Total time elapsed so far
        soup_progress(total, len(soups), len(game_links))

    return soups

In [None]:
#GETS MAX SCRIPT WHERE WE CAN FIND THE USEFUL INFO + CREATES A DATAFRAME FROM THE DATA

def extract_data_into_dataframe(soups): 
    years=[]
    min_players=[]
    max_players=[]
    min_times=[]
    max_times=[]
    min_ages=[]
    weights=[]
    ranks=[]
    designers=[]
    artists=[]
    publishers=[]
    owned=[]
    best_num_players=[]
    num_of_ratings=[]
    num_of_comments=[]
    names=[]
    ratings=[]
    types=[]
    error_counter = 0
    error_indices = []

    for i in range(len(soups)):
        corr_scripts = soups[i]("script")
        max_script_index = 0 
        max_curr = 0

        if(len(corr_scripts)==0): ###if we got a broken/unexpected soup with an error
            error_counter+=1
            error_indices.append(i)
            continue
                    
        ###EXTRACT THE BOARDGAME NAME
        name=soups[i].find("title").string.replace(" | Board Game | BoardGameGeek","").strip()
        names.append(name)    
        
        clear_output(wait=True)        

        
        for script in corr_scripts:
            if(len(script.get_text())>max_curr):
                max_curr = len(script.get_text())
                max_script_index = corr_scripts.index(script)

        script_for_data = corr_scripts[max_script_index]

        data_string = script_for_data.string
        data_string=data_string.rsplit(',')
        isRanked=False
        isDesignerFound=False

        for data in data_string:
            
            ###EXTRACT THE RELEASE YEAR
            if(f'yearpublished":' in data):
                year=data.replace(f'"', "")
                year=year.replace(f'yearpublished:',"").strip()
                
                if(year == '0'): 
                    year='NULL'
                years.append(year)                                

            ###EXTRACT THE MINIMUM AMOUNT OF PLAYERS
            if(f'minplayers":' in data):
                min_player=data.replace(f'"', "")
                min_player=min_player.replace(f'minplayers:',"").strip()

                if(min_player == '0'):
                    min_player='NULL'
                min_players.append(min_player)

            ###EXTRACT THE MAXIMUM AMOUNT OF PLAYERS
            if(f'maxplayers":' in data):
                max_player=data.replace(f'"', "")
                max_player=max_player.replace(f'maxplayers:',"").strip()

                if(max_player == '0'):
                    if((min_players[-1])=='NULL'):
                        max_player='NULL'
                    else:
                        max_player=min_players[-1]
                max_players.append(max_player) 

             ###EXTRACT THE MINIMUM PLAY TIME
            if(f'minplaytime":' in data):
                min_time=data.replace(f'"', "")
                min_time=min_time.replace(f'minplaytime:',"").strip()

                if(min_time == '0'):
                    min_time='NULL'
                min_times.append(min_time)

             ###EXTRACT THE MAXIMUM PLAY TIME
            if(f'maxplaytime":' in data):
                max_time=data.replace(f'"', "")
                max_time=max_time.replace(f'maxplaytime:',"").strip()

                if(max_time == '0'):
                    max_time='NULL'
                max_times.append(max_time)                     

                
             ###EXTRACT THE RECOMMENDED MINIMUM AGE
            if(f'minage":' in data):
                min_age=data.replace(f'"', "")
                min_age=min_age.replace(f'minage:',"").strip()

                if(min_age == '0'):
                    min_age='NULL'
                min_ages.append(min_age) 

             ###EXTRACT THE AVERAGE GAME COMPLEXITY LEVEL/'WEIGHT'
            if(f'avgweight":' in data):
                weight=data.replace(f'"', "")
                weight=weight.replace(f'avgweight:',"").strip()

                if(weight == '0'):
                    weight='NULL'
                weights.append(weight)

             ###EXTRACT THE RANK
            if(f'rank":' in data and isRanked==False):
                rank=data.replace(f'"', "")
                rank=rank.replace(f'rank:',"").strip()
                isRanked = True

                if(rank == '0'):
                    rank='NULL'
                ranks.append(rank)
                
            
             ###EXTRACT THE DESIGNER
            if('links":{' in data and isDesignerFound==False):
                isDesignerFound=True
                designer=data.replace(f'"', "")

                if('"links":{"boardgamedesigner":[{' in data):
                    designer=designer.replace('links:{boardgamedesigner:[{name:',"").strip()
                else:
                    designer=designer.replace('links:{boardgamedesigner:[]',"NULL").strip()

                if(designer=='(Uncredited)'):
                    designer='NULL'
                designers.append(designer)            

             ###EXTRACT THE NUMBER OF PEOPLE WHO OWN THE GAME
            if('numowned' in data):
                num_owned=data.replace(f'"', "")
                num_owned=num_owned.replace('numowned:',"").strip()
                owned.append(num_owned)            
            
            ###EXTRACT THE AVERAGE RATING SCORE
            if('"average":' in data):
                avg_rating=data.replace(f'"', "")
                avg_rating=avg_rating.replace('average:',"").strip()

                if(avg_rating=='0'):
                    avg_rating='NULL'
                ratings.append(avg_rating)            
            
            ###EXTRACT THE NUMBER OF PEOPLE WHO RATED
            if('"stats":' in data):
                num_of_rating=data.replace(f'"', "")
                num_of_rating=num_of_rating.replace('stats:{usersrated:',"").strip()
                num_of_ratings.append(num_of_rating)
                
            ###EXTRACT THE NUMBER OF COMMENTS
            if('"numcomments"' in data):
                num_of_comment=data.replace(f'"', "")
                num_of_comment=num_of_comment.replace('numcomments:',"").strip()
                num_of_comments.append(num_of_comment)
                
            ###EXTRACT THE BEST NUMBER OF PLAYERS
            if('"polls":' in data):
                num_players=data.replace(f'"', "")
                if('"polls":{"userplayers":{"best":[{' in data):
                    num_players=num_players.replace('polls:{userplayers:{best:[{min:',"").strip()
                else:
                    num_players=num_players.replace('polls:{userplayers:{best:[]',"NULL").strip()

                best_num_players.append(num_players)

            ###EXTRACT THE CATEGORY
            if('"boardgamesubdomain":[' in data):
                game_type=data.replace(f'"', "")
                if('"boardgamesubdomain":[{' in data):
                    game_type=game_type.replace('boardgamesubdomain:[{name:',"").replace(' Games',"").strip()
                else:
                    game_type=game_type.replace('boardgamesubdomain:[]',"NULL").strip()

                types.append(game_type)

            ###EXTRACT THE ARTIST    
            if('boardgameartist":[' in data):     
                artist=data.replace(f'"', "")
                if('boardgameartist":[{' in data):   
                    artist=artist.replace('boardgameartist:[{name:',"").strip()
                else:
                    artist=artist.replace('boardgameartist:[]',"NULL").strip()

                if(artist=='(Uncredited)'):
                    artist='NULL'
                artists.append(artist)
                isArtistFound=True
                
            ###EXTRACT THE PUBLISHER
            if('boardgamepublisher":[' in data): 
                publisher=data.replace(f'"', "")
                publisher=publisher.replace('boardgamepublisher:[{name:',"").strip()

                if(publisher=='(Unknown)'):
                    publisher='NULL'
                publishers.append(publisher)
            
    print(f"errors={error_counter}\n\nyears={len(years)}\nmin_players={len(min_players)}\nmax_players={len(max_players)}\nmin_times={len(min_times)}\nmax_times={len(max_times)}\nmin_ages={len(min_ages)}\nweights={len(weights)}\nranks={len(ranks)}\ndesigners={len(designers)}\nartists={len(artists)}\npublishers={len(publishers)}\nowned={len(owned)}\nbest_num_players={len(best_num_players)}\nnames={len(names)}\nratings={len(ratings)}\ntypes={len(types)}\nnum_of_ratings={len(num_of_ratings)}\nnum_of_comments={len(num_of_comments)}")
    df = pd.DataFrame({"Name":names,"Rank":ranks,"Rating":ratings,"#_of_Ratings":num_of_ratings,"#_of_Comments":num_of_comments,"Year":years,"Min_Players":min_players,"Max_Players":max_players,"Best_Players":best_num_players,"Min_Time":min_times,"Max_Time":max_times,"Min_Age":min_ages, "Complexity":weights,"Copies Owned":owned,"Type":types ,"Designer":designers,"Artist":artists, "Publisher":publishers})
    
    return df

In [None]:
def save_csv(df, csv_dir):   
    df.to_csv(csv_dir)

In [None]:
############--MAIN--##############

###STEP ONE, LINK GATHERING
game_links = extract_links(page_start=1,page_finish=1) ##comment to extract links from links_load_dir

###STEP TWO, SAVING THE LINKS
save_links(game_links)
     
###STEP THREE, SOUP GATHERING
soups = extract_soups(game_links)

###STEP FOUR, DATA GATHERING+DATAFRAME CREATION
df = extract_data_into_dataframe(soups)

###STEP FIVE, SAVING DATAFRAME INTO CSV
save_csv(df, csv_dir)

In [None]:
df