In [1]:
import requests as re
from bs4 import BeautifulSoup
import pandas as pd
import json
from datetime import datetime
import time
from tqdm.notebook import tqdm_notebook

In [5]:
def get_user_ratings(nickname, sleep=True):
    '''
    This function takes the boardgamegeek.com username as input 
    and returns a pandas DataFrame with all of the user's scores
    '''
    url_coll_main = 'https://api.geekdo.com/xmlapi/collection/'
    params = '?rated=1'
    
    if sleep: 
        r = re.get(url_coll_main + nickname + params)
        time.sleep(14.66)
    time.sleep(0.33)
    r = re.get(url_coll_main + nickname + params)
    soup = BeautifulSoup(r.text, features="xml")
    ratings = pd.DataFrame()
    rows = soup.find_all('item')
    for item in rows:
        boardgame_id = item.get('objectid')
        try:
            title = item.find('name').text
        except:
            title = ''
        try:
            rating = item.find('stats').find('rating').get('value')
        except:
            rating = ''
        try:
            num_of_plays = item.find('numplays').text
        except:
            num_of_plays = ''
        try:
            comment = item.find('comment').text
        except:
            comment = ''

        status = item.find('status')
        own = status.get('own')
        prevowned = status.get('prevowned')
        fortrade = status.get('fortrade')
        want = status.get('want')
        wanttoplay = status.get('wanttoplay')
        wanttobuy = status.get('wanttobuy')
        wishlist = status.get('wishlist') 
        preordered = status.get('preordered')
        last_modified = status.get('lastmodified')


        vote = {'nickname':nickname, 'title': title, 'boardgame_id':boardgame_id, 'rating':rating, 
                'num_of_plays':num_of_plays, 
                    'comment': comment, 'own':own, 'prevowned':prevowned, 'fortrade':fortrade, 
                    'want':want, 'wanttoplay':wanttoplay, 'wanttobuy':wanttobuy, 
                    'wishlist':wishlist, 'preordered':preordered, 'last_modified':last_modified}
        ratings = pd.concat([ratings, pd.DataFrame([vote])])
    return ratings

In [61]:
%%time
checked = pd.read_csv('checked_nicknames.csv')
nicknames = pd.read_csv('bgg_users_full.csv')
nicknames = list(set(nicknames.nickname) - set(checked.nicknames))
rates = pd.DataFrame()
test_nn = nicknames[:5164]
print(f'Checked {len(checked)}, in progress {len(test_nn)}, left {len(nicknames)-len(test_nn)} nicknames')
for nn in tqdm_notebook(test_nn):
    get_user_ratings(nn, sleep=False)
for i, nn in tqdm_notebook(enumerate(test_nn)):
    rates = pd.concat([rates,get_user_ratings(nn, sleep=False)])
#     if (i+1)%100==0: 
#         rates.to_csv('ratings_tmp.csv', index=False)
        
print(f'{len(rates)} rows added')
checked = pd.concat([checked,pd.DataFrame(test_nn, columns=['nicknames'])])
dt = datetime.now()
rates.to_csv(f'bgg_ratings_{dt.year}_{dt.month}_{dt.day}_{dt.hour}_{dt.minute}_{dt.second}_{len(test_nn)}.csv', index=False)
checked.to_csv(f'checked_nicknames.csv', index=False)

Checked 1664526, in progress 5164, left 0 nicknames


  0%|          | 0/5164 [00:00<?, ?it/s]

0it [00:00, ?it/s]

49827 rows added
CPU times: user 4min 31s, sys: 18.2 s, total: 4min 49s
Wall time: 1h 49min 17s
