# Steam database

Build database by scraping app and user info using the steam web api.

In [1]:
#import libraries
import random as rnd
import numpy as np
import pandas as pd
import requests
import pickle
import time

rnd.seed(8524)

In [2]:
# get all steam app ids
response = requests.get("https://api.steampowered.com/ISteamApps/GetAppList/v2/")

steam_apps = pd.DataFrame(response.json()["applist"]["apps"])
steam_apps["appid"]=steam_apps["appid"].astype(str)

steam_apps.shape

(213616, 2)

In [3]:
#steam_apps = steam_apps.drop(list(range(36))) # appear to be invalid (missing or test)
# todo: check that this works as intended
steam_apps.drop_duplicates(["appid"],inplace=True) # drop duplicate app ids
steam_apps.reset_index(drop=True,inplace=True)

steam_apps.shape

(213616, 2)

In [6]:
# load steam api key
# note: limited to 100k calls per day for steam web api

with open("steam_api_key.txt", 'r') as file:
    steam_key = file.read()

In [None]:
# get app general info
# todo: check if game already in database and collect missing apps

if not 'steam_app_info' in globals():
    
    filehandler = open('data/steam_app_info.obj', 'rb') 
    steam_app_info = pickle.load(filehandler)
    
    #steam_app_info = {} # output: dict

batch_count =  # 3799

In [4]:
filehandler = open('data/steam_app_info.obj', 'rb') 
steam_app_info = pickle.load(filehandler)

In [17]:
parameters = {"format": 'json', "key": steam_key, "appids": '1245620,2959660', "filters": 'price_overview'} # can only request one at a time

response = requests.get("http://store.steampowered.com/api/appdetails",params=parameters)
response = response.json()
response

{'1245620': {'success': True,
  'data': {'price_overview': {'currency': 'EUR',
    'initial': 5999,
    'final': 5999,
    'discount_percent': 0,
    'initial_formatted': '',
    'final_formatted': '59,99€'}}},
 '2959660': {'success': True, 'data': []}}

In [44]:
batch_count = 3798

3798

In [41]:
for app in steam_apps["appid"].iloc[(batch_count-1):]: #for testing: ['1245620']

    batch_count += 1

    # wait till next call after each batch. Limited to 200 calls per 3 min
    if not bool(batch_count % 200):
        time.sleep(180)

    parameters = {"format": 'json', "key": steam_key, "appids": app} # can only request one at a time

    response = requests.get("http://store.steampowered.com/api/appdetails",params=parameters)
    response = response.json()

    if response[app]["success"] == True:
        steam_app_info[app] = response[app]["data"]
    else:
        print(str(batch_count) + ": NO data retrieved for app " + app)
        continue

    # save/update pickled data file after each api call
    filehandler = open('data/steam_app_info.obj','wb')
    pickle.dump(steam_app_info, filehandler)

    print(str(batch_count) + ": data retrieved for app " + app)

KeyboardInterrupt: 

In [None]:
# get app review info
# todo: limit to games with app info retrieved and more than 40k reviews (influential games)

parameters = {'json': 1, 
              'filter': 'recent', 
              'language': 'all',
              'cursor': '*',
              'review_type': 'all',
              'purchase_type': 'all',
              'num_per_page': '100',
              'filter_offtopic_activity': 0
              }

review_summaries = list()

def prep_reviews_df(resp,appid):
    # resp is a dict of api response

    author_df = pd.DataFrame([item["author"] for item in resp])

    reviews_df=pd.DataFrame(resp)
    reviews_df=reviews_df.drop(columns=['author'])
    reviews_df=reviews_df.join(author_df)

    reviews_df["appid"]=[appid]*len(reviews_df)

    return reviews_df

for this_app in steam_apps["appid"]: #steam_apps["appid"]: # for testing: ['2717080']:

    while 1:
        # get document from API (API only allows retrieval of 20 reviews at a time)
        response = requests.get("https://store.steampowered.com/appreviews/" + str(this_app), params=parameters)
        response = response.json() # convert to dict

        if response["query_summary"]["num_reviews"] == 0:
            break # exit when no more reviews to retrieve for title

        # update reviews dfs
        if parameters['cursor'] == '*': # first iteration
            review_summaries = pd.DataFrame(response["query_summary"],index=[0])
            review_summaries=review_summaries.drop(columns=['num_reviews'])
            review_summaries["appid"] = this_app

            reviews_all = prep_reviews_df(response["reviews"],this_app)
        else:
            reviews_all = pd.concat([reviews_all, prep_reviews_df(response["reviews"],this_app)])

        # update cursor for next API request
        parameters["cursor"] = response["cursor"].encode()

In [None]:
# get id list
id_list = reviews_all["steamid"].unique()

In [None]:
# get user summaries (only those who left at least one review for one app) - this is not really useful except maybe country code

users_all = pd.DataFrame()

for x in np.arange(0,len(id_list),100): # can only request 100 ids at a time
    
    # get ids
    if x+100 < len(id_list):
        id_request = id_list[x:x+100]
    else:
        id_request = id_list[x:]

    id_request_str = ",".join(id_request) # format id list for api call

    parameters = {"key": steam_key, "steamids": id_request_str}

    response = requests.get("http://api.steampowered.com/ISteamUser/GetPlayerSummaries/v0002",params=parameters)
    response = response.json()

    # update reviews dfs
    if not users_all.empty: # first iteration
        users_all = pd.DataFrame(response["response"]["players"])
    else:
        users_all = pd.concat([users_all, pd.DataFrame(response["response"]["players"])])

users_all.reset_index(drop=True,inplace=True)

Note for generating random ids (see steam doc):

    id = rnd.randrange(1,1000000)
    id64 = id*2 + 76561197960265728 + 1

In [None]:
# get user games owned

user_games_count = pd.DataFrame() # note: this count is different from the info from the review api request and what is listed on the steam community profile. Possibly because some apps are not games (may need to filter)
user_games_all = pd.DataFrame()

for id in id_list:  #id_list: # for testing: ['76561198110169699']:

    parameters = {"key": steam_key, "steamid": id, "include_played_free_games": True}

    response = requests.get("http://api.steampowered.com/IPlayerService/GetOwnedGames/v0001",params=parameters)
    response = response.json()

    if response["response"]: # check if info could be retrieved (public profile)
        games_count_df = pd.DataFrame({"steamid": id, "game_count": response["response"]["game_count"]}, index=[0])
        
        games_df = pd.DataFrame(response["response"]["games"])
        games_df["steamid"] = [id]*len(games_df)
    else:
        continue

    # update games dfs
    if not user_games_all.empty:
        user_games_count = games_count_df
        user_games_all = games_df
    else:
        user_games_count = pd.concat([user_games_count, games_count_df])
        user_games_all = pd.concat([user_games_all, games_df])

user_games_count.reset_index(drop=True,inplace=True)
user_games_all.reset_index(drop=True,inplace=True)

Optional other info that could be useful:
* player achievements (hassle to get from web api since every game needs to be requested)
* player number of friends