In [3]:
import csv
import datetime as dt
import json
import os
import statistics
import time

# third-party imports
import numpy as np
import pandas as pd
import requests

# customisations - ensure tables show all columns
pd.set_option("max_columns", 100)


def get_request(url, parameters=None):
    """Return json-formatted response of a get request using optional parameters.
    
    Parameters
    ----------
    url : string
    parameters : {'parameter': 'value'}
        parameters to pass as part of get request
    
    Returns
    -------
    json_data
        json-formatted response (dict-like)
    """
    try:
        response = requests.get(url=url, params=parameters)
    except SSLError as s:
        print('SSL Error:', s)
        
        for i in range(5, 0, -1):
            print('\rWaiting... ({})'.format(i), end='')
            time.sleep(1)
        print('\rRetrying.' + ' '*10)
        
        # recusively try again
        return get_request(url, parameters)
    
    if response:
        return response.json()
    else:
        # response is none usually means too many requests. Wait and try again 
        print('No response, waiting 10 seconds...')
        time.sleep(10)
        print('Retrying.')
        return get_request(url, parameters)

def get_app_data(start, stop, parser, pause):
    """Return list of app data generated from parser.
    
    parser : function to handle request
    """
    app_data = []
    
    # iterate through each row of app_list, confined by start and stop
    for index, row in app_list[start:stop].iterrows():
        print('Current index: {}'.format(index), end='\r')
        
        appid = row['appid']
        name = row['name']

        # retrive app data for a row, handled by supplied parser, and append to list
        data = parser(appid, name)
        app_data.append(data)

        time.sleep(pause) # prevent overloading api with requests
    
    return app_data


def process_batches(parser, app_list, download_path, data_filename, index_filename,
                    columns, begin=0, end=-1, batchsize=100, pause=1):
    """Process app data in batches, writing directly to file.
    
    parser : custom function to format request
    app_list : dataframe of appid and name
    download_path : path to store data
    data_filename : filename to save app data
    index_filename : filename to store highest index written
    columns : column names for file
    
    Keyword arguments:
    
    begin : starting index (get from index_filename, default 0)
    end : index to finish (defaults to end of app_list)
    batchsize : number of apps to write in each batch (default 100)
    pause : time to wait after each api request (defualt 1)
    
    returns: none
    """
    print('Starting at index {}:\n'.format(begin))
    
    # by default, process all apps in app_list
    if end == -1:
        end = len(app_list) + 1
    
    # generate array of batch begin and end points
    batches = np.arange(begin, end, batchsize)
    batches = np.append(batches, end)
    
    apps_written = 0
    batch_times = []
    
    for i in range(len(batches) - 1):
        start_time = time.time()
        
        start = batches[i]
        stop = batches[i+1]
        
        app_data = get_app_data(start, stop, parser, pause)
        
        rel_path = os.path.join(download_path, data_filename)
        
        # writing app data to file
        with open(rel_path, 'a', newline='', encoding='utf-8') as f:
            writer = csv.DictWriter(f, fieldnames=columns, extrasaction='ignore')
            
            for j in range(3,0,-1):
                print("\rAbout to write data, don't stop script! ({})".format(j), end='')
                time.sleep(0.5)
            
            writer.writerows(app_data)
            print('\rExported lines {}-{} to {}.'.format(start, stop-1, data_filename), end=' ')
            
        apps_written += len(app_data)
        
        idx_path = os.path.join(download_path, index_filename)
        
        # writing last index to file
        with open(idx_path, 'w') as f:
            index = stop
            print(index, file=f)
            
        # logging time taken
        end_time = time.time()
        time_taken = end_time - start_time
        
        batch_times.append(time_taken)
        mean_time = statistics.mean(batch_times)
        
        est_remaining = (len(batches) - i - 2) * mean_time
        
        remaining_td = dt.timedelta(seconds=round(est_remaining))
        time_td = dt.timedelta(seconds=round(time_taken))
        mean_td = dt.timedelta(seconds=round(mean_time))
        
        print('Batch {} time: {} (avg: {}, remaining: {})'.format(i, time_td, mean_td, remaining_td))
            
    print('\nProcessing batches complete. {} apps written'.format(apps_written))

    
def reset_index(download_path, index_filename):
    """Reset index in file to 0."""
    rel_path = os.path.join(download_path, index_filename)
    
    with open(rel_path, 'w') as f:
        print(0, file=f)
        

def get_index(download_path, index_filename):
    """Retrieve index from file, returning 0 if file not found."""
    try:
        rel_path = os.path.join(download_path, index_filename)

        with open(rel_path, 'r') as f:
            index = int(f.readline())
    
    except FileNotFoundError:
        index = 0
        
    return index


def prepare_data_file(download_path, filename, index, columns):
    """Create file and write headers if index is 0."""
    if index == 0:
        rel_path = os.path.join(download_path, filename)

        with open(rel_path, 'w', newline='') as f:
            writer = csv.DictWriter(f, fieldnames=columns)
            writer.writeheader()
            
            
def parse_steam_request(appid, name):
    """Unique parser to handle data from Steam Store API.
    
    Returns : json formatted data (dict-like)
    """
    url = "http://store.steampowered.com/api/appdetails/"
    parameters = {"appids": appid}
    
    json_data = get_request(url, parameters=parameters)
    json_app_data = json_data[str(appid)]
    
    if json_app_data['success']:
        data = json_app_data['data']
    else:
        data = {'name': name, 'steam_appid': appid}
        
    return data


def parse_steamspy_request(appid, name):
    """Parser to handle SteamSpy API data."""
    url = "https://steamspy.com/api.php"
    parameters = {"request": "appdetails", "appid": appid}
    
    json_data = get_request(url, parameters)
    return json_data

In [4]:
# Read from file to save re-running api calls. 
app_list = pd.read_csv('Data Files/steam_app_list.csv')

In [5]:
download_path = 'Data Files'
steamspy_data = 'steamspy_data.csv'
steamspy_index = 'steamspy_index.txt'

steamspy_columns = [
    'appid', 'name', 'developer', 'publisher', 'score_rank', 'positive',
    'negative', 'userscore', 'owners', 'average_forever', 'average_2weeks',
    'median_forever', 'median_2weeks', 'price', 'initialprice', 'discount',
    'languages', 'genre', 'ccu', 'tags'
]

reset_index(download_path, steamspy_index)
index = get_index(download_path, steamspy_index)

# Wipe data file if index is 0
prepare_data_file(download_path, steamspy_data, index, steamspy_columns)

process_batches(
    parser=parse_steamspy_request,
    app_list=app_list,
    download_path=download_path, 
    data_filename=steamspy_data,
    index_filename=steamspy_index,
    columns=steamspy_columns,
    begin=index,
    end=len(app_list),
    batchsize=100,
    pause=0.3
)

Starting at index 0:

Exported lines 0-99 to steamspy_data.csv. Batch 0 time: 0:00:43 (avg: 0:00:43, remaining: 4:22:57)
Exported lines 100-199 to steamspy_data.csv. Batch 1 time: 0:00:42 (avg: 0:00:43, remaining: 4:20:27)
Exported lines 200-299 to steamspy_data.csv. Batch 2 time: 0:00:43 (avg: 0:00:43, remaining: 4:19:39)
Exported lines 300-399 to steamspy_data.csv. Batch 3 time: 0:00:43 (avg: 0:00:43, remaining: 4:19:48)
Exported lines 400-499 to steamspy_data.csv. Batch 4 time: 0:00:46 (avg: 0:00:43, remaining: 4:23:07)
Exported lines 500-599 to steamspy_data.csv. Batch 5 time: 0:00:42 (avg: 0:00:43, remaining: 4:21:06)
Exported lines 600-699 to steamspy_data.csv. Batch 6 time: 0:00:42 (avg: 0:00:43, remaining: 4:19:27)
Exported lines 700-799 to steamspy_data.csv. Batch 7 time: 0:00:43 (avg: 0:00:43, remaining: 4:18:33)
Exported lines 800-899 to steamspy_data.csv. Batch 8 time: 0:00:42 (avg: 0:00:43, remaining: 4:17:24)
Exported lines 900-999 to steamspy_data.csv. Batch 9 time: 0:00

Exported lines 7800-7899 to steamspy_data.csv. Batch 78 time: 0:00:42 (avg: 0:00:42, remaining: 3:23:07)
Exported lines 7900-7999 to steamspy_data.csv. Batch 79 time: 0:00:42 (avg: 0:00:42, remaining: 3:22:24)
Exported lines 8000-8099 to steamspy_data.csv. Batch 80 time: 0:00:42 (avg: 0:00:42, remaining: 3:21:40)
Exported lines 8100-8199 to steamspy_data.csv. Batch 81 time: 0:00:42 (avg: 0:00:42, remaining: 3:20:56)
Exported lines 8200-8299 to steamspy_data.csv. Batch 82 time: 0:00:42 (avg: 0:00:42, remaining: 3:20:11)
Exported lines 8300-8399 to steamspy_data.csv. Batch 83 time: 0:00:41 (avg: 0:00:42, remaining: 3:19:27)
Exported lines 8400-8499 to steamspy_data.csv. Batch 84 time: 0:00:47 (avg: 0:00:42, remaining: 3:19:01)
Exported lines 8500-8599 to steamspy_data.csv. Batch 85 time: 0:00:42 (avg: 0:00:42, remaining: 3:18:18)
Exported lines 8600-8699 to steamspy_data.csv. Batch 86 time: 0:00:43 (avg: 0:00:42, remaining: 3:17:37)
Exported lines 8700-8799 to steamspy_data.csv. Batch 87

Exported lines 15500-15599 to steamspy_data.csv. Batch 155 time: 0:00:43 (avg: 0:00:42, remaining: 2:28:32)
Exported lines 15600-15699 to steamspy_data.csv. Batch 156 time: 0:00:42 (avg: 0:00:42, remaining: 2:27:50)
Exported lines 15700-15799 to steamspy_data.csv. Batch 157 time: 0:00:42 (avg: 0:00:42, remaining: 2:27:08)
Exported lines 15800-15899 to steamspy_data.csv. Batch 158 time: 0:00:42 (avg: 0:00:42, remaining: 2:26:26)
Exported lines 15900-15999 to steamspy_data.csv. Batch 159 time: 0:00:43 (avg: 0:00:42, remaining: 2:25:45)
Exported lines 16000-16099 to steamspy_data.csv. Batch 160 time: 0:00:42 (avg: 0:00:42, remaining: 2:25:03)
Exported lines 16100-16199 to steamspy_data.csv. Batch 161 time: 0:00:42 (avg: 0:00:42, remaining: 2:24:22)
Exported lines 16200-16299 to steamspy_data.csv. Batch 162 time: 0:00:43 (avg: 0:00:42, remaining: 2:23:41)
Exported lines 16300-16399 to steamspy_data.csv. Batch 163 time: 0:00:43 (avg: 0:00:42, remaining: 2:22:59)
Exported lines 16400-16499 t

Exported lines 23100-23199 to steamspy_data.csv. Batch 231 time: 0:00:41 (avg: 0:00:42, remaining: 1:35:12)
Exported lines 23200-23299 to steamspy_data.csv. Batch 232 time: 0:00:41 (avg: 0:00:42, remaining: 1:34:29)
Exported lines 23300-23399 to steamspy_data.csv. Batch 233 time: 0:00:41 (avg: 0:00:42, remaining: 1:33:47)
Exported lines 23400-23499 to steamspy_data.csv. Batch 234 time: 0:00:42 (avg: 0:00:42, remaining: 1:33:05)
Exported lines 23500-23599 to steamspy_data.csv. Batch 235 time: 0:00:41 (avg: 0:00:42, remaining: 1:32:22)
Exported lines 23600-23699 to steamspy_data.csv. Batch 236 time: 0:00:42 (avg: 0:00:42, remaining: 1:31:40)
Exported lines 23700-23799 to steamspy_data.csv. Batch 237 time: 0:00:41 (avg: 0:00:42, remaining: 1:30:58)
Exported lines 23800-23899 to steamspy_data.csv. Batch 238 time: 0:00:42 (avg: 0:00:42, remaining: 1:30:16)
Exported lines 23900-23999 to steamspy_data.csv. Batch 239 time: 0:00:41 (avg: 0:00:42, remaining: 1:29:33)
Exported lines 24000-24099 t

Exported lines 30700-30799 to steamspy_data.csv. Batch 307 time: 0:00:41 (avg: 0:00:42, remaining: 0:41:49)
Exported lines 30800-30899 to steamspy_data.csv. Batch 308 time: 0:00:41 (avg: 0:00:42, remaining: 0:41:08)
Exported lines 30900-30999 to steamspy_data.csv. Batch 309 time: 0:00:41 (avg: 0:00:42, remaining: 0:40:26)
Exported lines 31000-31099 to steamspy_data.csv. Batch 310 time: 0:00:41 (avg: 0:00:42, remaining: 0:39:44)
Exported lines 31100-31199 to steamspy_data.csv. Batch 311 time: 0:00:41 (avg: 0:00:42, remaining: 0:39:02)
Exported lines 31200-31299 to steamspy_data.csv. Batch 312 time: 0:00:41 (avg: 0:00:42, remaining: 0:38:20)
Exported lines 31300-31399 to steamspy_data.csv. Batch 313 time: 0:00:41 (avg: 0:00:42, remaining: 0:37:38)
Exported lines 31400-31499 to steamspy_data.csv. Batch 314 time: 0:00:41 (avg: 0:00:42, remaining: 0:36:56)
Exported lines 31500-31599 to steamspy_data.csv. Batch 315 time: 0:00:41 (avg: 0:00:42, remaining: 0:36:14)
Exported lines 31600-31699 t