In [57]:
# Imports 

import csv
import datetime as dt
import json
import os
import statistics
import time
import sys
import numpy as np
import pandas as pd
import requests
import datetime

pd.set_option("max_columns", 100)

Pulling 30,000 rows of data from steamspy and steampowered (steam store). 

In [87]:
# set files and columns
download_path = 'Capstone-project'
steamspy_data = 'steamspy_data.csv'
steamspy_index = 'steamspy_index.txt'
steam_app_data = 'steampowered_app_data.csv'
steam_index = 'steampowered_index.txt'

steamspy_columns = [
    'appid', 'name', 'developer', 'publisher', 'score_rank', 'positive',
    'negative', 'userscore', 'owners', 'average_forever', 'average_2weeks',
    'median_forever', 'median_2weeks', 'price', 'initialprice', 'discount',
    'languages', 'genre', 'ccu', 'tags'
]

steam_columns = [
   'type', 'name', 'steam_appid', 'required_age', 'is_free', 'controller_support',
   'dlc', 'detailed_description', 'about_the_game', 'short_description', 'fullgame',
   'supported_languages','pc_requirements', 'mac_requirements', 'linux_requirements',
   'developers', 'publishers', 'demos', 'price_overview', 'packages', 'package_groups',
   'platforms', 'metacritic', 'reviews', 'categories', 'genres', 'recommendations',
   'achievements', 'release_date', 'content_descriptors'
]

In [49]:
## request 
def get_request(url, parameters=None):
    try:
        response = requests.get(url=url, params=parameters)
    except SSLError as s:
        print('SSL Error:', s)
        
        for i in range(5, 0, -1):
            print('\rWaiting... ({})'.format(i), end='')
            time.sleep(1)
        print('\rRetrying.' + ' '*10)
        return get_request(url, parameters)
    
    if response:
        return response.json()
    else:
        print('No response, waiting 10 seconds...')
        time.sleep(10)
        print('Retrying.')
        return get_request(url, parameters)

In [50]:
def reset_index(download_path, index_filename):
    """Reset index in file to 0."""
    rel_path = os.path.join(download_path, index_filename)
    
    with open(rel_path, 'w') as f:
        print(0, file=f)
        

def get_index(download_path, index_filename):
    """Retrieve index from file, returning 0 if file not found."""
    try:
        rel_path = os.path.join(download_path, index_filename)

        with open(rel_path, 'r') as f:
            index = int(f.readline())
    
    except FileNotFoundError:
        index = 0
        
    return index


def prepare_data_file(download_path, filename, index, columns):
    """Create file and write headers if index is 0."""
    if index == 0:
        rel_path = os.path.join(download_path, filename)
        with open(rel_path, 'w', newline='') as f:
            writer = csv.DictWriter(f, fieldnames=columns)
            writer.writeheader()
            

def get_app_data(start, stop, parser, pause):
    """Return list of app data generated from parser.
    
    parser : function to handle request
    """
    app_data = []
    
    # iterate through each row of app_list, confined by start and stop
    for index, row in app_list[start:stop].iterrows():
        print('Current index: {}'.format(index), end='\r')
        
        appid = row['appid']
        name = row['name']

        # retrive app data for a row, handled by supplied parser, and append to list
        data = parser(appid, name)
        app_data.append(data)

        time.sleep(pause) # prevent overloading api with requests
    
    return app_data

In [81]:
### the main function called to process data.  This will breakup the jobs into batches 
### and then grab data from those batches using the get_app_data function
def process_batches(parser, app_list, download_path, data_filename, index_filename,
                    columns, begin=0, end=-1, batchsize=100, pause=1):

    print('Starting at index {}:\n'.format(begin))
    now=datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
    # by default, process all apps in app_list
    if end == -1:
        end = len(app_list) + 1
    
    # generate array of batch begin and end points
    batches = np.arange(begin, end, batchsize)
    batches = np.append(batches, end)
    
    apps_written = 0
    batch_times = []
    count=0
    
    for i in range(len(batches) - 1):
        start_time = time.time()
        
        start = batches[i]
        stop = batches[i+1]
        
        app_data = get_app_data(start, stop, parser, pause)        
        rel_path = os.path.join(download_path, data_filename)
        temp_path = os.path.join("{}/tmp/".format(download_path), "{}.{}_{}".format(data_filename,now,count))                          
        count=count+1
        
        print("\rWriting temp data")
        with open(temp_path, 'a', newline='', encoding='utf-8') as f:
            writer = csv.DictWriter(f, fieldnames=columns, extrasaction='ignore')
            writer.writerows(app_data)
                
        # writing app data to file
        with open(rel_path, 'a', newline='', encoding='utf-8') as f:
            writer = csv.DictWriter(f, fieldnames=columns, extrasaction='ignore')
            
            for j in range(3,0,-1):
                print("\rAbout to write data, don't stop script! ({})".format(j), end='')
                time.sleep(0.5)
            
            writer.writerows(app_data)
            print('\rExported lines {}-{} to {}.'.format(start, stop-1, data_filename), end=' ')
            
        apps_written += len(app_data)
        
        idx_path = os.path.join(download_path, index_filename)
        
        # writing last index to file
        with open(idx_path, 'w') as f:
            index = stop
            print(index, file=f)
            
        # logging time taken
        end_time = time.time()
        time_taken = end_time - start_time
        
        batch_times.append(time_taken)
        mean_time = statistics.mean(batch_times)
        
        est_remaining = (len(batches) - i - 2) * mean_time
        
        remaining_td = dt.timedelta(seconds=round(est_remaining))
        time_td = dt.timedelta(seconds=round(time_taken))
        mean_td = dt.timedelta(seconds=round(mean_time))
        
        print('Batch {} time: {} (avg: {}, remaining: {})'.format(i, time_td, mean_td, remaining_td))
            
    print('\nProcessing batches complete. {} apps written'.format(apps_written))

In [66]:
###functions we use to parse data from steampowered and steamspy.
###These functions pull a single games information based an app id that is passed to it
def parse_steam_request(appid, name):
    url = "http://store.steampowered.com/api/appdetails/"
    parameters = {"appids": appid}
    
    json_data = get_request(url, parameters=parameters)
    json_app_data = json_data[str(appid)]
    
    if json_app_data['success']:
        data = json_app_data['data']
    else:
        data = {'name': name, 'steam_appid': appid}
        
    return data

def parse_steamspy_request(appid, name):
    """Parser to handle SteamSpy API data."""
    url = "https://steamspy.com/api.php"
    parameters = {"request": "appdetails", "appid": appid}
    
    json_data = get_request(url, parameters)
    return json_data

In [None]:
#pull data directly from steamspy.  Gets appid and game name.
url = "https://steamspy.com/api.php"

page=0
max_pages=30
app_list=pd.DataFrame()
while page<max_pages:
    parameters = {"request": "all","page":page}
    print("getting page {}?results=all&page={}".format(url,page))
    json_data = get_request(url, parameters=parameters)
   #print(json_data)
    if json_data:
        steam_spy_all = pd.DataFrame.from_dict(json_data, orient='index')
        tmp_app_list = steam_spy_all[['appid', 'name']].sort_values('appid').reset_index(drop=True)
        tmp_app_list.to_csv('app_list_page_{}.csv'.format(page), index=False)
        app_list = app_list.append(tmp_app_list)
    else:
        print("no data return")
    page=page+1
    time.sleep(60)

In [67]:
#pull data from CSV files instead of regenerating data.
csvpath="/Users/zig/Downloads//Project/csvfiles_backup/"
count=0
app_list=pd.DataFrame()
for filename in os.listdir(csvpath):
    if "app_list_page" in filename:
        full_path="{}{}".format(csvpath,filename)
        #print(full_path)
        tmp_list = pd.read_csv("{}{}".format(csvpath,filename))
        #print(tmp_list.head(5))
        app_list=app_list.append(tmp_list)
        count=count+1
print("Read {} csv files".format(count))
print("Number of rows  in app_list: {}".format(app_list.shape))
app_list=app_list.sort_values(by='appid')

Read 30 csv files
Number of rows  in app_list: (30000, 2)


In [None]:
reset_index(download_path, steamspy_index)
index = get_index(download_path, steamspy_index)

###get steamspy data
process_batches(
    parser=parse_steamspy_request,
    app_list=app_list,
    download_path=download_path, 
    data_filename=steamspy_data,
    index_filename=steamspy_index,
    columns=steamspy_columns,
    begin=index,
    #end=30000,
    batchsize=100,
    pause=0.1,
    #end=10,
    #batchsize=5
)

In [None]:
reset_index(download_path, steam_index)
index = get_index(download_path, steam_index)

###get steampowered data
process_batches(
   parser=parse_steam_request,
   app_list=app_list,
   download_path=download_path,
   data_filename=steam_app_data,
   index_filename=steam_index,
   columns=steam_columns,
   begin=index,
   batchsize=100,
   pause=0.6,
   #end=10,
   #batchsize=5
)

Starting at index 0:

Writing temp data
Exported lines 0-99 to steampowered_app_data.csv. Batch 0 time: 0:01:42 (avg: 0:01:42, remaining: 8:29:28)
No response, waiting 10 seconds...
Retrying.
No response, waiting 10 seconds...
Retrying.
No response, waiting 10 seconds...
Retrying.
No response, waiting 10 seconds...
Retrying.
No response, waiting 10 seconds...
Retrying.
No response, waiting 10 seconds...
Retrying.
Writing temp data
Exported lines 100-199 to steampowered_app_data.csv. Batch 1 time: 0:02:42 (avg: 0:02:12, remaining: 10:57:32)
Writing temp data
Exported lines 200-299 to steampowered_app_data.csv. Batch 2 time: 0:01:35 (avg: 0:02:00, remaining: 9:54:01)
No response, waiting 10 seconds...
Retrying.
No response, waiting 10 seconds...
Retrying.
No response, waiting 10 seconds...
Retrying.
No response, waiting 10 seconds...
Retrying.
No response, waiting 10 seconds...
Retrying.
No response, waiting 10 seconds...
Retrying.
No response, waiting 10 seconds...
Retrying.
No response