# Archivo para llamar a las distintas APIs con información de Steam

No me da la vida para hacerlo así que hago copiar pegar del siguiente enlace:
https://nik-davis.github.io/posts/2019/steam-data-collection/

In [2]:
import numpy as np
import pandas as pd
import requests

import csv
import datetime as dt
import time
import os
import statistics
import json
from typing import Dict, List

### Aquí se saca y asigna el ID para cada nombre de juego

In [3]:
def get_request(url, parameters=None):
    
    try:
        response = requests.get(url=url, params=parameters)
    except SSLError as s:
        print('SSL Error:', s)
        
        for i in range(5, 0, -1):
            print('\rWaiting... ({})'.format(i), end='')
            time.sleep(1)
        print('\rRetrying.' + ' '*10)
        
        # recusively try again
        return get_request(url, parameters)
    
    if response:
        return response.json()
    else:
        # response is none usually means too many requests. Wait and try again 
        print('No response, waiting 10 seconds...')
        time.sleep(10)
        print('Retrying.')
        return get_request(url, parameters)

In [4]:
url = "https://steamspy.com/api.php"
parameters = {"request": "all"}

# request 'all' from steam spy and parse into dataframe
json_data = get_request(url, parameters=parameters)
steam_spy_all = pd.DataFrame.from_dict(json_data, orient='index')

# generate sorted app_list from steamspy data
app_list = steam_spy_all[['appid', 'name']].sort_values('appid').reset_index(drop=True)

# export disabled to keep consistency across download sessions
app_list.to_csv('data/app_list.csv', index=False)

# instead read from stored csv
app_list = pd.read_csv('data/app_list.csv')

# display first few rows
app_list.head()

Unnamed: 0,appid,name
0,10,Counter-Strike
1,30,Day of Defeat
2,40,Deathmatch Classic
3,50,Half-Life: Opposing Force
4,70,Half-Life


### Aquí ?

In [5]:
def get_app_data(start, stop, parser, pause):
    """Return list of app data generated from parser.
    
    parser : function to handle request
    """
    app_data = []
    
    # iterate through each row of app_list, confined by start and stop
    for index, row in app_list[start:stop].iterrows():
        print('Current index: {}'.format(index), end='\r')
        
        appid = row['appid']
        name = row['name']

        # retrive app data for a row, handled by supplied parser, and append to list
        data = parser(appid, name)
        app_data.append(data)

        time.sleep(pause) # prevent overloading api with requests
    
    return app_data


def process_batches(parser, app_list, download_path, data_filename, index_filename,
                    columns, begin=0, end=-1, batchsize=100, pause=1):
    """Process app data in batches, writing directly to file.
    
    parser : custom function to format request
    app_list : dataframe of appid and name
    download_path : path to store data
    data_filename : filename to save app data
    index_filename : filename to store highest index written
    columns : column names for file
    
    Keyword arguments:
    
    begin : starting index (get from index_filename, default 0)
    end : index to finish (defaults to end of app_list)
    batchsize : number of apps to write in each batch (default 100)
    pause : time to wait after each api request (defualt 1)
    
    returns: none
    """
    print('Starting at index {}:\n'.format(begin))
    
    # by default, process all apps in app_list
    if end == -1:
        end = len(app_list) + 1
    
    # generate array of batch begin and end points
    batches = np.arange(begin, end, batchsize)
    batches = np.append(batches, end)
    
    apps_written = 0
    batch_times = []
    
    for i in range(len(batches) - 1):
        start_time = time.time()
        
        start = batches[i]
        stop = batches[i+1]
        
        app_data = get_app_data(start, stop, parser, pause)
        
        rel_path = os.path.join(download_path, data_filename)
        
        # writing app data to file
        with open(rel_path, 'a', newline='', encoding='utf-8') as f:
            writer = csv.DictWriter(f, fieldnames=columns, extrasaction='ignore')
            
            for j in range(3,0,-1):
                print("\rAbout to write data, don't stop script! ({})".format(j), end='')
                time.sleep(0.5)
            
            writer.writerows(app_data)
            print('\rExported lines {}-{} to {}.'.format(start, stop-1, data_filename), end=' ')
            
        apps_written += len(app_data)
        
        idx_path = os.path.join(download_path, index_filename)
        
        # writing last index to file
        with open(idx_path, 'w') as f:
            index = stop
            print(index, file=f)
            
        # logging time taken
        end_time = time.time()
        time_taken = end_time - start_time
        
        batch_times.append(time_taken)
        mean_time = statistics.mean(batch_times)
        
        est_remaining = (len(batches) - i - 2) * mean_time
        
        remaining_td = dt.timedelta(seconds=round(est_remaining))
        time_td = dt.timedelta(seconds=round(time_taken))
        mean_td = dt.timedelta(seconds=round(mean_time))
        
        print('Batch {} time: {} (avg: {}, remaining: {})'.format(i, time_td, mean_td, remaining_td))
            
    print('\nProcessing batches complete. {} apps written'.format(apps_written))

In [6]:
def reset_index(download_path, index_filename):
    """Reset index in file to 0."""
    rel_path = os.path.join(download_path, index_filename)
    
    with open(rel_path, 'w') as f:
        print(0, file=f)
        

def get_index(download_path, index_filename):
    """Retrieve index from file, returning 0 if file not found."""
    try:
        rel_path = os.path.join(download_path, index_filename)

        with open(rel_path, 'r') as f:
            index = int(f.readline())
    
    except FileNotFoundError:
        index = 0
        
    return index


def prepare_data_file(download_path, filename, index, columns):
    """Create file and write headers if index is 0."""
    if index == 0:
        rel_path = os.path.join(download_path, filename)

        with open(rel_path, 'w', newline='') as f:
            writer = csv.DictWriter(f, fieldnames=columns)
            writer.writeheader()

### Aquí saco la información de la API de Steam

Aquí ya hago modficaciones para quitar todas las columnas que no me interesan.
En mi primera tentativa esto formaba parte del proceso de limpieza del dataset, pero así a lo mejor puedo generarlo ya ligeremente 'limpio'.
De esta API nos interesa la columna "release_date" (fecha de lanzamiento) que es una de las principales que usaré en el EDA

In [7]:
def parse_steam_request(appid, name):
    """Unique parser to handle data from Steam Store API.
    
    Returns : json formatted data (dict-like)
    """
    url = "http://store.steampowered.com/api/appdetails/"
    parameters = {"appids": appid}
    
    json_data = get_request(url, parameters=parameters)
    json_app_data = json_data[str(appid)]
    
    if json_app_data['success']:
        data = json_app_data['data']
    else:
        data = {'name': name, 'steam_appid': appid}
        
    return data


# Set file parameters
download_path = r'C:\Users\adria\Bootcamp\EDA_The_Bridge\data\download'
steam_app_data = 'steam_app_data.csv'
steam_index = 'steam_index.txt'

steam_columns = [
    'type', 'name', 'steam_appid', 'is_free', 'developers', 'publishers', 'release_date'
]

# Overwrites last index for demonstration (would usually store highest index so can continue across sessions)
reset_index(download_path, steam_index)

# Retrieve last index downloaded from file
index = get_index(download_path, steam_index)

# Wipe or create data file and write headers if index is 0
prepare_data_file(download_path, steam_app_data, index, steam_columns)

# Set end and chunksize for demonstration - remove to run through entire app list
process_batches(
    parser=parse_steam_request,
    app_list=app_list,
    download_path=download_path,
    data_filename=steam_app_data,
    index_filename=steam_index,
    columns=steam_columns,
    begin=index,
    end=10000,
    batchsize=5
)

Starting at index 0:

Exported lines 0-4 to steam_app_data.csv. Batch 0 time: 0:00:11 (avg: 0:00:11, remaining: 5:51:01)
Exported lines 5-9 to steam_app_data.csv. Batch 1 time: 0:00:10 (avg: 0:00:10, remaining: 5:45:13)
Exported lines 10-14 to steam_app_data.csv. Batch 2 time: 0:00:11 (avg: 0:00:10, remaining: 5:48:47)
Exported lines 15-19 to steam_app_data.csv. Batch 3 time: 0:00:11 (avg: 0:00:11, remaining: 5:52:44)
Exported lines 20-24 to steam_app_data.csv. Batch 4 time: 0:00:11 (avg: 0:00:11, remaining: 5:52:11)
Exported lines 25-29 to steam_app_data.csv. Batch 5 time: 0:00:11 (avg: 0:00:11, remaining: 5:52:26)
Exported lines 30-34 to steam_app_data.csv. Batch 6 time: 0:00:10 (avg: 0:00:11, remaining: 5:51:39)
Exported lines 35-39 to steam_app_data.csv. Batch 7 time: 0:00:11 (avg: 0:00:11, remaining: 5:51:35)
Exported lines 40-44 to steam_app_data.csv. Batch 8 time: 0:00:11 (avg: 0:00:11, remaining: 5:51:16)
Exported lines 45-49 to steam_app_data.csv. Batch 9 time: 0:00:11 (avg: 0

In [8]:
pd.read_csv(r'C:\Users\adria\Bootcamp\EDA_The_Bridge\data\download\steam_app_data.csv')

Unnamed: 0,type,name,steam_appid,is_free,developers,publishers,release_date
0,game,Counter-Strike,10,False,['Valve'],['Valve'],"{'coming_soon': False, 'date': '1 Nov, 2000'}"
1,game,Day of Defeat,30,False,['Valve'],['Valve'],"{'coming_soon': False, 'date': 'May 1, 2003'}"
2,game,Deathmatch Classic,40,False,['Valve'],['Valve'],"{'coming_soon': False, 'date': '1 Jun, 2001'}"
3,game,Half-Life: Opposing Force,50,False,['Gearbox Software'],['Valve'],"{'coming_soon': False, 'date': '1 Nov, 1999'}"
4,game,Half-Life,70,False,['Valve'],['Valve'],"{'coming_soon': False, 'date': '19 Nov, 1998'}"
...,...,...,...,...,...,...,...
995,game,Black Myth: Wukong Benchmark Tool,3132990,True,['Game Science'],['Game Science'],"{'coming_soon': False, 'date': '12 Aug, 2024'}"
996,game,WEBFISHING,3146520,False,['lamedeveloper'],['lamedeveloper'],"{'coming_soon': False, 'date': '11 Oct, 2024'}"
997,game,Schedule I,3164500,False,['TVGS'],['TVGS'],"{'coming_soon': False, 'date': '24 Mar, 2025'}"
998,game,Grand Theft Auto V Enhanced,3240220,False,['Rockstar North'],['Rockstar Games'],"{'coming_soon': False, 'date': '4 Mar, 2025'}"


### Aquí sacamos la info de SteamSpy

De esta API nos interesa la columna "tags" (etiquetas) que es una de las principales para el EDA

In [53]:
def parse_steamspy_request(appid, name):
    """Parser to handle SteamSpy API data."""
    url = "https://steamspy.com/api.php"
    parameters = {"request": "appdetails", "appid": appid}
    
    json_data = get_request(url, parameters)
    return json_data


# set files and columns
download_path = r'C:\Users\adria\Bootcamp\EDA_The_Bridge\data\download'
steamspy_data = 'steamspy_data.csv'
steamspy_index = 'steamspy_index.txt'

steamspy_columns = [
    'appid', 'name', 'positive', 'negative', 'owners', 'initialprice', 'genre', 'tags'
]

reset_index(download_path, steamspy_index)
index = get_index(download_path, steamspy_index)

# Wipe data file if index is 0
prepare_data_file(download_path, steamspy_data, index, steamspy_columns)

process_batches(
    parser=parse_steamspy_request,
    app_list=app_list,
    download_path=download_path, 
    data_filename=steamspy_data,
    index_filename=steamspy_index,
    columns=steamspy_columns,
    begin=index,
    #end=20,
    batchsize=5,
    pause=1
)

Starting at index 0:

Exported lines 0-4 to steamspy_data.csv. Batch 0 time: 0:00:08 (avg: 0:00:08, remaining: 0:27:56)
Exported lines 5-9 to steamspy_data.csv. Batch 1 time: 0:00:09 (avg: 0:00:08, remaining: 0:28:01)
Exported lines 10-14 to steamspy_data.csv. Batch 2 time: 0:00:09 (avg: 0:00:09, remaining: 0:28:09)
Exported lines 15-19 to steamspy_data.csv. Batch 3 time: 0:00:09 (avg: 0:00:09, remaining: 0:27:59)
Exported lines 20-24 to steamspy_data.csv. Batch 4 time: 0:00:09 (avg: 0:00:09, remaining: 0:27:55)
Exported lines 25-29 to steamspy_data.csv. Batch 5 time: 0:00:09 (avg: 0:00:09, remaining: 0:27:49)
Exported lines 30-34 to steamspy_data.csv. Batch 6 time: 0:00:09 (avg: 0:00:09, remaining: 0:27:41)
Exported lines 35-39 to steamspy_data.csv. Batch 7 time: 0:00:09 (avg: 0:00:09, remaining: 0:27:35)
Exported lines 40-44 to steamspy_data.csv. Batch 8 time: 0:00:09 (avg: 0:00:09, remaining: 0:27:43)
Exported lines 45-49 to steamspy_data.csv. Batch 9 time: 0:00:09 (avg: 0:00:09, re

In [54]:
pd.read_csv(r'C:\Users\adria\Bootcamp\EDA_The_Bridge\data\download\steamspy_data.csv')

Unnamed: 0,appid,name,positive,negative,owners,initialprice,genre,tags
0,10,Counter-Strike,243818,6427,"10,000,000 .. 20,000,000",999,Action,"{'Action': 5504, 'FPS': 4929, 'Multiplayer': 3..."
1,30,Day of Defeat,6414,688,"5,000,000 .. 10,000,000",499,Action,"{'FPS': 804, 'World War II': 272, 'Multiplayer..."
2,40,Deathmatch Classic,2618,545,"5,000,000 .. 10,000,000",499,Action,"{'Action': 638, 'FPS': 155, 'Classic': 119, 'M..."
3,50,Half-Life: Opposing Force,24363,1198,"2,000,000 .. 5,000,000",499,Action,"{'FPS': 934, 'Action': 362, 'Classic': 291, 'S..."
4,70,Half-Life,143086,5135,"10,000,000 .. 20,000,000",999,Action,"{'FPS': 2509, 'Classic': 1998, ""1990's"": 1984,..."
...,...,...,...,...,...,...,...,...
995,3132990,Black Myth: Wukong Benchmark Tool,4485,368,"2,000,000 .. 5,000,000",0,"Free To Play, Utilities","{'Free to Play': 185, 'Utilities': 138, 'Bench..."
996,3146520,WEBFISHING,64473,1337,"1,000,000 .. 2,000,000",499,"Casual, Indie, Massively Multiplayer","{'Fishing': 395, 'Multiplayer': 349, 'Casual':..."
997,3164500,Schedule I,200803,3238,"10,000,000 .. 20,000,000",1999,"Action, Indie, Simulation, Strategy, Early Access","{'Simulation': 893, 'Co-op': 825, 'Crime': 804..."
998,3240220,Grand Theft Auto V Enhanced,38595,17286,"5,000,000 .. 10,000,000",4499,"Action, Adventure, Racing","{'Open World': 378, 'Action': 352, 'Multiplaye..."
