Data Extraction

In [1]:
#Inport libraries/modules
import json
import numpy as np
import pandas as pd
import requests
from json.decoder import JSONDecodeError

In [2]:
#Set dataframe visualization row and column limts
pd.set_option("display.max_columns", 1000)
pd.set_option("display.max_rows", 100)

 Part 1 - Steam App IDs

In [3]:
#Define Function for making API requests
def get_request(url, parameters=None):
    try:
        response = requests.get(url=url, params=parameters)
        #handling request errors
    except requests.exceptions.RequestException as e:
        #5 second count down to try again
        for i in range(5, 0, -1):
            print('\rWaiting... ({})'.format(i), end='')
            time.sleep(1)
        print('Retrying')
        return get_request(url, parameters)

    if response:
        #handling json decoding errors
        try:
            return response.json()
        except JSONDecodeError as json_error:
            print (f'Error decoding JSON for URL: {url}, Parameters: {parameters}')
            return None
        #if no response wait 10 seconds
    else:
        print('No response, waiting 10 seconds')
        time.sleep(10)
        print('Retrying')
        return get_request(url, parameters)

In [None]:
url = "https://api.steampowered.com/ISteamApps/GetAppList/v2/"

#Use the function with the Steam API URL
json_data = get_request(url)

#Extract List of JSON data
app_list_data = json_data['applist']['apps']

#Create DF from JSON data
app_list_df = pd.DataFrame(app_list_data)

#Select appid, and name, sorting by appid
app_list_df = app_list_df[['appid', 'name']].sort_values('appid').reset_index(drop=True)

#Export df as csv
app_list_df.to_csv('app_list.csv', index=False)

#Show df head
app_list_df.head()

Part 2 - Steam App Data

In [None]:
#read applist csv as df
df = pd.read_csv("app_list.csv")

In [None]:
#get list of appids
app_ids =[]
for i in df["appid"]:
  app_ids.append(i)

print(len(app_ids))

In [None]:
#df of data already extracted
df2 = pd.read_csv("steam_app_data.csv")

In [None]:
#get app ids that already have data 
app_ids_done = []
for i in df2["steam_appid"]:
     app_ids_done.append(i)

print(len(app_ids_done))

In [None]:
#get list of ids that still need to have data extracted 
#by adding them to new list if they are not in the dataframe with app data
app_ids_to_process = []
for i in app_ids:
    if i not in app_ids_done:
        app_ids_to_process.append(i)

print(len(app_ids_to_process))

In [None]:
#function to get app data from Steam API
def get_app_data(app_ids_to_process):
    app_data = []
    for app_id in app_ids_to_process:
        url = "http://store.steampowered.com/api/appdetails/"
        parameters = {"appids": app_id}
        data = get_request(url, parameters)
        #handling responses that are none
        if data:
            app_details = data.get(str(app_id), {}).get('data', {})
            app_data.append(app_details)
        else:
            app_data.append({})
    return app_data

In [None]:
#making requests in batches of 100 and adding responses to list of dataframes
dfs = []
for i in range(0, len(app_ids_to_process), 100):
    batch_ids = app_ids_to_process[i:i+100]  
    batch_data = get_app_data(batch_ids)  
    if batch_data: 
        batch_df = pd.DataFrame(batch_data)  
        dfs.append(batch_df) 
        print(f"Batch {i//100 + 1} completed.")
print("finished")

In [None]:
#concatenating batch dataframes into larger one
df_new = pd.concat(dfs, ignore_index=True)

In [None]:
#drop rows with no id
df_new = df_new.dropna(subset=['steam_appid'])

In [None]:
#merge data frame of newly extracted data with priorly extracted one
merged_df = pd.concat([df1, df_new], ignore_index=True)

In [None]:
#export as CSV
merged_df.to_csv('steam_app_data.csv', index=False)

In [None]:
#read csv as df
df2 = pd.read_csv("steam_app_data.csv")

In [None]:
#drop duplicate ids
df2 = df2.drop_duplicates(subset=['steam_appid'])

In [None]:
#count number of games
game_count = (df2['type'] == 'game').sum()
print(game_count)

In [None]:
#drop rows that do not have the type game
df2 = df2.drop(df2[df2.type != "game"].index)

In [None]:
#export to csv
df2.to_csv('steam_app_data.csv', index=False)

Part 3 - Steam Spy Data

In [None]:
#read csv
df2 = pd.read_csv("steam_app_data.csv")

In [None]:
#get list of appids
new_app_ids =[]
for i in df2["steam_appid"]:
  new_app_ids.append(i)

print(len(new_app_ids))

In [None]:
#function for getting steam spy data
def get_sspy_app_data(new_app_ids):
    app_data = []
    for app_id in new_app_ids:
        url = "https://steamspy.com/api.php?request=appdetails"
        parameters = {"appid": app_id}
        data = get_request(url, parameters)
        if data:
            app_data.append(data)
        else:
            app_data.append({})
    return app_data

In [None]:
#getting steamspy data in batches
dfs = []
for i in range(0, len(new_app_ids), 100):
    batch_ids = new_app_ids[i:i+100]
    batch_data = get_sspy_app_data(batch_ids)
    if batch_data: 
        batch_df = pd.DataFrame(batch_data)
        dfs.append(batch_df) 
        print(f"Batch {i//100 + 1} completed.")
print("finished")

In [None]:
#concatenate batch data
df_new2 = pd.concat(dfs, ignore_index=True)

In [None]:
#export to csv
df_new2.to_csv('steam_spy_data.csv', index=False)

Part 4 - Merging Data

In [None]:
#read csv
df1 = pd.read_csv("steam_app_data.csv")
df2 = pd.read_csv("steam_spy_data.csv")

In [None]:
#merge data on steam_appid = appid
merged_df = pd.merge(df1, df2, left_on='steam_appid', right_on='appid', how='left')

In [None]:
#drop redundant/time sensitive/irrelevant columns
cleaner_merged = merged_df.drop(["languages", "genre", "price_overview", "developer", "publisher", "name_y", "appid", "alternate_appid", "ccu", "score_rank", "fullgame", "average_2weeks", "median_2weeks", "price", "discount", "userscore"], axis = 1)

In [None]:
#export to csv
cleaner_merged.to_csv('merged_data.csv', index=False)