# Build a scraper to collect Steam App store data

The current dataset collected on Steam games was a but outdated.
So instead we will scrape a new data set for analysis.
The scraping method used here was borrowed from Nik Davis' GitHub (https://nik-davis.github.io/posts/2019/steam-data-collection/)
I have modified parts of the method to suit my needs here.

In [1]:
# standard library imports
import csv
import datetime as dt
import json
import os
import statistics
from time import sleep, time
import requests
import pandas as pd
import numpy as np
from random import randint
from bs4 import BeautifulSoup
import pickle

In [2]:
response = requests.get('https://api.steampowered.com/ISteamApps/GetAppList/v2/?')
applist = response.json()

In [3]:
steam_app_ids = pd.DataFrame(applist['applist']['apps'])
steam_app_ids.to_pickle('steam_app_list.p')
steam_app_ids['appid'].head()

0    562398
1    562399
2    562410
3    562420
4    562430
Name: appid, dtype: int64

In [4]:
# steam apps games scrape
def get_request(url, parameters=None):
    # return json format data about apps in steam app store
    try:
        response = requests.get(url=url, params=parameters)
    except SSLError as s:
        print('SSL Error:', s)
        
        sleep(randint(1,3))
        print('\rRetrying.' + ' '*10)
        
        # recursively try again
        return get_request(url, parameters)
    
    if response:
        return response.json()
    else:
        # response is none usually means too many requests. Wait and try again 
        print('No response, waiting 10 seconds...')
        sleep(10)
        print('Retrying.')
        return get_request(url, parameters)


In [5]:
def app_parser(app_id):
    url = "http://store.steampowered.com/api/appdetails/"
    parameters = {"appids":  app_id}
    json_data = requests.get(url, params= parameters).json()
    json_app_data = json_data[str(app_id)]
    if json_app_data['success']:
        app_data = json_app_data['data']
        details = ['type','steam_appid','name','release_date','reviews','developers','publishers','metacritic','platforms','genres','required_age']
        app_details = []
        for detail in details:
            try:
                app_details.append(app_data[detail])
            except:
                app_details.append(np.NaN)
                pass
        try:    
            app_details[3] = app_details[3]['date']
        except:
            pass
        try:
            app_details[4] = BeautifulSoup(app_details[4]).get_text() 
        except:
            pass
        try:    
            app_details[7] = app_details[7]['score']
        except:
            pass
        try:    
            app_details[9] = [x['description'] for x in app_details[9]]
        except:
            pass
    else:
        app_details = [np.NaN, np.NaN, np.NaN,np.NaN, np.NaN,np.NaN, np.NaN, np.NaN,np.NaN, np.NaN, np.NaN]

    return app_details


In [None]:
import csv
import pickle

columns = ['type','steam_appid','name','release_date','reviews','developers','publishers','metacritic','platforms','genres','required_age']
details = []
csv_file = open('steam_apps_dataset.csv', 'w', encoding = 'utf-8')
csv_writer= csv.writer(csv_file, delimiter= ',')

for ids in steam_app_ids['appid'][0:10000]:
    apps_written = 0
    row = app_parser(ids)
    csv_writer.writerow(row)
    details.append(row)
    apps_written += 1
    sleep(randint(1,3))
    if len(details) % 1000 == 0.0:
        print(f"About to pickle data, don't stop script!")     
        with open('steam_data_list.p','wb') as f:
            pickle.dump(details,f)
        print(f'Completed {apps_written} app details') 
        sleep(2)       
    else:    
        sleep(1)

csv_file.close     
details    


In [14]:
pickledeets = pickle.load(open('steam_data_list.p', 'rb'))
deets = pd.DataFrame(pickledeets, columns = columns)
deets['metacritic'].max

<bound method NDFrame._add_numeric_operations.<locals>.max of 0      NaN
1      NaN
2      NaN
3      NaN
4      NaN
        ..
9995   NaN
9996   NaN
9997   NaN
9998   NaN
9999   NaN
Name: metacritic, Length: 10000, dtype: float64>

In [22]:
columns = ['type','steam_appid','name','release_date','reviews','developers','publishers','metacritic','platforms','genres','required_age']
details = []
csv_file = open('steam_apps_dataset4.csv', 'w', encoding = 'utf-8')
csv_writer= csv.writer(csv_file, delimiter= ',')
apps_written = 0
for ids in steam_app_ids['appid'][22001:40000]:
    row = app_parser(ids)
    csv_writer.writerow(row)
    details.append(row)
    apps_written += 1
    sleep(randint(1,3))
    if len(details) % 1000 == 0.0:
        print(f"About to pickle data, don't stop script!")     
        with open('steam_data_list4.p','wb') as f:
            pickle.dump(details,f)
        print(f'Completed {apps_written} app details') 
        sleep(2)       
    else:    
        sleep(1)

csv_file.close     
details    


About to pickle data, don't stop script!
Completed 1000 app details
About to pickle data, don't stop script!
Completed 2000 app details
About to pickle data, don't stop script!
Completed 3000 app details
About to pickle data, don't stop script!
Completed 4000 app details
About to pickle data, don't stop script!
Completed 5000 app details
About to pickle data, don't stop script!
Completed 6000 app details
About to pickle data, don't stop script!
Completed 7000 app details
About to pickle data, don't stop script!
Completed 8000 app details
About to pickle data, don't stop script!
Completed 9000 app details
About to pickle data, don't stop script!
Completed 10000 app details
About to pickle data, don't stop script!
Completed 11000 app details
About to pickle data, don't stop script!
Completed 12000 app details
About to pickle data, don't stop script!
Completed 13000 app details
About to pickle data, don't stop script!
Completed 14000 app details
About to pickle data, don't stop script!
Co

[['episode',
  530446,
  'Battle Disc: Agressive Opponent Dink!',
  '21 Oct, 2016',
  nan,
  nan,
  [''],
  nan,
  {'windows': True, 'mac': False, 'linux': False},
  nan,
  0],
 ['episode',
  530447,
  'Battle Disc: Mia Sows Discord!',
  '21 Oct, 2016',
  nan,
  nan,
  [''],
  nan,
  {'windows': True, 'mac': False, 'linux': False},
  nan,
  0],
 ['episode',
  530448,
  'Battle Disc: Reunion of the Rockson Brothers!',
  '21 Oct, 2016',
  nan,
  nan,
  [''],
  nan,
  {'windows': True, 'mac': False, 'linux': False},
  nan,
  0],
 ['episode',
  530449,
  'Battle Disc: Dignity of The Warrior!',
  '21 Oct, 2016',
  nan,
  nan,
  [''],
  nan,
  {'windows': True, 'mac': False, 'linux': False},
  nan,
  0],
 ['episode',
  530450,
  'Battle Disc: The Fight of Six Warriors!',
  '4 Nov, 2016',
  nan,
  nan,
  [''],
  nan,
  {'windows': True, 'mac': False, 'linux': False},
  nan,
  0],
 ['episode',
  530451,
  'Battle Disc: The Birth of Mega Warrior!',
  '4 Nov, 2016',
  nan,
  nan,
  [''],
  nan,


In [36]:
steam1 = pickle.load(open('steam_data_list.p', 'rb'))
steam2 = pickle.load(open('steam_data_list2.p', 'rb'))
steam3 = pickle.load(open('steam_data_list3.p', 'rb'))
steam4 = pickle.load(open('steam_data_list4.p', 'rb'))
steam_data = steam1+steam2+steam3+steam4

len(steam_data)
steam_data_scraped = pd.DataFrame(steam_data, columns = columns)

In [37]:
x = steam_data_scraped
#x.head()

In [51]:
x = x[x['metacritic'] > 0 ]
x.sort_values(by = 'metacritic', ascending = False, inplace = True)

with open('steam_scraped_data.p', 'wb') as f:
    pickle.dump(x,f)

In [59]:
x['steam_appid'].astype(int)
z = x[x['type'] == 'game']
z.shape

(1907, 11)

In [62]:
# separate the final tally of games with rating scores into three categories.
z.sort_values(by = 'metacritic', ascending = False, inplace = True)
steam_data_top635 =z[0:634]
steam_data_mid635 = z[635:1300]
steam_data_bottom635 = z[1300:]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return func(*args, **kwargs)


In [63]:
# Pickle scraped data
# pickle steam backup data.
with open('O_steam_data_scraped.p', 'wb') as f:
    pickle.dump(x,f)

with open('O_steam_top_635.p', 'wb') as f:
    pickle.dump(steam_data_top635,f)

with open('O_steam_bottom_635.p', 'wb') as f:
    pickle.dump(steam_data_bottom635,f)

with open('O_steam_data_middle635.p', 'wb') as f:
    pickle.dump(steam_data_mid635,f)

In [6]:
steam_full = pickle.load(open('O_steam_data_scraped.p', 'rb'))
list_of_ids = steam_full['steam_appid'].tolist()

In [7]:
with open('O_steam_ids.p', 'wb') as f:
    pickle.dump(list_of_ids,f)