# Predicting STEAM User Engagement

# ACQUIRING DATA
## STEAM WEB AND STEAMSPY API SCRAPING

Based on code from Nik Davis
URL: https://nik-davis.github.io/posts/2019/steam-data-collection/

This code is a simplified version of his code, with reduced compelxity but with the feature of repairing missing entiries in data files. This code also attempts to does it's best to prevent data lose by creating copies of data before writing allow quicky and easy recovery.

In [1]:
# Imports

import csv
import datetime as dt
import json
import os
import statistics
import time
import sys
import numpy as np
import pandas as pd
import requests
import datetime
import shutil

Setup where we will write files, determine what the filenames are and set what columns we will save into our csv files from the data returned by the steampowered and steamspy API

In [219]:
# set files and columns
download_path = '/Capstone-project/data/data_scraped'
csvpath="{}/csvfiles/".format(download_path)
steamspy_data = 'steamspy_data.csv'
steampowered_data = 'steampowered_data.csv'

### Columns we want from steamspy's appdetail data
steamspy_columns = [
    'appid', 'name', 'developer', 'publisher', 'score_rank', 'positive',
    'negative', 'userscore', 'owners', 'average_forever', 'average_2weeks',
    'median_forever', 'median_2weeks', 'price', 'initialprice', 'discount',
    'languages', 'genre', 'ccu', 'tags'
]

### Columns we want from the steampowered's appdetail data
steampowered_columns = [
   'type', 'name', 'steam_appid', 'required_age', 'is_free', 'controller_support',
   'dlc', 'detailed_description', 'about_the_game', 'short_description', 'fullgame',
   'supported_languages','pc_requirements', 'mac_requirements', 'linux_requirements',
   'developers', 'publishers', 'demos', 'price_overview', 'packages', 'package_groups',
   'platforms', 'metacritic', 'reviews', 'categories', 'genres', 'recommendations',
   'achievements', 'release_date', 'content_descriptors'
]

get_request():  wrapper for requests module.  Adds retry logic. 

parse_steampowered_request():  get info about a game from steampowered when supplied a appid

parse_steamspy_request():   get info about a game frmo steamspy wheen supplied a appid

get_data():  our main function.  We use this so get data abut all the games we have in the app_list variable

In [220]:
### We will use this a lot to pull data from each page.
def get_request(url, parameters=None):
    ### if we fail to get data, sleep and try again
    ### Sometimes we poll the site too often and it won't return data
    sleeptime=30
    try:
        ### This is doing all the work
        response = requests.get(url=url, params=parameters)
    except:
        print("\nerror pulling data from {}\n".format(url))
        
    if response:
        return response.json()
    ### If we don't get data, lets try again by calling this function.
    else:
        print('\nNo response, waiting {} seconds\n'.format(sleeptime))
        ### The sleep if we are access the website too often.
        time.sleep(sleeptime)
        print('\nRetrying.\n')
        return get_request(url, parameters)
    
### functions we use to parse data from steampowered and steamspy.
### These functions pull a single games information based an app id that is passed to it
def parse_steampowered_request(appid, name):
    ### the base URL we use to get app data.
    url = "http://store.steampowered.com/api/appdetails/"
    ### We pass the appid variable to get data.
    parameters = {"appids": appid}
    
    ### get data
    json_data = get_request(url, parameters=parameters)
    ### Only get data for our appid (game)
    json_app_data = json_data[str(appid)]
    
    ### if pull was successful..
    if json_app_data['success']:
        ### set data to the data of the game.
        data = json_app_data['data']
    else:
        ### if we didn't get the game details, we return the basic info we know about the game
        data = {'name': name, 'steam_appid': appid}
        
    return data

### functions we use to parse data from steampowered and steamspy.
### These functions pull a single games information based an app id that is passed to it
def parse_steamspy_request(appid, name):
    ### the base URL wee usee to get app data
    url = "https://steamspy.com/api.php"
    ### Request type iss appdetails, and we pass the appid.  
    parameters = {"request": "appdetails", "appid": appid}
   
    ### The data wee turn is well formated so we can just return it
    ### unlike what is happening with steampowered.
    json_data = get_request(url, parameters)
    return json_data


### This function will read in the csv files that exist.
### if CSV files do not exist, we generate CSV files with header info
### For each app_id we have in app_list
###     check if we have data in our steampowered csv.  If we don't download
###     check if we have data in our steamspy csv. If we don't download
def get_data(app_list):
    ### Get the time.
    ### We use this data to create a backup file
    ### backup file has a timestamp
    now=datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
    current_steampowered_file='{}/{}'.format(download_path,steampowered_data)
    current_steamspy_file='{}/{}'.format(download_path,steamspy_data)    
    current_steampowered_file_backup='{}/{}.{}'.format(download_path,steampowered_data,now)
    current_steamspy_file_backup='{}/{}.{}'.format(download_path,steamspy_data,now)
    ### Actually copy the current file to a backup file.
    shutil.copy2(current_steampowered_file,current_steampowered_file_backup)
    shutil.copy2(current_steamspy_file,current_steamspy_file_backup)

    ### We read in the current file for steampowered
    ### and get all th appids.
    known_steampowered_appid=list()
    if os.path.isfile(current_steampowered_file):
        tmp_list =  pd.read_csv(current_steampowered_file)        
        for idx,row in tmp_list.iterrows():
                steam_id=row['steam_appid']
                known_steampowered_appid.append(steam_id)
    else:
        ### if this is our fist run, create the CSV file and write a header.
        print("First Run: creating {}".format(current_steampowered_file))
        with open(current_steampowered_file, 'w') as f:
            csv_writer = csv.DictWriter(f, fieldnames=steampowered_columns, extrasaction='ignore')
            csv_writer.writeheader()            
        f.close()


    ### We read in the current file for steamspy
    ### and get all th appids.
    known_steamspy_appid=list()
    if os.path.isfile(current_steamspy_file):
        tmp_list = pd.read_csv(current_steamspy_file)
        for idx,row in tmp_list.iterrows():
            steam_id=row['appid']
            known_steamspy_appid.append(steam_id)
    else:
        ### if this is our fist run, create the CSV file and write a header.
        print("First Run: creating {}".format(current_steamspy_file))
        with open(current_steamspy_file, 'w') as f:
            csv_writer = csv.DictWriter(f, fieldnames=steamspy_columns, extrasaction='ignore')
            csv_writer.writeheader()
        f.close()          

    ### create file handle so we can append data to our current files. 
    fh_steampowered   = open(current_steampowered_file, 'a', encoding='utf-8') 
    fhcsv_steampowered = csv.DictWriter(fh_steampowered, fieldnames=steampowered_columns, extrasaction='ignore') 
    fh_steamspy       = open(current_steamspy_file, 'a', encoding='utf-8')
    fhcsv_steamspy    = csv.DictWriter(fh_steamspy, fieldnames=steamspy_columns, extrasaction='ignore')
    
    ### counter is used to print a line every 100 appids.
    counter=0
    for index, row in app_list.iterrows():
        ##Get app id and game name from app_list.
        appid = row['appid']
        name = row['name']
        
        ### if this is the 100th app id, print.  
        if counter%100==0:
            print("\nappid: {} - name {}".format(appid,name))
        else:
            ### lets the user know something is happening.
            print(".",end='')
        counter=counter+1
        
        ### if we do not have info about the appid in our current CSV file, get from steampowered.
        ### This allows our code to be self healing our CSV filse and to resume.
        if appid not in known_steampowered_appid:
            data=parse_steampowered_request(appid,name)
            fhcsv_steampowered.writerow(data) 
            
        ### if we do not have info about the appid in our current CSV file, get from steampowered.
        ### This allows our code to self healing our CSV files and to resume.
        if appid not in known_steamspy_appid:
            data=parse_steamspy_request(appid,name)
            fhcsv_steamspy.writerow(data)

Get list of games from steamspy.  We need the appid too get more data later.  Each page returns 1000 games so by setting max_pages we can control how much data we want to use ultimately.

In [None]:
### pull data directly from steamspy.  Gets appid and game name.
### The appids provide a filtered view of all the 
### appids in the steam system.
url = "https://steamspy.com/api.php"
print("csvpath: {}".format(csvpath))

### starting page
page=0
### max pages.  Each page returns 1000 entries
### allows us to control how many entries we have.
max_pages=5
while page<max_pages:
    ### we use the request type all to a well formated list of games.
    ### data is pagenanted, we use page to change page. 
    parameters = {"request": "all","page":page}
    ### print so we know what is happening.
    print("getting page {}?request=all&page={}".format(url,page))
    ### Get data
    json_data = get_request(url, parameters=parameters)
    ### process our data, get the appid and game name.
    if json_data:
        steam_spy_all = pd.DataFrame.from_dict(json_data, orient='index')
        tmp_app_list = steam_spy_all[['appid', 'name']].sort_values('appid').reset_index(drop=True)
        ### write to csv
        tmp_app_list.to_csv('{}/app_list_page_{}.csv'.format(csvpath,page), index=False)
    else:
        ### this API somoetimes caches bad data.  If we don't get any data
        ### inform the user.  They will have to rerun for that page.
        print("no data return")
    ### increase page counter
    page=page+1
    ### sleep 60 seconds as requested by the API documentation
    time.sleep(60)

The above code does not load the data into a variable.  We need to run the code below to do that.  This allows flexability.  We can manually adjust the CSV files between the two runs if needed.

In [223]:
print("csvpath: {}".format(csvpath))

### pull data from CSV files instead of regenerating data.
count=0
### create empty dataFrame
app_list=pd.DataFrame()
### loop through list of filenames
for filename in os.listdir(csvpath):
    ### match file name.
    if "app_list_page" in filename:
        full_path="{}{}".format(csvpath,filename)
        ### safe data to temp dataFrame
        tmp_list = pd.read_csv("{}{}".format(csvpath,filename))
        ### append data to main dataFrame
        app_list=app_list.append(tmp_list)
        count=count+1
### print info about number of pages
### print number of rows in the dataFrame
print("Read {} csv files".format(count))
print("Number of rows  in app_list: {}".format(app_list.shape))
app_list=app_list.sort_values(by='appid')

csvpath: /Users/zig/Downloads/Project//csvfiles/
Read 3 csv files
Number of rows  in app_list: (3000, 2)


Call get_data to start pulling data from steamspy and steampowered

In [None]:
###call get_data to get our data!!
get_data(app_list)


appid: 10 - name Counter-Strike
...................................................................................................
appid: 6800 - name Commandos: Behind Enemy Lines
..............................................................