In [29]:
"""
In this file, we will explore information in the steam API to undertand what type of data the API contains, and what features we'll be able to utilize.

This notebook collects and extracts structured game data from the Steam Store API and saves it to disk.

"""
#Features we may need:
#Game name
#Genre
#Date
#Player count
#Ranking
#Price
#Review count
#Review score

"\nIn this file, we will explore information in the steam API to undertand what type of data the API contains, and what features we'll be able to utilize.\n\nThis notebook collects and extracts structured game data from the Steam Store API and saves it to disk.\n\n"

In [30]:
import numpy as np
import pandas as pd
import os
from dotenv import load_dotenv

load_dotenv()

STEAM_API_KEY = os.getenv("STEAM_API_KEY")

print(STEAM_API_KEY[:5])


2C912


In [31]:
""" API Strategy
Initial data will be pulled from Steam Store endpoints that provide game metadata.
The Steam Web API key has been successfully loaded and will be used if required for
player or user-level endpoints."""

' API Strategy\nInitial data will be pulled from Steam Store endpoints that provide game metadata.\nThe Steam Web API key has been successfully loaded and will be used if required for\nplayer or user-level endpoints.'

In [40]:
import requests
import json

url = "https://store.steampowered.com/api/appdetails"
params = {
    "appids": 570,   # Dota 2
    "cc": "us",
    "l": "en"
}

response = requests.get(url, params=params)
data = response.json()

type(data)


dict

In [None]:
data.keys()

dict_keys(['570'])

In [None]:
app_id = "570"
app_data = data[app_id]

app_data.keys()

dict_keys(['success', 'data'])

In [None]:
game_data = app_data["data"]

In [None]:
type(game_data)

dict

In [None]:
game_data.keys()

dict_keys(['type', 'name', 'steam_appid', 'required_age', 'is_free', 'dlc', 'detailed_description', 'about_the_game', 'short_description', 'supported_languages', 'reviews', 'header_image', 'capsule_image', 'capsule_imagev5', 'website', 'pc_requirements', 'mac_requirements', 'linux_requirements', 'developers', 'publishers', 'packages', 'package_groups', 'platforms', 'metacritic', 'categories', 'genres', 'screenshots', 'movies', 'recommendations', 'release_date', 'support_info', 'background', 'background_raw', 'content_descriptors', 'ratings'])

In [None]:
#Features we will use from above: name; is_free; release_date; recommendations; metacritic_score; genre_list

game_data["name"]

'Dota 2'

In [None]:
game_data["is_free"]

True

In [None]:
game_data["release_date"]

{'coming_soon': False, 'date': 'Jul 9, 2013'}

In [None]:
game_data["release_date"]["date"]

'Jul 9, 2013'

In [None]:
game_data["recommendations"]

{'total': 14372}

In [None]:
game_data.get("recommendations")

{'total': 14372}

In [None]:
game_data["recommendations"]["total"]

14372

In [None]:
#we use .get because it handles key errors differently. If this feature was empty it would return None instead of keyerror the other way
game_data.get("metacritic")

{'score': 90,
 'url': 'https://www.metacritic.com/game/pc/dota-2?ftag=MCD-06-10aaa1f'}

In [None]:
game_data["genres"]

[{'id': '1', 'description': 'Action'},
 {'id': '2', 'description': 'Strategy'},
 {'id': '37', 'description': 'Free To Play'}]

In [None]:
# We do not want a string as our date in release_year so we will convert that to get only the year

release_date_str = game_data["release_date"]["date"]
release_year = int(release_date_str.split(",")[-1].strip())

print(release_year)

2013


In [None]:
dota_features = {
    "name": game_data["name"],
    "is_free": game_data["is_free"],
    "release_year": release_year,
    "recommendations": game_data["recommendations"]["total"],
    "metacritic_score": game_data["metacritic"]["score"],
    "genres": [g["description"] for g in game_data["genres"]]
}

In [None]:
for k, v in dota_features.items():
    print(f"{k}: {v}")


name: Dota 2
is_free: True
release_year: 2013
recommendations: 14372
metacritic_score: 90
genres: ['Action', 'Strategy', 'Free To Play']


In [None]:
"""
Now that we have parciced with Dota 2 to see how we can extract the data that we need and convert it to a useable format, we will create a function that can do this for any game data moving forward
"""
def extract_game_features(game_data):
    # Safely get release year
    release_date_str = game_data.get("release_date", {}).get("date")
    if release_date_str:
        try:
            release_year = int(release_date_str.split(",")[-1].strip())
        except:
            release_year = None

    # Safely get recommendations
    recommendations = game_data.get("recommendations", {}).get("total", None)

    # Safely get metacritic score
    metacritic_score = game_data.get("metacritic", {}).get("score", None)

    # Safely get genres
    genres = [g.get("description") for g in game_data.get("genres", [])]

    # Build features dict
    game_features = {
        "name": game_data.get("name"),
        "is_free": game_data.get("is_free"),
        "release_year": release_year,
        "recommendations": recommendations,
        "metacritic_score": metacritic_score,
        "genres": genres
    }
    
    return game_features

In [None]:
dota_features = extract_game_features(game_data)

print(dota_features)

{'name': 'Dota 2', 'is_free': True, 'release_year': 2013, 'recommendations': 14372, 'metacritic_score': 90, 'genres': ['Action', 'Strategy', 'Free To Play']}


In [None]:
"""
Now we understnad our pipeline for the moment is app id list -> fetch raw game data -> extract_game_features() -> store results -> dataframe

We will be extracting the top 10,000 games from here to have a large dataset
"""

import requests

#We are using the below URL to extract app_id because the steampowered url that contains a list of all games is no longer working
url = "https://games-popularity.com/swagger/api/top-sellers"
response = requests.get(url)
data = response.json()

print(type(data), len(data))
print(data.keys())

<class 'dict'> 2
dict_keys(['validTimeUtc', 'data'])


In [None]:
print(data["data"][:10])

[{'position': 1, 'gameName': 'Counter-Strike 2', 'steamId': '730'}, {'position': 2, 'gameName': 'Quarantine Zone: The Last Check', 'steamId': '3419520'}, {'position': 3, 'gameName': 'Steam Deck', 'steamId': '1675200'}, {'position': 4, 'gameName': 'ARC Raiders', 'steamId': '1808500'}, {'position': 5, 'gameName': 'PUBG: BATTLEGROUNDS', 'steamId': '578080'}, {'position': 6, 'gameName': 'Where Winds Meet', 'steamId': '3564740'}, {'position': 7, 'gameName': 'Dota 2', 'steamId': '570'}, {'position': 8, 'gameName': 'Warframe', 'steamId': '230410'}, {'position': 9, 'gameName': 'StarRupture', 'steamId': '1631270'}, {'position': 10, 'gameName': 'Apex Legends™', 'steamId': '1172470'}]


In [None]:
game_list = data["data"]

top_app_ids = [int(game["steamId"]) for game in game_list[:]]

print(top_app_ids[:5])

[730, 3419520, 1675200, 1808500, 578080]


In [None]:
"""
Since we know we have the top_app_ids list now, we will create a function that will fetch the game data from steam using the app id above
"""
def fetch_game_data(app_id, cc="us", lang="en"):
    url = "https://store.steampowered.com/api/appdetails"
    params = {
        "appids": app_id,
        "cc": cc,
        "l": lang
    }
    
    # Add headers to mimic a real browser
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                      "AppleWebKit/537.36 (KHTML, like Gecko) "
                      "Chrome/117.0.0.0 Safari/537.36"
    }
    
    try:
        response = requests.get(url, params=params, timeout=10)
        response.raise_for_status() #Riase http error for bad status
        result = response.json()
        
        #each response is a dictionary with the appid as top-level key
        app_data = result.get(str(app_id), {})  # top-level key is the app_id as string
        if app_data.get("success"):
            return app_data.get("data", None)
        else:
            return None
    
    except Exception as e:
        print(f"Error fetching app {app_id}: {e}")
        return None
    
print(fetch_game_data(570))

{'type': 'game', 'name': 'Dota 2', 'steam_appid': 570, 'required_age': 0, 'is_free': True, 'dlc': [1241930, 652720], 'detailed_description': "<strong>The most-played game on Steam.</strong><br>Every day, millions of players worldwide enter battle as one of over a hundred Dota heroes. And no matter if it's their 10th hour of play or 1,000th, there's always something new to discover. With regular updates that ensure a constant evolution of gameplay, features, and heroes, Dota 2 has truly taken on a life of its own.<br><br><strong>One Battlefield. Infinite Possibilities.</strong><br>When it comes to diversity of heroes, abilities, and powerful items, Dota boasts an endless array—no two games are the same. Any hero can fill multiple roles, and there's an abundance of items to help meet the needs of each game. Dota doesn't provide limitations on how to play, it empowers you to express your own style.<br><br><strong>All heroes are free.</strong><br>Competitive balance is Dota's crown jewel, 

In [36]:
#Next we will feed this into our pipeline
import time
import random

all_game_features = []

for app_id in top_app_ids:
    try:
        game_data = fetch_game_data(app_id)
        if game_data is None:
            continue
        
        features = extract_game_features(game_data)
        features["app_id"] = app_id  # store app_id explicitly
        all_game_features.append(features)
        
        time.sleep(random.uniform(1.0, 3.0))  # wait 0.5 - 3.0 seconds between requests
        
    except KeyError as ke:
        print(f"Skipping {app_id}: missing key {ke}")
    except Exception as e:
        print(f"Skipping {app_id}: {e}")
        
print(all_game_features[:5])

Skipping 1634050: cannot access local variable 'release_year' where it is not associated with a value
Skipping 242050: cannot access local variable 'release_year' where it is not associated with a value
Skipping 223850: cannot access local variable 'release_year' where it is not associated with a value
[{'name': 'Counter-Strike 2', 'is_free': True, 'release_year': 2012, 'recommendations': 4868989, 'metacritic_score': None, 'genres': ['Action', 'Free To Play'], 'app_id': 730}, {'name': 'Quarantine Zone: The Last Check', 'is_free': False, 'release_year': 2026, 'recommendations': 873, 'metacritic_score': None, 'genres': ['Indie', 'Simulation', 'Strategy'], 'app_id': 3419520}, {'name': 'Steam Deck', 'is_free': False, 'release_year': 2025, 'recommendations': None, 'metacritic_score': None, 'genres': [], 'app_id': 1675200}, {'name': 'ARC Raiders', 'is_free': False, 'release_year': 2025, 'recommendations': 215125, 'metacritic_score': None, 'genres': ['Action'], 'app_id': 1808500}, {'name': 'P

In [37]:
print(len(all_game_features))

1997


In [38]:
#Now we will turn this list all_game_features into a dataframe then export as a csv so we do not have to refetch the data every single time

import pandas as pd

# Convert list of dicts to DataFrame
df = pd.DataFrame(all_game_features)

# Quick check
print(df.head())
print(df.info())

                              name  is_free  release_year  recommendations  \
0                 Counter-Strike 2     True          2012        4868989.0   
1  Quarantine Zone: The Last Check    False          2026            873.0   
2                       Steam Deck    False          2025              NaN   
3                      ARC Raiders    False          2025         215125.0   
4              PUBG: BATTLEGROUNDS     True          2017        1754636.0   

   metacritic_score                                             genres  \
0               NaN                             [Action, Free To Play]   
1               NaN                      [Indie, Simulation, Strategy]   
2               NaN                                                 []   
3               NaN                                           [Action]   
4               NaN  [Action, Adventure, Massively Multiplayer, Fre...   

    app_id  
0      730  
1  3419520  
2  1675200  
3  1808500  
4   578080  
<class '

In [39]:
# Save CSV to disk
df.to_csv("/home/gagan/DataScienceProjects/SteamPopularityProject/data/steam_games_partial.csv", index=False)