
## Filters for game scraping by theme

I tried to use the code of this guy
https://github.com/mmmmmm44/steam_scraping_api/blob/main/scrape_categories/README.md
Could not really test much but it looks like it might be what we need
I limited the results and added some timers because steam was already flagging me....



|   “filter” value    | Description              
| “” (empty)          | The default value of the “filter” field, to display all games on Steam. Usually used with field “sorted_by”=”Released_DESC” to display all games on Steam from the latest release date to the earliest. |
| “popularnew”        | Corresponds to the “New & Trending” tab                                                                                
| “topsellers”        | Corresponds to the “Top Sellers” tab                                                                                   
| “globaltopsellers”  | Corresponds to the “Global Top Sellers” button next to the “See more:” phrase in the bottom of the list of “TopSellers” tab                                                                            |
| “popularcomingsoon” | Corresponds to the “Popular Upcoming” 

In [7]:
# Imports and Helper functions

from datetime import datetime
import time
import requests
import pickle
from pathlib import Path
import re
import random
import pandas as pd

In [2]:
params = {
    "filter": "topsellers",
    "hidef2p": 1,
    "page": 1,                # to control the page of the returned result, similar to what "cursor" does in scraping reviews of a game
    "json": 1
}

def get_search_results(params):
    req_sr = requests.get(
        "https://store.steampowered.com/search/results/",
        params=params)
    
    if req_sr.status_code != 200:
        print_log(f"Failed to get search results: {req_sr.status_code}")
        return {"items": []}
    
    try:
        search_results = req_sr.json()
    except Exception as e:
        print_log(f"Failed to parse search results: {e}")
        return {"items": []}
    
    return search_results
    
search_results = get_search_results(params)

In [3]:


def print_log(*args):
    print(f"[{str(datetime.now())[:-3]}] ", end="")
    print(*args)
    
def get_search_results(params):
    req_sr = requests.get(
        "https://store.steampowered.com/search/results/",
        params=params)
    
    if req_sr.status_code != 200:
        print_log(f"Failed to get search results: {req_sr.status_code}")
        return {"items": []}
    
    try:
        search_results = req_sr.json()
    except Exception as e:
        print_log(f"Failed to parse search results: {e}")
        return {"items": []}
    
    return search_results
    
def get_app_details(appid):
    while(True):
        if appid == None:
            print_log("App Id is None.")
            return {}

        appdetails_req = requests.get(
            "https://store.steampowered.com/api/appdetails/",
            params={"appids": appid, "cc": "hk", "l": "english"})        # change the countrycode to the region you are staying with
        
        if appdetails_req.status_code == 200:
            appdetails = appdetails_req.json()
            appdetails = appdetails[str(appid)]
            print_log(f"App Id: {appid} - {appdetails['success']}")
            break

        elif appdetails_req.status_code == 429:
            print_log(f'Too many requests. Sleep for 10 sec')
            time.sleep(10)
            continue

        elif appdetails_req.status_code == 403:
            print_log(f'Forbidden to access. Sleep for 5 min.')
            time.sleep(5 * 60)
            continue

        else:
            print_log("ERROR: status code:", appdetails_req.status_code)
            print_log(f"Error in App Id: {appid}.")
            appdetails = {}
            break

    return appdetails

def safe_sleep():
    time.sleep(random.uniform(1.2, 3.5))

In [4]:

# Main code

"""session = requests.Session()
session.headers.update({
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"
})"""

execute_datetime = datetime.now()
number_of_results = 100    # number of results to retrieve for each category


    
search_result_folder_path = Path(f"search_results_{execute_datetime.strftime('%Y%m%d')}")
if not search_result_folder_path.exists():
    search_result_folder_path.mkdir()
    
# a list of filters
params_list = [
    {"filter": "topsellers"},
    {"filter": "globaltopsellers"},
    {"filter": "popularnew"},
    {"filter": "popularcommingsoon"},
    {"filter": "", "specials": 1}
]
page_list = list(range(1, 5))

params_sr_default = {
    "filter": "topsellers",
    "hidef2p": 1,
    "page": 1,            # page is used to go through different parts of the ranking. Each page contains 25 results
    "json": 1
}

for update_param in params_list:

    items_all = []
    if update_param["filter"]:
        filename = f"{update_param['filter']}_{execute_datetime.strftime('%Y%m%d')}.pkl"
        category = update_param["filter"]
    else:
        filename = f"specials_{execute_datetime.strftime('%Y%m%d')}.pkl"
        category = "specials"

    if (search_result_folder_path / filename).exists():
        print_log(f"File {filename} exists. Skip.")
        continue

    for page_no in page_list:
        param = params_sr_default.copy()
        param.update(update_param)
        param["page"] = page_no

        search_results = get_search_results(param)
        safe_sleep()
        print_log(search_results)

        if not search_results:
            continue
        # Limit results to {number_of_results}
        items = search_results.get("items", [])
        items = items[:number_of_results]

        # proprocessing search results to retrieve the appid of the game
        for item in items:
            try:
                item["appid"] = re.search(r"steam/\w+/(\d+)", item["logo"]).group(1)      # the URL can be steam/bundles/{appid} or steam/apps/{appid}
            except Exception as e:
                print_log(f"Failed to extract appid: {e}")
                item["appid"] = None

        # request for game information using appid
        for item in items:
            appid = item["appid"]
            appdetails = get_app_details(appid)
            safe_sleep()
            item["appdetail"] = appdetails
            # Add the category to each item
            item["category"] = category

        items_all.extend(items)

    # save the search results
    with open(search_result_folder_path / filename, "wb") as f:
        pickle.dump(items_all, f)
    print_log(f"Saved {filename}")

[2025-12-08 08:34:44.998] {'desc': '', 'items': [{'name': 'Steam Deck', 'logo': 'https://shared.fastly.steamstatic.com/store_item_assets/steam/apps/1675200/capsule_sm_120.jpg?t=1763066602'}, {'name': 'ARC Raiders', 'logo': 'https://shared.fastly.steamstatic.com/store_item_assets/steam/apps/1808500/542a243d39bd7ed791359be9e96c82f419b81475/capsule_sm_120.jpg?t=1764755475'}, {'name': 'HELLDIVERS™ 2', 'logo': 'https://shared.fastly.steamstatic.com/store_item_assets/steam/apps/553850/3dfb18846a0d2db277c70d5a9e1ecb98e74668b8/capsule_sm_120.jpg?t=1763568660'}, {'name': 'Clair Obscur: Expedition 33', 'logo': 'https://shared.fastly.steamstatic.com/store_item_assets/steam/apps/1903340/001d4a5d81e4bb9055b789240e78e04ef6e6da38/capsule_sm_120.jpg?t=1762765069'}, {'name': 'Warhammer 40,000: Darktide', 'logo': 'https://shared.fastly.steamstatic.com/store_item_assets/steam/apps/1361210/c71526085580f71cb915a9f1e1ae94bab347bd79/capsule_sm_120_alt_assets_19.jpg?t=1765057874'}, {'name': 'NBA 2K26', 'logo'

In [5]:
# Load all pickled search result files and infer category from filename
data = []

pkl_files = sorted(Path('.').glob('search_results_*/**/*.pkl'))
if not pkl_files:
    print("No pickle files found under 'search_results_*'. If you want to load a specific file, update the path here.")
else:
    for p in pkl_files:
        try:
            with open(p, 'rb') as f:
                items = pickle.load(f)
        except Exception as e:
            print(f"Failed to load {p}: {e}")
            continue

        # Infer category from filename (prefix before first underscore)
        stem = p.stem  # e.g., 'globaltopsellers_20251120'
        if '_' in stem:
            category = stem.split('_')[0]
            if category == '':
                category = 'specials'
        else:
            category = stem or 'unknown'

        # Assign category to each item if not present
        for it in items:
            if not isinstance(it, dict):
                continue
            it.setdefault('category', category)

        data.extend(items)
        print(f"Loaded {len(items)} items from {p.name} as category '{category}'")

print(f"Total items loaded: {len(data)}")

Loaded 100 items from globaltopsellers_20251208.pkl as category 'globaltopsellers'
Loaded 100 items from popularcommingsoon_20251208.pkl as category 'popularcommingsoon'
Loaded 100 items from popularnew_20251208.pkl as category 'popularnew'
Loaded 100 items from specials_20251208.pkl as category 'specials'
Loaded 100 items from topsellers_20251208.pkl as category 'topsellers'
Total items loaded: 500


In [8]:
# Extract key information into a clean DataFrame
def create_game_dataframe(data):
    """Convert raw scraped data into a clean, readable DataFrame"""
    games = []
    
    for item in data:
        try:
            appid = item.get("appid")
            appdetail = item.get("appdetail", {})
            filter_category = item.get("category", "Unknown")
            
            # Extract data safely with defaults
            if appdetail and appdetail.get("success"):
                detail_data = appdetail.get("data", {})
                
                # Extract genres - handle both string and dict formats
                genres = detail_data.get("genres", [])
                if genres and isinstance(genres[0], dict):
                    genre_str = ", ".join([g.get("description", "") for g in genres[:3]])
                else:
                    genre_str = ", ".join(genres[:3]) if genres else "N/A"

                # Publishers
                pubs = detail_data.get("publishers", [])
                if isinstance(pubs, list):
                    publishers = ", ".join(pubs) if pubs else "N/A"
                else:
                    publishers = str(pubs) if pubs else "N/A"

                # Categories (distinct from the search filter category)
                cats = detail_data.get("categories", [])
                if cats and isinstance(cats[0], dict):
                    categories = ", ".join([c.get("description", "") for c in cats[:4]])
                else:
                    categories = ", ".join(cats[:4]) if cats else "N/A"

                # Recommendations (total)
                rec = detail_data.get("recommendations", {})
                recommendations = rec.get("total") if isinstance(rec, dict) else (rec or "N/A")
                if recommendations is None:
                    recommendations = "N/A"

                # Required age
                required_age = detail_data.get("required_age", "N/A")

                game_info = {
                    "Filter": filter_category,
                    "App ID": appid,
                    "Name": item.get("name", "N/A"),
                    "Publishers": publishers,
                    "Price": item.get("price", "N/A"),
                    "Release Date": detail_data.get("release_date", {}).get("date", "N/A"),
                    "Genre": genre_str if genre_str else "N/A",
                    "Categories": categories,
                    "Recommendations": recommendations,
                    "Required Age": required_age,
                    "Metacritic Score": detail_data.get("metacritic", {}).get("score", "N/A") if detail_data.get("metacritic") else "N/A",
                }
                games.append(game_info)
        except Exception as e:
            print(f"Error processing item: {e}")
            continue
    
    return pd.DataFrame(games)

# Create and display the DataFrame
df = create_game_dataframe(data)
print(f"\nTotal Games: {len(df)}\n")
print(df.to_string(index=False))


# Optional: Display summary statistics
print("\n" + "="*80)
print("SUMMARY STATISTICS")
print("="*80)
print(f"Games with Metacritic Scores: {df['Metacritic Score'].apply(lambda x: x != 'N/A').sum()}")

# Show breakdown by filter
print("\nBreakdown by Filter:")
filter_counts = df['Filter'].value_counts()
for filter_name, count in filter_counts.items():
    print(f"  {filter_name}: {count} games")

# Save output into a CSV file
output_filename = f"games_data_{execute_datetime.strftime('%Y%m%d')}.csv"
df.to_csv(output_filename, index=False)
print(f"\nData saved to {output_filename}")


Total Games: 495

            Filter  App ID                                                  Name                                                 Publishers Price Release Date                                     Genre                                                                               Categories Recommendations Required Age Metacritic Score
  globaltopsellers 1808500                                           ARC Raiders                                             Embark Studios   N/A 30 Oct, 2025                                    Action                                                     Multi-player, PvP, Online PvP, Co-op          162643            0              N/A
  globaltopsellers 1675200                                            Steam Deck                                                        N/A   N/A 17 Jan, 2025                                       N/A                                                                                      N/A             N/A    

In [9]:
# Inspect loaded data structure and available fields
from collections import Counter

top_keys = Counter()
detail_keys = Counter()
num_items = len(data)

for it in data:
    if not isinstance(it, dict):
        continue
    for k in it.keys():
        top_keys[k] += 1
    appdetail = it.get('appdetail', {})
    if isinstance(appdetail, dict) and appdetail.get('success'):
        detail = appdetail.get('data', {})
        for k in detail.keys():
            detail_keys[k] += 1

print(f"Total loaded items: {num_items}\n")
print("Top-level keys and how many items have them:")
for k, c in top_keys.most_common():
    print(f"  {k}: {c}")

print("\nNested 'data' keys (in appdetail['data']) and counts:")
for k, c in detail_keys.most_common():
    print(f"  {k}: {c}")

# Show example values for common interesting fields
example_fields = [
    'developers', 'publishers', 'short_description', 'categories',
    'price_overview', 'recommendations', 'metacritic', 'header_image',
    'screenshots', 'languages', 'required_age', 'controller_support'
]

print("\nExample values for commonly useful fields:")
for field in example_fields:
    printed = False
    for it in data:
        appd = it.get('appdetail', {})
        if not (isinstance(appd, dict) and appd.get('success')):
            continue
        d = appd.get('data', {})
        if field in d and d[field]:
            print(f"\n-- {field} (from app id {it.get('appid')}):")
            val = d[field]
            # If it's a long list/dict, print a short summary
            if isinstance(val, list):
                print(f"    list with {len(val)} items; sample: {val[:3]}")
            elif isinstance(val, dict):
                print(f"    dict keys: {list(val.keys())}")
            else:
                print(f"    {val}")
            printed = True
            break
    if not printed:
        print(f"\n-- {field}: (no example found)")

# Suggest additional fields that could be added to the table
suggested = [
    'developers', 'publishers', 'short_description', 'price_overview',
    'recommendations', 'categories', 'metacritic', 'required_age'
]
print("\nSuggested additional columns for the DataFrame:")
print(', '.join(suggested))


Total loaded items: 500

Top-level keys and how many items have them:
  name: 500
  logo: 500
  appid: 500
  appdetail: 500
  category: 500

Nested 'data' keys (in appdetail['data']) and counts:
  type: 495
  name: 495
  steam_appid: 495
  required_age: 495
  is_free: 495
  detailed_description: 495
  about_the_game: 495
  short_description: 495
  header_image: 495
  capsule_image: 495
  capsule_imagev5: 495
  website: 495
  pc_requirements: 495
  mac_requirements: 495
  linux_requirements: 495
  package_groups: 495
  platforms: 495
  screenshots: 495
  release_date: 495
  support_info: 495
  background: 495
  background_raw: 495
  content_descriptors: 495
  ratings: 495
  packages: 494
  supported_languages: 490
  developers: 490
  publishers: 490
  price_overview: 490
  categories: 490
  genres: 490
  movies: 489
  recommendations: 488
  legal_notice: 412
  achievements: 402
  dlc: 348
  controller_support: 271
  metacritic: 195
  reviews: 165
  drm_notice: 106
  demos: 96
  ext_user