In [61]:
# Import required packages
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import json
import time

In [62]:
# Set up request
headers = {"User-Agent": "Mozilla/5.0"}
cookies = {
    "birthtime": "1004572800",
    "lastagecheckage": "1-November-2001" 
}

# Get soup from top sellers page
topsellers_search_url = "https://store.steampowered.com/search/?filter=topsellers"
topsellers_search_page = requests.get(topsellers_search_url, headers=headers, cookies=cookies)
topsellers_soup = BeautifulSoup(topsellers_search_page.text, "html.parser")

# Get soup from indie page 
indie_search_url = "https://store.steampowered.com/search/?tags=492&supportedlang=english&filter=topsellers&ndl=1"
indie_search_page = requests.get(indie_search_url, headers=headers, cookies=cookies)
indie_soup = BeautifulSoup(indie_search_page.text, "html.parser")


In [63]:
# Find top sellers
topsellers_results = topsellers_soup.select('a.search_result_row')
indie_results = indie_soup.select('a.search_result_row')

def extract_game_data(soup_results):
    game_data = []

    for result in soup_results:
        url = result['href'].split('?')[0]
        title = result.select_one('.title').text.strip()

        price_block = result.select_one('.search_price, .search_price_discount_combined')
        if price_block:
            # Extract text and clean it
            price_text = price_block.get_text(separator=' ', strip=True).replace('Free to Play', '$0.00')
            prices = re.findall(r'[\$€£]\d+(?:\.\d{2})?', price_text)
            price = prices[-1] if prices else price_text.strip() or "Unknown"
        else:
            price = "Unknown"


        game_data.append({
            "title": title,
            "url": url,
            "price": price
        })
    
    return game_data

In [64]:
topsellers_data = extract_game_data(topsellers_results)
indie_data = extract_game_data(indie_results)


In [65]:
# For each game, extract the tags
def extract_tags_from_game(url):
    try:
        game_page = requests.get(url, headers=headers, cookies=cookies)
        soup = BeautifulSoup(game_page.text, "html.parser")
        for script in soup.find_all("script"):
            if "InitAppTagModal" in script.text:
                match = re.search(r'InitAppTagModal\(\s*\d+,\s*(\[\{.*?\}\])', script.text)
                if match:
                    tag_data_json = match.group(1)
                    tag_data = json.loads(tag_data_json)
                    return [tag["name"] for tag in tag_data]
    except Exception as e:
        print(f"Error processing {url}: {e}")
    return []

In [66]:
# Collect tags for the top games
for i, game in enumerate(topsellers_data):
    print(f"Processing {game['title']} - {game['url']}")
    tags = extract_tags_from_game(game["url"])
    topsellers_data[i]["tags"] = tags
    time.sleep(1)

Processing RuneScape: Dragonwilds - https://store.steampowered.com/app/1374490/RuneScape_Dragonwilds/
Processing Last Epoch - https://store.steampowered.com/app/899770/Last_Epoch/
Processing Counter-Strike 2 - https://store.steampowered.com/app/730/CounterStrike_2/
Processing Schedule I - https://store.steampowered.com/app/3164500/Schedule_I/
Processing Steam Deck - https://store.steampowered.com/app/1675200/Steam_Deck/
Processing Last Epoch - Woven Legend Supporter Pack - https://store.steampowered.com/app/3512600/Last_Epoch__Woven_Legend_Supporter_Pack/
Processing Marvel Rivals - https://store.steampowered.com/app/2767030/Marvel_Rivals/
Processing R.E.P.O. - https://store.steampowered.com/app/3241660/REPO/
Processing Baldur's Gate 3 - https://store.steampowered.com/app/1086940/Baldurs_Gate_3/
Processing Blue Prince - https://store.steampowered.com/app/1569580/Blue_Prince/
Processing Tempest Rising - https://store.steampowered.com/app/1486920/Tempest_Rising/
Processing The Elder Scrol

In [69]:
# Collect tags for the top indie games
for i, game in enumerate(indie_data):
    print(f"Processing {game['title']} - {game['url']}")
    tags = extract_tags_from_game(game["url"])
    indie_data[i]["tags"] = tags
    time.sleep(1)

Processing Last Epoch - https://store.steampowered.com/app/899770/Last_Epoch/
Processing Last Epoch - Woven Legend Supporter Pack - https://store.steampowered.com/app/3512600/Last_Epoch__Woven_Legend_Supporter_Pack/
Processing Blue Prince - https://store.steampowered.com/app/1569580/Blue_Prince/
Processing Mandragora: Whispers of the Witch Tree - https://store.steampowered.com/app/1721060/Mandragora_Whispers_of_the_Witch_Tree/
Processing Rust - https://store.steampowered.com/app/252490/Rust/
Processing Last Epoch - Woven Vanquisher Supporter Pack - https://store.steampowered.com/app/3512590/Last_Epoch__Woven_Vanquisher_Supporter_Pack/
Processing Drive Beyond Horizons - https://store.steampowered.com/app/2625420/Drive_Beyond_Horizons/
Processing DAVE THE DIVER - https://store.steampowered.com/app/1868140/DAVE_THE_DIVER/
Processing Enshrouded - https://store.steampowered.com/app/1203620/Enshrouded/
Processing DREDGE - https://store.steampowered.com/app/1562430/DREDGE/
Processing Phasmoph

In [None]:
# Clean up and simplify data
# Remove redundant 'indie' tag from indie games
for game in indie_data:
    if 'Indie' in game['tags']:
        game['tags'] = [tag for tag in game['tags'] if tag != 'Indie']

# Save raw data to CSV files
topsellers_df = pd.DataFrame(topsellers_data)
topsellers_df.to_csv("steam_top_sellers_data.csv", index=False)
indie_df = pd.DataFrame(indie_data)
indie_df.to_csv("steam_indie_data.csv", index=False)

tag_simplification_map = {
    "4 Player Local": "Co-op",
    "Online Co-Op": "Co-op",
    "Local Co-Op": "Co-op",
    "Co-op Campaign": "Co-op",
    "Local Multiplayer": "Co-op",
    "Split Screen": "Co-op",
    "Party Game": "Co-op",
    "Lore-Rich": "Story Rich",
    "Action Roguelike": "Roguelike",
    "Rouguelite": "Roguelike", # For simplicity I am classifying these as roguelikes (I know they're different)
    "Job Simulator": "Simulation",
    "Immersive Sim": "Simulation",
    "Farming Sim": "Simulation",
    "Dating Sim": "Simulation",
    "Automobile Sim": "Simulation",
    "Life Sim": "Simulation",
    "Colony Sim": "Simulation",
    "Action Adventure": "Action",
    "Card Battler": "Card Game",
    "Nudity": "Sexual Content",
    "NSFW": "Sexual Content",
    "Great Soundtrack": "Soundtrack",
    "Tactical RPG": "Tactical",
    "Board Game": "Tabletop",
    "War": "Military",
    "Wargame": "Military",
    "World War II": "Military",
    "Cold War": "Military",
    "Cyberpunk": "Sci-Fi",
    "Dark Fantasy": "Fantasy",
    "Old School": "Retro",
    "Kickstarter": "Crowdfunded",
    "Psychological Horror": "Horror",
    "Survival Horror": "Horror",
    "Thriler": "Horror",
    "Side Scroller": "Platformer",
    "2D Platformer": "Platformer",
    "Base Building": "Building"
    # Add more mappings as needed
}

# Simplify tags in the data
for game in topsellers_data:
    simplified_tags = []
    for tag in game['tags']:
        simplified_tags.append(tag_simplification_map.get(tag, tag))  # Replace if match, else keep
    game['tags'] = list(set(simplified_tags))  # Remove duplicates

# Apply the simplification to indie_data
for game in indie_data:
    simplified_tags = []
    for tag in game['tags']:
        simplified_tags.append(tag_simplification_map.get(tag, tag))
    game['tags'] = list(set(simplified_tags))

In [71]:
# Run if you want updated data as top sellers change frequently

topsellers_df = pd.DataFrame(topsellers_data)
topsellers_df.to_csv("steam_top_sellers_data_simple.csv", index=False)
indie_df = pd.DataFrame(indie_data)
indie_df.to_csv("steam_indie_data_simple.csv", index=False)

In [None]:
import plotly.express as px
import plotly.io as pio
import ast

pio.renderers.default = "browser"

def plot_tags(df, title, type):
    # Convert the tag string to a list
    df['tags'] = df['tags'].apply(ast.literal_eval)

    exploded_tags = df.explode('tags')

    tag_counts = exploded_tags['tags'].value_counts().reset_index()
    tag_counts.columns = ['tag', 'count']

    # Plot
    if type == 0:
        fig = px.bar(
            tag_counts,
            x = 'tag',
            y = 'count',
            title = title,
            labels = {'tag': 'Tag', 'count': 'Count'},
            color_discrete_sequence = px.colors.qualitative.Plotly
        )
    else:
        fig = px.pie(
            tag_counts.head(15),
            names = 'tag',
            values = 'count',
            title = title,
            labels = {'tag': 'Tag', 'count': 'Count'},
            color_discrete_sequence = px.colors.qualitative.Plotly
        )

    fig.update_layout(xaxis_tickangle=45)
    fig.show()

In [76]:
# Analyze the data
topsellers_df = pd.read_csv("steam_top_sellers_data_simple.csv")
indie_df = pd.read_csv("steam_indie_data_simple.csv")

plot_tags(topsellers_df, "Top Sellers Tags", 0)
plot_tags(indie_df, "Top Indie Tags", 0)