In [21]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
import time
import requests
import re
import json
import polars as pl
from helper_functions import save_data_to_json

In [15]:
def get_steam_gamesIDs():
    url = f"http://api.steampowered.com/ISteamApps/GetAppList/v2/"
    response = requests.get(url)
    data = response.json()
    data['applist']['apps'] = [{**app, 'appid': str(app['appid'])} for app in data['applist']['apps']]
    return data['applist']["apps"]  # This is a list of dictionaries with 'appid' and 'name'

In [16]:
games_list = get_steam_gamesIDs()
print(len(games_list))

198677


In [17]:
def clean_games_list(games_list):
    # List of keywords to exclude
    excluded = ['test', 'client', 'server', 'soundtrack', 'demo']
    
    # RegEx pattern for checking European characters
    european_chars_pattern = re.compile(r'^[a-zA-Z0-9 \-\'!@#$%^&*()_+={}[\]|\\:;"<>,.?/~`€£±§]+$')
    
    # Filter the list by removing dictionaries whose 'name' is empty,
    # contains 'test', 'client', 'server', 'soundtrack', or non-European characters.
    filtered_games = [
        game for game in games_list 
        if game['name'] and all(exclude not in game['name'].lower() for exclude in excluded)
        and european_chars_pattern.match(game['name'])
    ]
    return filtered_games

In [18]:
filtered_games = clean_games_list(games_list)
print(len(filtered_games))

152351


In [19]:
filtered_games = sorted(filtered_games, key=lambda x: x['appid'])

In [20]:
file_path = 'data/jsons/SteamGames.json'
#save_data_to_json(filtered_games, file_path)

'data/jsons/SteamGames.json'

In [9]:
def scrape_steam_topsellers():
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))

    url = 'https://store.steampowered.com/search/?category1=998&filter=topsellers&ndl=1'

    driver.get(url)
    
    time.sleep(3)
    
    last_height = driver.execute_script("return document.body.scrollHeight")
    while True:
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        
        time.sleep(0.5)
        
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height
    
    game_links = driver.find_elements(By.CSS_SELECTOR, 'a.search_result_row')
    
    appids = []
    for link in game_links:
        href = link.get_attribute('href')
        parts = href.split('/')
        if 'app' in parts:
            appid_index = parts.index('app') + 1
            appid = parts[appid_index]
            appids.append(appid)
    
    driver.quit()

    # Path to the JSON file
    steam_games_path = 'data/jsons/SteamGames.json'

    # Load the SteamGames.json file
    with open(steam_games_path, 'r') as file:
        steam_games = json.load(file)

    # Convert the list of dictionaries to a Polars DataFrame
    steam_games_df = pl.DataFrame(steam_games)
    
    # Convert the DataFrame to a dictionary for fast lookups
    appid_to_name = steam_games_df.select(['appid', 'name']).to_dict(as_series=False)
    
    # Create a lookup dictionary
    lookup = dict(zip(appid_to_name['appid'], appid_to_name['name']))
    
    # Create the output list
    topseller_games = [{"appid": appid, "name": lookup[appid]} for appid in appids if appid in lookup]    
    return topseller_games

In [10]:
# topseller_games = scrape_steam_topsellers()
# len(topseller_games)

5228

In [11]:
# file_path = 'data/jsons/SteamTopSellers.json'
# save_data_to_json(topseller_games, file_path)

'data/jsons/SteamTopSellers.json'

In [12]:
def scrape_steam_mostplayed():
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))

    url = 'https://store.steampowered.com/charts/mostplayed'
    driver.get(url)
    time.sleep(2)  # Give extra time for any late content to load

    # Scroll to the bottom of the page to ensure all content is loaded
    last_height = driver.execute_script("return document.body.scrollHeight")
    while True:
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(0.5)
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height

    games_data = []

    # Find the only table on the page
    table = driver.find_element(By.TAG_NAME, 'table')

    # Iterate through the table rows
    rows = table.find_elements(By.TAG_NAME, 'tr')

    for row in rows[1:]:  # Skip the header row
        columns = row.find_elements(By.TAG_NAME, 'td')

        if len(columns) < 6:
            continue

        rank = columns[1].text
        app_link_element = columns[2].find_element(By.TAG_NAME, 'a')
        app_link = app_link_element.get_attribute('href')
        name_element = app_link_element.find_element(By.XPATH, ".//div[last()]")
        name = name_element.text
        max_players_today = columns[4].text

        app_id = app_link.split('/')[4]

        game_info = {
            'rank': rank,
            'appid': app_id,
            'name': name,
            'max_players_today': max_players_today
        }
        games_data.append(game_info)

    driver.quit()

    return games_data

In [13]:
# mostplayed_games = scrape_steam_mostplayed()
# len(mostplayed_games)

100

In [14]:
# file_path = 'data/jsons/SteamMostPlayed.json'
# save_data_to_json(mostplayed_games, file_path)

'data/jsons/SteamMostPlayed.json'

In [33]:
def scrape_steam_monthlytopgames():
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))

    urls = ["https://store.steampowered.com/charts/topnewreleases/top_january_2024",
            "https://store.steampowered.com/charts/topnewreleases/top_february_2024",
            "https://store.steampowered.com/charts/topnewreleases/top_march_2024"]
    month = ["January", "February", "March"]
    
    appids = []
    
    for i in range(len(urls)):
        driver.get(urls[i])
        time.sleep(2)  # Give extra time for any late content to load
    
        # Scroll to the bottom of the page to ensure all content is loaded
        last_height = driver.execute_script("return document.body.scrollHeight")
        while True:
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(0.5)
            new_height = driver.execute_script("return document.body.scrollHeight")
            if new_height == last_height:
                break
            last_height = new_height

        for i in range(1,21):
            
            a_tag = driver.find_element(By.XPATH, xpath)
            href = a_tag.get_attribute('href')
            appid = href.split('/')[4]
            appids.append(appid)
    
    # Load the SteamGames.json file
    with open(steam_games_path, 'r') as file:
        steam_games = json.load(file)

    # Convert the list of dictionaries to a Polars DataFrame
    steam_games_df = pl.DataFrame(steam_games)
    
    # Convert the DataFrame to a dictionary for fast lookups
    appid_to_name = steam_games_df.select(['appid', 'name']).to_dict(as_series=False)
    
    # Create a lookup dictionary
    lookup = dict(zip(appid_to_name['appid'], appid_to_name['name']))
    
    # Create the output list
    games_data = [{"appid": appid, "name": lookup[appid], "month": month[i]} for appid in appids if appid in lookup] 

    driver.quit()

    return games_data

In [34]:
monthlytopgames = scrape_steam_monthlytopgames()
len(monthlytopgames)

NoSuchElementException: Message: no such element: Unable to locate element: {"method":"xpath","selector":"/html/body/div[1]/div[7]/div[6]/div[3]/div[3]/div/div/div/div/div/div/div/div[1]/div[2]/div[3]/div/div[2]/div[19]/div/div/div/div/div[1]/div/a"}
  (Session info: chrome=124.0.6367.119); For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#no-such-element-exception
Stacktrace:
	GetHandleVerifier [0x0078C113+48259]
	(No symbol) [0x0071CA41]
	(No symbol) [0x00610A17]
	(No symbol) [0x00650BED]
	(No symbol) [0x00650C9B]
	(No symbol) [0x0068BC12]
	(No symbol) [0x00670DE4]
	(No symbol) [0x00689B9C]
	(No symbol) [0x00670B36]
	(No symbol) [0x0064570D]
	(No symbol) [0x006462CD]
	GetHandleVerifier [0x00A465A3+2908435]
	GetHandleVerifier [0x00A83BBB+3159851]
	GetHandleVerifier [0x008250CB+674875]
	GetHandleVerifier [0x0082B28C+699900]
	(No symbol) [0x00726244]
	(No symbol) [0x00722298]
	(No symbol) [0x0072242C]
	(No symbol) [0x00714BB0]
	BaseThreadInitThunk [0x756B7BA9+25]
	RtlInitializeExceptionChain [0x7734BE3B+107]
	RtlClearBits [0x7734BDBF+191]


In [None]:
file_path = 'data/jsons/SteamMonthlyTopGames.json'
save_data_to_json(monthlytopgames, file_path)