# Steam Scraping

## Setup

In [None]:
# Installs
%pip install selenium
%pip install pandas
%pip install gdown

In [19]:
# Save Installs
%pip freeze > requirements.txt

Note: you may need to restart the kernel to use updated packages.


In [26]:
# Imports
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions
import pandas as pd
import os
import gdown

In [46]:
# Globals
URL = "https://store.steampowered.com/search/?filter=topsellers"
TARGET_GAMES = 200
pause = 1
path_game_ids = "data/game_ids.csv"
link_game_ids = "https://drive.google.com/file/d/13dJcwSq05OMW5E0h0uZRTfnx0Pncu1I0/view"

## Game ID Scraping

### Selenium

In [41]:
options = webdriver.ChromeOptions()
options.add_argument("--headless=new")
driver = webdriver.Chrome(options=options)

In [42]:
driver.get(URL)
time.sleep(pause)
WebDriverWait(driver, 10).until(expected_conditions.presence_of_element_located((By.ID, "search_resultsRows")))

games = []
app_ids = []
last = None
height = 0

while True:
	# Load More
	if last:
		last = games[-1]
		driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
		time.sleep(pause)
		print("[LOG] Loading More,", len(games))

	rows = driver.find_elements(By.CSS_SELECTOR, "#search_resultsRows a[data-ds-appid]")

	for row in rows:
		id = row.get_attribute("data-ds-appid")
		if id in app_ids: continue

		app_ids.append(id)
		temp = row.find_elements(By.CSS_SELECTOR, ".title")
		title = temp[0].text if temp else ""
		games.append({"Game ID": id, "Game Title": title})
		
		if len(games) >= TARGET_GAMES:
			print(f"[LOG] Target {len(games)} Reached")
			break
	
	# Exit Conditions
	if len(games) >= TARGET_GAMES: break
	elif last and last == games[-1]:
		print(f"[LOG] Reached App Max ({len(games)})")
		break

	if last == None:
		last = games[-1]



[LOG] Loading More, 50
[LOG] Loading More, 100
[LOG] Loading More, 150
[LOG] Target 200 Reached


In [43]:
print(len(games), "Games")
print(pd.DataFrame(games))

200 Games
     Game ID                                  Game Title
0    1675200                                  Steam Deck
1    1808500                                 ARC Raiders
2        730                            Counter-Strike 2
3    2767030                               Marvel Rivals
4    1085660                                   Destiny 2
..       ...                                         ...
195  3710910  Warhammer 40,000: Darktide - Arbites Class
196  2564960                   I'm on Observation Duty 8
197   976730           Halo: The Master Chief Collection
198  1430190                             Killing Floor 3
199  2444750                             Shape of Dreams

[200 rows x 2 columns]


### Save Data

In [44]:
df = pd.DataFrame(games)
df.insert(0, "Rank", [i + 1 for i in range(len(df))])

In [45]:
os.makedirs("data", exist_ok=True)
df.to_csv(path_game_ids, index=False)

## Download Game IDs


In [None]:
os.makedirs("data", exist_ok=True)
gdown.download(link_game_ids, output=path_game_ids, fuzzy=True)

Downloading...
From: https://drive.google.com/uc?id=13dJcwSq05OMW5E0h0uZRTfnx0Pncu1I0
To: d:\Professional\Education\Colleges & Universities\CGU\Semesters\2025 08\IST 332 - Natural Language Processing\Group Project\group-project\data\game_ids.csv
100%|██████████| 6.56k/6.56k [00:00<00:00, 7.04MB/s]


'data/game_ids.csv'

### Load Game IDs

In [52]:
df = pd.read_csv(path_game_ids)
df["Rank"] = [i+1 for i in range(len(df))]
df

Unnamed: 0,Rank,Game ID,Game Title
0,1,1675200,Steam Deck
1,2,1808500,ARC Raiders
2,3,730,Counter-Strike 2
3,4,2767030,Marvel Rivals
4,5,1085660,Destiny 2
...,...,...,...
195,196,3710910,"Warhammer 40,000: Darktide - Arbites Class"
196,197,2564960,I'm on Observation Duty 8
197,198,976730,Halo: The Master Chief Collection
198,199,1430190,Killing Floor 3


# New Section