<a href="https://colab.research.google.com/github/503N-project-RC/data-processing/blob/main/game_rec_dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Preparation

In [None]:
pip install steamspypi

Collecting steamspypi
  Downloading steamspypi-1.1.1-py3-none-any.whl.metadata (5.3 kB)
Downloading steamspypi-1.1.1-py3-none-any.whl (11 kB)
Installing collected packages: steamspypi
Successfully installed steamspypi-1.1.1


In [None]:
import steamspypi
import time
import csv
import json
import random
import numpy as np

# Part 1: Create games library

## SteamSpy API

### Fetch top 1000 most owned games

In [None]:
data_request = dict()
data_request['request'] = 'all'
data_request['page'] = '0'

all_games = steamspypi.download(data_request)

In [None]:
len(all_games)

1000

### Take subset

In [None]:
all_games = dict(list(all_games.items())[:100])
len(all_games)

100

In [None]:
print(next(iter(all_games.items())))

('570', {'appid': 570, 'name': 'Dota 2', 'developer': 'Valve', 'publisher': 'Valve', 'score_rank': '', 'positive': 2001040, 'negative': 452471, 'userscore': 0, 'owners': '200,000,000 .. 500,000,000', 'average_forever': 44263, 'average_2weeks': 1368, 'median_forever': 926, 'median_2weeks': 699, 'price': '0', 'initialprice': '0', 'discount': '0', 'ccu': 454370})


### Drop "Wallpaper Engine" outlier game

In [None]:
for key, value in list(all_games.items()):
    if value['name'] == "Wallpaper Engine":
        del all_games[key]
        break

### For each game in subset, fetch more details

In [None]:
all_games_detailed = {}

In [None]:
data_request = dict()
data_request['request'] = 'appdetails'

In [None]:
for game_id, game_data in all_games.items():
  data_request["appid"] = game_id
  data = steamspypi.download(data_request)

  tag_list = list(data.get("tags", {}).keys())

  all_games_detailed[str(game_id)] = {
      "name": data.get("name", ""),
      "rating_ratio": round(data.get("positive", 1) / data.get("negative", 1), 3),
      "price": data.get("price", "0"),
      "genre": [g.strip() for g in data.get("genre", "").split(",")],
      "tags": tag_list
  }

  time.sleep(1)

In [None]:
all_games_detailed

{'570': {'name': 'Dota 2',
  'rating_ratio': 4.422,
  'price': '0',
  'genre': ['Action', 'Strategy', 'Free To Play'],
  'tags': ['Free to Play',
   'MOBA',
   'Multiplayer',
   'Strategy',
   'e-sports',
   'Team-Based',
   'Competitive',
   'Action',
   'Online Co-Op',
   'PvP',
   'Difficult',
   'Co-op',
   'RTS',
   'RPG',
   'Tower Defense',
   'Fantasy',
   'Character Customization',
   'Replay Value',
   'Action RPG',
   'Simulation']},
 '730': {'name': 'Counter-Strike: Global Offensive',
  'rating_ratio': 6.583,
  'price': '0',
  'genre': ['Action', 'Free To Play'],
  'tags': ['FPS',
   'Shooter',
   'Multiplayer',
   'Competitive',
   'Action',
   'Team-Based',
   'e-sports',
   'Tactical',
   'First-Person',
   'PvP',
   'Online Co-Op',
   'Co-op',
   'Strategy',
   'Military',
   'War',
   'Difficult',
   'Trading',
   'Realistic',
   'Fast-Paced',
   'Moddable']},
 '578080': {'name': 'PUBG: BATTLEGROUNDS',
  'rating_ratio': 1.454,
  'price': '0',
  'genre': ['Action', 'Adv

## Export data to CSV

In [None]:
csv_filename = "game_library_data.csv"

with open(csv_filename, mode="w", newline="", encoding="utf-8") as file:
    writer = csv.writer(file)

    header = ["game_id", "name", "rating_ratio", "price", "genre", "tags"]
    writer.writerow(header)

    for game_id, game_data in all_games_detailed.items():
        writer.writerow([
            game_id,
            game_data["name"],
            game_data["rating_ratio"],
            game_data["price"],
            ", ".join(game_data["genre"]),
            ", ".join(game_data["tags"])
        ])

# Part 2: Generate synthetic player-owned game libraries

### Get unique list of genres

In [None]:
unique_genres = set()
for game_id, game_data in all_games_detailed.items():
    genres = game_data["genre"]
    for genre in genres:
        unique_genres.add(genre.strip())

if 'Early Access' in unique_genres:
    unique_genres.remove('Early Access')

if 'Free To Play' in unique_genres:
    unique_genres.remove('Free To Play')

unique_genres

{'Action',
 'Adventure',
 'Casual',
 'Indie',
 'Massively Multiplayer',
 'RPG',
 'Racing',
 'Simulation',
 'Sports',
 'Strategy'}

### Create genre-to-games mapping

In [None]:
genre_game_map = {}
for genre in unique_genres:
  genre_game_map[genre] = []

for game_id, game_data in all_games_detailed.items():
  for genre in game_data["genre"]:
    genre = genre.strip()
    if genre in genre_game_map:
      genre_game_map[genre].append((game_id, game_data["name"]))

genre_game_map

{'Massively Multiplayer': [('578080', 'PUBG: BATTLEGROUNDS'),
  ('1063730', 'New World: Aeternum'),
  ('1599340', 'Lost Ark'),
  ('236390', 'War Thunder'),
  ('252490', 'Rust'),
  ('346110', 'ARK: Survival Evolved'),
  ('238960', 'Path of Exile'),
  ('1203220', 'NARAKA: BLADEPOINT'),
  ('1097150', 'Fall Guys: Ultimate Knockout'),
  ('444090', 'Paladins'),
  ('552990', 'World of Warships'),
  ('438100', 'VRChat'),
  ('227940', 'Heroes & Generals'),
  ('386360', 'SMITE'),
  ('444200', 'World of Tanks Blitz'),
  ('755790', 'Ring of Elysium'),
  ('301520', 'Robocraft'),
  ('417910', 'Street Warriors Online'),
  ('433850', 'Z1 Battle Royale'),
  ('304050', 'Trove'),
  ('221100', 'DayZ')],
 'RPG': [('1623730', 'Palworld'),
  ('1063730', 'New World: Aeternum'),
  ('2358720', 'Black Myth: Wukong'),
  ('1599340', 'Lost Ark'),
  ('1245620', 'ELDEN RING'),
  ('105600', 'Terraria'),
  ('1086940', "Baldur's Gate 3"),
  ('252490', 'Rust'),
  ('346110', 'ARK: Survival Evolved'),
  ('892970', 'Valheim

### Count the number of games per genre

In [None]:
genre_game_counts = {}
for genre, games in genre_game_map.items():
  genre_game_counts[genre] = len(games)

genre_game_counts

{'Massively Multiplayer': 21,
 'RPG': 30,
 'Action': 77,
 'Sports': 6,
 'Indie': 35,
 'Strategy': 15,
 'Simulation': 24,
 'Adventure': 46,
 'Casual': 12,
 'Racing': 1}

### Calculate number of players per focused genre based on above ratio

In [None]:
TOTAL_PLAYERS = 50000

total_genre_instances = sum(genre_game_counts.values())

num_players_per_genre = {
    genre: max(1, round((count / total_genre_instances) * TOTAL_PLAYERS))
    for genre, count in genre_game_counts.items()
}

# ensure total players is exactly 50,000
difference = TOTAL_PLAYERS - sum(num_players_per_genre.values())

# sort difference across genres
genres_sorted = sorted(genre_game_counts, key=genre_game_counts.get, reverse=True)

for i in range(abs(difference)):
    index = i % len(genres_sorted)
    num_players_per_genre[genres_sorted[index]] += (1 if difference > 0 else -1)

# ensure each genre has a positive number of players
for genre in num_players_per_genre:
    num_players_per_genre[genre] = max(1, num_players_per_genre[genre])

print("Final number of players per genre:", num_players_per_genre)
print("Total players generated:", sum(num_players_per_genre.values()))

Final number of players per genre: {'Massively Multiplayer': 3933, 'RPG': 5618, 'Action': 14420, 'Sports': 1124, 'Indie': 6554, 'Strategy': 2809, 'Simulation': 4494, 'Adventure': 8614, 'Casual': 2247, 'Racing': 187}
Total players generated: 50000


### Generate the synthetic dataset
Iterate through each genre, generating a dynamic number of players with libraries focused on given genre

In [None]:
min_games_per_player = 3
max_games_per_player = 10
total_playtime_range = (100, 500)  # min, max total playtime per player

synthetic_players = {}

for genre, game_list in genre_game_map.items():
    num_players = num_players_per_genre[genre]
    game_ids_in_genre = [game[0] for game in game_list if game[0] in all_games_detailed]  # make sure game exists

    for player_id in range(len(synthetic_players), len(synthetic_players) + num_players):
        synthetic_players[player_id] = {"focused_genre": genre, "games": {}}

        # calculate how many games each player will own
        num_games = random.randint(min_games_per_player, max_games_per_player)

        # ensure 70% of a player's games is in their focus genre
        num_genre_games = int(num_games * 0.7)  # 70% of games in focused genre
        num_other_games = num_games - num_genre_games

        owned_genre_games = random.sample(game_ids_in_genre, min(num_genre_games, len(game_ids_in_genre)))
        other_games = random.sample(
            [g_id for g_id in all_games_detailed.keys() if g_id not in owned_genre_games],
            min(num_other_games, len(all_games_detailed) - len(owned_genre_games))
        )

        total_playtime = random.randint(*total_playtime_range)

        genre_playtime = int(total_playtime * 0.95)  # ensure 95% of a player's playtime is in the focus genre
        other_playtime = total_playtime - genre_playtime


        # distribute playtime randomly
        genre_hours = np.random.dirichlet(np.ones(len(owned_genre_games))) * genre_playtime
        other_hours = np.random.dirichlet(np.ones(len(other_games))) * other_playtime if other_games else []

        # store player data
        for game_id, hours in zip(owned_genre_games, genre_hours):
            synthetic_players[player_id]["games"][game_id] = {**all_games_detailed[game_id], "hours": max(1, int(hours))}

        for game_id, hours in zip(other_games, other_hours):
            synthetic_players[player_id]["games"][game_id] = {**all_games_detailed[game_id], "hours": max(1, int(hours))}

# save dataset as json file
with open("synthetic_training_data.json", "w") as f:
    json.dump(synthetic_players, f, indent=4)

print("Synthetic dataset generated successfully!")

Synthetic dataset generated successfully!


### Generate a list of owned games and played hours per player to run manual checks

In [None]:
filepath = "synthetic_training_data.json"

with open(filepath, 'r') as f:
    data = json.load(f)

if data:
    processed_data = {}

    # process each player's data
    for player_id, player_info in data.items():
        processed_data[player_id] = {
            "focused_genre": player_info["focused_genre"],
            "games": []
        }

        # sort games by # of hours played in descending orde)r
        sorted_games = sorted(player_info["games"].items(), key=lambda x: x[1]["hours"], reverse=True)

        for game_id, game_data in sorted_games:
            processed_data[player_id]["games"].append({
                "game_name": game_data["name"],
                "hours_played": game_data["hours"]
            })

    with open("processed_player_data.json", "w") as f:
        json.dump(processed_data, f, indent=4)

    print("Processed data saved to 'processed_player_data.json'")

Processed data saved to 'processed_player_data.json'
