# CS171 Final Project — Data Pre-Processing Notebook

This notebook outlines all steps involved in obtaining, organizing, and preparing our dataset of gameplay screenshots for use in model training.

This notebook focuses on:

- Importing and organizing raw gameplay screenshots collected via Python script
- Mapping each video game title to its corresponding genre category
- Sorting images into train, test, and validate folders based on genre
- Creating genre-specific subfolders and cleaning filenames
- Splitting the training data into an additional validation set
- Applying basic preprocessing transformations to standardize image size and format for model input

In [1]:
import os
import shutil
import random


# GAME to GENRE MAPPING

game_map = {
    'Apex Legends': 'shooter',
    'Assetto Corsa': 'racing',
    'Automobilista 2': 'racing',
    'Bad Rats': 'puzzle',
    'BeamNG.drive': 'racing',
    'Cities Skylines': 'strategy',
    'Civilization VI': 'strategy',
    'Company of Heroes 2': 'strategy',
    'Counter-strike  Global Offensive': 'shooter',
    'Counter-strike 2': 'shooter',
    'Crying Suns': 'strategy',
    'Dead Cells': 'rpg',
    'Destiny 2': 'shooter',
    'Dirt Rally 2': 'racing',
    'Dirt Rally': 'racing',
    'Dr Mario': 'puzzle',
    'Dragon Quest XI': 'rpg',
    'Elden Ring': 'rpg',
    'Elite Dangerous': 'rpg',
    'Enter the Gungeon': 'shooter',
    'Final Fantasy VII': 'rpg',
    'Half-Life 2': 'shooter',
    'Hearts of Iron IV': 'strategy',
    'Hell Let Loose': 'shooter',
    'Insurgency Sandstorm': 'shooter',
    'Metal Gear Solid V  The Phantom Pain': 'shooter',
    'Modded Minecraft': 'strategy',
    'Pokemon Green': 'rpg',
    'Puyo Puyo Tetris': 'puzzle',
    'Remnant  From the Ashes': 'shooter',
    'Satisfactory': 'puzzle',
    'Sea of Thieves': 'shooter',
    'The Finals': 'shooter',
    'Titanfall2': 'shooter',
    "Tom Clancy's Rainbow Six  Siege": 'shooter',
    'Warframe': 'rpg',
    'Wrc': 'racing',

    # NEW GAMES ADDED
    'Europa Universalis IV': 'strategy',
    'F-Zero GX': 'racing',
    'Fortnite': 'shooter',
    'Lumines Arise': 'puzzle',
    'Mario Kart World': 'racing',
    'Quake III Arena': 'shooter',
    'Skyrim Special Edition': 'rpg',
    'StarCraft II': 'strategy',
    'Tetris Effect': 'puzzle',
    "The Legend of Zelda： Link's Awakening": 'rpg'
}

genres = sorted(list(set(game_map.values())))


# PATHS

src_train = "input_images/train"
src_test = "input_images/test"

dst_train = "input_images/Images/train"
dst_test = "input_images/Images/test"
dst_val = "input_images/Images/validate"  # new folder


# CREATE GENRE FOLDERS


for genre in genres:
    os.makedirs(os.path.join(dst_train, genre), exist_ok=True)
    os.makedirs(os.path.join(dst_test, genre), exist_ok=True)
    os.makedirs(os.path.join(dst_val, genre), exist_ok=True)

# SORTING

unknown_files = []

def sort_folder(src_folder, dst_folder):
    for filename in os.listdir(src_folder):

        # skip non-image files
        if not filename.lower().endswith((".jpg", ".jpeg", ".png")):
            continue

        # check filename structure
        if ";;" not in filename:
            unknown_files.append(filename)
            continue

        game_name = filename.split(";;")[0].strip()

        game_name = game_name.replace("：", ":")

        # try both forms in case mapping uses either
        if game_name not in game_map:
            unknown_files.append(filename)
            continue

        genre = game_map[game_name]

        src_path = os.path.join(src_folder, filename)
        dst_path = os.path.join(dst_folder, genre, filename)

        shutil.copy(src_path, dst_path)


# RUN SORTING

sort_folder(src_train, dst_train)
sort_folder(src_test, dst_test)

print("DONE SORTING!")
print(f"Unknown filenames needing manual fix: {len(unknown_files)}")
for f in unknown_files[:20]:
    print("  -", f)
if len(unknown_files) > 20:
    print("  ... more not shown ...")

# VALIDATION SPLIT (20%)

VAL_RATIO = 0.2

for genre in genres:
    genre_folder = os.path.join(dst_train, genre)
    files = os.listdir(genre_folder)

    random.shuffle(files)
    val_size = int(len(files) * VAL_RATIO)

    for f in files[:val_size]:
        shutil.move(
            os.path.join(genre_folder, f),
            os.path.join(dst_val, genre, f)
        )

print("Validation set created!")


FileNotFoundError: [Errno 2] No such file or directory: 'input_images/train'

In [None]:
# Transform: resize, tensor, normalize
transform = transforms.Compose([
    transforms.Resize((100, 100)),    #resize images to 100x100
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])  # 3 values for RGB
])

The images used in this project were collected using a Python script written by my partner, which automatically captured gameplay screenshots from a variety of video games. These raw images were originally unlabeled except for the game name embedded in the filename. In this notebook, we describe how the screenshots were organized, labeled, cleaned, and split into training, testing, and validation sets so they could be used for genre classification.