# Exploratory Data Anaylysis

## Dataset Overview

In [1]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import kagglehub
import shutil

In [2]:
# Download dataset
path = kagglehub.dataset_download("fronkongames/steam-games-dataset")
target_path = "../data/raw/"
os.makedirs(target_path, exist_ok=True)

# Move and rename the csv file
for file in os.listdir(path):
    if file.endswith(".csv"):
        shutil.move(os.path.join(path, file), os.path.join(target_path, "games.csv"))
        break

file_path = os.path.join(target_path, "games.csv")

In [3]:
# Read games.csv
games = pd.read_csv(file_path, index_col=False, header=None, skiprows=1)
cols = ['AppID', 'Name', 'Release date', 'Estimated owners', 'Peak CCU',
       'Required age', 'Price', 'Discount', 'DLC count', 'About the game',
       'Supported languages', 'Full audio languages', 'Reviews',
       'Header image', 'Website', 'Support url', 'Support email', 'Windows',
       'Mac', 'Linux', 'Metacritic score', 'Metacritic url', 'User score',
       'Positive', 'Negative', 'Score rank', 'Achievements', 'Recommendations',
       'Notes', 'Average playtime forever', 'Average playtime two weeks',
       'Median playtime forever', 'Median playtime two weeks', 'Developers',
       'Publishers', 'Categories', 'Genres', 'Tags', 'Screenshots', 'Movies']
games.columns = cols
games.head()

Unnamed: 0,AppID,Name,Release date,Estimated owners,Peak CCU,Required age,Price,Discount,DLC count,About the game,...,Average playtime two weeks,Median playtime forever,Median playtime two weeks,Developers,Publishers,Categories,Genres,Tags,Screenshots,Movies
0,20200,Galactic Bowling,"Oct 21, 2008",0 - 20000,0,0,19.99,0,0,Galactic Bowling is an exaggerated and stylize...,...,0,0,0,Perpetual FX Creative,Perpetual FX Creative,"Single-player,Multi-player,Steam Achievements,...","Casual,Indie,Sports","Indie,Casual,Sports,Bowling",https://cdn.akamai.steamstatic.com/steam/apps/...,http://cdn.akamai.steamstatic.com/steam/apps/2...
1,655370,Train Bandit,"Oct 12, 2017",0 - 20000,0,0,0.99,0,0,THE LAW!! Looks to be a showdown atop a train....,...,0,0,0,Rusty Moyher,Wild Rooster,"Single-player,Steam Achievements,Full controll...","Action,Indie","Indie,Action,Pixel Graphics,2D,Retro,Arcade,Sc...",https://cdn.akamai.steamstatic.com/steam/apps/...,http://cdn.akamai.steamstatic.com/steam/apps/2...
2,1732930,Jolt Project,"Nov 17, 2021",0 - 20000,0,0,4.99,0,0,Jolt Project: The army now has a new robotics ...,...,0,0,0,Campião Games,Campião Games,Single-player,"Action,Adventure,Indie,Strategy",,https://cdn.akamai.steamstatic.com/steam/apps/...,http://cdn.akamai.steamstatic.com/steam/apps/2...
3,1355720,Henosis™,"Jul 23, 2020",0 - 20000,0,0,5.99,0,0,HENOSIS™ is a mysterious 2D Platform Puzzler w...,...,0,0,0,Odd Critter Games,Odd Critter Games,"Single-player,Full controller support","Adventure,Casual,Indie","2D Platformer,Atmospheric,Surreal,Mystery,Puzz...",https://cdn.akamai.steamstatic.com/steam/apps/...,http://cdn.akamai.steamstatic.com/steam/apps/2...
4,1139950,Two Weeks in Painland,"Feb 3, 2020",0 - 20000,0,0,0.0,0,0,ABOUT THE GAME Play as a hacker who has arrang...,...,0,0,0,Unusual Games,Unusual Games,"Single-player,Steam Achievements","Adventure,Indie","Indie,Adventure,Nudity,Violent,Sexual Content,...",https://cdn.akamai.steamstatic.com/steam/apps/...,http://cdn.akamai.steamstatic.com/steam/apps/2...


In [4]:
games['Release date'] = pd.to_datetime(games['Release date'], format='mixed', errors='coerce')

In [5]:
# Inspect missing values
missing_values = games.isnull().sum()
missing_percentage = (games.isnull().sum() / len(games)) * 100

missing_df = pd.DataFrame({
    'Missing Values': missing_values,
    'Percentage': missing_percentage
}).sort_values(by='Missing Values', ascending=False)

missing_df = missing_df[missing_df["Missing Values"] > 0]
missing_df

Unnamed: 0,Missing Values,Percentage
Score rank,97366,99.95483
Metacritic url,93457,95.941895
Reviews,87285,89.60579
Notes,81937,84.115594
Website,54673,56.126681
Support url,51510,52.879581
Tags,29763,30.554358
Support email,16035,16.461349
Movies,7891,8.100811
Categories,5913,6.070219


In [None]:
plt.figure(figsize=(16, 10))
ax = plt.axes()
sns.heatmap(games.isna().transpose(), cbar=False, ax=ax, cmap='viridis')

plt.xlabel('Samples (Rows)')
plt.ylabel('Features (Columns)')
plt.title('Missing Values Heatmap')

plt.show()

In [19]:
cols_to_drop = ['Score rank', 'Metacritic url', 'Reviews', 'Notes', 'Header image', 'Website', 
                'Support url', 'Discount', 'Support email', 'Screenshots', 'Movies', 'Estimated owners']

cleaned_games = games.drop(columns=cols_to_drop).dropna(subset=['Name']).fillna("Unknown")
cleaned_games[['Estimated owners min', 'Estimated owners max']] = (
    games['Estimated owners'].str.replace(',', '').str.split(' - ', expand=True).astype(int)
)

cleaned_games.head()

Unnamed: 0,AppID,Name,Release date,Peak CCU,Required age,Price,DLC count,About the game,Supported languages,Full audio languages,...,Average playtime two weeks,Median playtime forever,Median playtime two weeks,Developers,Publishers,Categories,Genres,Tags,Estimated owners min,Estimated owners max
0,20200,Galactic Bowling,2008-10-21,0,0,19.99,0,Galactic Bowling is an exaggerated and stylize...,['English'],[],...,0,0,0,Perpetual FX Creative,Perpetual FX Creative,"Single-player,Multi-player,Steam Achievements,...","Casual,Indie,Sports","Indie,Casual,Sports,Bowling",0,20000
1,655370,Train Bandit,2017-10-12,0,0,0.99,0,THE LAW!! Looks to be a showdown atop a train....,"['English', 'French', 'Italian', 'German', 'Sp...",[],...,0,0,0,Rusty Moyher,Wild Rooster,"Single-player,Steam Achievements,Full controll...","Action,Indie","Indie,Action,Pixel Graphics,2D,Retro,Arcade,Sc...",0,20000
2,1732930,Jolt Project,2021-11-17,0,0,4.99,0,Jolt Project: The army now has a new robotics ...,"['English', 'Portuguese - Brazil']",[],...,0,0,0,Campião Games,Campião Games,Single-player,"Action,Adventure,Indie,Strategy",Unknown,0,20000
3,1355720,Henosis™,2020-07-23,0,0,5.99,0,HENOSIS™ is a mysterious 2D Platform Puzzler w...,"['English', 'French', 'Italian', 'German', 'Sp...",[],...,0,0,0,Odd Critter Games,Odd Critter Games,"Single-player,Full controller support","Adventure,Casual,Indie","2D Platformer,Atmospheric,Surreal,Mystery,Puzz...",0,20000
4,1139950,Two Weeks in Painland,2020-02-03,0,0,0.0,0,ABOUT THE GAME Play as a hacker who has arrang...,"['English', 'Spanish - Spain']",[],...,0,0,0,Unusual Games,Unusual Games,"Single-player,Steam Achievements","Adventure,Indie","Indie,Adventure,Nudity,Violent,Sexual Content,...",0,20000
