In [1]:
import os
import sys

curr_dir = os.getcwd()
src_dir = os.path.join(os.path.dirname(curr_dir), "src")
sql_dir = os.path.join(os.path.dirname(curr_dir), "sql")
sys.path.append(src_dir)
sys.path.append(sql_dir)

In [19]:
import itertools
import warnings
from ast import literal_eval
from collections import Counter

import pandas as pd
from db import get_db
from sqlalchemy import text

warnings.filterwarnings("ignore")
pd.set_option("display.max_columns", 100)

### Data Extraction from SQL Database

In [3]:
def fetch_data(source: str):
    """
    Fetches data from a specified source and returns it as a pandas DataFrame.

    Parameters:
    source (str): The name of the source file containing the SQL query.

    Returns:
    pandas.DataFrame: The fetched data as a DataFrame.

    """
    db = get_db()

    with open(os.path.join(sql_dir, source), "r") as f:
        query = text(f.read())
    result = db.execute(query)
    data = result.fetchall()
    columns = result.keys()
    df = pd.DataFrame(data, columns=columns)

    db.close()

    return df

steamspy_data = fetch_data("get_all_steamspy_data.sql")
steamspy_data.head()

Unnamed: 0,appid,name,developer,publisher,score_rank,positive,negative,userscore,owners,average_forever,average_2weeks,median_forever,median_2weeks,price,initialprice,discount,ccu,languages,genre,tags
0,10,Counter-Strike,Valve,Valve,,231805,6061,0.0,"10,000,000 .. 20,000,000",0,0,0,0,999.0,999.0,0,12571,"English, French, German, Italian, Spanish - Sp...",Action,"{""FPS"": 4899, ""PvP"": 907, ""1980s"": 278, ""1990'..."
1,20,Team Fortress Classic,Valve,Valve,,7136,1087,0.0,"50,000 .. 100,000",0,0,0,0,499.0,499.0,0,84,"English, French, German, Italian, Spanish - Sp...",Action,"{""FPS"": 327, ""Mod"": 39, ""Co-op"": 98, ""Funny"": ..."
2,30,Day of Defeat,Valve,Valve,,6140,668,0.0,"5,000,000 .. 10,000,000",0,0,0,0,499.0,499.0,0,88,"English, French, German, Italian, Spanish - Spain",Action,"{""FPS"": 798, ""War"": 158, ""Co-op"": 36, ""Retro"":..."
3,40,Deathmatch Classic,Valve,Valve,,2457,518,0.0,"100,000 .. 200,000",0,0,0,0,499.0,499.0,0,4,"English, French, German, Italian, Spanish - Sp...",Action,"{""FPS"": 150, ""Gore"": 20, ""Co-op"": 16, ""Retro"":..."
4,50,Half-Life: Opposing Force,Gearbox Software,Valve,,21262,1086,0.0,"2,000,000 .. 5,000,000",0,0,0,0,499.0,499.0,0,109,"English, French, German, Korean",Action,"{""FPS"": 917, ""Gore"": 57, ""Co-op"": 43, ""Retro"":..."


Creating a copy of `steam_data` dataset before starting the cleaning process.

In [6]:
raw_steamspy_data = steamspy_data.copy()

### Process Null values

Since the data is queried from SQL, some null vales are read as strings. 

In [7]:
def process_null(df):
    """
    Process null values in a DataFrame by replacing specific values with None.

    Args:
        df (pandas.DataFrame): The DataFrame to process.

    Returns:
        pandas.DataFrame: The processed DataFrame with null values replaced.

    """
    df = df.copy()

    convert_to_none = ['', 'none', 'null', 'N/a', 'N/A', 'NA', 'None', 'n/a']
    df.replace(convert_to_none, None, inplace=True)
    
    return df

raw_steam_data = process_null(raw_steamspy_data)
raw_steam_data.isnull().sum()

appid                  0
name                  17
developer            319
publisher            315
score_rank         74112
positive               0
negative               0
userscore              0
owners                 0
average_forever        0
average_2weeks         0
median_forever         0
median_2weeks          0
price                 29
initialprice          22
discount              22
ccu                    0
languages             94
genre                361
tags                 320
dtype: int64

### Dropping Columns and Rows 

Some columns are not needed as these are in `steam_data`. Others are temporary and change on a weekly/daily basis.

In [8]:
def process_col_rows(df):
    df = df.copy()
    col_to_drop = ['score_rank', # too many missing values
    'userscore', # too little variance (most have 0)
    'genre', 'developer', 'publisher', 'price', 'initialprice', 'discount', # provided by Steam data
    'average_2weeks', 'median_2weeks', 'ccu' # not interested in temporally specific columns
    ]

    # Drop missing games
    df = df.dropna(subset=['name'])
    df = df.drop(col_to_drop, axis=1)

    return df

col_row_df = process_col_rows(raw_steam_data)
col_row_df.head()

Unnamed: 0,appid,name,positive,negative,owners,average_forever,median_forever,languages,tags
0,10,Counter-Strike,231805,6061,"10,000,000 .. 20,000,000",0,0,"English, French, German, Italian, Spanish - Sp...","{""FPS"": 4899, ""PvP"": 907, ""1980s"": 278, ""1990'..."
1,20,Team Fortress Classic,7136,1087,"50,000 .. 100,000",0,0,"English, French, German, Italian, Spanish - Sp...","{""FPS"": 327, ""Mod"": 39, ""Co-op"": 98, ""Funny"": ..."
2,30,Day of Defeat,6140,668,"5,000,000 .. 10,000,000",0,0,"English, French, German, Italian, Spanish - Spain","{""FPS"": 798, ""War"": 158, ""Co-op"": 36, ""Retro"":..."
3,40,Deathmatch Classic,2457,518,"100,000 .. 200,000",0,0,"English, French, German, Italian, Spanish - Sp...","{""FPS"": 150, ""Gore"": 20, ""Co-op"": 16, ""Retro"":..."
4,50,Half-Life: Opposing Force,21262,1086,"2,000,000 .. 5,000,000",0,0,"English, French, German, Korean","{""FPS"": 917, ""Gore"": 57, ""Co-op"": 43, ""Retro"":..."


### Processing Tags

In [15]:
tags = col_row_df['tags']
parsed_tags = tags.apply(lambda x: literal_eval(x) if x else {})

unique_tags = set(itertools.chain(*parsed_tags))

print('Number of unique tags:', len(unique_tags))

# Create a DataFrame with 15 columns and 30 rows
num_columns = 15
num_rows = 30

unique_tags = sorted(list(unique_tags))

# Reshape the list into the desired DataFrame shape
ut = [unique_tags[i * num_columns:(i + 1) * num_columns] for i in range(num_rows)]

# Create the DataFrame
utdf = pd.DataFrame(ut)
utdf

Number of unique tags: 450


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,1980s,1990's,2.5D,2D,2D Fighter,2D Platformer,360 Video,3D,3D Fighter,3D Platformer,3D Vision,4 Player Local,4X,6DOF,8-bit Music
1,ATV,Abstract,Action,Action RPG,Action RTS,Action Roguelike,Action-Adventure,Addictive,Adventure,Agriculture,Aliens,Alternate History,Ambient,America,Animation & Modeling
2,Anime,Arcade,Archery,Arena Shooter,Artificial Intelligence,Assassin,Asymmetric VR,Asynchronous Multiplayer,Atmospheric,Audio Production,Auto Battler,Automation,Automobile Sim,BMX,Base-Building
3,Baseball,Based On A Novel,Basketball,Batman,Battle Royale,Beat 'em up,Beautiful,Benchmark,Bikes,Birds,Blood,Board Game,Boomer Shooter,Boss Rush,Bowling
4,Boxing,Building,Bullet Hell,Bullet Time,CRPG,Capitalism,Card Battler,Card Game,Cartoon,Cartoony,Casual,Cats,Character Action Game,Character Customization,Chess
5,Choices Matter,Choose Your Own Adventure,Cinematic,City Builder,Class-Based,Classic,Clicker,Co-op,Co-op Campaign,Coding,Cold War,Collectathon,Colony Sim,Colorful,Combat
6,Combat Racing,Comedy,Comic Book,Competitive,Conspiracy,Controller,Conversation,Cooking,Cozy,Crafting,Creature Collector,Cricket,Crime,Crowdfunded,Cult Classic
7,Cute,Cyberpunk,Cycling,Dark,Dark Comedy,Dark Fantasy,Dark Humor,Dating Sim,Deckbuilding,Demons,Design & Illustration,Destruction,Detective,Difficult,Dinosaurs
8,Diplomacy,Documentary,Dog,Dragons,Drama,Driving,Dungeon Crawler,Dungeons & Dragons,Dwarf,Dynamic Narration,Dystopian,Early Access,Economy,Education,Electronic
9,Electronic Music,Elf,Emotional,Epic,Episodic,Escape Room,Experience,Experimental,Exploration,Extraction Shooter,FMV,FPS,Faith,Family Friendly,Fantasy


In [12]:
tags[0]

'{"FPS": 4899, "PvP": 907, "1980s": 278, "1990\'s": 1230, "Action": 5471, "Classic": 2823, "Shooter": 3397, "Assassin": 238, "Military": 648, "Strategy": 628, "Survival": 313, "Tactical": 1370, "e-sports": 1217, "Nostalgia": 181, "Old School": 806, "Team-Based": 1897, "Competitive": 1632, "Multiplayer": 3445, "First-Person": 1739, "Score Attack": 296}'

Since the tags are based on votes, it makes sense to keep top 3 or 5 for a game.

In [28]:
tagc = Counter()

for tag_list in parsed_tags:
    tagc.update(tag_list.keys())

tagc

Counter({'Indie': 44389,
         'Singleplayer': 41361,
         'Action': 33028,
         'Casual': 31877,
         'Adventure': 31171,
         '2D': 21835,
         'Strategy': 15898,
         'Simulation': 15861,
         'RPG': 13920,
         'Puzzle': 13871,
         '3D': 13606,
         'Atmospheric': 13514,
         'Colorful': 11529,
         'Pixel Graphics': 11273,
         'Story Rich': 11014,
         'Exploration': 10377,
         'Cute': 10155,
         'First-Person': 9820,
         'Early Access': 9728,
         'Arcade': 9671,
         'Fantasy': 9443,
         'Multiplayer': 9241,
         'Funny': 8209,
         'Shooter': 8018,
         'Retro': 7897,
         'Horror': 7755,
         'Platformer': 7551,
         'Family Friendly': 7535,
         'Action-Adventure': 7479,
         'Sci-fi': 7261,
         'Relaxing': 7231,
         'Anime': 7223,
         'Female Protagonist': 6407,
         'Third Person': 5950,
         'Top-Down': 5899,
         'Difficult': 

### Languages Supported

In [31]:
langs = col_row_df['languages']
langs = langs.apply(lambda x: x.split(', ') if x else [])

langc = Counter()

for l in langs:
    langc.update(l)

langc

Counter({'English': 73143,
         'German': 19083,
         'Simplified Chinese': 18933,
         'French': 18460,
         'Russian': 17075,
         'Spanish - Spain': 16917,
         'Japanese': 15734,
         'Italian': 12631,
         'Korean': 9977,
         'Portuguese - Brazil': 9632,
         'Traditional Chinese': 9242,
         'Polish': 7353,
         'Portuguese - Portugal': 5632,
         'Turkish': 5315,
         'Spanish - Latin America': 4260,
         'Dutch': 4132,
         'Czech': 3073,
         'Ukrainian': 2951,
         'Swedish': 2768,
         'Hungarian': 2500,
         'Arabic': 2410,
         'Thai': 2405,
         'Danish': 2271,
         'Norwegian': 2202,
         'Finnish': 2171,
         'Romanian': 1934,
         'Greek': 1902,
         'Not supported': 1849,
         'Vietnamese': 1698,
         'Bulgarian': 1694,
         'Indonesian': 464,
         'Catalan': 146,
         'Hindi': 134,
         'Slovak': 107,
         'Malay': 65,
         'Heb

# Test

userscore
0.0      74112
100.0        4
95.0         3
51.0         2
80.0         2
Name: count, dtype: int64

In [20]:
for i in steamspy_data.columns:
    print(i, steamspy_data[steamspy_data[i]=='[]'].shape[0])

appid 0
name 0
developer 0
publisher 0
score_rank 0
positive 0
negative 0
userscore 0
owners 0
average_forever 0
average_2weeks 0
median_forever 0
median_2weeks 0
price 0
initialprice 0
discount 0
ccu 0
languages 0
genre 0
tags 0
