In [1]:
import os
import sys

curr_dir = os.getcwd()
src_dir = os.path.join(os.path.dirname(curr_dir), "src")
sql_dir = os.path.join(os.path.dirname(curr_dir), "sql")
sys.path.append(src_dir)
sys.path.append(sql_dir)

In [2]:
import warnings
from ast import literal_eval

import pandas as pd
from db import get_db
from sqlalchemy import text

warnings.filterwarnings("ignore")
pd.set_option("display.max_columns", 100)

### Data Extraction from SQL Database

In [3]:
def fetch_data(source: str):
    """
    Fetches data from a specified source and returns it as a pandas DataFrame.

    Parameters:
    source (str): The name of the source file containing the SQL query.

    Returns:
    pandas.DataFrame: The fetched data as a DataFrame.

    """
    db = get_db()

    with open(os.path.join(sql_dir, source), "r") as f:
        query = text(f.read())
    result = db.execute(query)
    data = result.fetchall()
    columns = result.keys()
    df = pd.DataFrame(data, columns=columns)

    db.close()

    return df

steamspy_data = fetch_data("get_all_steamspy_data.sql")
steamspy_data.head()

Unnamed: 0,appid,name,developer,publisher,score_rank,positive,negative,userscore,owners,average_forever,average_2weeks,median_forever,median_2weeks,price,initialprice,discount,ccu,languages,genre,tags
0,10,Counter-Strike,Valve,Valve,,231805,6061,0.0,"10,000,000 .. 20,000,000",0,0,0,0,999.0,999.0,0,12571,"English, French, German, Italian, Spanish - Sp...",Action,"{""FPS"": 4899, ""PvP"": 907, ""1980s"": 278, ""1990'..."
1,20,Team Fortress Classic,Valve,Valve,,7136,1087,0.0,"50,000 .. 100,000",0,0,0,0,499.0,499.0,0,84,"English, French, German, Italian, Spanish - Sp...",Action,"{""FPS"": 327, ""Mod"": 39, ""Co-op"": 98, ""Funny"": ..."
2,30,Day of Defeat,Valve,Valve,,6140,668,0.0,"5,000,000 .. 10,000,000",0,0,0,0,499.0,499.0,0,88,"English, French, German, Italian, Spanish - Spain",Action,"{""FPS"": 798, ""War"": 158, ""Co-op"": 36, ""Retro"":..."
3,40,Deathmatch Classic,Valve,Valve,,2457,518,0.0,"100,000 .. 200,000",0,0,0,0,499.0,499.0,0,4,"English, French, German, Italian, Spanish - Sp...",Action,"{""FPS"": 150, ""Gore"": 20, ""Co-op"": 16, ""Retro"":..."
4,50,Half-Life: Opposing Force,Gearbox Software,Valve,,21262,1086,0.0,"2,000,000 .. 5,000,000",0,0,0,0,499.0,499.0,0,109,"English, French, German, Korean",Action,"{""FPS"": 917, ""Gore"": 57, ""Co-op"": 43, ""Retro"":..."


Creating a copy of `steam_data` dataset before starting the cleaning process.

In [4]:
raw_steamspy_data = steamspy_data.copy()

### Process Null values

Since the data is queried from SQL, some null vales are read as strings. 

In [6]:
def process_null(df):
    """
    Process null values in a DataFrame by replacing specific values with None.

    Args:
        df (pandas.DataFrame): The DataFrame to process.

    Returns:
        pandas.DataFrame: The processed DataFrame with null values replaced.

    """
    df = df.copy()

    convert_to_none = ['', 'none', 'null', 'N/a', 'N/A', 'NA', 'None', 'n/a']
    df.replace(convert_to_none, None, inplace=True)
    
    return df

raw_steam_data = process_null(raw_steamspy_data)
raw_steam_data.isnull().sum()

appid                  0
name                  17
developer            319
publisher            315
score_rank         74112
positive               0
negative               0
userscore              0
owners                 0
average_forever        0
average_2weeks         0
median_forever         0
median_2weeks          0
price                 29
initialprice          22
discount              22
ccu                    0
languages             94
genre                361
tags                 320
dtype: int64

In [None]:
for i in steamspy_data.columns:
    print(i, steamspy_data[steamspy_data[i]==' '].shape[0])