Imports section:

In [56]:
import json
import pandas as pd
import os
import numpy as np
from sqlalchemy import create_engine
import ast
import numpy as np

Deal with each table of stage (All credential need to be set up in the current directory inside the .env file).

In [2]:
db_username = os.getenv("DB_USERNAME")
db_password = os.getenv("DB_PASSWORD")
db_host = os.getenv("DB_HOST")
db_port = os.getenv("DB_PORT")
db_name = os.getenv("DB_NAME")

engine = create_engine(
    f"postgresql://{db_username}:{db_password}@{db_host}:{db_port}/{db_name}"
)

table_names = ['all_steam', 'genres', 'mdn_play_time','metacritic_review','play_time_by_player','regions','time_to_beat']

Initial extraction into the stage area:

In [7]:
def cut_csv(params_json):
    params = json.loads(params_json)
    csv_file_path = params["csv_file_path"]
    separator = params["separator"]
    columns = params["columns"]
    df = pd.read_csv(csv_file_path, sep=separator, encoding="utf-8")
    missing_columns = set(df.columns) - set(columns)
    for col in missing_columns:
        df.drop(col, axis=1, inplace=True)
    df = df.fillna("null")
    file_name = os.path.basename(csv_file_path)
    df.to_csv(f"./Datasets/Processed/{file_name}", index=False)
    print(file_name)

play_time_by_gamers = json.dumps(
    {
        "csv_file_path": "./Datasets/Selected/metacritic.csv",
        "columns": ["metascore", "name", "console"],
        "separator": ",",
    }
)

cut_csv(play_time_by_gamers)

play_time_by_gamers = json.dumps(
    {
        "csv_file_path": "./Datasets/Selected/requirements.csv",
        "columns": [
            "steam_appid",
            "pc_requirements",
            "mac_requirements",
            "linux_requirements",
            "minimum",
        ],
        "separator": ";",
    }
)

cut_csv(play_time_by_gamers)

play_time_by_gamers = json.dumps(
    {
        "csv_file_path": "./Datasets/Selected/all_steam.csv",
        "columns": [
            "game",
            "link",
            "release",
            "rating",
            "publisher",
            "developer",
            "detected_technologies",
            "all_time_peak",
            "all_time_peak_date",
        ],
        "separator": ",",
    }
)

cut_csv(play_time_by_gamers)

play_time_by_gamers = json.dumps(
    {
        "csv_file_path": "./Datasets/Selected/regions.csv",
        "columns": [
            "Name",
            "Platform",
            "Year_of_Release",
            "Genre",
            "Publisher",
            "NA_players",
            "EU_players",
            "JP_players",
            "Other_players",
            "Global_players",
            "Developer",
            "Rating",
        ],
        "separator": ",",
    }
)

cut_csv(play_time_by_gamers)

play_time_by_gamers = json.dumps(
    {
        "csv_file_path": "./Datasets/Selected/time_to_beat.csv",
        "columns": [
            "achievements",
            "description",
            "developers",
            "gfq_difficulty",
            "gfq_rating",
            "grnk_score",
            "hltb_complete",
            "hltb_single",
            "igdb_score",
            "igdb_single",
            "igdb_uscore",
            "languages",
            "name",
            "platforms",
            "published_hltb",
            "publishers",
            "tags",
            "voiceovers"
        ],
        "separator": ",",
    }
)

cut_csv(play_time_by_gamers)

Convert all game names to common look:

In [9]:
def reduceToCommon(col_name):
   for table_name in table_names:
      df = pd.read_sql_table(table_name, engine, schema='stage')
      df[col_name] = df[col_name].replace(r'(?<!\d)[^\w\s]|[^\w\s](?!\d)', ' ', regex=True)
      df[col_name] = df[col_name].replace(r'[^\w\s]', '', regex=True)
      df[col_name] = df[col_name].replace(r'\_+', ' ', regex=True)
      df[col_name] = df[col_name].replace(r'\s+', ' ', regex=True)
      df[col_name] = df[col_name].str.lower()
      df[col_name] = df[col_name].str.strip()
      df.to_sql(table_name, schema='stage', con=engine, if_exists='replace', index=False)

reduceToCommon('game')


Replace all undifined with actual nulls:

In [4]:
def reduceUndefined():
   for table_name in table_names:
      df = pd.read_sql_table(table_name, engine, schema='stage')
      df = df.map(lambda x: x.strip() if isinstance(x, str) else x)
      df = df.replace(to_replace=r'\?+', value=np.nan, regex=True)
      df = df.replace(to_replace='null', value=np.nan)
      df.to_sql(table_name, schema='stage', con=engine, if_exists='replace', index=False)

reduceUndefined()

Convert date to one type:

In [45]:
def convertDate(df, col_name):
    # Check if the column is already in the desired format ('yyyy-mm-dd')
    already_formatted = (df[col_name].dtype == 'datetime64[ns]' and df[col_name].dt.strftime('%Y-%m-%d').unique().size == 1)

    if not already_formatted:

        formats = [
            {'format': '%d.%m.%Y', 'errors': 'ignore'},
            {
                'format': '%Y',
                'errors': 'coerce',
                'post_process': lambda x: pd.to_datetime(x.astype(str) + '-01-01', errors='coerce')
            },
            {'format': '%d.%m.%Y %H:%M:%S', 'errors': 'ignore'}
        ]

        for format_config in formats:
            try:
                if 'post_process' in format_config:
                    df[col_name] = format_config['post_process'](df[col_name])

                df[col_name] = pd.to_datetime(df[col_name], format=format_config['format'], errors=format_config['errors'])

            except ValueError:
                pass

    df[col_name] = pd.to_datetime(df[col_name], errors='ignore')
    return df

Supporting functions:

In [5]:
def printNulls(df):
    null_values = df.isnull()
    for column in null_values.columns.values.tolist():
        print (null_values[column].value_counts())
        print("")

def parseString(df,col_name,separator):
    df[col_name] = df[col_name].str.split(separator)
    df = df.explode(col_name).reset_index(drop=True)
    return df

def drop_nulls_overlim(df, target_columns, limit):
    existing_columns = [col for col in target_columns if col in df.columns]
    if existing_columns:
        null_percentage = df[existing_columns].isnull().mean()
        columns_to_drop = null_percentage[null_percentage >= limit].index.tolist()
        df.drop(columns=columns_to_drop, axis=1, inplace=True)

def convertToCommonScore(df, col_name, initial_scale):
    df[col_name] = df[col_name].astype(float)
    df[col_name] = df[col_name] / initial_scale * 100
    np.round(df[col_name].values, decimals=1, out=df[col_name].values)

def parseArraish(df, col_name):
    df[col_name] = df[col_name].apply(ast.literal_eval)
    df = df.explode(col_name)
    return df

def mapToCommon(df,col_name,map):
    df[col_name] = df[col_name].map(map)

def toType(df,type,*args):
    for arg in args:
        df[arg] = df[arg].astype(type)
        

Maps for reducing differences in identical columns:

In [90]:
os_map = {
    'WIN':'windows',
    'MAC':'mac',
    'LNX':'linux',
    'Ubuntu':'linux',
    'WIN7':'windows',
}

'Play time by player' dataset:

In [None]:
query = 'SELECT * FROM stage.play_time_by_player'
df = pd.read_sql_query(query, engine)

#Fix data types
df['game'] = df['game'].astype('object')
if 'action_type' in df.columns:
    df['action_type'] = df['action_type'].astype('object')
df['time'] = df['time'].astype('float64')

#Dicard not
if 'action_type' in df.columns:
    df = df[df['action_type'] == 'play']
    df.drop('action_type', axis=1, inplace=True)

#Aggregate
df = df.groupby('game')['time'].mean().reset_index()

#Round time
df['time'] = df['time'].round(1)
df.to_sql('play_time_by_player', schema='stage', con=engine, if_exists='replace', index=True, index_label='id')

'Genres' dataset:

In [None]:
query = 'SELECT * FROM stage.genres'
df = pd.read_sql_query(query, engine)

#Fix data types
df['game'] = df['game'].astype('object')
df['developer'] = df['developer'].astype('object')
df['genres'] = df['genres'].astype('object')

#Remove summary column
if 'summary' in df.columns:
    df.drop('summary',axis=1,inplace=True)
 
#Parse array into separate rows 
df = df.rename(columns={'genres':'genre'})
df = parseArraish(df,'genre')

df['developer'] = df['developer'].replace(to_replace=np.nan, value="['Unknown']", regex=True)
df = parseArraish(df,'developer')

df.dropna(subset='release_date',inplace=True)
convertDate(df,'release_date')
df.drop('id',axis=1,inplace=True)
df = df.reset_index(drop=True)

df.to_sql('genres', schema='stage', con=engine, if_exists='replace', index=True, index_label='id')

'Median play time' dataset:

In [None]:
query = 'SELECT * FROM stage.mdn_play_time'
df = pd.read_sql_query(query, engine)

df = parseString(df, 'genres')
df = parseString(df, 'platforms')
df = parseString(df, 'developer')
df = parseString(df, 'publisher')

df[['start', 'end']] = df['owners'].str.split('-', expand=True)
df['owners'] = (df['start'].astype(int) + df['end'].astype(int)) / 2
df.drop(['start', 'end'], axis=1, inplace=True)
df['owners'] = df['owners'].astype(np.int64)

convertDate(df,'release_date')

df = df.rename(columns={'platforms': 'os'})

df.drop('id',axis=1,inplace=True)
df = df.rename(columns={'genres':'genre'})
df.to_sql('mdn_play_time', schema='stage', con=engine, if_exists='replace', index=True, index_label='id')

'Metacritic review' dataset:

In [None]:
query = 'SELECT * FROM stage.metacritic_review'
df = pd.read_sql_query(query, engine)

toType(df,float,'metascore')

df.drop('id',axis=1,inplace=True)
df.to_sql('metacritic_review', schema='stage', con=engine, if_exists='replace', index=True, index_label='id')

'All_steam' dataset:

In [None]:
query = 'SELECT * FROM stage.all_steam'
df = pd.read_sql_query(query, engine)

df = df.dropna(subset={'all_time_peak_date'})
df = df.rename(columns={'release':'release_date'})

df['technologies'] = df['technologies'].str.split('; ')
df = df.explode('technologies')
df[['technology_type','technology']] = df['technologies'].str.split('.',expand=True)
df.drop('technologies',axis=1,inplace=True)

df['rating'] = df['rating'].astype(float)
convertDate(df,'release_date')
convertDate(df,'all_time_peak_date')
df['all_time_peak'] = df['all_time_peak'].astype(int) 

df.drop('id',axis=1,inplace=True)
df.to_sql('all_steam', schema='stage', con=engine, if_exists='replace', index=True, index_label='id')

'Regions' dataset:

In [None]:
query = 'SELECT * FROM stage.regions'
df = pd.read_sql_query(query, engine)

missing_percentage = df.isna().mean() * 100

columns_to_check = ['rating', 'developer']

drop_nulls_overlim(df,columns_to_check,0.3)

df.dropna(subset='game',inplace=True)
df.dropna(subset='publisher',inplace=True)
df.dropna(subset='year_of_release',inplace=True)

toType(df,float,'na_players','jp_players','eu_players','other_players','global_players')

toType(df,float,'year_of_release')
toType(df,int,'year_of_release')
convertDate(df,'year_of_release')

df.head()
df.drop('id',axis=1,inplace=True)
df = df.rename(columns={'year_of_release':'release_date'})
df.to_sql('regions', schema='stage', con=engine, if_exists='replace', index=True, index_label='id')

'Time to beat' dataset:

In [None]:
query = 'SELECT * FROM stage.time_to_beat'
df = pd.read_sql_query(query, engine)

columns_to_check = ['description','tags','voiceovers','published_hltb_date']
drop_nulls_overlim(df,columns_to_check,0.3)
df.dropna(subset='gfq_difficulty',inplace=True)
df.dropna(subset='hltb_single',inplace=True)
df.dropna(subset='hltb_complete',inplace=True)
df.dropna(subset='igdb_uscore',inplace=True)
df.dropna(subset='gfq_rating',inplace=True)
columns_to_check = ['grnk_score','igdb_score','igdb_single','gfq_rating','igdb_uscore']
drop_nulls_overlim(df,columns_to_check,0.3)

df.fillna({'achivements': 0}, inplace=True)

convertToCommonScore(df,'gfq_rating',5)

if 'platforms' in df.columns:
    df = df.rename(columns={'platforms':'os'})

df = parseString(df,'os',',')
mapToCommon(df,'os',os_map)

df = parseString(df,'languages',',')
if 'languages' in df.columns:
    df = df.rename(columns={'languages':'language'})

df = parseString(df,'developers',',')
if 'developers' in df.columns:
    df = df.rename(columns={'developers':'developer'})

df = parseString(df,'publishers',',')
if 'publishers' in df.columns:
    df = df.rename(columns={'publishers':'publisher'})

toType(df,float,'hltb_single','hltb_complete','gfq_rating','igdb_uscore','achivements')
toType(df,int,'achivements')

df.drop('id',axis=1,inplace=True)
df.to_sql('time_to_beat', schema='stage', con=engine, if_exists='replace', index=True, index_label='id')

In [None]:
#Execute all above