In [27]:
import os
import sys

curr_dir = os.getcwd()
src_dir = os.path.join(os.path.dirname(curr_dir), "src")
sql_dir = os.path.join(os.path.dirname(curr_dir), "sql")
sys.path.append(src_dir)
sys.path.append(sql_dir)

In [28]:
import json
import warnings
from ast import literal_eval

import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from data_cleaning import print_steam_links
from db import get_db
from settings import Path
from sqlalchemy import text

warnings.filterwarnings("ignore")
pd.set_option("display.max_columns", 100)

### Data Extraction from SQL Database

In [29]:
def fetch_data(source: str):
    """
    Fetches data from a specified source and returns it as a pandas DataFrame.

    Parameters:
    source (str): The name of the source file containing the SQL query.

    Returns:
    pandas.DataFrame: The fetched data as a DataFrame.

    """
    db = get_db()

    with open(os.path.join(sql_dir, source), "r") as f:
        query = text(f.read())
    result = db.execute(query)
    data = result.fetchall()
    columns = result.keys()
    df = pd.DataFrame(data, columns=columns)

    db.close()

    return df

steam_data = fetch_data("get_all_steam_data.sql")

In [30]:
steam_data.head()

Unnamed: 0,type,name,appid,required_age,is_free,controller_support,dlc,detailed_description,about_the_game,short_description,supported_languages,reviews,header_image,capsule_image,website,requirements,developers,publishers,price_overview,platform,metacritic,categories,genres,recommendations,achievements,release_date,coming_soon
0,game,Counter-Strike,10,0,0,,[],Play the world's number 1 online action game. ...,Play the world's number 1 online action game. ...,Play the world's number 1 online action game. ...,"English\n*\n, French\n*\n, German\n*\n, Italia...",,https://shared.akamai.steamstatic.com/store_it...,https://shared.akamai.steamstatic.com/store_it...,,"{""minimum"": ""\r\n\t\t\t<p><strong>Minimum:</st...","[""Valve""]","[""Valve""]","{""final"": 99, ""initial"": 999, ""currency"": ""USD...","{""mac"": true, ""linux"": true, ""windows"": true}",88,"[{""id"": 1, ""description"": ""Multi-player""}, {""i...","[{""id"": ""1"", ""description"": ""Action""}]",150423,0,"Nov 1, 2000",0
1,game,Team Fortress Classic,20,0,0,,[],One of the most popular online action games of...,One of the most popular online action games of...,One of the most popular online action games of...,"English, French, German, Italian, Spanish - Sp...",,https://shared.akamai.steamstatic.com/store_it...,https://shared.akamai.steamstatic.com/store_it...,,"{""minimum"": ""\r\n\t\t\t<p><strong>Minimum:</st...","[""Valve""]","[""Valve""]","{""final"": 74, ""initial"": 499, ""currency"": ""USD...","{""mac"": true, ""linux"": true, ""windows"": true}",0,"[{""id"": 1, ""description"": ""Multi-player""}, {""i...","[{""id"": ""1"", ""description"": ""Action""}]",6133,0,"Apr 1, 1999",0
2,game,Day of Defeat,30,0,0,,[],Enlist in an intense brand of Axis vs. Allied ...,Enlist in an intense brand of Axis vs. Allied ...,Enlist in an intense brand of Axis vs. Allied ...,"English, French, German, Italian, Spanish - Spain",,https://shared.akamai.steamstatic.com/store_it...,https://shared.akamai.steamstatic.com/store_it...,http://www.dayofdefeat.com/,"{""minimum"": ""\r\n\t\t\t<p><strong>Minimum:</st...","[""Valve""]","[""Valve""]","{""final"": 74, ""initial"": 499, ""currency"": ""USD...","{""mac"": true, ""linux"": true, ""windows"": true}",79,"[{""id"": 1, ""description"": ""Multi-player""}, {""i...","[{""id"": ""1"", ""description"": ""Action""}]",4074,0,"May 1, 2003",0
3,game,Deathmatch Classic,40,0,0,,[],Enjoy fast-paced multiplayer gaming with Death...,Enjoy fast-paced multiplayer gaming with Death...,Enjoy fast-paced multiplayer gaming with Death...,"English, French, German, Italian, Spanish - Sp...",,https://shared.akamai.steamstatic.com/store_it...,https://shared.akamai.steamstatic.com/store_it...,,"{""minimum"": ""\r\n\t\t\t<p><strong>Minimum:</st...","[""Valve""]","[""Valve""]","{""final"": 74, ""initial"": 499, ""currency"": ""USD...","{""mac"": true, ""linux"": true, ""windows"": true}",0,"[{""id"": 1, ""description"": ""Multi-player""}, {""i...","[{""id"": ""1"", ""description"": ""Action""}]",2149,0,"Jun 1, 2001",0
4,game,Half-Life: Opposing Force,50,0,0,,[],Return to the Black Mesa Research Facility as ...,Return to the Black Mesa Research Facility as ...,Return to the Black Mesa Research Facility as ...,"English, French, German, Korean",,https://shared.akamai.steamstatic.com/store_it...,https://shared.akamai.steamstatic.com/store_it...,,"{""minimum"": ""\r\n\t\t\t<p><strong>Minimum:</st...","[""Gearbox Software""]","[""Valve""]","{""final"": 74, ""initial"": 499, ""currency"": ""USD...","{""mac"": true, ""linux"": true, ""windows"": true}",0,"[{""id"": 2, ""description"": ""Single-player""}, {""...","[{""id"": ""1"", ""description"": ""Action""}]",19244,0,"Nov 1, 1999",0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52934,game,Grocery Simulator,3032530,0,0,,[],"Realistic Store Management\nDesign, customize,...","Realistic Store Management\nDesign, customize,...",Experience the thrill of managing your own gro...,English\n*\n*\nlanguages with full audio support,,https://shared.akamai.steamstatic.com/store_it...,https://shared.akamai.steamstatic.com/store_it...,,"{""minimum"": ""<strong>Minimum:</strong><br><ul ...","[""Roman Savenok""]","[""Roman Savenok""]","{""final"": 689, ""initial"": 689, ""currency"": ""EU...","{""mac"": false, ""linux"": false, ""windows"": true}",0,"[{""id"": 2, ""description"": ""Single-player""}, {""...","[{""id"": ""1"", ""description"": ""Action""}, {""id"": ...",0,0,"22 Jun, 2024",0
52935,game,Quescaper,3033880,0,0,,[],You were thrown into a dungeon with the undead...,You were thrown into a dungeon with the undead...,You were thrown into a dungeon with the undead...,"English, Russian",,https://shared.akamai.steamstatic.com/store_it...,https://shared.akamai.steamstatic.com/store_it...,,"{""minimum"": ""<strong>Minimum:</strong><br><ul ...","[""Neki4 Electronics""]","[""Neki4 Electronics""]","{""final"": 199, ""initial"": 199, ""currency"": ""EU...","{""mac"": false, ""linux"": false, ""windows"": true}",0,"[{""id"": 2, ""description"": ""Single-player""}, {""...","[{""id"": ""1"", ""description"": ""Action""}, {""id"": ...",0,9,"19 Jun, 2024",0
52936,game,USD Clicker,3035710,0,0,,"[3037840, 3037850, 3037860, 3037870, 3037880, ...",USD Clicker\nis a simple clicker game where yo...,USD Clicker\nis a simple clicker game where yo...,The most simple clicker game with endless USD ...,English,,https://shared.akamai.steamstatic.com/store_it...,https://shared.akamai.steamstatic.com/store_it...,,"{""minimum"": ""<strong>Minimum:</strong><br><ul ...","[""RazDva Games""]","[""RazDva Games""]","{""final"": 79, ""initial"": 99, ""currency"": ""EUR""...","{""mac"": false, ""linux"": false, ""windows"": true}",0,"[{""id"": 2, ""description"": ""Single-player""}, {""...","[{""id"": ""4"", ""description"": ""Casual""}, {""id"": ...",0,30,"20 Jun, 2024",0
52937,game,Asterisk 2: Next -Now a Days-,3037470,0,0,,[],"Introduction\n""Asterisk 2: Next Now a Day"" is ...","Introduction\n""Asterisk 2: Next Now a Day"" is ...","""Asterisk 2: Next Now a Day"" is a JRPG created...",Simplified Chinese,,https://shared.akamai.steamstatic.com/store_it...,https://shared.akamai.steamstatic.com/store_it...,,"{""minimum"": ""<strong>Minimum:</strong><br><ul ...","[""暗de刺客""]","[""TVM Studio""]","{""final"": 424, ""initial"": 499, ""currency"": ""EU...","{""mac"": false, ""linux"": false, ""windows"": true}",0,"[{""id"": 2, ""description"": ""Single-player""}, {""...","[{""id"": ""25"", ""description"": ""Adventure""}, {""i...",0,0,"25 Jun, 2024",0


In [31]:
null_counts = steam_data.isnull().sum()
null_counts

type                        0
name                        0
appid                       0
required_age                0
is_free                     0
controller_support      40879
dlc                         0
detailed_description       36
about_the_game             37
short_description          31
supported_languages        70
reviews                 44879
header_image                0
capsule_image               0
website                     0
requirements                0
developers                  0
publishers                  0
price_overview              0
platform                    0
metacritic                  0
categories                  0
genres                      0
recommendations             0
achievements                0
release_date                0
coming_soon                 0
dtype: int64

Creating a copy of raw dataset before starting the cleaning process.

In [32]:
raw_steam_data = steam_data.copy()

### Processing Age

In [33]:
raw_steam_data['required_age'].value_counts()

required_age
0     52126
17      477
18      127
13       91
16       42
12       28
15       13
10        9
6         8
7         8
3         4
14        3
11        2
5         1
Name: count, dtype: int64

Reducing the number of categories that ages fall into by following the [PEGI age ratings](https://pegi.info) categories. For example, instead of comparing games rated as 5, 6, 7 or 8, we could compare games rated 5+ or 8+.

In [34]:
def process_age(df):
    """
    Process the age column in the given DataFrame by converting the age ratings to specific age groups.

    Parameters:
    df (DataFrame): The DataFrame containing the age column.

    Returns:
    DataFrame: The DataFrame with the age column processed.
    """
    # PEGI Age ratings: 3, 7, 12, 16, 18
    cut_points = [-1, 0, 3, 7, 12, 16, 18]
    categories = [0, 3, 7, 12, 16, 18]
    
    df['required_age'] = pd.cut(df['required_age'], bins=cut_points, labels=categories)
    
    return df

age_df = process_age(raw_steam_data)
age_df['required_age'].value_counts().sort_index()

required_age
0     52126
3         4
7        17
12       39
16      149
18      604
Name: count, dtype: int64

### Processing Platforms

In [35]:
platforms_first_row = age_df['platform'].iloc[0]
print(type(platforms_first_row))
platforms_first_row

<class 'str'>


'{"mac": true, "linux": true, "windows": true}'

First task is to recognise the data as dictionaries rather than strings.

In [36]:
eval_first_row = json.loads(platforms_first_row)
print(type(eval_first_row))
print(eval_first_row)
eval_first_row['windows']

<class 'dict'>
{'mac': True, 'linux': True, 'windows': True}


True

Formatting the output. Let's keep things simple and return a string of supported platforms

In [37]:
';'.join(eval_first_row.keys())

'mac;linux;windows'

Keping only keys that have `True` value

In [38]:
platforms = {'windows': True, 'mac': True, 'linux': False}
print([x for x in platforms.keys() if platforms[x]])
';'.join(x for x in platforms.keys() if platforms[x])

['windows', 'mac']


'windows;mac'

Creating a funtion by using pandas [Series.apply](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.apply.html)

In [39]:
def process_platforms(df):
    """
    Process the platforms column in the given DataFrame.

    Args:
        df (pandas.DataFrame): The DataFrame containing the platforms column.

    Returns:
        pandas.DataFrame: The DataFrame with the platforms column processed.
    """
    df = df.copy()

    def parse_platforms(x):
        """
        Parse the platforms from the given JSON string.

        Args:
            x (str): The JSON string representing the platforms.

        Returns:
            str: The parsed platforms separated by semicolons.
        """
        d = json.loads(x)

        return ';'.join(platform for platform in d.keys() if d[platform])

    df['platform'] = df['platform'].apply(parse_platforms)

    return df

platforms_df = process_platforms(age_df)
platforms_df['platform'].value_counts()

platform
windows              39330
mac;linux;windows     6465
mac;windows           5428
linux;windows         1703
mac                      9
linux                    3
mac;linux                1
Name: count, dtype: int64

### Processing Languages

In [40]:
platforms_df['supported_languages'].value_counts().head(10)

supported_languages
English                                                           14321
English\n*\n*\nlanguages with full audio support                  12526
English, Russian                                                    930
English, Simplified Chinese                                         730
English, Japanese                                                   659
Simplified Chinese                                                  509
Simplified Chinese\n*\n*\nlanguages with full audio support         386
English\n*\n, Russian\n*\n*\nlanguages with full audio support      359
English, Portuguese - Brazil                                        274
English, French                                                     264
Name: count, dtype: int64

English plus other languages make up most of the data. It seems reasonably safe to assume that if the app is in English, the word English will appear somewhere in this string. Use the Series.apply to check if the string 'english' appears in each row.

In [41]:
def process_language(df):
    """
    Process the language data in the given DataFrame.

    Parameters:
    df (DataFrame): The input DataFrame containing the language data.

    Returns:
    DataFrame: The processed DataFrame with the language data cleaned.

    """
    df = df.copy()
    
    df = df.dropna(subset=['supported_languages'])
    
    df['english'] = df['supported_languages'].apply(lambda x: 1 if 'english' in x.lower() else 0)
    df = df.drop('supported_languages', axis=1)
    
    return df

language_df = process_language(platforms_df)
language_df[['name', 'english']].head()

Unnamed: 0,name,english
0,Counter-Strike,1
1,Team Fortress Classic,1
2,Day of Defeat,1
3,Deathmatch Classic,1
4,Half-Life: Opposing Force,1


In [42]:
language_df['english'].value_counts()

english
1    51149
0     1720
Name: count, dtype: int64

### Processing Developers and Publishers

In [43]:
print('Developers null counts:', language_df[language_df['developers'] == "null"].shape[0])
print('Developers empty list counts:', language_df[language_df['developers'] == '[""]'].shape[0])

print('\nPublishers null counts:', language_df['publishers'].isnull().sum())
print('Publishers empty list counts:', language_df[language_df['publishers'] == '[""]'].shape[0])

Developers null counts: 185
Developers empty list counts: 0

Publishers null counts: 0
Publishers empty list counts: 433


A few options for dealing with these two columns:

- Remove all rows missing either developer or publisher information
- Impute missing information by replacing the missing column with the column we have (i.e. if developers is missing, fill it with the value in publishers)
- Fill missing information with 'Unknown' or 'None'

In [44]:
no_dev = language_df[language_df['developers'] == "null"]
print('Total games missing developer:', no_dev.shape[0], '\n')
print_steam_links(no_dev[:5])

Total games missing developer: 185 

Crash Time 2: http://store.steampowered.com/app/11390
18 Wheels of Steel: Extreme Trucker: http://store.steampowered.com/app/33730
Prison Tycoon 4: SuperMax: http://store.steampowered.com/app/33750
Jewel Quest Pack: http://store.steampowered.com/app/37960
Mahjong Quest Collection: http://store.steampowered.com/app/38000


In [45]:
no_pub = language_df[language_df['publishers'] == '[""]']
print('\nTotal games missing publisher:', no_pub.shape[0], '\n')
print_steam_links(no_pub[:5])


Total games missing publisher: 433 

RIP - Trilogy™: http://store.steampowered.com/app/2540
Vigil: Blood Bitterness™: http://store.steampowered.com/app/2570
ThreadSpace: Hyperbol: http://store.steampowered.com/app/2720
Bullet Candy: http://store.steampowered.com/app/6600
Loki: http://store.steampowered.com/app/7260


In [46]:
no_dev_or_pub = language_df[(language_df['developers']=="null") & (language_df['publishers']=='[""]')]
print('\nTotal games missing developer and publisher:', no_dev_or_pub.shape[0], '\n')
print_steam_links(no_dev_or_pub[:5])


Total games missing developer and publisher: 115 

Guardians of Graxia: http://store.steampowered.com/app/90500
Patterns: http://store.steampowered.com/app/218980
PlayClaw 5 - Game Recording and Streaming: http://store.steampowered.com/app/237370
Artemis Spaceship Bridge Simulator: http://store.steampowered.com/app/247350
A Walk in the Dark: http://store.steampowered.com/app/248730


Some titles may have been self-publishes while others simply have wrong ot missing data. As the priority is creating a clean dataset, and there are only a few hundred rows, it will be fine to remove them from the data. Is is also safe to assume that multiple developers or publishers may have been involved in the game.

In [47]:
def process_developers_and_publishers(df):
    """
    Process the 'developers' and 'publishers' columns in the given DataFrame.

    Args:
        df (pandas.DataFrame): The DataFrame containing the 'developers' and 'publishers' columns.

    Returns:
        None

    This function filters out rows where the 'developers' column is not equal to "null" and the 'publishers'
    column is not equal to '[""]'. It then converts the values in the 'developers' and 'publishers' columns
    from strings to lists using the `literal_eval` function from the `ast` module. Finally, it prints the number
    of rows in each column that have multiple values.

    Note:
        This function modifies the input DataFrame in-place.
    """
    df = df[(df['developers']!="null") & (df['publishers'] != '[""]')].copy()
    
    for col in ['developers', 'publishers']:
        df[col] = df[col].apply(lambda x: literal_eval(x))
        
        num_rows = df[df[col].str.len() > 1].shape[0]
        
        print('Rows in {} column with multiple values:'.format(col), num_rows)

process_developers_and_publishers(language_df)

Rows in developers column with multiple values: 3473
Rows in publishers column with multiple values: 1760


The column can be formateed by joining the multiple developers and publishers into a string. Since a number of developers and publishers have a comma in their name (for example PopCap Games, Inc.), they can be joint on a semi-colon. There are some that have `[NA]` or `[N/A]`.

In [48]:
def process_developers_and_publishers(df):

    df = df[(df['developers']!="null") & (df['publishers'] != '[""]')].copy()
    df = df[~(df['developers'].str.contains(';')) & ~(df['publishers'].str.contains(';'))]
    df = df[(df['publishers'] != '["NA"]') & (df['publishers'] != '["N/A"]')]
    
    df['developer'] = df['developers'].apply(lambda x: ';'.join(literal_eval(x)))
    df['publisher'] = df['publishers'].apply(lambda x: ';'.join(literal_eval(x)))

    df = df.drop(['developers', 'publishers'], axis=1)
    
    return df

dev_pub_df = process_developers_and_publishers(language_df)
dev_pub_df[['name', 'appid', 'developer', 'publisher']].head()

Unnamed: 0,name,appid,developer,publisher
0,Counter-Strike,10,Valve,Valve
1,Team Fortress Classic,20,Valve,Valve
2,Day of Defeat,30,Valve,Valve
3,Deathmatch Classic,40,Valve,Valve
4,Half-Life: Opposing Force,50,Gearbox Software,Valve


### Processing Categories and Genres