In [1]:
import os
import sys

curr_dir = os.getcwd()
pkg_dir = os.path.join(os.path.dirname(curr_dir), "steam_sales", "steam_etl")
sys.path.append(pkg_dir)

In [2]:
import json
import warnings
from ast import literal_eval

import dateparser
import pandas as pd
from bs4 import BeautifulSoup
from db import get_db
from settings import Path
from sqlalchemy import text
from utils import check_na, print_steam_links

warnings.filterwarnings("ignore")
pd.set_option("display.max_columns", 100)

### Data Extraction from SQL Database

In [3]:
def fetch_data(source: str):
    """
    Fetches data from a specified source and returns it as a pandas DataFrame.

    Parameters:
    source (str): The name of the source file containing the SQL query.

    Returns:
    pandas.DataFrame: The fetched data as a DataFrame.

    """
    with open(os.path.join(Path.sql_queries, source), "r") as f:
        query = text(f.read())
        
    with get_db() as db:
        result = db.execute(query)

    data = result.fetchall()
    columns = result.keys()
    df = pd.DataFrame(data, columns=columns)

    return df

steam_data = fetch_data("get_all_steam_data.sql")
steam_data.head()

Unnamed: 0,type,name,appid,required_age,is_free,controller_support,dlc,detailed_description,about_the_game,short_description,supported_languages,reviews,header_image,capsule_image,website,requirements,developers,publishers,price_overview,platform,metacritic,categories,genres,recommendations,achievements,release_date,coming_soon
0,game,Counter-Strike,10,0,0,,[],Play the world's number 1 online action game. ...,Play the world's number 1 online action game. ...,Play the world's number 1 online action game. ...,"English\n*\n, French\n*\n, German\n*\n, Italia...",,https://shared.akamai.steamstatic.com/store_it...,https://shared.akamai.steamstatic.com/store_it...,,"{""minimum"": ""\r\n\t\t\t<p><strong>Minimum:</st...","[""Valve""]","[""Valve""]","{""final"": 99, ""initial"": 999, ""currency"": ""USD...","{""mac"": true, ""linux"": true, ""windows"": true}",88,"[{""id"": 1, ""description"": ""Multi-player""}, {""i...","[{""id"": ""1"", ""description"": ""Action""}]",150423,0,"Nov 1, 2000",0
1,game,Team Fortress Classic,20,0,0,,[],One of the most popular online action games of...,One of the most popular online action games of...,One of the most popular online action games of...,"English, French, German, Italian, Spanish - Sp...",,https://shared.akamai.steamstatic.com/store_it...,https://shared.akamai.steamstatic.com/store_it...,,"{""minimum"": ""\r\n\t\t\t<p><strong>Minimum:</st...","[""Valve""]","[""Valve""]","{""final"": 74, ""initial"": 499, ""currency"": ""USD...","{""mac"": true, ""linux"": true, ""windows"": true}",0,"[{""id"": 1, ""description"": ""Multi-player""}, {""i...","[{""id"": ""1"", ""description"": ""Action""}]",6133,0,"Apr 1, 1999",0
2,game,Day of Defeat,30,0,0,,[],Enlist in an intense brand of Axis vs. Allied ...,Enlist in an intense brand of Axis vs. Allied ...,Enlist in an intense brand of Axis vs. Allied ...,"English, French, German, Italian, Spanish - Spain",,https://shared.akamai.steamstatic.com/store_it...,https://shared.akamai.steamstatic.com/store_it...,http://www.dayofdefeat.com/,"{""minimum"": ""\r\n\t\t\t<p><strong>Minimum:</st...","[""Valve""]","[""Valve""]","{""final"": 74, ""initial"": 499, ""currency"": ""USD...","{""mac"": true, ""linux"": true, ""windows"": true}",79,"[{""id"": 1, ""description"": ""Multi-player""}, {""i...","[{""id"": ""1"", ""description"": ""Action""}]",4074,0,"May 1, 2003",0
3,game,Deathmatch Classic,40,0,0,,[],Enjoy fast-paced multiplayer gaming with Death...,Enjoy fast-paced multiplayer gaming with Death...,Enjoy fast-paced multiplayer gaming with Death...,"English, French, German, Italian, Spanish - Sp...",,https://shared.akamai.steamstatic.com/store_it...,https://shared.akamai.steamstatic.com/store_it...,,"{""minimum"": ""\r\n\t\t\t<p><strong>Minimum:</st...","[""Valve""]","[""Valve""]","{""final"": 74, ""initial"": 499, ""currency"": ""USD...","{""mac"": true, ""linux"": true, ""windows"": true}",0,"[{""id"": 1, ""description"": ""Multi-player""}, {""i...","[{""id"": ""1"", ""description"": ""Action""}]",2149,0,"Jun 1, 2001",0
4,game,Half-Life: Opposing Force,50,0,0,,[],Return to the Black Mesa Research Facility as ...,Return to the Black Mesa Research Facility as ...,Return to the Black Mesa Research Facility as ...,"English, French, German, Korean",,https://shared.akamai.steamstatic.com/store_it...,https://shared.akamai.steamstatic.com/store_it...,,"{""minimum"": ""\r\n\t\t\t<p><strong>Minimum:</st...","[""Gearbox Software""]","[""Valve""]","{""final"": 74, ""initial"": 499, ""currency"": ""USD...","{""mac"": true, ""linux"": true, ""windows"": true}",0,"[{""id"": 2, ""description"": ""Single-player""}, {""...","[{""id"": ""1"", ""description"": ""Action""}]",19244,0,"Nov 1, 1999",0


Creating a copy of `steam_data` dataset before starting the cleaning process.

In [4]:
raw_steam_data = steam_data.copy()

### Process Null values

Since the data is queried from SQL, some null vales are read as strings. 

In [5]:
def process_null(df):
    """
    Process null values in a DataFrame by replacing specific values with None.

    Args:
        df (pandas.DataFrame): The DataFrame to process.

    Returns:
        pandas.DataFrame: The processed DataFrame with null values replaced.

    """
    df = df.copy()

    convert_to_none = ['', 'none', 'null', 'N/a', 'n/a', '["none"]', '["null"]', '{}', ' ']
    df.replace(convert_to_none, None, inplace=True)
    
    return df

raw_steam_data = process_null(raw_steam_data)
raw_steam_data.isnull().sum()

type                        0
name                       15
appid                       0
required_age                0
is_free                     0
controller_support      57183
dlc                         0
detailed_description      121
about_the_game            146
short_description          64
supported_languages        82
reviews                 63623
header_image                0
capsule_image               0
website                 36352
requirements               54
developers                232
publishers                 11
price_overview           9895
platform                    0
metacritic                  0
categories                801
genres                    133
recommendations             0
achievements                0
release_date              112
coming_soon                 0
dtype: int64

### Processing Age

In [6]:
raw_steam_data['required_age'].value_counts()

required_age
0     72952
17      606
18      155
13      121
16       50
12       35
10       14
15       14
7         9
6         8
3         5
14        4
11        2
21        1
5         1
Name: count, dtype: int64

Reducing the number of categories that ages fall into by following the [PEGI age ratings](https://pegi.info) categories. For example, instead of comparing games rated as 5, 6, 7 or 8, we could compare games rated 5+ or 8+. 

In [7]:
def process_age(df):
    """
    Process the 'required_age' column in the given DataFrame.

    Parameters:
    df (DataFrame): The input DataFrame containing the 'required_age' column.

    Returns:
    DataFrame: A copy of the input DataFrame with the 'required_age' column processed.

    """
    df = df.copy()
    cut_points = [-1, 0, 3, 7, 12, 16, 1000]
    categories = [0, 3, 7, 12, 16, 18]

    df = df[df['required_age'].notna()] 
    df['required_age'] = pd.cut(df['required_age'], bins=cut_points, labels=categories)
    
    return df

age_df = process_age(raw_steam_data)
age_df['required_age'].value_counts().sort_index()

required_age
0     72952
3         5
7        18
12       51
16      189
18      762
Name: count, dtype: int64

### Processing Platforms

In [8]:
platforms_first_row = age_df['platform'].iloc[0]
print(type(platforms_first_row))
platforms_first_row

<class 'str'>


'{"mac": true, "linux": true, "windows": true}'

First task is to recognise the data as dictionaries rather than strings.

In [9]:
eval_first_row = json.loads(platforms_first_row)
print(type(eval_first_row))
print(eval_first_row)
eval_first_row['windows']

<class 'dict'>
{'mac': True, 'linux': True, 'windows': True}


True

Formatting the output. Let's keep things simple and return a string of supported platforms

In [10]:
';'.join(eval_first_row.keys())

'mac;linux;windows'

Keeping only keys that have `True` value

In [11]:
platforms = {'windows': True, 'mac': True, 'linux': False}
print([x for x in platforms.keys() if platforms[x]])
';'.join(x for x in platforms.keys() if platforms[x])

['windows', 'mac']


'windows;mac'

Creating a funtion by using pandas [Series.apply](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.apply.html)

In [12]:
def process_platforms(df):
    """
    Process the platforms column in the given DataFrame.

    Args:
        df (pandas.DataFrame): The DataFrame containing the platforms column.

    Returns:
        pandas.DataFrame: The DataFrame with the platforms column processed.
    """
    df = df.copy()

    def parse_platforms(x):
        """
        Parse the platforms from the given JSON string.

        Args:
            x (str): The JSON string representing the platforms.

        Returns:
            str: The parsed platforms separated by semicolons.
        """
        d = json.loads(x)

        return ';'.join(platform for platform in d.keys() if d[platform])

    df['platform'] = df['platform'].apply(parse_platforms)

    return df

platforms_df = process_platforms(age_df)
platforms_df['platform'].value_counts()

platform
windows              55962
mac;linux;windows     8073
mac;windows           7453
linux;windows         2466
mac                     11
linux                   11
mac;linux                1
Name: count, dtype: int64

### Processing Languages

In [13]:
platforms_df['supported_languages'].value_counts().head(10)

supported_languages
English                                                           19799
English\n*\n*\nlanguages with full audio support                  17331
English, Russian                                                   1386
English, Simplified Chinese                                        1020
English, Japanese                                                   888
Simplified Chinese                                                  829
Simplified Chinese\n*\n*\nlanguages with full audio support         649
English\n*\n, Russian\n*\n*\nlanguages with full audio support      457
English, Portuguese - Brazil                                        422
English, French                                                     395
Name: count, dtype: int64

English plus other languages make up most of the data. It seems reasonably safe to assume that if the app is in English, the word English will appear somewhere in this string. Use the Series.apply to check if the string 'english' appears in each row. Null values are dropped as they are very less in number.

In [14]:
def process_language(df):
    """
    Process the language data in the given DataFrame.

    Parameters:
    df (DataFrame): The input DataFrame containing the language data.

    Returns:
    DataFrame: The processed DataFrame with the language data cleaned.

    """
    df = df.copy()
    
    df = df.dropna(subset=['supported_languages'])
    
    df['english'] = df['supported_languages'].apply(lambda x: 1 if 'english' in x.lower() else 0)
    df = df.drop('supported_languages', axis=1)
    
    return df

language_df = process_language(platforms_df)
language_df[['name', 'english']].head()

Unnamed: 0,name,english
0,Counter-Strike,1
1,Team Fortress Classic,1
2,Day of Defeat,1
3,Deathmatch Classic,1
4,Half-Life: Opposing Force,1


In [15]:
language_df['english'].value_counts()

english
1    71174
0     2721
Name: count, dtype: int64

### Processing Developers and Publishers

In [16]:
print('Developers null counts:', language_df[language_df['developers'].isnull()].shape[0])
print('Developers empty list counts:', language_df[language_df['developers'] == '[""]'].shape[0])

print('\nPublishers null counts:', language_df['publishers'].isnull().sum())
print('Publishers empty list counts:', language_df[language_df['publishers'] == '[""]'].shape[0])

Developers null counts: 214
Developers empty list counts: 0

Publishers null counts: 10
Publishers empty list counts: 504


A few options for dealing with these two columns:

- Remove all rows missing either developer or publisher information
- Impute missing information by replacing the missing column with the column we have (i.e. if developers is missing, fill it with the value in publishers)
- Fill missing information with 'Unknown' or 'None'

In [17]:
no_dev = language_df[language_df['developers'].isnull()]
print('Total games missing developer:', no_dev.shape[0], '\n')
print_steam_links(no_dev[:5])

Total games missing developer: 214 

Crash Time 2: http://store.steampowered.com/app/11390
18 Wheels of Steel: Extreme Trucker: http://store.steampowered.com/app/33730
Prison Tycoon 4: SuperMax: http://store.steampowered.com/app/33750
Jewel Quest Pack: http://store.steampowered.com/app/37960
Mahjong Quest Collection: http://store.steampowered.com/app/38000


In [18]:
no_pub = language_df[language_df['publishers'] == '[""]']
print('\nTotal games missing publisher:', no_pub.shape[0], '\n')
print_steam_links(no_pub[:5])


Total games missing publisher: 504 

RIP - Trilogy™: http://store.steampowered.com/app/2540
Vigil: Blood Bitterness™: http://store.steampowered.com/app/2570
ThreadSpace: Hyperbol: http://store.steampowered.com/app/2720
Bullet Candy: http://store.steampowered.com/app/6600
Loki: http://store.steampowered.com/app/7260


In [19]:
no_dev_or_pub = language_df[(language_df['developers'].isnull()) & (language_df['publishers'] == '[""]')]
print('\nTotal games missing developer and publisher:', no_dev_or_pub.shape[0], '\n')
print_steam_links(no_dev_or_pub[:5])


Total games missing developer and publisher: 137 

Guardians of Graxia: http://store.steampowered.com/app/90500
Patterns: http://store.steampowered.com/app/218980
PlayClaw 5 - Game Recording and Streaming: http://store.steampowered.com/app/237370
Artemis Spaceship Bridge Simulator: http://store.steampowered.com/app/247350
A Walk in the Dark: http://store.steampowered.com/app/248730


Some titles may have been self-publishes while others simply have wrong ot missing data. As the priority is creating a clean dataset, and there are only a few hundred rows, it will be fine to remove them from the data. Is is also safe to assume that multiple developers or publishers may have been involved in the game. The column can be formateed by joining the multiple developers and publishers into a string. Since a number of developers and publishers have a comma in their name (for example PopCap Games, Inc.), they can be joint on a semi-colon. There are some that have `[NA]` or `[N/A]` or `[N/a]`.

In [20]:
def process_developers_and_publishers(df):
    """
    Process the developers and publishers columns in the given DataFrame.

    Args:
        df (pandas.DataFrame): The DataFrame containing the developers and publishers columns.

    Returns:
        pandas.DataFrame: The processed DataFrame with the developers and publishers columns transformed.

    """
    pattern = r'(?i)\["(n/a|na|null)"\]'

    df = df[(df['developers'].notna()) & (df['publishers'].notna() & (df['publishers'] != '[""]') & (df['publishers'] != '[" "]'))]
    df = df[~df['developers'].str.contains(pattern, na=False)]
    df = df[~df['publishers'].str.contains(pattern, na=False)]

    df = df[~df['developers'].str.contains(';', na=False)]
    df = df[~df['publishers'].str.contains(';', na=False)]

    def safe_literal_eval(val):
        try:
            result = literal_eval(val)
            if isinstance(result, list):
                return ';'.join(filter(None, result))
        except (ValueError, SyntaxError):
            return ''
        return val
    
    df['developer'] = df['developers'].apply(safe_literal_eval)
    df['publisher'] = df['publishers'].apply(safe_literal_eval)

    df = df.drop(['developers', 'publishers'], axis=1)
    
    return df

dev_pub_df = process_developers_and_publishers(language_df)
dev_pub_df[['name', 'appid', 'developer', 'publisher']].head()


Unnamed: 0,name,appid,developer,publisher
0,Counter-Strike,10,Valve,Valve
1,Team Fortress Classic,20,Valve,Valve
2,Day of Defeat,30,Valve,Valve
3,Deathmatch Classic,40,Valve,Valve
4,Half-Life: Opposing Force,50,Gearbox Software,Valve


Helper cell to check if any values of `[n/a]` variations were missed.

In [21]:
# temp = check_na(dev_pub_df, 'publisher')
# temp.head()

### Processing Price

In [22]:
dev_pub_df[dev_pub_df['price_overview'].isnull()].shape[0]

9696

In [23]:
free_and_null_price = dev_pub_df[(dev_pub_df['is_free']) & (dev_pub_df['price_overview'].isnull())]
print("Games that are marked free and have price as null: ", free_and_null_price.shape[0])

Games that are marked free and have price as null:  6040


It turns out this accounts for most of the missing values in the `price_overview` column. These missing values can be handled by setting the final price as 0. This makes intuitive sense - free games wouldn't have a price.

In [24]:
not_free_and_null_price = dev_pub_df[(dev_pub_df['is_free'] == 0) & (dev_pub_df['price_overview'].isnull())]
print("Steam store links to some games that are not free and have missing price\n")
print_steam_links(not_free_and_null_price[:5])

Steam store links to some games that are not free and have missing price

Half-Life 2: Lost Coast: http://store.steampowered.com/app/340
Final DOOM: http://store.steampowered.com/app/2290
Quake II Mission Pack: The Reckoning: http://store.steampowered.com/app/2330
Quake II Mission Pack: Ground Zero: http://store.steampowered.com/app/2340
The Ship: Single Player: http://store.steampowered.com/app/2420


Some of these games are sold as part of the main game, some have been removed while others have their name replaced.

In [25]:
dev_pub_df['price_overview'][37]

'{"final": 119, "initial": 599, "currency": "USD", "final_formatted": "$1.19", "discount_percent": 80, "initial_formatted": "$5.99"}'

This will be formatted by taking only the `currency` and `initial` keys. These will be created as two seperate columns. There are multiple currencies and it makes sense to convert these to USD.

In [26]:
def process_price(df):
    """
    Process the price data in the given DataFrame.

    Args:
        df (pandas.DataFrame): The DataFrame containing the price data.

    Returns:
        pandas.DataFrame: The DataFrame with processed price data.
    """
    df = df.copy()
    currency_rates = {'EUR': 1.08, 'TWD': 0.03, 'SGD': 0.74, 'BRL': 0.18, 'AUD': 0.67} 
        
    def parse_price(x):
        """
        Parses the price information from a string representation.

        Parameters:
        x (str): The string representation of the price.

        Returns:
        dict: A dictionary containing the parsed price information. The dictionary has two keys:
              - 'currency': The currency of the price.
              - 'initial': The initial price value.
        """
        if x is not None:
            return literal_eval(x)
        else:
            return {'currency': 'USD', 'initial': -1}
    
    def convert_to_usd(price, currency, rates):
        """
        Converts the given price from the specified currency to USD using the provided exchange rates.

        Parameters:
        price (float): The price to be converted.
        currency (str): The currency of the price.
        rates (dict): A dictionary containing exchange rates for different currencies.

        Returns:
        float: The converted price in USD.
        """
        if currency == 'USD':
            return price

        currency_rate = rates[currency]
        return price * currency_rate
    
    df['price_overview'] = df['price_overview'].apply(parse_price)
    
    df['currency'] = df['price_overview'].apply(lambda x: x['currency'])
    df['price'] = df['price_overview'].apply(lambda x: x['initial'])
    
    df.loc[df['is_free'] == 1, 'price'] = 0
    df.loc[df['price'] > 0, 'price'] /= 100

    df['price'] = df.apply(lambda x: convert_to_usd(x['price'], x['currency'], currency_rates), axis=1)

    df = df.drop(['is_free', 'currency', 'price_overview'], axis=1)
    
    return df

price_df = process_price(dev_pub_df)
price_df[['name', 'price']].head()

Unnamed: 0,name,price
0,Counter-Strike,9.99
1,Team Fortress Classic,4.99
2,Day of Defeat,4.99
3,Deathmatch Classic,4.99
4,Half-Life: Opposing Force,4.99


### Processing Categories and Genres

In [27]:
print("Number of null values for categories: ", price_df[price_df['categories'].isnull()].shape[0])
print("Number of null values for genres: ", price_df[price_df['genres'].isnull()].shape[0])

Number of null values for categories:  797
Number of null values for genres:  99


Randomly inspect null rows by using [Dataframe.sample](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.sample.html) method.

In [28]:
print_steam_links(price_df[price_df['categories'].isnull()].sample(5, random_state=0))

RPG Paper Maker: http://store.steampowered.com/app/1066860
PhotoTangler Collage Maker: http://store.steampowered.com/app/586560
MasterPlan: http://store.steampowered.com/app/1269310
Home Office Tasker: http://store.steampowered.com/app/1727670
Hypnosia - Application d'Hypnose avec Reconnaissance Vocale: http://store.steampowered.com/app/2952000


All of the above rows of categories are applications or software of some kind, and not actually games. It would be best to remove these.

In [29]:
print_steam_links(price_df[price_df['genres'].isnull()].sample(5, random_state=0))

ChessBase 13 Academy: http://store.steampowered.com/app/377340
General Staff: Black Powder Battle Designer Bundle: http://store.steampowered.com/app/2231050
Call of Cthulhu®: Dark Corners of the Earth: http://store.steampowered.com/app/22340
River Relaxation VR: http://store.steampowered.com/app/938760
Evil Shogun: http://store.steampowered.com/app/1505830


Most of these are games and suggests that genre data simply wasn't supplied. These rows can be removed as these are less in number.

In [30]:
def process_categories_and_genres(df):
    """
    Process the categories and genres columns of the given DataFrame.
    
    Parameters:
    df (DataFrame): The input DataFrame containing the 'categories' and 'genres' columns.
    
    Returns:
    DataFrame: The processed DataFrame with the 'categories' and 'genres' columns modified.
    """
    df = df.copy()
    df = df[(df['categories'].notna()) & (df['genres'].notna())]
    
    for col in ['categories', 'genres']:
        df[col] = df[col].apply(lambda x: ';'.join(item['description'] for item in literal_eval(x)))
    
    return df

cat_gen_df = process_categories_and_genres(price_df)
cat_gen_df[['appid', 'categories', 'genres']].head()

Unnamed: 0,appid,categories,genres
0,10,Multi-player;PvP;Online PvP;Shared/Split Scree...,Action
1,20,Multi-player;PvP;Online PvP;Shared/Split Scree...,Action
2,30,Multi-player;Valve Anti-Cheat enabled;Family S...,Action
3,40,Multi-player;PvP;Online PvP;Shared/Split Scree...,Action
4,50,Single-player;Multi-player;Valve Anti-Cheat en...,Action


### Processing Controllor Support

In [31]:
cat_gen_df['controller_support'].value_counts()

controller_support
full    16681
Name: count, dtype: int64

Since it's either `full` or `None`, these can be replaced with 1's and 0's respectively.

In [32]:
def process_controller(df):
    """
    Process the controller support column in the given DataFrame.

    Parameters:
    df (pandas.DataFrame): The DataFrame containing the controller support column.

    Returns:
    pandas.DataFrame: The DataFrame with the controller support column processed.
    """
    df = df.copy()
    df['controller_support'] = df['controller_support'].apply(lambda x: 1 if x == 'full' else 0)

    return df

con_sup_df = process_controller(cat_gen_df)
con_sup_df['controller_support'].value_counts()

controller_support
0    55700
1    16681
Name: count, dtype: int64

### Processing DLC

In [33]:
con_sup_df['dlc'].value_counts()

dlc
[]                                                        60454
[1477350]                                                     5
[689890]                                                      2
[952210]                                                      2
[283750]                                                      2
                                                          ...  
[3075210, 3075230]                                            1
[3027720, 3044890, 3044900]                                   1
[2930440]                                                     1
[2931080]                                                     1
[3037840, 3037850, 3037860, 3037870, 3037880, 3037890]        1
Name: count, Length: 11911, dtype: int64

These values can be replaced by finding out the length of the list as it makes sense to know the number of dlc's a game has.

In [34]:
def process_dlc(df):
    """
    Process the 'dlc' column in the given DataFrame by converting the values to the number of DLCs available for each game.

    Args:
        df (DataFrame): The input DataFrame containing the 'dlc' column.

    Returns:
        DataFrame: A copy of the input DataFrame with the 'dlc' column transformed to the number of DLCs.

    """
    df = df.copy()

    def parse_list(x):
        """
        Parses a string representation of a list and returns the length of the list.

        Parameters:
        x (str): A string representation of a list.

        Returns:
        int: The length of the list.

        """
        lst = literal_eval(x) 
        return len(lst)

    df['dlc'] = df['dlc'].apply(parse_list)

    return df

dlc_df = process_dlc(con_sup_df)

### Processing Requirements

In [35]:
text = dlc_df['requirements'][0]
text

'{"minimum": "\\r\\n\\t\\t\\t<p><strong>Minimum:</strong> 500 mhz processor, 96mb ram, 16mb video card, Windows XP, Mouse, Keyboard, Internet Connection<br /></p>\\r\\n\\t\\t\\t<p><strong>Recommended:</strong> 800 mhz processor, 128mb ram, 32mb+ video card, Windows XP, Mouse, Keyboard, Internet Connection<br /></p>\\r\\n\\t\\t\\t"}'

The strings are full of html formatting, which is presumably parsed to display the information on the website.

In [36]:
parse_text = BeautifulSoup(text, 'lxml')
plain_text = parse_text.get_text()
plain_text

'{"minimum": "\\r\\n\\t\\t\\tMinimum: 500 mhz processor, 96mb ram, 16mb video card, Windows XP, Mouse, Keyboard, Internet Connection\\r\\n\\t\\t\\tRecommended: 800 mhz processor, 128mb ram, 32mb+ video card, Windows XP, Mouse, Keyboard, Internet Connection\\r\\n\\t\\t\\t"}'

In [37]:
def process_requirement(df):
    """
    Process the requirements column of the given DataFrame.

    Parameters:
    df (DataFrame): The input DataFrame containing the requirements column.

    Returns:
    DataFrame: A copy of the input DataFrame with the requirements column processed.

    """
    df = df.copy()
    df['requirements'] = df['requirements'].apply(lambda x: BeautifulSoup(x, 'lxml').get_text() if x else 'Not available')

    return df

req_df = process_requirement(dlc_df)

### Processing Dates

In [38]:
print("Dates missing and coming soon is true: ", req_df[req_df['release_date'].isnull() & req_df['coming_soon']==1].shape[0])

Dates missing and coming soon is true:  19


In [39]:
req_df['release_date'].head()

0    Nov 1, 2000
1    Apr 1, 1999
2    May 1, 2003
3    Jun 1, 2001
4    Nov 1, 1999
Name: release_date, dtype: object

All date formats are of the type `YYYY-MM-DD`. Some dates are not in english and are handlled using [dateparser](https://dateparser.readthedocs.io/en/latest/). These are then converted to `datetime64` data type.

In [40]:
def process_date(df):
    """
    Process the release date column in the given DataFrame.

    Args:
        df (pandas.DataFrame): The DataFrame containing the release date column.

    Returns:
        pandas.DataFrame: The DataFrame with the release date column processed.

    """
    df = df.copy()

    def parse_date(date_string):
        """
        Parses a date string and returns a pandas datetime object.

        Parameters:
        date_string (str): The date string to be parsed.

        Returns:
        pd.Timestamp: A pandas datetime object representing the parsed date.

        """
        if isinstance(date_string, str):
            try:
                date_obj = dateparser.parse(date_string)
                return pd.to_datetime(date_obj)
            
            except:
                date_str = date_string.replace(" ", "")
                date_str = dateparser.parse(date_str)
                
                return pd.to_datetime(date_str)

        return pd.NaT

    df['release_date'] = df['release_date'].apply(parse_date)

    return df

date_df = process_date(req_df)
date_df[['appid', 'name', 'release_date']].head()

Unnamed: 0,appid,name,release_date
0,10,Counter-Strike,2000-11-01
1,20,Team Fortress Classic,1999-04-01
2,30,Day of Defeat,2003-05-01
3,40,Deathmatch Classic,2001-06-01
4,50,Half-Life: Opposing Force,1999-11-01


### Processing Game Descriptions

Since there columns are text, it makes sense to combine all these under one column as they can be used for models that work on textual content. `website` and `header_image` columns are also merged into this.

In [41]:
def process_descriptions(df):
    """
    Process the descriptions of a DataFrame by combining multiple columns and performing some cleaning.

    Args:
        df (pandas.DataFrame): The DataFrame containing the columns to be processed.

    Returns:
        pandas.DataFrame: The processed DataFrame with the combined description column and other unnecessary columns dropped.
    """
    df = df.copy()

    df['description'] = df['detailed_description'].fillna('') + ' ' + df['about_the_game'].fillna('') + ' ' + df['short_description'].fillna('') + ' Website: ' + df['website'].fillna('Not available') + ' Game Image: ' + df['header_image'].fillna('Not available')
    df['description'] = df['description'].replace('', 'Not available')
    df = df.drop(['detailed_description', 'about_the_game', 'short_description', 'website', 'header_image'], axis=1)

    return df

desc_df = process_descriptions(date_df)
desc_df[['appid', 'name', 'description']].head()

Unnamed: 0,appid,name,description
0,10,Counter-Strike,Play the world's number 1 online action game. ...
1,20,Team Fortress Classic,One of the most popular online action games of...
2,30,Day of Defeat,Enlist in an intense brand of Axis vs. Allied ...
3,40,Deathmatch Classic,Enjoy fast-paced multiplayer gaming with Death...
4,50,Half-Life: Opposing Force,Return to the Black Mesa Research Facility as ...


### Miscellaneous Changes

Almost all columns are delt with. Columns `reviews` and `capsule_image` can be dropped. `release_date` is split into `year`, `month` and `day` columns.

In [42]:
def misc(df):
    """
    Perform miscellaneous data cleaning operations on the given DataFrame.

    Parameters:
    df (pandas.DataFrame): The input DataFrame to be cleaned.

    Returns:
    pandas.DataFrame: The cleaned DataFrame.
    """
    df = df.copy()
    col_to_drop = ['capsule_image', 'reviews']
    
    df['year'] = df['release_date'].dt.year.astype('Int16')
    df['month'] = df['release_date'].dt.month.astype('Int16')
    df['day'] = df['release_date'].dt.day.astype('Int16')

    df = df.drop(col_to_drop, axis=1)

    return df

clean_steam_data = misc(desc_df)
clean_steam_data.head()

Unnamed: 0,type,name,appid,required_age,controller_support,dlc,requirements,platform,metacritic,categories,genres,recommendations,achievements,release_date,coming_soon,english,developer,publisher,price,description,year,month,day
0,game,Counter-Strike,10,0,0,0,"{""minimum"": ""\r\n\t\t\tMinimum: 500 mhz proces...",mac;linux;windows,88,Multi-player;PvP;Online PvP;Shared/Split Scree...,Action,150423,0,2000-11-01,0,1,Valve,Valve,9.99,Play the world's number 1 online action game. ...,2000,11,1
1,game,Team Fortress Classic,20,0,0,0,"{""minimum"": ""\r\n\t\t\tMinimum: 500 mhz proces...",mac;linux;windows,0,Multi-player;PvP;Online PvP;Shared/Split Scree...,Action,6133,0,1999-04-01,0,1,Valve,Valve,4.99,One of the most popular online action games of...,1999,4,1
2,game,Day of Defeat,30,0,0,0,"{""minimum"": ""\r\n\t\t\tMinimum: 500 mhz proces...",mac;linux;windows,79,Multi-player;Valve Anti-Cheat enabled;Family S...,Action,4074,0,2003-05-01,0,1,Valve,Valve,4.99,Enlist in an intense brand of Axis vs. Allied ...,2003,5,1
3,game,Deathmatch Classic,40,0,0,0,"{""minimum"": ""\r\n\t\t\tMinimum: 500 mhz proces...",mac;linux;windows,0,Multi-player;PvP;Online PvP;Shared/Split Scree...,Action,2149,0,2001-06-01,0,1,Valve,Valve,4.99,Enjoy fast-paced multiplayer gaming with Death...,2001,6,1
4,game,Half-Life: Opposing Force,50,0,0,0,"{""minimum"": ""\r\n\t\t\tMinimum: 500 mhz proces...",mac;linux;windows,0,Single-player;Multi-player;Valve Anti-Cheat en...,Action,19244,0,1999-11-01,0,1,Gearbox Software,Valve,4.99,Return to the Black Mesa Research Facility as ...,1999,11,1


### Final Steps

Except dates, all columns are devoid of null values.

In [43]:
clean_steam_data.isnull().sum()

type                    0
name                    0
appid                   0
required_age            0
controller_support      0
dlc                     0
requirements            0
platform                0
metacritic              0
categories              0
genres                  0
recommendations         0
achievements            0
release_date          168
coming_soon             0
english                 0
developer               0
publisher               0
price                   0
description             0
year                  168
month                 168
day                   168
dtype: int64

Memory usage analysis of raw data and clean data.

In [44]:
raw_steam_data.info(verbose=False, memory_usage="deep")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 73977 entries, 0 to 73976
Columns: 27 entries, type to coming_soon
dtypes: int64(7), object(20)
memory usage: 489.6 MB


In [45]:
clean_steam_data.info(verbose=False, memory_usage="deep")

<class 'pandas.core.frame.DataFrame'>
Index: 72381 entries, 0 to 73976
Columns: 23 entries, type to day
dtypes: Int16(3), category(1), datetime64[ns](1), float64(1), int64(8), object(9)
memory usage: 388.3 MB
