# SteamSpy Data Cleaning

## Import Libraries and Inspect Data

In [13]:
# standard library imports
from ast import literal_eval
import itertools
# import time
# import re

# third-party imports
import numpy as np
import pandas as pd

# customisations
pd.options.display.max_columns = 100

In [14]:
raw_steamspy_data = pd.read_csv('../data/exports/steam_partially_clean.csv')
raw_steamspy_data.head()

Unnamed: 0,AppID,Name,Release date,Estimated owners,Peak CCU,Required age,Price,DLC count,Supported languages,Windows,Mac,Linux,User score,Positive,Negative,Achievements,Recommendations,Average playtime forever,Average playtime two weeks,Median playtime forever,Median playtime two weeks,Categories,Genres,Tags,Developer,Publisher
0,20200,Galactic Bowling,2008-10-21,0 - 20000,0,0,19.99,0,['English'],True,False,False,0,6,11,30,0,0,0,0,0,"Single-player,Multi-player,Steam Achievements,...","Casual,Indie,Sports","Indie,Casual,Sports,Bowling",Perpetual FX Creative,Perpetual FX Creative
1,655370,Train Bandit,2017-10-12,0 - 20000,0,0,0.99,0,"['English', 'French', 'Italian', 'German', 'Sp...",True,True,False,0,53,5,12,0,0,0,0,0,"Single-player,Steam Achievements,Full controll...","Action,Indie","Indie,Action,Pixel Graphics,2D,Retro,Arcade,Sc...",Rusty Moyher,Wild Rooster
2,1355720,Henosis™,2020-07-23,0 - 20000,0,0,5.99,0,"['English', 'French', 'Italian', 'German', 'Sp...",True,True,True,0,3,0,0,0,0,0,0,0,"Single-player,Full controller support","Adventure,Casual,Indie","2D Platformer,Atmospheric,Surreal,Mystery,Puzz...",Odd Critter Games,Odd Critter Games
3,1139950,Two Weeks in Painland,2020-02-03,0 - 20000,0,0,0.0,0,"['English', 'Spanish - Spain']",True,True,False,0,50,8,17,0,0,0,0,0,"Single-player,Steam Achievements","Adventure,Indie","Indie,Adventure,Nudity,Violent,Sexual Content,...",Unusual Games,Unusual Games
4,1469160,Wartune Reborn,2021-02-26,50000 - 100000,68,0,0.0,0,['English'],True,False,False,0,87,49,0,0,0,0,0,0,"Single-player,Multi-player,MMO,PvP,Online PvP,...","Adventure,Casual,Free to Play,Massively Multip...","Turn-Based Combat,Massively Multiplayer,Multip...",7Road,7Road


In [15]:
raw_steamspy_data.isnull().sum()

AppID                         0
Name                          0
Release date                  0
Estimated owners              0
Peak CCU                      0
Required age                  0
Price                         0
DLC count                     0
Supported languages           0
Windows                       0
Mac                           0
Linux                         0
User score                    0
Positive                      0
Negative                      0
Achievements                  0
Recommendations               0
Average playtime forever      0
Average playtime two weeks    0
Median playtime forever       0
Median playtime two weeks     0
Categories                    0
Genres                        0
Tags                          0
Developer                     0
Publisher                     0
dtype: int64

## Remove unwanted columns

In [16]:
raw_steamspy_data['User score'].value_counts().head()

User score
0      62521
100        5
80         2
84         2
46         2
Name: count, dtype: int64

In [17]:
drop_cols = [
    'User score', # too little variance (most have 0)
    'Average playtime two weeks', 'Median playtime two weeks', 'Peak CCU' # not interested in temporally specific columns
]

## Process & Export Tags Column

In [18]:
tags = raw_steamspy_data['Tags']

print(tags[0])
tags.head()

Indie,Casual,Sports,Bowling


0                          Indie,Casual,Sports,Bowling
1    Indie,Action,Pixel Graphics,2D,Retro,Arcade,Sc...
2    2D Platformer,Atmospheric,Surreal,Mystery,Puzz...
3    Indie,Adventure,Nudity,Violent,Sexual Content,...
4    Turn-Based Combat,Massively Multiplayer,Multip...
Name: Tags, dtype: object

In [19]:
def parse_tags(x):
    tags = x.split(',')
    
    # Limit to 3 categories
    if len(tags) <= 3:
        return tags
    else:
        return tags[:3]
    
tags.apply(parse_tags).head()

0                              [Indie, Casual, Sports]
1                      [Indie, Action, Pixel Graphics]
2                [2D Platformer, Atmospheric, Surreal]
3                           [Indie, Adventure, Nudity]
4    [Turn-Based Combat, Massively Multiplayer, Mul...
Name: Tags, dtype: object

## Handle Owners Column

In [20]:
owners = raw_steamspy_data['Estimated owners']
owners.head()

0         0 - 20000
1         0 - 20000
2         0 - 20000
3         0 - 20000
4    50000 - 100000
Name: Estimated owners, dtype: object

## Define Function

In [21]:
def process_tags(df, export=False):
    if export: 
        
        tag_data = df[['AppID', 'Tags']].copy()

        tag_data.to_csv('../data/exports/steamspy_tag_data.csv', index=False)
        print("Exported tag data to '../data/exports/steamspy_tag_data.csv'")
        
    def parse_tags(x):
        tags = x.split(',')
        
        # Limit to 3 categories
        if len(tags) <= 3:
            return tags
        else:
            return tags[:3]
    
    df['Tags'] = df['Tags'].apply(parse_tags)
        
    return df


def process(df):
    df = df.copy()
        
    # remove unwanted columns
    df = df.drop(drop_cols, axis=1)
    
    # keep top tags, exporting full tag data to file
    df = process_tags(df, export=True)
        
    return df


steamspy_data = process(raw_steamspy_data)
steamspy_data.head()

Exported tag data to '../data/exports/steamspy_tag_data.csv'


Unnamed: 0,AppID,Name,Release date,Estimated owners,Required age,Price,DLC count,Supported languages,Windows,Mac,Linux,Positive,Negative,Achievements,Recommendations,Average playtime forever,Median playtime forever,Categories,Genres,Tags,Developer,Publisher
0,20200,Galactic Bowling,2008-10-21,0 - 20000,0,19.99,0,['English'],True,False,False,6,11,30,0,0,0,"Single-player,Multi-player,Steam Achievements,...","Casual,Indie,Sports","[Indie, Casual, Sports]",Perpetual FX Creative,Perpetual FX Creative
1,655370,Train Bandit,2017-10-12,0 - 20000,0,0.99,0,"['English', 'French', 'Italian', 'German', 'Sp...",True,True,False,53,5,12,0,0,0,"Single-player,Steam Achievements,Full controll...","Action,Indie","[Indie, Action, Pixel Graphics]",Rusty Moyher,Wild Rooster
2,1355720,Henosis™,2020-07-23,0 - 20000,0,5.99,0,"['English', 'French', 'Italian', 'German', 'Sp...",True,True,True,3,0,0,0,0,0,"Single-player,Full controller support","Adventure,Casual,Indie","[2D Platformer, Atmospheric, Surreal]",Odd Critter Games,Odd Critter Games
3,1139950,Two Weeks in Painland,2020-02-03,0 - 20000,0,0.0,0,"['English', 'Spanish - Spain']",True,True,False,50,8,17,0,0,0,"Single-player,Steam Achievements","Adventure,Indie","[Indie, Adventure, Nudity]",Unusual Games,Unusual Games
4,1469160,Wartune Reborn,2021-02-26,50000 - 100000,0,0.0,0,['English'],True,False,False,87,49,0,0,0,0,"Single-player,Multi-player,MMO,PvP,Online PvP,...","Adventure,Casual,Free to Play,Massively Multip...","[Turn-Based Combat, Massively Multiplayer, Mul...",7Road,7Road


In [22]:
# inspect tag data
pd.read_csv('../data/exports/steamspy_tag_data.csv').head()

Unnamed: 0,AppID,Tags
0,20200,"Indie,Casual,Sports,Bowling"
1,655370,"Indie,Action,Pixel Graphics,2D,Retro,Arcade,Sc..."
2,1355720,"2D Platformer,Atmospheric,Surreal,Mystery,Puzz..."
3,1139950,"Indie,Adventure,Nudity,Violent,Sexual Content,..."
4,1469160,"Turn-Based Combat,Massively Multiplayer,Multip..."


## Merge and Export Clean Data

In [23]:
steamspy_data.isnull().sum()

AppID                       0
Name                        0
Release date                0
Estimated owners            0
Required age                0
Price                       0
DLC count                   0
Supported languages         0
Windows                     0
Mac                         0
Linux                       0
Positive                    0
Negative                    0
Achievements                0
Recommendations             0
Average playtime forever    0
Median playtime forever     0
Categories                  0
Genres                      0
Tags                        0
Developer                   0
Publisher                   0
dtype: int64

In [24]:
# export clean dataset
steamspy_data.to_csv('../data/steam_clean.csv', index=False)