## Importing libraries and datasets

In [1]:
# Importing libraries

import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
import scipy

In [2]:
# Change the format of the values visualization

pd.options.display.float_format = '{:.2f}'.format

In [3]:
# Reset the maximum number of columns to display to default (all columns)

pd.set_option('display.max_columns', None)

In [4]:
# Defining the path to datasets

path = r'C:\Users\efens\cf_tasks\2023-08 Steam Analysis'

In [5]:
# Importing project data "steam"

steam_raw = pd.read_csv(os.path.join(path, '02 Data', '021 Original Data', 'steam.csv'))

In [6]:
# Importing project data "valve_player"

player_raw = pd.read_csv(os.path.join(path, '02 Data', '021 Original Data', 'Valve_Player_Data.csv'))

In [7]:
# Importing project data "game developer"

gamedev_raw = pd.read_excel(os.path.join(path, '02 Data', '021 Original Data', 'gamedev_raw.xlsx'))

## Cleaning and Consistency checks

## 01. Steam dataset

In [8]:
steam_raw.head()

Unnamed: 0,appid,name,release_date,english,developer,publisher,platforms,required_age,categories,genres,steamspy_tags,achievements,positive_ratings,negative_ratings,average_playtime,median_playtime,owners,price
0,10,Counter-Strike,2000-11-01,1,Valve,Valve,windows;mac;linux,0,Multi-player;Online Multi-Player;Local Multi-P...,Action,Action;FPS;Multiplayer,0,124534,3339,17612,317,10000000-20000000,7.19
1,20,Team Fortress Classic,1999-04-01,1,Valve,Valve,windows;mac;linux,0,Multi-player;Online Multi-Player;Local Multi-P...,Action,Action;FPS;Multiplayer,0,3318,633,277,62,5000000-10000000,3.99
2,30,Day of Defeat,2003-05-01,1,Valve,Valve,windows;mac;linux,0,Multi-player;Valve Anti-Cheat enabled,Action,FPS;World War II;Multiplayer,0,3416,398,187,34,5000000-10000000,3.99
3,40,Deathmatch Classic,2001-06-01,1,Valve,Valve,windows;mac;linux,0,Multi-player;Online Multi-Player;Local Multi-P...,Action,Action;FPS;Multiplayer,0,1273,267,258,184,5000000-10000000,3.99
4,50,Half-Life: Opposing Force,1999-11-01,1,Gearbox Software,Valve,windows;mac;linux,0,Single-player;Multi-player;Valve Anti-Cheat en...,Action,FPS;Action;Sci-fi,0,5250,288,624,415,5000000-10000000,3.99


In [9]:
steam_raw[['appid', 'achievements', 'positive_ratings', 'negative_ratings', 'average_playtime', 'median_playtime', 'price']].describe()

Unnamed: 0,appid,achievements,positive_ratings,negative_ratings,average_playtime,median_playtime,price
count,27075.0,27075.0,27075.0,27075.0,27075.0,27075.0,27075.0
mean,596203.51,45.25,1000.56,211.03,149.8,146.06,6.08
std,250894.17,352.67,18988.72,4284.94,1827.04,2353.88,7.87
min,10.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,401230.0,0.0,6.0,2.0,0.0,0.0,1.69
50%,599070.0,7.0,24.0,9.0,0.0,0.0,3.99
75%,798760.0,23.0,126.0,42.0,0.0,0.0,7.19
max,1069460.0,9821.0,2644404.0,487076.0,190625.0,190625.0,421.99


In [10]:
# Invetigating the column average_playtime

ap_value_counts = steam_raw['median_playtime'].value_counts()
ap_value_counts

0       20905
1         155
3          72
2          52
9          48
        ...  
2379        1
2478        1
1052        1
2390        1
816         1
Name: median_playtime, Length: 1312, dtype: int64

- This means that many games have been purchased but not yet played. Will this also be true for the most popular game, which has the maximum number of users? And what about expensive games?

In [11]:
# Invetigating the column price

pr_value_counts = steam_raw['price'].value_counts()
pr_value_counts

3.99     3211
0.79     2892
0.00     2560
6.99     2050
7.19     1304
         ... 
20.51       1
11.95       1
3.92        1
6.59        1
6.10        1
Name: price, Length: 282, dtype: int64

- There are many free games available on the Steam platform. What are the top 100 free games? How many users do they have? And what is their median_playtime?

In [12]:
steam_raw.dtypes

appid                 int64
name                 object
release_date         object
english               int64
developer            object
publisher            object
platforms            object
required_age          int64
categories           object
genres               object
steamspy_tags        object
achievements          int64
positive_ratings      int64
negative_ratings      int64
average_playtime      int64
median_playtime       int64
owners               object
price               float64
dtype: object

#### Checking and cleaning

In [13]:
# Checking for missing values

missing_values = steam_raw.isnull().sum()
missing_values

appid               0
name                0
release_date        0
english             0
developer           0
publisher           0
platforms           0
required_age        0
categories          0
genres              0
steamspy_tags       0
achievements        0
positive_ratings    0
negative_ratings    0
average_playtime    0
median_playtime     0
owners              0
price               0
dtype: int64

- There is no missing values found

In [14]:
# Checking for mix-type values

for col in steam_raw.columns:
  mix_test = (steam_raw[[col]].applymap(type) != steam_raw[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (steam_raw[mix_test]) > 0:
    print (col)

- no mixed-type values

In [15]:
# Checking for duplicates

steam_dups = steam_raw[steam_raw.duplicated()]

In [16]:
steam_dups.head()

Unnamed: 0,appid,name,release_date,english,developer,publisher,platforms,required_age,categories,genres,steamspy_tags,achievements,positive_ratings,negative_ratings,average_playtime,median_playtime,owners,price


- no duplicates found

## 02. Valve Player

In [17]:
player_raw.head()

Unnamed: 0,Month_Year,Avg_players,Gain,Percent_Gain,Peak_Players,URL,Date,Game_Name
0,September 2021,512350.92,268.96,+0.05%,942519,https://steamcharts.com/app/730,2021-09-01,Counter Strike: Global Offensive
1,August 2021,512081.96,6014.6,+1.19%,802544,https://steamcharts.com/app/730,2021-08-01,Counter Strike: Global Offensive
2,July 2021,506067.36,-43279.72,-7.88%,763523,https://steamcharts.com/app/730,2021-07-01,Counter Strike: Global Offensive
3,June 2021,549347.08,-110541.81,-16.75%,929940,https://steamcharts.com/app/730,2021-06-01,Counter Strike: Global Offensive
4,May 2021,659888.89,-63457.63,-8.77%,1087197,https://steamcharts.com/app/730,2021-05-01,Counter Strike: Global Offensive


In [18]:
player_raw[['Avg_players','Peak_Players']].describe()

Unnamed: 0,Avg_players,Peak_Players
count,5271.0,5271.0
mean,37631.49,70320.72
std,103295.06,194194.82
min,0.0,0.0
25%,6720.2,13029.0
50%,12358.4,23447.0
75%,24149.76,46101.5
max,1584886.77,3236027.0


In [19]:
player_raw.dtypes

Month_Year       object
Avg_players     float64
Gain            float64
Percent_Gain     object
Peak_Players      int64
URL              object
Date             object
Game_Name        object
dtype: object

In [20]:
# Invetigating the column Game_Name

pl_value_counts = player_raw['Game_Name'].value_counts()
pl_value_counts

Counter Strike: Global Offensive    111
Left 4 Dead 2                       111
Dota 2                              111
Team Fortress 2                     111
Garry's Mod                         111
                                   ... 
NARAKA: BLADEPOINT                    2
Tales of Arise                        1
NBA 2k22                              1
Cookie Clicker                        1
Pathfinder: Wrath of Righteous        1
Name: Game_Name, Length: 98, dtype: int64

In [21]:
# Invetigating the column URL

pl_value_counts = player_raw['URL'].value_counts()
pl_value_counts

https://steamcharts.com/app/730        111
https://steamcharts.com/app/550        111
https://steamcharts.com/app/570        111
https://steamcharts.com/app/440        111
https://steamcharts.com/app/4000       111
                                      ... 
https://steamcharts.com/app/1203220      2
https://steamcharts.com/app/740130       1
https://steamcharts.com/app/1644960      1
https://steamcharts.com/app/1454400      1
https://steamcharts.com/app/1184370      1
Name: URL, Length: 98, dtype: int64

- This dataset compiles data from various steam charts. 

#### Cleaning and checking

In [22]:
# Checking for missing values

pl_missing_values = player_raw.isnull().sum()
pl_missing_values

Month_Year       0
Avg_players      0
Gain            98
Percent_Gain    98
Peak_Players     0
URL              0
Date             0
Game_Name        0
dtype: int64

- There are 98 missing values in the Gain and Percent Gain columns. As these two columns are not relevant for further analysis, I decided to create a new dataset without them.

In [23]:
# Creating the new ds

player_clean = player_raw[['Game_Name','Month_Year', 'Avg_players', 'Peak_Players', 'Date', 'URL']].copy()

In [24]:
# Checking the new ds
player_clean.head()

Unnamed: 0,Game_Name,Month_Year,Avg_players,Peak_Players,Date,URL
0,Counter Strike: Global Offensive,September 2021,512350.92,942519,2021-09-01,https://steamcharts.com/app/730
1,Counter Strike: Global Offensive,August 2021,512081.96,802544,2021-08-01,https://steamcharts.com/app/730
2,Counter Strike: Global Offensive,July 2021,506067.36,763523,2021-07-01,https://steamcharts.com/app/730
3,Counter Strike: Global Offensive,June 2021,549347.08,929940,2021-06-01,https://steamcharts.com/app/730
4,Counter Strike: Global Offensive,May 2021,659888.89,1087197,2021-05-01,https://steamcharts.com/app/730


In [25]:
# Checking for mix-type values

for col in player_clean.columns:
  mix_test = (player_clean[[col]].applymap(type) != player_clean[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (player_clean[mix_test]) > 0:
    print (col)

- no mixed-type values found

In [26]:
# Checking for duplicates

player_dups = player_clean[player_clean.duplicated()]
player_dups.head()

Unnamed: 0,Game_Name,Month_Year,Avg_players,Peak_Players,Date,URL


- No duplicated found.

## 03. Video games developers

In [27]:
gamedev_raw.head()

Unnamed: 0,Developer,City,Administrative_division,Country,Est.
0,0verflow,Tokyo,,Japan,1997
1,11 bit studios,Warsaw,Masovian Voivodeship,Poland,2010
2,1C Company,Moscow,,Russia,1991
3,1-Up Studio,Tokyo,,Japan,2000
4,2K Czech,Brno,,Czech Republic,1997


In [28]:
gamedev_raw.dtypes

Developer                  object
City                       object
Administrative_division    object
Country                    object
Est.                       object
dtype: object

In [29]:
# Invetigating the column Administrative_division

dev_value_counts = gamedev_raw['Administrative_division'].value_counts()
dev_value_counts

California         131
England             87
Washington          38
Texas               29
Massachusetts       19
                  ... 
Karnataka            1
Iowa                 1
Wales                1
Minnesota            1
South Australia      1
Name: Administrative_division, Length: 62, dtype: int64

In [30]:
# Invetigating the column Country

dev_value_counts = gamedev_raw['Country'].value_counts()
dev_value_counts

United States     337
Japan             158
United Kingdom     99
Canada             46
Sweden             32
France             27
Germany            23
Czech Republic     18
South Korea        17
Australia          15
Poland             14
China              14
Finland            13
Russia             12
Netherlands        10
Denmark             8
Ukraine             7
Hungary             7
Croatia             6
Spain               5
Italy               5
Bulgaria            5
Austria             5
Slovakia            4
Belgium             4
Chile               4
Turkey              4
Norway              4
New Zealand         3
Slovenia            3
Argentina           2
Malaysia            2
Singapore           2
Mexico              2
Ireland             2
Greece              2
Indonesia           2
India               2
Iceland             2
Malta               1
England             1
Cameroon            1
Taiwan              1
Belarus             1
Cyprus              1
South Afri

#### Cleaning and checking

In [31]:
# Checking for missing values

gd_missing_values = gamedev_raw.isnull().sum()
gd_missing_values

Developer                    2
City                        25
Administrative_division    444
Country                      7
Est.                       211
dtype: int64

- There are many missing values in the dataset.

In [32]:
# Deleting rows with missing values for developer

gamedev_raw = gamedev_raw.dropna(subset=['Developer'])

In [33]:
# Checking for missing values

dev_missing_values = gamedev_raw['Developer'].isnull().sum()
dev_missing_values

0

In [34]:
# List of columns with missing values
columns_to_replace = ['City', 'Administrative_division', 'Country', 'Est.']

# Replace missing values with "N/A" for those columns
gamedev_raw[columns_to_replace] = gamedev_raw[columns_to_replace].fillna('N/A')

In [35]:
# Checking for missing values after replacement

new_missing_values = gamedev_raw.isnull().sum()
new_missing_values

Developer                  0
City                       0
Administrative_division    0
Country                    0
Est.                       0
dtype: int64

In [36]:
# Creating the new ds

dev_clean = gamedev_raw[['Developer','Est.', 'Country', 'City', 'Administrative_division']].copy()

In [37]:
dev_clean.head()

Unnamed: 0,Developer,Est.,Country,City,Administrative_division
0,0verflow,1997,Japan,Tokyo,
1,11 bit studios,2010,Poland,Warsaw,Masovian Voivodeship
2,1C Company,1991,Russia,Moscow,
3,1-Up Studio,2000,Japan,Tokyo,
4,2K Czech,1997,Czech Republic,Brno,


In [38]:
# Checking for mix-type values

for col in dev_clean.columns:
  mix_test = (dev_clean[[col]].applymap(type) != dev_clean[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (dev_clean[mix_test]) > 0:
    print (col)

Est.


In [39]:
# Invetigating the column Est

dev_est_value_counts = dev_clean['Est.'].value_counts()
dev_est_value_counts

N/A                                                       210
2000                                                       43
2002                                                       41
1999                                                       38
1997                                                       36
2001                                                       33
1998                                                       32
1994                                                       31
2006                                                       30
1996                                                       30
2005                                                       29
1995                                                       29
2009                                                       24
2003                                                       22
2007                                                       21
1993                                                       20
1992    

In [40]:
# Importing module with regular expression

import re

In [41]:
# Specifying the column you want to extract years from
column_to_extract = 'Est.'  # Replace 'Est.' with the actual column name

# Using regular expression to extract the years and create a new column
pattern = r'\b(\d{4})\b'
dev_clean['Year'] = dev_clean[column_to_extract].apply(lambda x: re.search(pattern, str(x)).group(1) if re.search(pattern, str(x)) else 'N/A')

In [42]:
# Invetigating the column Year

y_value_counts = dev_clean['Year'].value_counts()
y_value_counts

N/A     210
2000     43
2002     41
1999     38
1997     37
2001     33
1998     32
1994     31
2006     30
1996     30
1995     29
2005     29
2009     24
2003     22
1993     21
2007     21
1992     19
2010     18
2004     17
1991     17
2008     17
1988     16
1989     14
1990     14
1986     13
1982     12
2012     12
1984     11
1987     11
1985     11
2011     10
2014      9
1979      8
2013      7
1978      5
1980      4
2015      3
2017      3
1983      3
2018      3
1981      2
1960      2
2016      2
1974      2
1976      1
1953      1
1975      1
1973      1
1889      1
1969      1
2022      1
1955      1
1967      1
Name: Year, dtype: int64

In [43]:
# Creating a new cleaned ds

gamedev_clean = dev_clean[['Developer','Year', 'Country', 'City', 'Administrative_division']].copy()

In [44]:
# Checking for mix-type values

for col in gamedev_clean.columns:
  mix_test = (gamedev_clean[[col]].applymap(type) != gamedev_clean[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (gamedev_clean[mix_test]) > 0:
    print (col)

- No mix-type data anymore.

In [45]:
gamedev_clean.head()

Unnamed: 0,Developer,Year,Country,City,Administrative_division
0,0verflow,1997,Japan,Tokyo,
1,11 bit studios,2010,Poland,Warsaw,Masovian Voivodeship
2,1C Company,1991,Russia,Moscow,
3,1-Up Studio,2000,Japan,Tokyo,
4,2K Czech,1997,Czech Republic,Brno,


In [46]:
# Checking for duplicates

dev_dups = gamedev_clean[gamedev_clean.duplicated()]
dev_dups.head()

Unnamed: 0,Developer,Year,Country,City,Administrative_division


- No duplicates found.

## Exporting datasets

In [47]:
steam_raw.to_csv(os.path.join(path, '02 Data','022 Prepared Data', 'steam_clean.csv'), header=True, index=False)

In [48]:
player_clean.to_csv(os.path.join(path, '02 Data','022 Prepared Data', 'player_clean.csv'), header=True, index=False)

In [49]:
gamedev_clean.to_excel(os.path.join(path, '02 Data','022 Prepared Data', 'gamedev_clean.xlsx'), header=True, index=False)