## Preparing data for analysis (Part 1)

In [1]:
# Importing libraries

import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
import scipy

In [2]:
# Change the format of the values visualization

pd.options.display.float_format = '{:.2f}'.format

In [3]:
# Reset the maximum number of columns to display to default (all columns)

pd.set_option('display.max_columns', None)

In [4]:
# Defining the path to datasets

path = r'C:\Users\efens\cf_tasks\2023-08 Steam Analysis'

## 00. Importing project data

In [5]:
# Importing project data "steam"

steam_clean = pd.read_csv(os.path.join(path, '02 Data', '022 Prepared Data', 'steam_clean.csv'))

In [6]:
# Importing project data "game developer"

gamedev_clean = pd.read_excel(os.path.join(path, '02 Data', '022 Prepared Data', 'gamedev_clean.xlsx'))

## 01.  Currency conversion from GBP to EUR

In [1]:
# Creating a new column price_eur for price in euro to the exchange rate

steam_clean['price_eur'] = steam_clean['price'].apply(lambda x: x*1.16)

NameError: name 'steam_clean' is not defined

In [8]:
steam_clean.head(3)

Unnamed: 0,appid,name,release_date,english,developer,publisher,platforms,required_age,categories,genres,steamspy_tags,achievements,positive_ratings,negative_ratings,average_playtime,median_playtime,owners,price,price_eur
0,10,Counter-Strike,2000-11-01,1,Valve,Valve,windows;mac;linux,0,Multi-player;Online Multi-Player;Local Multi-P...,Action,Action;FPS;Multiplayer,0,124534,3339,17612,317,10000000-20000000,7.19,8.34
1,20,Team Fortress Classic,1999-04-01,1,Valve,Valve,windows;mac;linux,0,Multi-player;Online Multi-Player;Local Multi-P...,Action,Action;FPS;Multiplayer,0,3318,633,277,62,5000000-10000000,3.99,4.63
2,30,Day of Defeat,2003-05-01,1,Valve,Valve,windows;mac;linux,0,Multi-player;Valve Anti-Cheat enabled,Action,FPS;World War II;Multiplayer,0,3416,398,187,34,5000000-10000000,3.99,4.63


## 02. Separating the values in owners column

#### Creating two columns from the column "owners" owners_min and owners_max

In [9]:
# Splitting values and create new columns

steam_clean[['owners_min', 'owners_max']] = steam_clean['owners'].str.split('-', expand=True)

# Converting the new columns to numeric type

steam_clean['owners_min'] = pd.to_numeric(steam_clean['owners_min'])
steam_clean['owners_max'] = pd.to_numeric(steam_clean['owners_max'])

In [10]:
steam_clean.head(1)

Unnamed: 0,appid,name,release_date,english,developer,publisher,platforms,required_age,categories,genres,steamspy_tags,achievements,positive_ratings,negative_ratings,average_playtime,median_playtime,owners,price,price_eur,owners_min,owners_max
0,10,Counter-Strike,2000-11-01,1,Valve,Valve,windows;mac;linux,0,Multi-player;Online Multi-Player;Local Multi-P...,Action,Action;FPS;Multiplayer,0,124534,3339,17612,317,10000000-20000000,7.19,8.34,10000000,20000000


#### Creating the new column with average meaning for owners

In [11]:
steam_clean['owners_avg'] = (steam_clean['owners_max'] + steam_clean['owners_min']) / 2

In [12]:
steam_clean.head(1)

Unnamed: 0,appid,name,release_date,english,developer,publisher,platforms,required_age,categories,genres,steamspy_tags,achievements,positive_ratings,negative_ratings,average_playtime,median_playtime,owners,price,price_eur,owners_min,owners_max,owners_avg
0,10,Counter-Strike,2000-11-01,1,Valve,Valve,windows;mac;linux,0,Multi-player;Online Multi-Player;Local Multi-P...,Action,Action;FPS;Multiplayer,0,124534,3339,17612,317,10000000-20000000,7.19,8.34,10000000,20000000,15000000.0


In [13]:
# Displaying summary statistics for 'owners_min', 'owners_max', and 'owners_avg'

steam_clean[['owners_min', 'owners_max','owners_avg']].describe()

Unnamed: 0,owners_min,owners_max,owners_avg
count,27075.0,27075.0,27075.0
mean,80126.69,188054.29,134090.49
std,870897.7,1786800.66,1328088.58
min,0.0,20000.0,10000.0
25%,0.0,20000.0,10000.0
50%,0.0,20000.0,10000.0
75%,20000.0,50000.0,35000.0
max,100000000.0,200000000.0,150000000.0


In [14]:
# Counting the occurrences of each unique value in the 'owners_avg' column

steam_clean['owners_avg'].value_counts()

10000.00        18596
35000.00         3059
75000.00         1695
150000.00        1386
350000.00        1272
750000.00         513
1500000.00        288
3500000.00        193
7500000.00         46
15000000.00        21
35000000.00         3
75000000.00         2
150000000.00        1
Name: owners_avg, dtype: int64

## 03. Preparing column categories for further analysis

In [15]:
steam_clean['categories'].value_counts()

Single-player                                                                                                              6110
Single-player;Steam Achievements                                                                                           2334
Single-player;Steam Achievements;Steam Trading Cards                                                                        848
Single-player;Partial Controller Support                                                                                    804
Single-player;Steam Trading Cards                                                                                           792
                                                                                                                           ... 
Single-player;Steam Achievements;Steam Trading Cards;Captions available;Partial Controller Support;Commentary available       1
Single-player;Steam Achievements;Full controller support;Steam Trading Cards;VR Support;Steam Cloud;Stea

In [16]:
# Concatenating all categories into a single string
all_categories = ';'.join(steam_clean['categories'])

# Splitting the concatenated string and extracting unique values
unique_categories = set(all_categories.split(';'))

print(unique_categories)

{'Partial Controller Support', 'Multi-player', 'Steam Turn Notifications', 'Shared/Split Screen', 'Includes Source SDK', 'Includes level editor', 'Steam Workshop', 'Captions available', 'In-App Purchases', 'Local Co-op', 'Cross-Platform Multiplayer', 'Online Multi-Player', 'VR Support', 'Valve Anti-Cheat enabled', 'Single-player', 'Steam Leaderboards', 'SteamVR Collectibles', 'Local Multi-Player', 'MMO', 'Stats', 'Steam Achievements', 'Online Co-op', 'Mods (require HL2)', 'Co-op', 'Mods', 'Steam Trading Cards', 'Commentary available', 'Full controller support', 'Steam Cloud'}


#### Creating a new column 'gameplay"

1. If value contains "Single-player" - then save as a "Single-player"
2. If value contains 'Co-op', or 'Local Multi-Player', or 'Cross-Platform Multiplayer', or 'Multi-player',or 'Local Co-op' - save as 'Multi-player'
3. If value contains 'Online Multi-Player', 'Online Co-op' save as 'Online Multi-Player'
4. If value containg 'MMO' save as 'MMO'
5. If value contains for example 1. and 2. --> 2 is more important so save as the value for "2"
6. If value doesnt contain any of the listed examples save as 'Other"

In [17]:
# Writing if statement

def categorize_gameplay(categories):
    if 'Multi-player' in categories:
        if any(tag in categories for tag in ['Co-op', 'Local Multi-Player', 'Cross-Platform Multiplayer']):
            return 'Multi-Player'
        elif 'Online Multi-Player' in categories or 'Online Co-op' in categories:
            return 'Online Multi-Player'
        else:
            return 'Multi-Player'
    elif 'Single-player' in categories:
        return 'Single Player'
    elif 'MMO' in categories:
        return 'MMO'
    else:
        return 'Other'

In [18]:
# Applying the categorize_gameplay function to create a new column

steam_clean['gameplay_type'] = steam_clean['categories'].apply(categorize_gameplay)

In [19]:

steam_clean['gameplay_type'].value_counts()

Single Player          22478
Multi-Player            3503
Other                    509
Online Multi-Player      471
MMO                      114
Name: gameplay_type, dtype: int64

In [20]:
steam_clean.head()

Unnamed: 0,appid,name,release_date,english,developer,publisher,platforms,required_age,categories,genres,steamspy_tags,achievements,positive_ratings,negative_ratings,average_playtime,median_playtime,owners,price,price_eur,owners_min,owners_max,owners_avg,gameplay_type
0,10,Counter-Strike,2000-11-01,1,Valve,Valve,windows;mac;linux,0,Multi-player;Online Multi-Player;Local Multi-P...,Action,Action;FPS;Multiplayer,0,124534,3339,17612,317,10000000-20000000,7.19,8.34,10000000,20000000,15000000.0,Multi-Player
1,20,Team Fortress Classic,1999-04-01,1,Valve,Valve,windows;mac;linux,0,Multi-player;Online Multi-Player;Local Multi-P...,Action,Action;FPS;Multiplayer,0,3318,633,277,62,5000000-10000000,3.99,4.63,5000000,10000000,7500000.0,Multi-Player
2,30,Day of Defeat,2003-05-01,1,Valve,Valve,windows;mac;linux,0,Multi-player;Valve Anti-Cheat enabled,Action,FPS;World War II;Multiplayer,0,3416,398,187,34,5000000-10000000,3.99,4.63,5000000,10000000,7500000.0,Multi-Player
3,40,Deathmatch Classic,2001-06-01,1,Valve,Valve,windows;mac;linux,0,Multi-player;Online Multi-Player;Local Multi-P...,Action,Action;FPS;Multiplayer,0,1273,267,258,184,5000000-10000000,3.99,4.63,5000000,10000000,7500000.0,Multi-Player
4,50,Half-Life: Opposing Force,1999-11-01,1,Gearbox Software,Valve,windows;mac;linux,0,Single-player;Multi-player;Valve Anti-Cheat en...,Action,FPS;Action;Sci-fi,0,5250,288,624,415,5000000-10000000,3.99,4.63,5000000,10000000,7500000.0,Multi-Player


## 04.  Creating a new dataset with geographic information

In [21]:
steam_clean['developer'].value_counts()

Choice of Games               94
KOEI TECMO GAMES CO., LTD.    72
Ripknot Systems               62
Laush Dmitriy Sergeevich      51
Nikita "Ghost_RUS"            50
                              ..
CRAPPY ZOMBIE GAME STUDIO      1
Ramon Mujica                   1
Oomst Games                    1
Joe Censored Games             1
Adept Studios GD               1
Name: developer, Length: 17113, dtype: int64

In [23]:
gamedev_clean.head(3)

Unnamed: 0,Developer,Year,Country,City,Administrative_division
0,0verflow,1997.0,Japan,Tokyo,
1,11 bit studios,2010.0,Poland,Warsaw,Masovian Voivodeship
2,1C Company,1991.0,Russia,Moscow,


In [25]:
# Converting Year to a proper datetime

gamedev_clean['Year'] = pd.to_datetime(gamedev_clean['Year'], format='%Y').dt.strftime('%Y')

gamedev_clean.head(3)

Unnamed: 0,Developer,Year,Country,City,Administrative_division
0,0verflow,1997,Japan,Tokyo,
1,11 bit studios,2010,Poland,Warsaw,Masovian Voivodeship
2,1C Company,1991,Russia,Moscow,


In [26]:
gamedev_clean['Developer'].value_counts()

Facepunch Studios          2
Punch Entertainment        2
Red Thread Games           2
Schell Games               2
VoxelStorm                 2
                          ..
Irem                       1
Iron Galaxy Studios        1
Iron Lore Entertainment    1
Irrational Games           1
Zoonami                    1
Name: Developer, Length: 886, dtype: int64

In [27]:
# Changing column names to a lower case

gamedev_clean.columns =gamedev_clean.columns.str.lower()

In [28]:
gamedev_clean.head(1)

Unnamed: 0,developer,year,country,city,administrative_division
0,0verflow,1997,Japan,Tokyo,


In [29]:
steam_clean.shape

(27075, 23)

In [30]:
gamedev_clean.shape

(945, 5)

In [31]:
# Merging the dataframes based on 'developer'

merged_steam = steam_clean.merge(gamedev_clean, on='developer', how='left')

In [32]:
merged_steam.head(3)

Unnamed: 0,appid,name,release_date,english,developer,publisher,platforms,required_age,categories,genres,steamspy_tags,achievements,positive_ratings,negative_ratings,average_playtime,median_playtime,owners,price,price_eur,owners_min,owners_max,owners_avg,gameplay_type,year,country,city,administrative_division
0,10,Counter-Strike,2000-11-01,1,Valve,Valve,windows;mac;linux,0,Multi-player;Online Multi-Player;Local Multi-P...,Action,Action;FPS;Multiplayer,0,124534,3339,17612,317,10000000-20000000,7.19,8.34,10000000,20000000,15000000.0,Multi-Player,1996,United States,Kirkland,Washington
1,20,Team Fortress Classic,1999-04-01,1,Valve,Valve,windows;mac;linux,0,Multi-player;Online Multi-Player;Local Multi-P...,Action,Action;FPS;Multiplayer,0,3318,633,277,62,5000000-10000000,3.99,4.63,5000000,10000000,7500000.0,Multi-Player,1996,United States,Kirkland,Washington
2,30,Day of Defeat,2003-05-01,1,Valve,Valve,windows;mac;linux,0,Multi-player;Valve Anti-Cheat enabled,Action,FPS;World War II;Multiplayer,0,3416,398,187,34,5000000-10000000,3.99,4.63,5000000,10000000,7500000.0,Multi-Player,1996,United States,Kirkland,Washington


In [33]:
# Rename the 'year' column to 'dev_year'
merged_steam.rename(columns={'year': 'dev_year'}, inplace=True)

In [34]:
merged_steam.head(3)

Unnamed: 0,appid,name,release_date,english,developer,publisher,platforms,required_age,categories,genres,steamspy_tags,achievements,positive_ratings,negative_ratings,average_playtime,median_playtime,owners,price,price_eur,owners_min,owners_max,owners_avg,gameplay_type,dev_year,country,city,administrative_division
0,10,Counter-Strike,2000-11-01,1,Valve,Valve,windows;mac;linux,0,Multi-player;Online Multi-Player;Local Multi-P...,Action,Action;FPS;Multiplayer,0,124534,3339,17612,317,10000000-20000000,7.19,8.34,10000000,20000000,15000000.0,Multi-Player,1996,United States,Kirkland,Washington
1,20,Team Fortress Classic,1999-04-01,1,Valve,Valve,windows;mac;linux,0,Multi-player;Online Multi-Player;Local Multi-P...,Action,Action;FPS;Multiplayer,0,3318,633,277,62,5000000-10000000,3.99,4.63,5000000,10000000,7500000.0,Multi-Player,1996,United States,Kirkland,Washington
2,30,Day of Defeat,2003-05-01,1,Valve,Valve,windows;mac;linux,0,Multi-player;Valve Anti-Cheat enabled,Action,FPS;World War II;Multiplayer,0,3416,398,187,34,5000000-10000000,3.99,4.63,5000000,10000000,7500000.0,Multi-Player,1996,United States,Kirkland,Washington


In [35]:
# Checking for missing values

missing_values = merged_steam.isnull().sum()
missing_values

appid                          0
name                           0
release_date                   0
english                        0
developer                      0
publisher                      0
platforms                      0
required_age                   0
categories                     0
genres                         0
steamspy_tags                  0
achievements                   0
positive_ratings               0
negative_ratings               0
average_playtime               0
median_playtime                0
owners                         0
price                          0
price_eur                      0
owners_min                     0
owners_max                     0
owners_avg                     0
gameplay_type                  0
dev_year                   26100
country                    25726
city                       25756
administrative_division    26423
dtype: int64

In [36]:
# Select rows with missing 'Country' values for specific 'Developer' values

missing_country = merged_steam.loc[merged_steam['country'].isnull(), 'developer'].unique()

In [37]:
count_missing_country_values = len(missing_country)
count_missing_country_values

16781

In [38]:
# Fill missing values in the merged DataFrame with 'N/A'

merged_steam.fillna('N/A', inplace=True)

In [39]:
# Checking for missing values

missing_values = merged_steam.isnull().sum()
missing_values

appid                      0
name                       0
release_date               0
english                    0
developer                  0
publisher                  0
platforms                  0
required_age               0
categories                 0
genres                     0
steamspy_tags              0
achievements               0
positive_ratings           0
negative_ratings           0
average_playtime           0
median_playtime            0
owners                     0
price                      0
price_eur                  0
owners_min                 0
owners_max                 0
owners_avg                 0
gameplay_type              0
dev_year                   0
country                    0
city                       0
administrative_division    0
dtype: int64

#### Limitation: 

1. I have not been able to find a database of game developers' countries, so information on the location and price range of games will be very limited. 

2. Further information gathering and recalculation is required at a later date.

## Exporting the datasets

In [41]:
merged_steam.to_csv(os.path.join(path, '02 Data','022 Prepared Data', 'merged_steam.csv'), header=True, index=False)