In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [109]:
df = pd.read_csv('datasets/games.csv')

In [110]:
df.head()

Unnamed: 0,Name,Platform,Year_of_Release,Genre,NA_sales,EU_sales,JP_sales,Other_sales,Critic_Score,User_Score,Rating
0,Wii Sports,Wii,2006.0,Sports,41.36,28.96,3.77,8.45,76.0,8.0,E
1,Super Mario Bros.,NES,1985.0,Platform,29.08,3.58,6.81,0.77,,,
2,Mario Kart Wii,Wii,2008.0,Racing,15.68,12.76,3.79,3.29,82.0,8.3,E
3,Wii Sports Resort,Wii,2009.0,Sports,15.61,10.93,3.28,2.95,80.0,8.0,E
4,Pokemon Red/Pokemon Blue,GB,1996.0,Role-Playing,11.27,8.89,10.22,1.0,,,


In [111]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16715 entries, 0 to 16714
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Name             16713 non-null  object 
 1   Platform         16715 non-null  object 
 2   Year_of_Release  16446 non-null  float64
 3   Genre            16713 non-null  object 
 4   NA_sales         16715 non-null  float64
 5   EU_sales         16715 non-null  float64
 6   JP_sales         16715 non-null  float64
 7   Other_sales      16715 non-null  float64
 8   Critic_Score     8137 non-null   float64
 9   User_Score       10014 non-null  object 
 10  Rating           9949 non-null   object 
dtypes: float64(6), object(5)
memory usage: 1.4+ MB


In [139]:
df.sample(10)

Unnamed: 0,name,platform,year_of_release,genre,na_sales,eu_sales,jp_sales,other_sales,critic_score,user_score,rating
12245,Gundam Try Age SP,3DS,2014-01-01,Strategy,0.0,0.0,0.07,0.0,,,
7696,UFC: Tapout,XB,2002-01-01,Fighting,0.14,0.05,0.0,0.01,,,
4545,Naruto: Ninja Council 3,DS,2006-01-01,Action,0.39,0.0,0.0,0.03,56.0,6.5,E10+
6041,"Sakura Wars 4: Koi Seyo,Otome",DC,2002-01-01,Adventure,0.0,0.0,0.29,0.0,,,
7418,Prince of Persia: Revelations,PSP,2005-01-01,Adventure,0.18,0.01,0.0,0.02,,,
1475,The Elder Scrolls V: Skyrim,PS4,2016-01-01,Role-Playing,0.48,0.59,0.04,0.21,,,
12105,Heavenly Guardian,Wii,2007-01-01,Shooter,0.06,0.0,0.0,0.0,,5.0,E
8054,Mega Man Soccer,SNES,1993-01-01,Sports,0.04,0.01,0.13,0.0,,,
2915,Chocobo's Dungeon 2,PS,1998-01-01,Role-Playing,0.04,0.03,0.58,0.05,,,
12722,Samurai Warriors 4: Empires,PS3,2015-01-01,Action,0.0,0.0,0.06,0.0,,5.0,T


### Data Preprocessing

Make columns lowercase.

I will use the lower() method on the strings of columns

In [113]:
df.columns = df.columns.str.lower()
df.columns

Index(['name', 'platform', 'year_of_release', 'genre', 'na_sales', 'eu_sales',
       'jp_sales', 'other_sales', 'critic_score', 'user_score', 'rating'],
      dtype='object')

Convert data types:

I will convert the `year_of_release` to datetime, to allow for time analysis in the years.

In [114]:
df.year_of_release.dtype

dtype('float64')

In [115]:
df.year_of_release = pd.to_datetime(df.year_of_release, format='%Y')

I will change the `user_score` to float, because it has ratings of float data types.

In [116]:
df.user_score.dtype

dtype('O')

In [117]:
df.user_score.value_counts()

tbd    2424
7.8     324
8       290
8.2     282
8.3     254
       ... 
0.9       2
1.3       2
2.3       2
0         1
9.7       1
Name: user_score, Length: 96, dtype: int64

In [118]:
df.query('user_score == "tbd"').head()

Unnamed: 0,name,platform,year_of_release,genre,na_sales,eu_sales,jp_sales,other_sales,critic_score,user_score,rating
119,Zumba Fitness,Wii,2010-01-01,Sports,3.45,2.59,0.0,0.66,,tbd,E
301,Namco Museum: 50th Anniversary,PS2,2005-01-01,Misc,2.08,1.35,0.0,0.54,61.0,tbd,E10+
520,Zumba Fitness 2,Wii,2011-01-01,Sports,1.51,1.03,0.0,0.27,,tbd,T
645,uDraw Studio,Wii,2010-01-01,Misc,1.65,0.57,0.0,0.2,71.0,tbd,E
657,Frogger's Adventures: Temple of the Frog,GBA,NaT,Adventure,2.15,0.18,0.0,0.07,73.0,tbd,E


But first I will change the "tbd" values to the average of the maximum user score rating(10), because at average the user did not explicitly decide whether it was a good or bad game.

In [119]:
df.loc[df['user_score'] == 'tbd', 'user_score'] = 5

In [120]:
df['user_score'] = df['user_score'].astype('float')

In [121]:
#missing values

Filling in missing values

In [141]:
df.isnull().sum()

name                  0
platform              0
year_of_release       0
genre                 0
na_sales              0
eu_sales              0
jp_sales              0
other_sales           0
critic_score       8578
user_score            0
rating             6766
dtype: int64

`name` column has two missing values

In [127]:
df[df['name'].isnull()]

Unnamed: 0,name,platform,year_of_release,genre,na_sales,eu_sales,jp_sales,other_sales,critic_score,user_score,rating


In [150]:
df[df['name'] == 'Game']

Unnamed: 0,name,platform,year_of_release,genre,na_sales,eu_sales,jp_sales,other_sales,critic_score,user_score,rating
659,Game,GEN,1993-01-01,Misc,1.78,0.53,0.0,0.08,,-1.0,
14244,Game,GEN,1993-01-01,Misc,0.0,0.0,0.03,0.0,,-1.0,


Looking at the missing names, it seems that it is the same game recorded twice with separate records for `jp_sales`.
I will fill in with an imaginary name.

In [125]:
#df.loc[df['name'].isnull(), 'name'] = 'Game'

In [151]:
#df.loc[df['name'] == 'Game']['jp_sales']

659      0.00
14244    0.03
Name: jp_sales, dtype: float64

`genre` has two missing values.

I will fill in as Miscelleanous since it can not be determined exactly what genre they belong to.

In [130]:
df.loc[df['genre'].isnull(), 'genre'] = 'Misc'

`year_of_release` has 269 randomly missing values.

The values could be missing because of lack of entry during data collection, or a program error during retrieval form the database.

I will forward-fill in randomly using the fillna() method.

In [133]:
#df[df['year_of_release'].isnull()]

Unnamed: 0,name,platform,year_of_release,genre,na_sales,eu_sales,jp_sales,other_sales,critic_score,user_score,rating
8547,Alone in the Dark: The New Nightmare,PS,NaT,Adventure,0.09,0.06,0.0,0.01,77.0,8.1,M
183,Madden NFL 2004,PS2,NaT,Sports,4.26,0.26,0.01,0.71,94.0,8.5,E
3352,The Golden Compass,Wii,NaT,Action,0.26,0.28,0.0,0.07,35.0,6.8,E10+
16329,Brothers in Arms: Furious 4,X360,NaT,Shooter,0.01,0.0,0.0,0.0,,,M
13929,Sega Rally 2006,PS2,NaT,Racing,0.0,0.0,0.04,0.0,,,
10465,Disney's Chicken Little: Ace In Action,Wii,NaT,Shooter,0.09,0.0,0.0,0.01,72.0,5.0,E10+
10486,Atsumare! Power Pro Kun no DS Koushien,DS,NaT,Sports,0.0,0.0,0.1,0.0,,,
7353,Ghostbusters II,2600,NaT,Action,0.2,0.01,0.0,0.0,,,
9293,Disney's Cinderella: Magical Dreams,GBA,NaT,Platform,0.1,0.04,0.0,0.0,69.0,5.0,E
8680,Jurassic Park: The Game,X360,NaT,Action,0.15,0.0,0.0,0.01,60.0,3.6,T


In [136]:
df['year_of_release'].fillna(method='ffill', inplace=True)

`critic_score` and `user_score` have missing values that could be caused at the collection point whereby a user/critic did not provide this data.

I will fill in the `user_score` with -1, to show that this data was not provided.

In [138]:
#df[df['critic_score'].isnull()]

Unnamed: 0,name,platform,year_of_release,genre,na_sales,eu_sales,jp_sales,other_sales,critic_score,user_score,rating
1,Super Mario Bros.,NES,1985-01-01,Platform,29.08,3.58,6.81,0.77,,,
4,Pokemon Red/Pokemon Blue,GB,1996-01-01,Role-Playing,11.27,8.89,10.22,1.00,,,
5,Tetris,GB,1989-01-01,Puzzle,23.20,2.26,4.22,0.58,,,
9,Duck Hunt,NES,1984-01-01,Shooter,26.93,0.63,0.28,0.47,,,
10,Nintendogs,DS,2005-01-01,Simulation,9.05,10.95,1.93,2.74,,,
...,...,...,...,...,...,...,...,...,...,...,...
16710,Samurai Warriors: Sanada Maru,PS3,2016-01-01,Action,0.00,0.00,0.01,0.00,,,
16711,LMA Manager 2007,X360,2006-01-01,Sports,0.00,0.01,0.00,0.00,,,
16712,Haitaka no Psychedelica,PSV,2016-01-01,Adventure,0.00,0.00,0.01,0.00,,,
16713,Spirits & Spells,GBA,2003-01-01,Platform,0.01,0.00,0.00,0.00,,,


In [140]:
df['user_score'].fillna(value=-1, inplace=True)

I will leave the missing values as is, in the `critic_score` because it does not affect my analysis.

The `rating` column will also be left blank since this feature will be critical in my analysis.

In [143]:
df.duplicated().sum()

0

#### Calculations:

- Total sales for each game
    - I will add the na, eu, jp and other sales columns

In [161]:
df['total_sales'] = df['eu_sales'] + df['na_sales'] +df['jp_sales'] +df['other_sales']

In [162]:
df.head(10)

Unnamed: 0,name,platform,year_of_release,genre,na_sales,eu_sales,jp_sales,other_sales,critic_score,user_score,rating,total_sales
0,Wii Sports,Wii,2006-01-01,Sports,41.36,28.96,3.77,8.45,76.0,8.0,E,82.54
1,Super Mario Bros.,NES,1985-01-01,Platform,29.08,3.58,6.81,0.77,,-1.0,,40.24
2,Mario Kart Wii,Wii,2008-01-01,Racing,15.68,12.76,3.79,3.29,82.0,8.3,E,35.52
3,Wii Sports Resort,Wii,2009-01-01,Sports,15.61,10.93,3.28,2.95,80.0,8.0,E,32.77
4,Pokemon Red/Pokemon Blue,GB,1996-01-01,Role-Playing,11.27,8.89,10.22,1.0,,-1.0,,31.38
5,Tetris,GB,1989-01-01,Puzzle,23.2,2.26,4.22,0.58,,-1.0,,30.26
6,New Super Mario Bros.,DS,2006-01-01,Platform,11.28,9.14,6.5,2.88,89.0,8.5,E,29.8
7,Wii Play,Wii,2006-01-01,Misc,13.96,9.18,2.93,2.84,58.0,6.6,E,28.91
8,New Super Mario Bros. Wii,Wii,2009-01-01,Platform,14.44,6.94,4.7,2.24,87.0,8.4,E,28.32
9,Duck Hunt,NES,1984-01-01,Shooter,26.93,0.63,0.28,0.47,,-1.0,,28.31


### Data Analysis

In [163]:
df.pivot_table(index='year_of_release', values='name', aggfunc=['count'])

Unnamed: 0_level_0,count
Unnamed: 0_level_1,name
year_of_release,Unnamed: 1_level_2
1980-01-01,9
1981-01-01,47
1982-01-01,37
1983-01-01,17
1984-01-01,14
1985-01-01,14
1986-01-01,21
1987-01-01,16
1988-01-01,15
1989-01-01,17
