## Import Essential libraries

In [29]:
# For data processing
import numpy as np
import pandas as pd

## Import the Dataset

In [30]:
df = pd.read_csv('Raw Data.csv')
df.head()

Unnamed: 0,Name,Platform,Year_of_Release,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales,Critic_Score,Critic_Count,User_Score,User_Count,Developer,Rating
0,Wii Sports,Wii,2006.0,Sports,Nintendo,41.36,28.96,3.77,8.45,82.53,76.0,51.0,8.0,322.0,Nintendo,E
1,Super Mario Bros.,NES,1985.0,Platform,Nintendo,29.08,3.58,6.81,0.77,40.24,,,,,,
2,Mario Kart Wii,Wii,2008.0,Racing,Nintendo,15.68,12.76,3.79,3.29,35.52,82.0,73.0,8.3,709.0,Nintendo,E
3,Wii Sports Resort,Wii,2009.0,Sports,Nintendo,15.61,10.93,3.28,2.95,32.77,80.0,73.0,8.0,192.0,Nintendo,E
4,Pokemon Red/Pokemon Blue,GB,1996.0,Role-Playing,Nintendo,11.27,8.89,10.22,1.0,31.37,,,,,,


In [31]:
print("Data type : ", type(df))
print("Data dims : ", df.shape)

Data type :  <class 'pandas.core.frame.DataFrame'>
Data dims :  (16719, 16)


In [32]:
print(df.dtypes)

Name                object
Platform            object
Year_of_Release    float64
Genre               object
Publisher           object
NA_Sales           float64
EU_Sales           float64
JP_Sales           float64
Other_Sales        float64
Global_Sales       float64
Critic_Score       float64
Critic_Count       float64
User_Score          object
User_Count         float64
Developer           object
Rating              object
dtype: object


---

## Explore the Dataset

In [33]:
# Information about the Variables
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16719 entries, 0 to 16718
Data columns (total 16 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Name             16717 non-null  object 
 1   Platform         16719 non-null  object 
 2   Year_of_Release  16450 non-null  float64
 3   Genre            16717 non-null  object 
 4   Publisher        16665 non-null  object 
 5   NA_Sales         16719 non-null  float64
 6   EU_Sales         16719 non-null  float64
 7   JP_Sales         16719 non-null  float64
 8   Other_Sales      16719 non-null  float64
 9   Global_Sales     16719 non-null  float64
 10  Critic_Score     8137 non-null   float64
 11  Critic_Count     8137 non-null   float64
 12  User_Score       10015 non-null  object 
 13  User_Count       7590 non-null   float64
 14  Developer        10096 non-null  object 
 15  Rating           9950 non-null   object 
dtypes: float64(9), object(7)
memory usage: 2.0+ MB


Notice that more than half of the data rows may be missing Critic_Score, Critic_Count, User_Count or Rating which are potentially some of our more important variables

---

## Clean the Dataset

Once we are done with the basic exploration of variables, it's time to *clean* and *tidy-up* the dataset.

In [89]:
# Create a copy of the Dataset
dfclean = df.copy()

In [90]:
# Remove all spaces and dots from Variable Names
# dfclean.columns = dfclean.columns.str.replace(".","")
# dfclean.columns = dfclean.columns.str.replace("_"," ")

In [91]:
dfclean.head()


Unnamed: 0,Name,Platform,Year_of_Release,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales,Critic_Score,Critic_Count,User_Score,User_Count,Developer,Rating
0,Wii Sports,Wii,2006.0,Sports,Nintendo,41.36,28.96,3.77,8.45,82.53,76.0,51.0,8.0,322.0,Nintendo,E
1,Super Mario Bros.,NES,1985.0,Platform,Nintendo,29.08,3.58,6.81,0.77,40.24,,,,,,
2,Mario Kart Wii,Wii,2008.0,Racing,Nintendo,15.68,12.76,3.79,3.29,35.52,82.0,73.0,8.3,709.0,Nintendo,E
3,Wii Sports Resort,Wii,2009.0,Sports,Nintendo,15.61,10.93,3.28,2.95,32.77,80.0,73.0,8.0,192.0,Nintendo,E
4,Pokemon Red/Pokemon Blue,GB,1996.0,Role-Playing,Nintendo,11.27,8.89,10.22,1.0,31.37,,,,,,


In [92]:
dfclean.info

<bound method DataFrame.info of                                 Name Platform  Year_of_Release         Genre  \
0                         Wii Sports      Wii           2006.0        Sports   
1                  Super Mario Bros.      NES           1985.0      Platform   
2                     Mario Kart Wii      Wii           2008.0        Racing   
3                  Wii Sports Resort      Wii           2009.0        Sports   
4           Pokemon Red/Pokemon Blue       GB           1996.0  Role-Playing   
...                              ...      ...              ...           ...   
16714  Samurai Warriors: Sanada Maru      PS3           2016.0        Action   
16715               LMA Manager 2007     X360           2006.0        Sports   
16716        Haitaka no Psychedelica      PSV           2016.0     Adventure   
16717               Spirits & Spells      GBA           2003.0      Platform   
16718            Winning Post 8 2016      PSV           2016.0    Simulation   

       

Remove irrelevant data

> No developer
> No user score
> No user count
> No critic score
> No critic count
> “TBD” under User_Score


In [97]:
dfclean.dropna(subset=['Developer'], inplace=True)
dfclean.dropna(subset=['User_Score'], inplace=True)
dfclean.dropna(subset=['User_Count'], inplace=True)
dfclean.dropna(subset=['Critic_Score'], inplace=True)
dfclean.dropna(subset=['Critic_Count'], inplace=True)
dfclean=dfclean[dfclean['User_Score'] != 'tbd']

In [98]:
dfclean.info

<bound method DataFrame.info of                                     Name Platform  Year_of_Release      Genre  \
0                             Wii Sports      Wii           2006.0     Sports   
2                         Mario Kart Wii      Wii           2008.0     Racing   
3                      Wii Sports Resort      Wii           2009.0     Sports   
6                  New Super Mario Bros.       DS           2006.0   Platform   
7                               Wii Play      Wii           2006.0       Misc   
...                                  ...      ...              ...        ...   
16677     Mortal Kombat: Deadly Alliance      GBA           2002.0   Fighting   
16696  Metal Gear Solid V: Ground Zeroes       PC           2014.0     Action   
16700                             Breach       PC           2011.0    Shooter   
16706            STORM: Frontline Nation       PC           2011.0   Strategy   
16709                            15 Days       PC           2009.0  Adventure

---

## Retrieve Cleaned Dataset

Once we are done with the cleaning of dataset, it's time to retrieve the dataset that we will be working on for the subsequent steps.

In [99]:

dfclean.to_csv('Cleaned Data.csv')