In [1]:
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt

# Modelling
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, ConfusionMatrixDisplay
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from scipy.stats import randint

# Tree Visualisation
from sklearn.tree import export_graphviz
from IPython.display import Image
import graphviz

In [2]:
data = pd.read_csv("cleaned_data2.csv")
data.head()

Unnamed: 0.1,Unnamed: 0,Name,Platform,Year_of_Release,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales,Critic_Score,Critic_Count,User_Score,User_Count,Developer,Rating,New_User_Score
0,0,Wii Sports,Wii,2006.0,Sports,Nintendo,41.36,28.96,3.77,8.45,82.53,76.0,51.0,8.0,322.0,Nintendo,E,1
1,2,Mario Kart Wii,Wii,2008.0,Racing,Nintendo,15.68,12.76,3.79,3.29,35.52,82.0,73.0,8.3,709.0,Nintendo,E,1
2,3,Wii Sports Resort,Wii,2009.0,Sports,Nintendo,15.61,10.93,3.28,2.95,32.77,80.0,73.0,8.0,192.0,Nintendo,E,1
3,6,New Super Mario Bros.,DS,2006.0,Platform,Nintendo,11.28,9.14,6.5,2.88,29.8,89.0,65.0,8.5,431.0,Nintendo,E,1
4,7,Wii Play,Wii,2006.0,Misc,Nintendo,13.96,9.18,2.93,2.84,28.92,58.0,41.0,6.6,129.0,Nintendo,E,0


In [3]:
current_year = 2016

# Create a lambda function to calculate the years since release
calculate_years_since_release = lambda year: current_year - year

# Apply the lambda function to the year_of_release column to create the years_since_release column
data['years_since_release'] = data['Year_of_Release'].apply(calculate_years_since_release)
data

Unnamed: 0.1,Unnamed: 0,Name,Platform,Year_of_Release,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales,Critic_Score,Critic_Count,User_Score,User_Count,Developer,Rating,New_User_Score,years_since_release
0,0,Wii Sports,Wii,2006.0,Sports,Nintendo,41.36,28.96,3.77,8.45,82.53,76.0,51.0,8.0,322.0,Nintendo,E,1,10.0
1,2,Mario Kart Wii,Wii,2008.0,Racing,Nintendo,15.68,12.76,3.79,3.29,35.52,82.0,73.0,8.3,709.0,Nintendo,E,1,8.0
2,3,Wii Sports Resort,Wii,2009.0,Sports,Nintendo,15.61,10.93,3.28,2.95,32.77,80.0,73.0,8.0,192.0,Nintendo,E,1,7.0
3,6,New Super Mario Bros.,DS,2006.0,Platform,Nintendo,11.28,9.14,6.50,2.88,29.80,89.0,65.0,8.5,431.0,Nintendo,E,1,10.0
4,7,Wii Play,Wii,2006.0,Misc,Nintendo,13.96,9.18,2.93,2.84,28.92,58.0,41.0,6.6,129.0,Nintendo,E,0,10.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7008,16677,Mortal Kombat: Deadly Alliance,GBA,2002.0,Fighting,Midway Games,0.01,0.00,0.00,0.00,0.01,81.0,12.0,8.8,9.0,Criterion Games,M,1,14.0
7009,16696,Metal Gear Solid V: Ground Zeroes,PC,2014.0,Action,Konami Digital Entertainment,0.00,0.01,0.00,0.00,0.01,80.0,20.0,7.6,412.0,Kojima Productions,M,0,2.0
7010,16700,Breach,PC,2011.0,Shooter,Destineer,0.01,0.00,0.00,0.00,0.01,61.0,12.0,5.8,43.0,Atomic Games,T,0,5.0
7011,16706,STORM: Frontline Nation,PC,2011.0,Strategy,Unknown,0.00,0.01,0.00,0.00,0.01,60.0,12.0,7.2,13.0,SimBin,E10+,0,5.0


In [4]:
value_counts = data['Developer'].value_counts()
print(value_counts)


EA Canada                                 152
EA Sports                                 145
Capcom                                    128
Ubisoft                                   104
Konami                                    100
                                         ... 
Netherock Ltd.                              1
Interchannel                                1
Warner Bros. Interactive Entertainment      1
Crawfish Interactive                        1
DTP Entertainment                           1
Name: Developer, Length: 1312, dtype: int64


In [5]:
def map_to_binary2(value):
    if value=='EA Canada' or value=='EA Sports' or value=='Capcom' or value=='Ubisoft' or value=='Konami' or value=='Ubisoft Montreal' or value=='EA Tiburon' or value=='Omega Force' or value=="Traveller's Tales" or value=='Electronic Arts' or value=='Visual Concepts' or value=='Nintendo' or value=='Codemasters' or value=='Vicarious Visions' or value=='Namco':
        return 1
    else:
        return 0

In [6]:
#Developers with more than 50 games developed = 1
data['Developer_clas'] = data['Developer'].apply(map_to_binary2)
data

Unnamed: 0.1,Unnamed: 0,Name,Platform,Year_of_Release,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales,Critic_Score,Critic_Count,User_Score,User_Count,Developer,Rating,New_User_Score,years_since_release,Developer_clas
0,0,Wii Sports,Wii,2006.0,Sports,Nintendo,41.36,28.96,3.77,8.45,82.53,76.0,51.0,8.0,322.0,Nintendo,E,1,10.0,1
1,2,Mario Kart Wii,Wii,2008.0,Racing,Nintendo,15.68,12.76,3.79,3.29,35.52,82.0,73.0,8.3,709.0,Nintendo,E,1,8.0,1
2,3,Wii Sports Resort,Wii,2009.0,Sports,Nintendo,15.61,10.93,3.28,2.95,32.77,80.0,73.0,8.0,192.0,Nintendo,E,1,7.0,1
3,6,New Super Mario Bros.,DS,2006.0,Platform,Nintendo,11.28,9.14,6.50,2.88,29.80,89.0,65.0,8.5,431.0,Nintendo,E,1,10.0,1
4,7,Wii Play,Wii,2006.0,Misc,Nintendo,13.96,9.18,2.93,2.84,28.92,58.0,41.0,6.6,129.0,Nintendo,E,0,10.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7008,16677,Mortal Kombat: Deadly Alliance,GBA,2002.0,Fighting,Midway Games,0.01,0.00,0.00,0.00,0.01,81.0,12.0,8.8,9.0,Criterion Games,M,1,14.0,0
7009,16696,Metal Gear Solid V: Ground Zeroes,PC,2014.0,Action,Konami Digital Entertainment,0.00,0.01,0.00,0.00,0.01,80.0,20.0,7.6,412.0,Kojima Productions,M,0,2.0,0
7010,16700,Breach,PC,2011.0,Shooter,Destineer,0.01,0.00,0.00,0.00,0.01,61.0,12.0,5.8,43.0,Atomic Games,T,0,5.0,0
7011,16706,STORM: Frontline Nation,PC,2011.0,Strategy,Unknown,0.00,0.01,0.00,0.00,0.01,60.0,12.0,7.2,13.0,SimBin,E10+,0,5.0,0


In [7]:
count_of_zeros = (data['Developer_clas'] == 0).sum()
print(count_of_zeros)

5695


In [8]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7013 entries, 0 to 7012
Data columns (total 20 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Unnamed: 0           7013 non-null   int64  
 1   Name                 7013 non-null   object 
 2   Platform             7013 non-null   object 
 3   Year_of_Release      6890 non-null   float64
 4   Genre                7013 non-null   object 
 5   Publisher            7009 non-null   object 
 6   NA_Sales             7013 non-null   float64
 7   EU_Sales             7013 non-null   float64
 8   JP_Sales             7013 non-null   float64
 9   Other_Sales          7013 non-null   float64
 10  Global_Sales         7013 non-null   float64
 11  Critic_Score         7013 non-null   float64
 12  Critic_Count         7013 non-null   float64
 13  User_Score           7013 non-null   float64
 14  User_Count           7013 non-null   float64
 15  Developer            7013 non-null   o

In [9]:
def map_to_binary3(value):
    if value >= 0.50:
        return 1
    else:
        return 0

In [11]:
#Games with more than 500k global sales
data['Derived_global_sales'] = data['Global_Sales'].apply(map_to_binary3)
data.dropna(subset=['years_since_release'], inplace=True)
data

Unnamed: 0.1,Unnamed: 0,Name,Platform,Year_of_Release,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,...,Critic_Score,Critic_Count,User_Score,User_Count,Developer,Rating,New_User_Score,years_since_release,Developer_clas,Derived_global_sales
0,0,Wii Sports,Wii,2006.0,Sports,Nintendo,41.36,28.96,3.77,8.45,...,76.0,51.0,8.0,322.0,Nintendo,E,1,10.0,1,1
1,2,Mario Kart Wii,Wii,2008.0,Racing,Nintendo,15.68,12.76,3.79,3.29,...,82.0,73.0,8.3,709.0,Nintendo,E,1,8.0,1,1
2,3,Wii Sports Resort,Wii,2009.0,Sports,Nintendo,15.61,10.93,3.28,2.95,...,80.0,73.0,8.0,192.0,Nintendo,E,1,7.0,1,1
3,6,New Super Mario Bros.,DS,2006.0,Platform,Nintendo,11.28,9.14,6.50,2.88,...,89.0,65.0,8.5,431.0,Nintendo,E,1,10.0,1,1
4,7,Wii Play,Wii,2006.0,Misc,Nintendo,13.96,9.18,2.93,2.84,...,58.0,41.0,6.6,129.0,Nintendo,E,0,10.0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7008,16677,Mortal Kombat: Deadly Alliance,GBA,2002.0,Fighting,Midway Games,0.01,0.00,0.00,0.00,...,81.0,12.0,8.8,9.0,Criterion Games,M,1,14.0,0,0
7009,16696,Metal Gear Solid V: Ground Zeroes,PC,2014.0,Action,Konami Digital Entertainment,0.00,0.01,0.00,0.00,...,80.0,20.0,7.6,412.0,Kojima Productions,M,0,2.0,0,0
7010,16700,Breach,PC,2011.0,Shooter,Destineer,0.01,0.00,0.00,0.00,...,61.0,12.0,5.8,43.0,Atomic Games,T,0,5.0,0,0
7011,16706,STORM: Frontline Nation,PC,2011.0,Strategy,Unknown,0.00,0.01,0.00,0.00,...,60.0,12.0,7.2,13.0,SimBin,E10+,0,5.0,0,0


In [12]:
#New derived data columns are years_since_released, developer_clas, and derived_global_sales
data.to_csv('cleaned_derived_data.csv', index=False)