In [1]:
# Importation des modules nécessaires

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
%matplotlib inline

In [3]:
# Définition de l'espace de travail

#import os
#os.chdir(r"C:\Users\victo\Documents\DataScientest\Data Analyst\1000 - Travail de Groupe\Workspace\04 - Workspace - Machine Learning")

In [2]:
# Création du DataFrame

vgsales = pd.read_csv("vgsales_cleaned_franchise_random.csv")
display(vgsales.head())
print(len(vgsales))
print('\n')
print(vgsales.isna().sum())

Unnamed: 0,Rank,Name,Franchise,basename,Genre,Platform,Publisher,Developer,Year,Random_Values,Estimated_Sales
0,1,Wii Sports,Undefined Franchise,wii-sports,Sports,Wii,Nintendo,Nintendo EAD,2006.0,0.961891,82.86
1,2,Super Mario Bros.,Mario,super-mario-bros,Platform,NES,Nintendo,Nintendo EAD,1985.0,0.648642,40.24
2,3,Mario Kart Wii,Mario,mario-kart-wii,Racing,Wii,Nintendo,Nintendo EAD,2008.0,0.429207,37.14
3,4,PlayerUnknown's Battlegrounds,Undefined Franchise,playerunknowns-battlegrounds,Shooter,PC,PUBG Corporation,PUBG Corporation,2017.0,0.089494,36.6
4,5,Wii Sports Resort,Undefined Franchise,wii-sports-resort,Sports,Wii,Nintendo,Nintendo EAD,2009.0,0.05638,33.09


21233


Rank                 0
Name                 0
Franchise            0
basename             0
Genre                0
Platform             0
Publisher          581
Developer          651
Year                29
Random_Values        0
Estimated_Sales      0
dtype: int64


In [3]:
# Suppressions de valeur manquantes dans la colonne Year
# Nous encoderons cette colonne avec un StandardScaler, qui conserve les valeur manquantes
# Notre algorithme de ML ne pourra pas les gérer

vgsales.dropna(subset = ['Year'], inplace = True)
vgsales.isna().sum()

Rank                 0
Name                 0
Franchise            0
basename             0
Genre                0
Platform             0
Publisher          562
Developer          650
Year                 0
Random_Values        0
Estimated_Sales      0
dtype: int64

In [4]:
# Suppression de la colonne Name, qui a une cardinalité trop importante pour l'encodage
# Suppression de la colonne Rank, qui donne trop d'informations sur les données
# Suppression de la colonne basename, pour le même motif que la colonne Name

vgsales = vgsales.drop('Name', axis = 1)
vgsales = vgsales.drop('Rank', axis = 1)
vgsales = vgsales.drop('basename', axis = 1)
vgsales.head()


Unnamed: 0,Franchise,Genre,Platform,Publisher,Developer,Year,Random_Values,Estimated_Sales
0,Undefined Franchise,Sports,Wii,Nintendo,Nintendo EAD,2006.0,0.961891,82.86
1,Mario,Platform,NES,Nintendo,Nintendo EAD,1985.0,0.648642,40.24
2,Mario,Racing,Wii,Nintendo,Nintendo EAD,2008.0,0.429207,37.14
3,Undefined Franchise,Shooter,PC,PUBG Corporation,PUBG Corporation,2017.0,0.089494,36.6
4,Undefined Franchise,Sports,Wii,Nintendo,Nintendo EAD,2009.0,0.05638,33.09


In [5]:
# Séparation des varirables explicatives et de la variable cible dans deux DataFrame différents

feats = vgsales.drop('Estimated_Sales', axis = 1)
target = vgsales['Estimated_Sales']

display(feats)
display(target)

Unnamed: 0,Franchise,Genre,Platform,Publisher,Developer,Year,Random_Values
0,Undefined Franchise,Sports,Wii,Nintendo,Nintendo EAD,2006.0,0.961891
1,Mario,Platform,NES,Nintendo,Nintendo EAD,1985.0,0.648642
2,Mario,Racing,Wii,Nintendo,Nintendo EAD,2008.0,0.429207
3,Undefined Franchise,Shooter,PC,PUBG Corporation,PUBG Corporation,2017.0,0.089494
4,Undefined Franchise,Sports,Wii,Nintendo,Nintendo EAD,2009.0,0.056380
...,...,...,...,...,...,...,...
21228,Undefined Franchise,Puzzle,DS,505 Games,Crush Digital,2010.0,0.008455
21229,Undefined Franchise,Strategy,PC,ValuSoft,ValuSoft,2007.0,0.692902
21230,Tales,Action,PSP,Sony Computer Entertainment,Acquire,2007.0,0.223675
21231,Undefined Franchise,Shooter,PC,Atari,Kamehan Studios,2002.0,0.822668


0        82.86
1        40.24
2        37.14
3        36.60
4        33.09
         ...  
21228     0.00
21229     0.00
21230     0.00
21231     0.00
21232     0.00
Name: Estimated_Sales, Length: 21204, dtype: float64

In [6]:
# Séparation du des données en jeu de test et jeu d'entraînement

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(feats, target, test_size = 0.2, random_state = 42)

print(X_train.shape)
print(X_test.shape)

(16963, 7)
(4241, 7)


In [7]:
# Encoding
# Franchise, Genre, Platform, Publisher, Developer : OneHotEncoder

X_train_ohe = X_train[['Franchise', 'Genre', 'Platform', 'Publisher', 'Developer']]
X_test_ohe = X_test[['Franchise', 'Genre', 'Platform', 'Publisher', 'Developer']]

from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder(drop = None, handle_unknown = 'ignore')

X_train_ohe = pd.DataFrame(ohe.fit_transform(X_train_ohe).toarray())
X_test_ohe = pd.DataFrame(ohe.transform(X_test_ohe).toarray())

print(X_train_ohe.shape)
print(X_test_ohe.shape)

(16963, 4116)
(4241, 4116)


In [8]:
# Encoding
# Year : Standard Scaler

from sklearn.preprocessing import StandardScaler

X_train_sc = np.asarray(X_train['Year']).reshape(-1,1)
X_test_sc = np.asarray(X_test['Year']).reshape(-1, 1)

sc = StandardScaler()
X_train_sc = pd.DataFrame(sc.fit_transform(X_train_sc))
X_test_sc = pd.DataFrame(sc.transform(X_test_sc))

print(X_train_sc.shape)
print(X_test_sc.shape)

(16963, 1)
(4241, 1)


In [9]:
# Rassemblement des DataFrames

X_train = pd.concat([X_train_ohe, X_train_sc], ignore_index = False, axis = 1)
X_test = pd.concat([X_test_ohe, X_test_sc], ignore_index = False, axis = 1)

print(X_train.shape)
print(X_test.shape)

(16963, 4117)
(4241, 4117)


In [10]:
print(X_train.isna().sum().sum())
print(X_test.isna().sum().sum())

0
0


In [14]:
# DecisionTreeRegressor

from sklearn.tree import DecisionTreeRegressor 
  
regressor = DecisionTreeRegressor(random_state=42) 
  
regressor.fit(X_train, y_train)

print("Score du modèle de Decision Tree Regrossor sur le jeu de données d'entraînement:" ,regressor.score(X_train,y_train))
print("Score du modèle de Decision Tree Regrossor sur le jeu de données de test:" ,regressor.score(X_test,y_test))


Score du modèle de Decision Tree Regrossor sur le jeu de données d'entraînement: 0.9891689995370239
Score du modèle de Decision Tree Regrossor sur le jeu de données de test: 0.1677429974525112
