In [1]:
import pandas as pd
import tensorflow as tf
import sklearn

In [2]:
from tensorflow.keras.layers import Dense, Dropout, Activation, Input
from tensorflow. keras.models import Model
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

In [3]:
base = pd.read_csv('games.csv')
base

Unnamed: 0,Name,Platform,Year_of_Release,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales,Critic_Score,Critic_Count,User_Score,User_Count,Developer,Rating
0,Wii Sports,Wii,2006.0,Sports,Nintendo,41.36,28.96,3.77,8.45,82.53,76.0,51.0,8,322.0,Nintendo,E
1,Super Mario Bros.,NES,1985.0,Platform,Nintendo,29.08,3.58,6.81,0.77,40.24,,,,,,
2,Mario Kart Wii,Wii,2008.0,Racing,Nintendo,15.68,12.76,3.79,3.29,35.52,82.0,73.0,8.3,709.0,Nintendo,E
3,Wii Sports Resort,Wii,2009.0,Sports,Nintendo,15.61,10.93,3.28,2.95,32.77,80.0,73.0,8,192.0,Nintendo,E
4,Pokemon Red/Pokemon Blue,GB,1996.0,Role-Playing,Nintendo,11.27,8.89,10.22,1.00,31.37,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16714,Samurai Warriors: Sanada Maru,PS3,2016.0,Action,Tecmo Koei,0.00,0.00,0.01,0.00,0.01,,,,,,
16715,LMA Manager 2007,X360,2006.0,Sports,Codemasters,0.00,0.01,0.00,0.00,0.01,,,,,,
16716,Haitaka no Psychedelica,PSV,2016.0,Adventure,Idea Factory,0.00,0.00,0.01,0.00,0.01,,,,,,
16717,Spirits & Spells,GBA,2003.0,Platform,Wanadoo,0.01,0.00,0.00,0.00,0.01,,,,,,


O objetivo é prever as vendas da América o Norte, Europa e do Japão

In [5]:
#Tirar tudo que não vamos usar
base = base.drop('Other_Sales', axis=1)
base = base.drop('Global_Sales', axis =1)
base = base.drop('Developer', axis=1)

In [6]:
base.shape

(16719, 13)

In [7]:
#Verificar se existem valores nulos

#Somatório da qtde de valores nulos em cada coluna
base.isnull().sum()

Unnamed: 0,0
Name,2
Platform,0
Year_of_Release,269
Genre,2
Publisher,54
NA_Sales,0
EU_Sales,0
JP_Sales,0
Critic_Score,8582
Critic_Count,8582


In [8]:
#0 indica que vamos apagar todos os registros nulos (todas as linhas com valores nulos)
base = base.dropna(axis=0)

In [9]:
base.shape

(6825, 13)

In [10]:
base.isnull().sum()

Unnamed: 0,0
Name,0
Platform,0
Year_of_Release,0
Genre,0
Publisher,0
NA_Sales,0
EU_Sales,0
JP_Sales,0
Critic_Score,0
Critic_Count,0


In [11]:
base['Name'].value_counts()

Unnamed: 0_level_0,count
Name,Unnamed: 1_level_1
Need for Speed: Most Wanted,8
Madden NFL 07,8
LEGO Star Wars II: The Original Trilogy,8
The Sims 2,7
Terraria,7
...,...
Castlevania: Portrait of Ruin,1
Suzuki TT Superbikes,1
Rumble Roses,1
Sherlock Holmes: The Mystery of the Mummy,1


In [12]:
#O atributo acima não é um bom atributo para ser utilizado no treinamento das redes neurais, pois cada nome é um identificador único
base = base.drop('Name', axis=1)

In [13]:
base.shape

(6825, 12)

In [14]:
base.columns

Index(['Platform', 'Year_of_Release', 'Genre', 'Publisher', 'NA_Sales',
       'EU_Sales', 'JP_Sales', 'Critic_Score', 'Critic_Count', 'User_Score',
       'User_Count', 'Rating'],
      dtype='object')

In [15]:
#Criar a variável X que representa os previsores
X = base.iloc[:, [0,1,2,3,7,8,9,10,11]].values

In [16]:
X

array([['Wii', 2006.0, 'Sports', ..., '8', 322.0, 'E'],
       ['Wii', 2008.0, 'Racing', ..., '8.3', 709.0, 'E'],
       ['Wii', 2009.0, 'Sports', ..., '8', 192.0, 'E'],
       ...,
       ['PC', 2014.0, 'Action', ..., '7.6', 412.0, 'M'],
       ['PC', 2011.0, 'Shooter', ..., '5.8', 43.0, 'T'],
       ['PC', 2011.0, 'Strategy', ..., '7.2', 13.0, 'E10+']], dtype=object)

In [17]:
#Criando as variáveis contendo as colunas que representam américa do norte, europa e japao (o que quremos prever)
y_na = base.iloc[:,4].values
y_eu = base.iloc[:,5].values
y_jp = base.iloc[:,6].values

In [18]:
base.columns

Index(['Platform', 'Year_of_Release', 'Genre', 'Publisher', 'NA_Sales',
       'EU_Sales', 'JP_Sales', 'Critic_Score', 'Critic_Count', 'User_Score',
       'User_Count', 'Rating'],
      dtype='object')

In [19]:
#Transformar atributos categóricos em numéricos (OneHotEncoding)
onehotencoder = ColumnTransformer(transformers=[("OneHot", OneHotEncoder(), [0,2,3,8])], remainder='passthrough')
X = onehotencoder.fit_transform(X).toarray()

In [20]:
X.shape

(6825, 303)

In [21]:
camada_entrada = Input(shape=(303,))
camada_oculta1 = Dense(units=153, activation='relu')(camada_entrada)
camada_oculta2 = Dense(units=153, activation='relu')(camada_oculta1)
camada_saida1 = Dense(units=1, activation='linear')(camada_oculta2)
camada_saida2 = Dense(units=1, activation='linear')(camada_oculta2)
camada_saida3 = Dense(units=1, activation='linear')(camada_oculta2)


In [22]:
regressor = Model(inputs=camada_entrada, outputs=[camada_saida1, camada_saida2, camada_saida3])

In [23]:
regressor.compile(optimizer='adam', loss='mse')

In [24]:
regressor.fit(X, [y_na, y_eu, y_jp], epochs=500, batch_size=100)

Epoch 1/500
[1m69/69[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - loss: 12029.7764
Epoch 2/500
[1m69/69[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 14.1436
Epoch 3/500
[1m69/69[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 2.6482
Epoch 4/500
[1m69/69[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 3.8207
Epoch 5/500
[1m69/69[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 6.9556
Epoch 6/500
[1m69/69[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 2.0601
Epoch 7/500
[1m69/69[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 1.8027
Epoch 8/500
[1m69/69[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 2.2326
Epoch 9/500
[1m69/69[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 2.1609
Epoch 10/500
[1m69/69[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 1

<keras.src.callbacks.history.History at 0x7e7000d0cbe0>

In [25]:
previsao_na, previsao_eu, previsao_jp = regressor.predict(X)

[1m214/214[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step


In [26]:
previsao_na, previsao_na.mean()

(array([[13.53205   ],
        [13.906495  ],
        [ 8.335899  ],
        ...,
        [ 0.07983461],
        [ 0.0890702 ],
        [ 0.03672114]], dtype=float32),
 0.41008183)

In [27]:
from sklearn.metrics import mean_absolute_error

In [28]:
mean_absolute_error(y_na, previsao_na)

0.28860174967517777

In [29]:
previsao_eu, previsao_eu.mean()

(array([[9.673459  ],
        [9.797995  ],
        [5.7888346 ],
        ...,
        [0.09236246],
        [0.0958841 ],
        [0.08665353]], dtype=float32),
 0.2322673)

In [30]:
y_eu, y_eu.mean()

(array([2.896e+01, 1.276e+01, 1.093e+01, ..., 1.000e-02, 0.000e+00,
        1.000e-02]),
 0.23608937728937732)

In [31]:
previsao_jp, previsao_jp.mean()

(array([[3.7413905 ],
        [3.576462  ],
        [2.3958724 ],
        ...,
        [0.02753938],
        [0.03710453],
        [0.03369569]], dtype=float32),
 0.08544452)