# PyVG: Data Science to predict Video Games sales
>Equipe: Alexis Terrasse, Henri-François Mole, Hsan Drissi, Stephane Lelievre
>
>Promo: DS_Oct21
---
## Essai de classification avec le VotingClassifier
---

In [1]:
from sklearn.model_selection import train_test_split, cross_validate, KFold 

from sklearn.linear_model import LogisticRegression

from sklearn.neighbors import KNeighborsClassifier

from sklearn.ensemble import VotingClassifier, StackingClassifier, RandomForestClassifier

from sklearn.metrics import f1_score
import pandas as pd

from sklearn import preprocessing


In [2]:
df = pd.read_csv("../data/vgsales_eda2_ready.csv", sep=",")
print ('Input shape',df.shape)  # (7524, 21)
df.head()

Input shape (8478, 85)


Unnamed: 0,Name,Platform,Genre,Global_Sales,game_key,Name_meta,Genre_meta,Score_pro,Score_user,Publisher,...,genre_Platform,genre_Puzzle,genre_Racing,genre_Role-Playing,genre_Shooter,genre_Simulation,genre_Sports,genre_Strategy,genre_Visual+Novel,labels
0,Grand Theft Auto: San Andreas,XB,Action,1.95,grand-theft-auto-san-andreas,Grand Theft Auto: San Andreas,"Action Adventure, Modern",9.3,8.8,rockstar games,...,0,0,0,0,0,0,0,0,0,7
1,Grand Theft Auto: Vice City,PC,Action,0.04,grand-theft-auto-vice-city,Grand Theft Auto: Vice City,"Action Adventure, Modern, Modern, Open-World",9.4,8.8,rockstar games,...,0,0,0,0,0,0,0,0,0,4
2,Grand Theft Auto III,PC,Action,0.01,grand-theft-auto-iii,Grand Theft Auto III,"Action Adventure, Modern, Modern, Open-World",9.3,8.2,rockstar games,...,0,0,0,0,0,0,0,0,0,4
3,Grand Theft Auto IV,PC,Action,0.87,grand-theft-auto-iv,Grand Theft Auto IV,"Action Adventure, Modern, Modern, Open-World",9.0,7.0,rockstar games,...,0,0,0,0,0,0,0,0,0,6
4,Grand Theft Auto: Liberty City Stories,PSP,Action,7.72,grand-theft-auto-liberty-city-stories,Grand Theft Auto: Liberty City Stories,"Action Adventure, Modern, Modern, Open-World",8.8,7.8,rockstar games,...,0,0,0,0,0,0,0,0,0,0


In [3]:
# Séparer les variables explicatives de la variable à prédire
bins=8
target = pd.qcut(df['Global_Sales'],q=bins, labels=[i for i in range(0,bins)]).astype('int64')

data = df.select_dtypes(exclude='object').drop(['Global_Sales','Global_Sales.log','year',
                                               #'day', 'year', 'month', 'quarter', 'anom',
                                               ],axis=1)

# Séparer le jeu de données en données d'entraînement et données test 
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.20, random_state=0)


In [4]:
# Etape de preprocessing pour normaliser les données
scaler = preprocessing.StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [5]:
clf1 = KNeighborsClassifier(n_neighbors=5)         # MODEL
clf2 = RandomForestClassifier(random_state= 123)
clf3 = LogisticRegression(max_iter=1000)

In [6]:
vclf =  VotingClassifier(estimators=[('knn', clf1), ('rf', clf2), ('lr', clf3)], voting='hard')


In [7]:
# cross-validator à 3 parties (folds), avec les paramètres random_state=111 et shuffle = True.
cv3 = KFold(n_splits=3, random_state=111, shuffle=True)

# afficher pour chacun des 3 classifieurs individuels, ainsi que pour le Voting Classifier
for clf, label in zip([clf1, clf2, clf3, vclf], ['KNN', 'Random Forest', 'Logistic Regression', 'Voting Classifier']):
    scores = cross_validate(clf, X_train_scaled, y_train, cv=cv3, scoring=['accuracy'])
    print ('[%s]: \n Accuracy: %0.2f (+/- %0.2f)' % (label, scores['test_accuracy'].mean(), scores['test_accuracy'].std()))
    #print("[%s]: \n Accuracy: %0.2f (+/- %0.2f)" % (label, scores['test_accuracy'].mean(), scores['test_accuracy'].std()),
    #      "F1 score: %0.2f (+/- %0.2f)" % (scores['test_f1'].mean(), scores['test_f1'].std()))

[KNN]: 
 Accuracy: 0.27 (+/- 0.01)
[Random Forest]: 
 Accuracy: 0.40 (+/- 0.01)
[Logistic Regression]: 
 Accuracy: 0.34 (+/- 0.00)
[Voting Classifier]: 
 Accuracy: 0.35 (+/- 0.00)
