In [None]:
from sklearn.tree import DecisionTreeClassifier as DTC
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, StackingClassifier
from sklearn.preprocessing import LabelEncoder as LB
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold, cross_val_score, train_test_split
import numpy as np

In [None]:
df = pd.read_csv('games_clean.csv')
df

In [None]:
#for name in df.columns:
#    if df[name].isna().sum() > 0:
#        df[name].fillna(df[name].mean(), inplace=True)

In [None]:

df.drop(['RatingsBreakdown-Recommended', 'RatingsBreakdown-Meh', 'RatingsBreakdown-Exceptional', 'RatingsBreakdown-Skip', 'Presence'], inplace=True, axis=1)

LE = LB()
class_names = df['Metacritic'].unique()

df.loc[:, 'Metacritic'] = LE.fit_transform(df['Metacritic'])

X = df.loc[:, df.columns != 'Metacritic']
Y = df['Metacritic']


X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, shuffle=False)

In [None]:
####### Bagging classifier
bestNumEstimator = 0
valuescore = 0
for i in range(10,30):
    cv = KFold(n_splits=49, shuffle=False)
    clf = BaggingClassifier(base_estimator=DTC(max_depth=13, criterion='entropy'), 
                            n_estimators=i, random_state=0)
    cv_scores = cross_val_score(clf, X=X,y=Y, cv=cv, scoring='accuracy')  
    meannp = np.mean(cv_scores)
    if valuescore < meannp:
        valuescore = meannp
        bestNumEstimator = i
print('Score of bagging decision tree with 49-fold cross-val. is: ', meannp, 
      ' with a num. of estimators of ', bestNumEstimator)

In [None]:
####### Random Forest classifier

### K fold cross-val
cv = KFold(n_splits=49, shuffle=False)
clf = RandomForestClassifier(n_estimators = 100, max_depth=13, criterion='entropy')
cv_scores = cross_val_score(clf, X=X,y=Y, cv=cv, scoring='accuracy')  
meannp = np.mean(cv_scores)
print('Score of RFC with 49-fold cross-val. and max_depth of 13 is: ', meannp)

### cross-val 70/30
clffited = clf.fit(X_train, y_train)
scoreCV = accuracy_score(clffited.predict(X_test), y_test)
print('Score of RFC with cross-val 70/30 and max_depth of 13 is: ', scoreCV)


# now we know that 49-fold cross-val is better, we try different num. of estimators to try to improve it
bestnumest = 0
bestestim = 0
for i in range(85,115): # it will take a while...
    clf = RandomForestClassifier(n_estimators = i, max_depth=13, criterion='entropy')
    cv_scores = cross_val_score(clf, X=X,y=Y, cv=cv, scoring='accuracy')
    meannp = np.mean(cv_scores)
    if bestestim < meannp:
        bestestim = meannp
        bestnumest = i

print('Best num of estimators is: ', bestnumest, ' with an score of ', bestestim)

In [None]:
####### Stacking classifier

estimators = [('rfc', BaggingClassifier(base_estimator=DTC(max_depth=13, criterion='entropy'), 
                            n_estimators=25, random_state=0)),
            ('bc', RandomForestClassifier(n_estimators = 105, max_depth=13, criterion='entropy'))]

clf = StackingClassifier(estimators=estimators)
clffited = clf.fit(X_train, y_train)
scoreCV = accuracy_score(clffited.predict(X_test), y_test)
print('Score of Stacking with cross-val 70/30 and max_depth of 13 is: ', scoreCV)
