In [None]:
from sklearn.tree import DecisionTreeClassifier as DTC
from sklearn import tree
from sklearn.preprocessing import LabelEncoder as LB
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, mean_squared_error
from sklearn.model_selection import KFold, cross_val_score, LeaveOneOut, cross_validate
import numpy as np
import datetime
import graphviz

In [None]:
df = pd.read_csv('games_clean.csv')
df

In [None]:

df.drop(['RatingsBreakdown-Recommended', 'RatingsBreakdown-Meh', 'RatingsBreakdown-Exceptional', 'RatingsBreakdown-Skip', 'Presence'], inplace=True, axis=1)

LE = LB()
class_names = df['Metacritic'].unique()

df.loc[:, 'Metacritic'] = LE.fit_transform(df['Metacritic'])

X = df.loc[:, df.columns != 'Metacritic']
Y = df['Metacritic']
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3,random_state=404)

In [None]:
# See Tree with max depth
fig = plt.figure(figsize=(40,40), dpi=400)
_ = tree.plot_tree(DTC(criterion='entropy').fit(X_train, y_train),
               feature_names= df.loc[:, df.columns != 'Metacritic'].columns,
               class_names= class_names,
               filled=True)

In [None]:
from sklearn.model_selection import learning_curve
from sklearn.model_selection import ShuffleSplit


def plot_learning_curve(estimator, title, X, y, axes=None, ylim=None, cv=None,
                        n_jobs=None, train_sizes=np.linspace(.1, 1.0, 5)):

    if axes is None:
        _, axes = plt.subplots(1, 3, figsize=(20, 5))

    axes[0].set_title(title)
    if ylim is not None:
        axes[0].set_ylim(*ylim)
    axes[0].set_xlabel("Training examples")
    axes[0].set_ylabel("Score")

    train_sizes, train_scores, test_scores, fit_times, _ = \
        learning_curve(estimator, X, y, cv=cv, n_jobs=n_jobs,
                       train_sizes=train_sizes,
                       return_times=True)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    fit_times_mean = np.mean(fit_times, axis=1)
    fit_times_std = np.std(fit_times, axis=1)

    # Plot learning curve
    axes[0].grid()
    axes[0].fill_between(train_sizes, train_scores_mean - train_scores_std,
                         train_scores_mean + train_scores_std, alpha=0.1,
                         color="r")
    axes[0].fill_between(train_sizes, test_scores_mean - test_scores_std,
                         test_scores_mean + test_scores_std, alpha=0.1,
                         color="g")
    axes[0].plot(train_sizes, train_scores_mean, 'o-', color="r",
                 label="Training score")
    axes[0].plot(train_sizes, test_scores_mean, 'o-', color="g",
                 label="Cross-validation score")
    axes[0].legend(loc="best")

    # Plot n_samples vs fit_times
    axes[1].grid()
    axes[1].plot(train_sizes, fit_times_mean, 'o-')
    axes[1].fill_between(train_sizes, fit_times_mean - fit_times_std,
                         fit_times_mean + fit_times_std, alpha=0.1)
    axes[1].set_xlabel("Training examples")
    axes[1].set_ylabel("fit_times")
    axes[1].set_title("Scalability of the model")

    # Plot fit_time vs score
    axes[2].grid()
    axes[2].plot(fit_times_mean, test_scores_mean, 'o-')
    axes[2].fill_between(fit_times_mean, test_scores_mean - test_scores_std,
                         test_scores_mean + test_scores_std, alpha=0.1)
    axes[2].set_xlabel("fit_times")
    axes[2].set_ylabel("Score")
    axes[2].set_title("Performance of the model")

    return plt

title = 'Accuracy'
cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state=0)
plot_learning_curve(DTC(), title,  X, Y, ylim=(0.4, 1.01), n_jobs=8, cv=cv)

In [None]:
####### Kfold cross-validation
indexi = 0
bestmean = 0
betterk = 0
DTCestimator = DTC()
for k in range(20, 101, 20):
    cv = KFold(n_splits=k, shuffle=False)
    for i in range(5,14):
        #a = datetime.datetime.now()
        cv_scores = cross_validate(DTC(max_depth=i, criterion='entropy'), X=X_train,y=y_train, cv=cv, scoring='accuracy', return_estimator=True, n_jobs=16)  
        meannp = np.mean([sc for i,sc in enumerate(cv_scores['test_score'])])
        if bestmean < meannp:
            bestmean = meannp
            indexi = i
            betterk = k
            DTCestimator = cv_scores['estimator'][max([(sc,i) for i,sc in enumerate(cv_scores['test_score'])])[1]]

print('best depth is: ', i, ' having the best score of: ', bestmean, ' and the num of folders is: ', betterk)
print('Accuracy in test', '--->', accuracy_score(y_test,DTCestimator.predict(X_test)))

indexi = 0
bestmean = 0
betterk = 0
for k in range(20, 101, 20):
    cv = KFold(n_splits=k, shuffle=False)
    for i in range(5,14):
        #a = datetime.datetime.now()
        cv_scores = cross_validate(DTC(max_depth=i, criterion='entropy'), X=X,y=Y, cv=cv, scoring='accuracy', return_estimator=True, n_jobs=16)  
        meannp = np.mean([sc for i,sc in enumerate(cv_scores['test_score'])])
        if bestmean < meannp:
            bestmean = meannp
            indexi = i
            betterk = k
            DTCestimator = cv_scores['estimator'][max([(sc,i) for i,sc in enumerate(cv_scores['test_score'])])[1]]

print('best depth is: ', i, ' having the best score of: ', bestmean, ' and the num of folders is: ', betterk)            
print('Accuracy within the whole dataset', '--->', accuracy_score(Y,DTCestimator.predict(X)))
print('')

In [None]:
####### cross-validation

maxDep = {}
for i in range(5,14):
    DT = DTC(max_depth=i, criterion='entropy')
    DTCfitted = DT.fit(X_train, y_train)
    maxDep[i] = (accuracy_score(DTCfitted.predict(X_test), y_test), DTCfitted) 
    
K, A, TR = [(K,A,TR) for (K,(A,TR)) in sorted(maxDep.items(), key=lambda x: x[1][0], reverse=True)][0]
fig = plt.figure(figsize=(40,40), dpi=400)
_ = tree.plot_tree(TR,
               feature_names= df.loc[:, df.columns != 'Metacritic'].columns,
               class_names= class_names,
               filled=True)

print('depth is: ', K, ' with a score of: ', A)

In [None]:
dot_data = tree.export_graphviz(TR, out_file=None, feature_names=df.loc[:, df.columns != 'Metacritic'].columns, class_names=class_names, filled=True, rounded=True, special_characters=True)
graph = graphviz.Source(dot_data)
graph