In [1]:
from sklearn.tree import DecisionTreeClassifier as DTC
from sklearn import tree
from sklearn.preprocessing import LabelEncoder as LB
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, mean_squared_error
from sklearn.model_selection import KFold, cross_val_score, LeaveOneOut, cross_validate
import numpy as np
import datetime
#import graphviz

In [None]:
df = pd.read_csv('games_clean.csv')
df

In [None]:

df.drop(['RatingsBreakdown-Recommended', 'RatingsBreakdown-Meh', 'RatingsBreakdown-Exceptional', 'RatingsBreakdown-Skip', 'Presence'], inplace=True, axis=1)

LE = LB()
class_names = df['Metacritic'].unique()

df.loc[:, 'Metacritic'] = LE.fit_transform(df['Metacritic'])

X = df.loc[:, df.columns != 'Metacritic']
Y = df['Metacritic']
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3,random_state=404)

In [None]:
# See Tree with max depth
fig = plt.figure(figsize=(40,40), dpi=400)
_ = tree.plot_tree(DTC(criterion='entropy').fit(X_train, y_train),
               feature_names= df.loc[:, df.columns != 'Metacritic'].columns,
               class_names= class_names,
               filled=True)

In [None]:
####### Kfold cross-validation
indexi = 0
bestmean = 0
betterk = 0
DTCestimator = DTC()
for k in range(20, 101, 20):
    cv = KFold(n_splits=k, shuffle=False)
    for i in range(5,14):
        #a = datetime.datetime.now()
        cv_scores = cross_validate(DTC(max_depth=i, criterion='entropy'), X=X_train,y=y_train, cv=cv, scoring='accuracy', return_estimator=True)  
        meannp = np.mean([sc for i,sc in enumerate(cv_scores['test_score'])])
        if bestmean < meannp:
            bestmean = meannp
            indexi = i
            betterk = k
            DTCestimator = cv_scores['estimator'][max([(sc,i) for i,sc in enumerate(cv_scores['test_score'])])[1]]

print('best depth is: ', i, ' having the best score of: ', bestmean, ' and the num of folders is: ', betterk)
print('Accuracy in test', '--->', accuracy_score(y_test,DTCestimator.predict(X_test)))

indexi = 0
bestmean = 0
betterk = 0
for k in range(20, 101, 20):
    cv = KFold(n_splits=k, shuffle=False)
    for i in range(5,14):
        #a = datetime.datetime.now()
        cv_scores = cross_validate(DTC(max_depth=i, criterion='entropy'), X=X,y=Y, cv=cv, scoring='accuracy', return_estimator=True)  
        meannp = np.mean([sc for i,sc in enumerate(cv_scores['test_score'])])
        if bestmean < meannp:
            bestmean = meannp
            indexi = i
            betterk = k
            DTCestimator = cv_scores['estimator'][max([(sc,i) for i,sc in enumerate(cv_scores['test_score'])])[1]]

print('best depth is: ', i, ' having the best score of: ', bestmean, ' and the num of folders is: ', betterk)            
print('Accuracy within the whole dataset', '--->', accuracy_score(Y,DTCestimator.predict(X)))
print('')
#b = datetime.datetime.now() 
#print(b-a)

In [None]:
####### cross-validation

maxDep = {}
for i in range(5,14):
    DT = DTC(max_depth=i, criterion='entropy')
    DTCfitted = DT.fit(X_train, y_train)
    maxDep[i] = (accuracy_score(DTCfitted.predict(X_test), y_test), DTCfitted) 
    
K, A, TR = [(K,A,TR) for (K,(A,TR)) in sorted(maxDep.items(), key=lambda x: x[1][0], reverse=True)][0]
fig = plt.figure(figsize=(40,40), dpi=400)
_ = tree.plot_tree(TR,
               feature_names= df.loc[:, df.columns != 'Metacritic'].columns,
               class_names= class_names,
               filled=True)

print('depth is: ', K, ' with a score of: ', A)

In [None]:
dot_data = tree.export_graphviz(TR, out_file=None, feature_names=df.loc[:, df.columns != 'Metacritic'].columns, class_names=class_names, filled=True, rounded=True, special_characters=True)
graph = graphviz.Source(dot_data)
graph

In [None]:
import numpy as np
import matplotlib.pyplot as plt

from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier, plot_tree

# Parameters
n_classes = 4
plot_colors = "rybg"
plot_step = 0.02

alpha = 0.1
all_comb = []
for x in range(int(X.shape[1]-X.shape[1]*alpha)):
    all_comb += [[x,y+1] for y in range(x,X.shape[1]-1)]


for pairidx, pair in enumerate(all_comb):
    # We only take the two corresponding features
    pair_aux = [X.columns[pair[0]], X.columns[pair[1]]]
    X_aux = X.loc[:, pair_aux]
    y_aux = Y

    # Train
    DTCfitted = DecisionTreeClassifier().fit(X, y_aux)

    # Plot the decision boundary
    plt.subplot(2, 3, pairidx + 1)

 

    x_min, x_max = X_aux.loc[:, pair_aux[0]].min() - 1, X_aux.loc[:, pair_aux[0]].max() + 1
    y_min, y_max = X_aux.loc[:, pair_aux[1]].min() - 1, X_aux.loc[:, pair_aux[1]].max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, plot_step),
                         np.arange(y_min, y_max, plot_step))
    plt.tight_layout(h_pad=0.5, w_pad=0.5, pad=2.5)

    Z = DTCfitted.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
    cs = plt.contourf(xx, yy, Z, cmap=plt.cm.RdYlBu)

    plt.xlabel(pair_aux[0])
    plt.ylabel(pair_aux[0])


    # Plot the training points
    for i, color in zip(range(n_classes), plot_colors):
        idx = np.where(y_aux == i)
        plt.scatter(X_aux[idx, 0], X_aux[idx, 1], c=color, label=class_names[i],
                    cmap=plt.cm.RdYlBu, edgecolor='black', s=15)

plt.suptitle("Decision surface of a decision tree using paired features")
plt.legend(loc='lower right', borderpad=0, handletextpad=0)
plt.axis("tight")

plt.figure()
DTCfitted = DecisionTreeClassifier().fit(X, Y)
plot_tree(DTCfitted, filled=True)
plt.show()