In [19]:
# Decision Tree
# Import everything you need
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn import metrics
from sklearn.metrics import classification_report,confusion_matrix


In [268]:
#### Load the data ... 
dataset = load_wine()

# Split it in train and test set (mind the value for test_size and random_state- how does they affect your model?)
X_train, X_test, y_train, y_test = train_test_split(dataset.data, dataset.target, test_size=0.40, random_state=42)

# Initialize a model and train it (yes, it is actually just the *.fit() call)
dt_wine = tree.DecisionTreeClassifier(criterion="gini")
dt_wine = dt_wine.fit(X_train,y_train)

# Compute the predictions with your trained model
y_pred = dt_wine.predict(X_test)

# Get a first impression: How did it work?
test_accuracy= metrics.accuracy_score(y_test, y_pred)
print("Accuracy: ", '%.2f'% (test_accuracy*100),"%")


Accuracy:  94.44 %


In [269]:
# You almost never get 100% accuracy; to check where the error may come from use a confusion matrix
print(confusion_matrix(y_test, y_pred))
    

[[23  3  0]
 [ 0 27  0]
 [ 1  0 18]]


In [204]:
# You can calculate true positive etc. from the confusion matrix....or use scikit to do it for you. What is the difference
# between precision and recall (do not mind micro avg. etc.)

print(classification_report(y_test, y_pred, 
                   target_names=dataset.target_names))

              precision    recall  f1-score   support

     class_0       0.81      0.96      0.88        26
     class_1       0.81      0.78      0.79        27
     class_2       0.93      0.74      0.82        19

    accuracy                           0.83        72
   macro avg       0.85      0.83      0.83        72
weighted avg       0.84      0.83      0.83        72



In [5]:
# You think the model could do better? You called the decision tree with default values; anything you would like to change?
# If so, train the model with a different configuration and test it again. Will your evaluation metric change? If so, how?

tree.DecisionTreeClassifier()

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [None]:
# integrate with crossvalidation

#### Load the data ... 
dataset = load_wine()

# So just import it instead of 'train_test_split'
from sklearn.model_selection import cross_val_score

# Get some data, use X for the datapoints and y for the label (or rename but then change accordingly)

# Define a classifier

dt_wine = tree.DecisionTreeClassifier()

# Here, the k-fold crossvalidation starts with k=10; you do not pass separated train and test sets anymore 
scores = cross_val_score(dt

In [270]:
# integrate with crossvalidation

#### Load the data ... 
dataset = load_wine()

# So just import it instead of 'train_test_split'
from sklearn.model_selection import cross_val_score

# Get some data, use X for the datapoints and y for the label (or rename but then change accordingly)

# Define a classifier

dt_wine = tree.DecisionTreeClassifier()

# Here, the k-fold crossvalidation starts with k=10; you do not pass separated train and test sets anymore 
scores = cross_val_score(dt_wine, dataset.data, dataset.target, cv=10, scoring='accuracy')
# Prints all scores of all folds
print(scores)

# Prints the average, this is actually what you want and report in a paper
print(scores.mean(), scores.std())

[0.88888889 0.88888889 0.66666667 0.88888889 0.83333333 0.83333333
 1.         0.94444444 0.94117647 0.76470588]
0.865032679738562 0.0913519991478125


In [276]:
# integrate with grid search

'''An even easier approach (and even more "black box") can be achieved using grid search. 
*Adapt* this code for the integration into your experiments'''

# Import gridsearch with crossvalidation 
from sklearn.model_selection import GridSearchCV

# Get some data; please use X for the datapoints and y for the labels, or rename and change it accordingly

# create a parameter grid: map the parameter names to the values that should be searched
param_grid = dict(random_state=list(range(0, 50)), criterion=["gini", "entropy"])
print(param_grid)

dt_wine = tree.DecisionTreeClassifier()
# Setup the grid with your classifier and the default cross-validation value k=10
grid = GridSearchCV(dt_wine, param_grid, cv=10, scoring='accuracy')

# ... and train it as before
grid.fit(dataset.data, dataset.target)

# Get the mean accuracy over all folds
grid_mean_scores = grid.cv_results_['mean_test_score']
print(grid_mean_scores)

# It is also important to get the standard deviation per fold 
grid_std_score= grid.cv_results_['std_test_score']
print(grid_std_score)

# Even more high-level, we can just use 'pandas' to print it nicer
import pandas as pd
pd.DataFrame(grid.cv_results_)[['mean_test_score', 'std_test_score', 'params']]

# But usually, we are interested in the best fit, which returns the best achieved accuracy over all folds and the value
# for the parameter under investigation

print(grid.best_score_, grid.best_params_)

{'random_state': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49], 'criterion': ['gini', 'entropy']}
[0.87058824 0.86503268 0.87614379 0.86503268 0.85947712 0.85947712
 0.85947712 0.87058824 0.87058824 0.85947712 0.85392157 0.88169935
 0.85947712 0.87058824 0.85947712 0.88169935 0.86503268 0.87614379
 0.85947712 0.85947712 0.86503268 0.8872549  0.86503268 0.86503268
 0.87058824 0.87058824 0.86503268 0.87614379 0.85359477 0.86503268
 0.85392157 0.87058824 0.86503268 0.85392157 0.85359477 0.85359477
 0.87026144 0.87058824 0.87026144 0.88169935 0.86470588 0.87058824
 0.86503268 0.85915033 0.87058824 0.87058824 0.85947712 0.86470588
 0.85915033 0.87026144 0.89869281 0.89869281 0.92091503 0.8869281
 0.8875817  0.89313725 0.8872549  0.90424837 0.90424837 0.90424837
 0.89869281 0.89869281 0.89313725 0.89836601 0.89313725 0.89313725
 0.91535948 0.9042483