## Part 3: Decision Tree

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import sklearn
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, precision_score
import scipy

def unpickle(file):
    import pickle
    with open(file, 'rb') as fo:
        dict = pickle.load(fo, encoding='bytes')
    return dict

# folder_dir = './cifar-10-batches-py/' 
folder_dir = 'cifar-10-batches-py/' 

### Loading CIFAR Data

In [4]:
import numpy as np
# folder_dir = './cifar-10-batches-py/' 
folder_dir = 'cifar-10-batches-py/' 
metadata_dict = unpickle(folder_dir+"batches.meta")
label_names = np.array([label.decode('ascii') for label in metadata_dict[b'label_names']])

cifar_train_data = None
cifar_train_filenames = []
cifar_train_labels = []

for i in range(1,6):
    cifar_train_data_dict = unpickle(folder_dir + "data_batch_" + str(i))
    if i == 1:
        cifar_train_data = cifar_train_data_dict[b'data']
    else:
        cifar_train_data = np.vstack((cifar_train_data, cifar_train_data_dict[b'data']))
    cifar_train_labels += cifar_train_data_dict[b'labels']

cifar_train_data = cifar_train_data.reshape((len(cifar_train_data), 3, 32, 32))
cifar_train_data_rolled = np.rollaxis(cifar_train_data, 1, 4) # only if plt.imshow()

cifar_test_data_dict = unpickle(folder_dir + "test_batch")
cifar_test_data = cifar_test_data_dict[b'data']
cifar_test_filenames = cifar_test_data_dict[b'filenames']
cifar_test_labels = cifar_test_data_dict[b'labels']

cifar_test_data = cifar_test_data.reshape((len(cifar_test_data), 3, 32, 32))
# cifar_test_data = np.rollaxis(cifar_test_dsata, 1, 4)


cifar_train_data = cifar_train_data.reshape(50000, -1)
cifar_test_data = cifar_test_data.reshape(10000, -1)
print(cifar_train_data.shape)
print(cifar_test_data.shape)

(50000, 3072)
(10000, 3072)


### Rescaling from [0,255] to [-1,1]

In [5]:
X_train = cifar_train_data/127.5-1
X_test = cifar_test_data/127.5-1
y_train = np.array(cifar_train_labels)
y_test = np.array(cifar_test_labels)
print("X_train.shape", X_train.shape)
print("X_test.shape", X_test.shape)
print("y_test.shape", y_test.shape)
print("y_train.shape", y_train.shape)

X_train.shape (50000, 3072)
X_test.shape (10000, 3072)
y_test.shape (10000,)
y_train.shape (50000,)


### Instantiate the Decision Tree model

In [6]:
tree = sklearn.tree.DecisionTreeClassifier()

### Train the DecisionTree model

In [5]:
param_distribution = {'max_depth': range(1,75), 'min_samples_split': range(2,50)}

best_tree = sklearn.model_selection.RandomizedSearchCV(tree, param_distribution, n_iter=100, cv=2, random_state=0, verbose=3).fit(X_train, y_train)
    
print('best Hyperparameters = ' + str(best_tree.best_params_))


Fitting 2 folds for each of 70 candidates, totalling 140 fits
[CV 1/2] END max_depth=40, min_samples_split=41;, score=0.273 total time=  59.4s
[CV 2/2] END max_depth=40, min_samples_split=41;, score=0.275 total time= 1.0min
[CV 1/2] END ..max_depth=6, min_samples_split=8;, score=0.268 total time=  26.8s
[CV 2/2] END ..max_depth=6, min_samples_split=8;, score=0.266 total time=  26.7s
[CV 1/2] END ...max_depth=48, min_samples_split=1;, score=nan total time=   0.5s
[CV 2/2] END ...max_depth=48, min_samples_split=1;, score=nan total time=   0.5s
[CV 1/2] END max_depth=34, min_samples_split=49;, score=0.275 total time= 1.0min
[CV 2/2] END max_depth=34, min_samples_split=49;, score=0.278 total time= 1.3min
[CV 1/2] END ..max_depth=9, min_samples_split=7;, score=0.287 total time=  43.8s
[CV 2/2] END ..max_depth=9, min_samples_split=7;, score=0.283 total time=  49.3s
[CV 1/2] END max_depth=24, min_samples_split=24;, score=0.267 total time= 1.3min
[CV 2/2] END max_depth=24, min_samples_split=24

2 fits failed out of a total of 140.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
2 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\axels\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\model_selection\_validation.py", line 681, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\axels\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\tree\_classes.py", line 937, in fit
    super().fit(
  File "C:\Users\axels\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\tree\_classes.py", line 250, in fit
    raise ValueError(
ValueError: min_samples_split must be an integer greater than 1 or a float in (0.0, 1.0]; got the int

best Hyperparameters = {'min_samples_split': 49, 'max_depth': 10}


### Testing the DecisionTree model

In [6]:
print(best_tree.score(X_test, y_test))
print(best_tree)
print(np.logspace(start=1, stop=4, base=2, num=4, dtype=np.int))

0.3065
RandomizedSearchCV(cv=2, estimator=DecisionTreeClassifier(), n_iter=70,
                   param_distributions={'max_depth': range(1, 50),
                                        'min_samples_split': range(1, 50)},
                   random_state=0, verbose=3)
[ 2  4  8 16]


### Visualization Decision Tree Classifier Results

In [7]:
def plot_decision_tree(decision_tree_classifier, tree_depth, plot_depth, figsize=(25,10)):

    plt.figure(figsize=figsize, dpi=350)
    plot_tree(
        decision_tree=decision_tree_classifier,
        max_depth=plot_depth,    # only plot the top plot_depth layers
        rotate=True,
        fontsize=8)
    plt.title('Decision Tree Classifier (depth = ' + str(tree_depth) + ')')
    
    plt.show()

In [None]:
tree_depth = best_tree.get_depth()
plot_depth = 2

plot_decision_tree(best_tree, tree_depth=tree_depth, plot_depth=plot_depth)