In [2]:
from matplotlib.colors import ListedColormap
import matplotlib.pyplot as plot
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler

In [3]:
dataset = pd.read_csv('datasets/ads.csv')
X = dataset.iloc[:, [2, 3]].values
y = dataset.iloc[:, 4].values

In [5]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=1/4, random_state=0
)

In [6]:
scaler_x = StandardScaler()
X_train = scaler_x.fit_transform(X_train)
X_test = scaler_x.transform(X_test)
X_train



array([[ 0.58164944, -0.88670699],
       [-0.60673761,  1.46173768],
       [-0.01254409, -0.5677824 ],
       [-0.60673761,  1.89663484],
       [ 1.37390747, -1.40858358],
       [ 1.47293972,  0.99784738],
       [ 0.08648817, -0.79972756],
       [-0.01254409, -0.24885782],
       [-0.21060859, -0.5677824 ],
       [-0.21060859, -0.19087153],
       [-0.30964085, -1.29261101],
       [-0.30964085, -0.5677824 ],
       [ 0.38358493,  0.09905991],
       [ 0.8787462 , -0.59677555],
       [ 2.06713324, -1.17663843],
       [ 1.07681071, -0.13288524],
       [ 0.68068169,  1.78066227],
       [-0.70576986,  0.56295021],
       [ 0.77971394,  0.35999821],
       [ 0.8787462 , -0.53878926],
       [-1.20093113, -1.58254245],
       [ 2.1661655 ,  0.93986109],
       [-0.01254409,  1.22979253],
       [ 0.18552042,  1.08482681],
       [ 0.38358493, -0.48080297],
       [-0.30964085, -0.30684411],
       [ 0.97777845, -0.8287207 ],
       [ 0.97777845,  1.8676417 ],
       [-0.01254409,

In [12]:
# The criterion "entropy" focuses on information entropy instead of the
# default looking at minority information.
classifier = DecisionTreeClassifier(criterion='entropy', random_state=0)
classifier = classifier.fit(X_train, y_train)

In [13]:
# in order to see the accuracy of this model we use a confusion matrix
y_prediction = classifier.predict(X_test)
matrix = confusion_matrix(y_test, y_prediction)
matrix

array([[62,  6],
       [ 3, 29]])

In [8]:
# now we create a plotting function and plot our train and test sets
def plot_classifier(X_set, y_set, set_description='Training'):
    """
    We visualise the decision boundary. First create a new meshgrid from
    our test set and fill it with datapoints for every value of 0.01
    in between our min and max of the first and second column.
    Subtracting and adding 1 to each, so our datapoints don't 
    get squashed up to the sides of the graph.
    """
    X1, X2 = np.meshgrid(
        np.arange(start=X_set[:, 0].min() - 1, stop=X_set[:, 0].max() + 1, step=0.01),
        np.arange(start=X_set[:, 1].min() - 1, stop=X_set[:, 1].max() + 1, step=0.01)
    )
    # we then go over each data point in our new mesh and predict if the value is 0 or 1 and apply
    # a color to it.
    plot.contourf(
        X1, X2, 
        classifier.predict(np.array([X1.ravel(), X2.ravel()]).T).reshape(X1.shape),
        alpha=0.75,
        cmap=ListedColormap(('red', 'green'))
     )
    # we set the limits of the graph to the limits of our mesh grid.
    plot.xlim(X1.min(), X1.max())
    plot.ylim(X2.min(), X2.max())

    # and add our training set data points.
    for i, j in enumerate(np.unique(y_set)):
        plot.scatter(
            X_set[y_set == j, 0],
            X_set[y_set == j, 1],
            c=ListedColormap(('red', 'green'))(i),
            label=j
        )

    plot.title(f'Decision Tree classification ({set_description})')
    plot.xlabel('Age')
    plot.ylabel('Estimated salary')
    plot.legend()
    plot.show()
print(y_train)
plot_classifier(X_train, y_train)
plot_classifier(X_test, y_test, 'Testing')

[0 1 0 1 1 1 0 0 0 0 0 0 1 1 1 0 1 0 0 1 0 1 0 1 0 0 1 1 1 1 0 1 0 1 0 0 1
 0 0 1 0 0 0 0 0 1 1 1 1 0 0 0 1 0 1 0 1 0 0 1 0 0 0 1 0 0 0 1 1 0 0 1 0 1
 1 1 0 0 1 1 0 0 1 1 0 1 0 0 1 1 0 1 1 1 0 0 0 0 0 1 0 0 1 1 1 1 1 0 1 1 0
 1 0 0 0 0 0 0 0 1 1 0 0 1 0 0 1 0 0 0 1 0 1 1 0 1 0 0 0 0 1 0 0 0 1 1 0 0
 0 0 1 0 1 0 0 0 1 0 0 0 0 1 1 1 0 0 0 0 0 0 1 1 1 1 1 0 1 0 0 0 0 0 1 0 0
 0 0 0 0 1 1 0 1 0 1 0 0 1 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 1 0 1 1 0 0 0 0 0
 0 1 1 0 0 0 0 1 0 0 0 0 1 0 1 0 1 0 0 0 1 0 0 0 1 0 1 0 0 0 0 0 1 1 0 0 0
 0 0 1 0 1 1 0 0 0 0 0 1 0 1 0 0 1 0 0 1 0 1 0 0 0 0 0 0 1 1 1 1 0 0 0 0 1
 0 0 0 0]


NameError: name 'classifier' is not defined