In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import tree, metrics
from sklearn.model_selection import train_test_split

#import graphviz

In [2]:
data = pd.read_csv('https://raw.githubusercontent.com/NeilPandey/Test_Dataset/master/car.data',
names=['buying','maint','doors','persons','lug_boot','safety','class1'])
data.head(10)

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,class1
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc
5,vhigh,vhigh,2,2,med,high,unacc
6,vhigh,vhigh,2,2,big,low,unacc
7,vhigh,vhigh,2,2,big,med,unacc
8,vhigh,vhigh,2,2,big,high,unacc
9,vhigh,vhigh,2,4,small,low,unacc


In [3]:
data.dtypes

buying      object
maint       object
doors       object
persons     object
lug_boot    object
safety      object
class1      object
dtype: object

In [4]:
data.class1.unique()

array(['unacc', 'acc', 'vgood', 'good'], dtype=object)

In [5]:
data.isna().sum()

buying      0
maint       0
doors       0
persons     0
lug_boot    0
safety      0
class1      0
dtype: int64

In [6]:
data.shape

(1728, 7)

In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1728 entries, 0 to 1727
Data columns (total 7 columns):
buying      1728 non-null object
maint       1728 non-null object
doors       1728 non-null object
persons     1728 non-null object
lug_boot    1728 non-null object
safety      1728 non-null object
class1      1728 non-null object
dtypes: object(7)
memory usage: 94.6+ KB


In [8]:
data['class1'],class_names = pd.factorize(data['class1'])

In [9]:
print(class_names)

Index(['unacc', 'acc', 'vgood', 'good'], dtype='object')


In [10]:
print(data['class1'].unique())

[0 1 2 3]


In [11]:
data['buying'],_ = pd.factorize(data['buying'])

In [12]:
data['maint'],_ = pd.factorize(data['maint'])
data['doors'],_ = pd.factorize(data['doors'])
data['persons'],_ = pd.factorize(data['persons'])
data['lug_boot'],_ = pd.factorize(data['lug_boot'])
data['safety'],_ = pd.factorize(data['safety'])

In [13]:
data.head(10)

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,class1
0,0,0,0,0,0,0,0
1,0,0,0,0,0,1,0
2,0,0,0,0,0,2,0
3,0,0,0,0,1,0,0
4,0,0,0,0,1,1,0
5,0,0,0,0,1,2,0
6,0,0,0,0,2,0,0
7,0,0,0,0,2,1,0
8,0,0,0,0,2,2,0
9,0,0,0,1,0,0,0


In [14]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1728 entries, 0 to 1727
Data columns (total 7 columns):
buying      1728 non-null int64
maint       1728 non-null int64
doors       1728 non-null int64
persons     1728 non-null int64
lug_boot    1728 non-null int64
safety      1728 non-null int64
class1      1728 non-null int64
dtypes: int64(7)
memory usage: 94.6 KB


In [15]:
#DATA PREPN
X = data.iloc[:,0:-1]

In [16]:
y = data.iloc[:,-1]

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.25, 
                                                    random_state=0)

In [18]:
dtree = tree.DecisionTreeClassifier(criterion='entropy', random_state=0)

In [19]:
dtree.fit(X_train,y_train)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=0,
            splitter='best')

In [20]:
y_pred = dtree.predict(X_test)
len(y_pred)

432

In [21]:
count_misclassified = (y_test != y_pred).sum()

In [22]:
print('Misclassified samples: {}'.format(count_misclassified))

Misclassified samples: 12


In [23]:
accuracy = metrics.accuracy_score(y_test, y_pred)

In [24]:
print('Accuracy: {:.2f}'.format(accuracy))

Accuracy: 0.97


In [25]:
X_test.head(5)

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety
1318,3,0,0,2,1,1
124,0,1,0,1,2,1
648,1,2,0,0,0,0
249,0,2,1,0,2,0
1599,3,2,3,0,2,0


In [26]:
y_test.head(5)

1318    0
124     0
648     0
249     0
1599    0
Name: class1, dtype: int64

In [27]:
dtree.predict([[2,1,0,1,0,2]])

array([1], dtype=int64)

In [28]:
tree.plot_tree(dtree)

AttributeError: module 'sklearn.tree' has no attribute 'plot_tree'

In [31]:
tree.export_graphviz(dtree)

digraph Tree {
node [shape=box] ;
0 [label="X[5] <= 0.5\nentropy = 1.191\nsamples = 1296\nvalue = [914, 285, 49, 48]"] ;
1 [label="entropy = 0.0\nsamples = 446\nvalue = [446, 0, 0, 0]"] ;
0 -> 1 [labeldistance=2.5, labelangle=45, headlabel="True"] ;
2 [label="X[3] <= 0.5\nentropy = 1.474\nsamples = 850\nvalue = [468, 285, 49, 48]"] ;
0 -> 2 [labeldistance=2.5, labelangle=-45, headlabel="False"] ;
3 [label="entropy = 0.0\nsamples = 286\nvalue = [286, 0, 0, 0]"] ;
2 -> 3 ;
4 [label="X[0] <= 1.5\nentropy = 1.633\nsamples = 564\nvalue = [182, 285, 49, 48]"] ;
2 -> 4 ;
5 [label="X[1] <= 0.5\nentropy = 1.0\nsamples = 290\nvalue = [148, 142, 0, 0]"] ;
4 -> 5 ;
6 [label="entropy = 0.0\nsamples = 66\nvalue = [66, 0, 0, 0]"] ;
5 -> 6 ;
7 [label="X[4] <= 0.5\nentropy = 0.948\nsamples = 224\nvalue = [82, 142, 0, 0]"] ;
5 -> 7 ;
8 [label="X[5] <= 1.5\nentropy = 0.927\nsamples = 73\nvalue = [48, 25, 0, 0]"] ;
7 -> 8 ;
9 [label="entropy = 0.0\nsamples = 36\nvalue = [36, 0, 0, 0]"] ;
8 -> 9 ;
10 [labe

In [None]:
fn=['buying','maint','doors','persons','lug_boot','safety']
cn=['unacc', 'acc', 'vgood', 'good']
fig, axes = plt.subplots(nrows = 1,ncols = 1,figsize = (4,4), dpi=300)
tree.plot_tree(dtree,
               feature_names = fn, 
               class_names=cn,
               filled = True);
fig.savefig('imagename.png')