## Decision Trees Classifier

### Loading the Data 

In [1]:
import pandas as pd
import numpy as np 

#### Dataset link: https://archive.ics.uci.edu/dataset/19/car+evaluation

#. Attribute Values:

   buying: v-high, high, med, low
   maint: v-high, high, med, low
   doors: 2, 3, 4, 5-more
   persons: 2, 4, more
   lug_boot: small, med, big
   safety: low, med, high

In [2]:
df = pd.read_csv('./Data/car.data')
df.head()
#notice that there is no column information

Unnamed: 0,vhigh,vhigh.1,2,2.1,small,low,unacc
0,vhigh,vhigh,2,2,small,med,unacc
1,vhigh,vhigh,2,2,small,high,unacc
2,vhigh,vhigh,2,2,med,low,unacc
3,vhigh,vhigh,2,2,med,med,unacc
4,vhigh,vhigh,2,2,med,high,unacc


In [3]:
#lets create the headers manually as a list + class
headers = [
'buying',
'maint',
'doors',
'persons',
'lug_boot',
'safety',
'class']

In [4]:
#chet the created list to verify if everything is correct 
headers

['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety', 'class']

In [5]:
# put the new created headers with the table of data 
df.columns = headers
df.head()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,class
0,vhigh,vhigh,2,2,small,med,unacc
1,vhigh,vhigh,2,2,small,high,unacc
2,vhigh,vhigh,2,2,med,low,unacc
3,vhigh,vhigh,2,2,med,med,unacc
4,vhigh,vhigh,2,2,med,high,unacc


### Preliminary Analysis of Data 

In [6]:
#shape of the dataset 
df.shape
#rows = 1727 
#columns = 7

(1727, 7)

In [7]:
#info
df.info()
#all of them are object / string and we need numeric data

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1727 entries, 0 to 1726
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   buying    1727 non-null   object
 1   maint     1727 non-null   object
 2   doors     1727 non-null   object
 3   persons   1727 non-null   object
 4   lug_boot  1727 non-null   object
 5   safety    1727 non-null   object
 6   class     1727 non-null   object
dtypes: object(7)
memory usage: 94.6+ KB


In [8]:
#describe
df.describe()
#count = for all 1727
#unique = there are only 3 or 4 values
#calls columns is that we want to predict 

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,class
count,1727,1727,1727,1727,1727,1727,1727
unique,4,4,4,3,3,3,4
top,high,high,3,4,med,med,unacc
freq,432,432,432,576,576,576,1209


In [10]:
#lets see if there null values
df.isnull().any()

buying      False
maint       False
doors       False
persons     False
lug_boot    False
safety      False
class       False
dtype: bool

In [11]:
df.isnull().sum()

buying      0
maint       0
doors       0
persons     0
lug_boot    0
safety      0
class       0
dtype: int64

In [12]:
#look at the class column and unique values 
df['class'].unique()

array(['unacc', 'acc', 'vgood', 'good'], dtype=object)

In [14]:
#even better, call the value_counts
df['class'].value_counts()

class
unacc    1209
acc       384
good       69
vgood      65
Name: count, dtype: int64

### Preprocessing

In [15]:
#the data frame 
df.head()
#how to convert the data into numeric data?

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,class
0,vhigh,vhigh,2,2,small,med,unacc
1,vhigh,vhigh,2,2,small,high,unacc
2,vhigh,vhigh,2,2,med,low,unacc
3,vhigh,vhigh,2,2,med,med,unacc
4,vhigh,vhigh,2,2,med,high,unacc


In [17]:
#doors column
df['doors'].value_counts()
#notice the 5more, as string

doors
3        432
4        432
5more    432
2        431
Name: count, dtype: int64

In [18]:
df['doors']
#datatype = object

0           2
1           2
2           2
3           2
4           2
        ...  
1722    5more
1723    5more
1724    5more
1725    5more
1726    5more
Name: doors, Length: 1727, dtype: object

In [19]:
#call again the df
df.head()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,class
0,vhigh,vhigh,2,2,small,med,unacc
1,vhigh,vhigh,2,2,small,high,unacc
2,vhigh,vhigh,2,2,med,low,unacc
3,vhigh,vhigh,2,2,med,med,unacc
4,vhigh,vhigh,2,2,med,high,unacc


#### ...preprocessing

In [21]:
#OrdinalEncoder 
from sklearn.preprocessing import OrdinalEncoder

In [22]:
#create the instance
enc = OrdinalEncoder()

In [25]:
#create the fit_transform method and transform all the column without class column 
ar = enc.fit_transform(df.drop('class', axis=1))
#result = a numpy array

In [26]:
#lets tranform the numpy array into a pandas DF
dft = pd.DataFrame(ar, columns=headers[:-1]) #exlude class column
dft.head()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety
0,3.0,3.0,0.0,0.0,2.0,2.0
1,3.0,3.0,0.0,0.0,2.0,0.0
2,3.0,3.0,0.0,0.0,1.0,1.0
3,3.0,3.0,0.0,0.0,1.0,2.0
4,3.0,3.0,0.0,0.0,1.0,0.0


In [27]:
df.head()
# 7 columns
# contain string values 

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,class
0,vhigh,vhigh,2,2,small,med,unacc
1,vhigh,vhigh,2,2,small,high,unacc
2,vhigh,vhigh,2,2,med,low,unacc
3,vhigh,vhigh,2,2,med,med,unacc
4,vhigh,vhigh,2,2,med,high,unacc


In [28]:
#dft.info()
dft.info()
# all are numeric values
# 6 columns

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1727 entries, 0 to 1726
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   buying    1727 non-null   float64
 1   maint     1727 non-null   float64
 2   doors     1727 non-null   float64
 3   persons   1727 non-null   float64
 4   lug_boot  1727 non-null   float64
 5   safety    1727 non-null   float64
dtypes: float64(6)
memory usage: 81.1 KB


### Decision Tree Classifier

In [29]:
dft.head()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety
0,3.0,3.0,0.0,0.0,2.0,2.0
1,3.0,3.0,0.0,0.0,2.0,0.0
2,3.0,3.0,0.0,0.0,1.0,1.0
3,3.0,3.0,0.0,0.0,1.0,2.0
4,3.0,3.0,0.0,0.0,1.0,0.0


In [30]:
# X as attribute 
X = dft

In [31]:
# y as target (df because in dft we dont have class column)
y = df['class']

#### ...split them now 

In [32]:
#train_test_split
from sklearn.model_selection import train_test_split

##### train_test_split() / shift + tab to find the line that we will use...
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [33]:
#create the training and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)


In [34]:
#import the algorithm = Decision Tree Classifier
from sklearn.tree import DecisionTreeClassifier

In [35]:
#after that, Create an object and the instance 
tree = DecisionTreeClassifier()
#gini is by default but its work also with entropy, same.

#### ...train the model

In [36]:
#train the model
tree.fit(X_train, y_train)
#the model is ready to make predictions

In [37]:
#make predictions
y_pred = tree.predict(X_test)

In [38]:
#check how good the model is...classification_report, confusion_matrix
from sklearn.metrics import classification_report, confusion_matrix

In [40]:
print(classification_report(y_test, y_pred))
# the result show = really good classifier

              precision    recall  f1-score   support

         acc       0.98      0.94      0.96       127
        good       0.79      0.83      0.81        18
       unacc       0.99      1.00      1.00       399
       vgood       0.89      0.92      0.91        26

    accuracy                           0.98       570
   macro avg       0.91      0.93      0.92       570
weighted avg       0.98      0.98      0.98       570



In [41]:
#confusion matrix
print(confusion_matrix(y_test, y_pred))
#we have four classes, it will only 2 by 2 matrix 
# accuracy very high = 120,15,399,24= values)

[[120   2   3   2]
 [  2  15   0   1]
 [  0   0 399   0]
 [  0   2   0  24]]


## Random Forest Classifier

In [42]:
#call the random Forest Classifier
from sklearn.ensemble import RandomForestClassifier

In [43]:
#lets create the random forest classifier object and the instance 
rfc = RandomForestClassifier()

In [44]:
#train the model /object
rfc.fit(X_train, y_train)

In [45]:
#lets make some predictions
y_pred_rfc = rfc.predict(X_test)

In [46]:
#call again the classification_report and confusion_matrix
from sklearn.metrics import classification_report, confusion_matrix

In [47]:
#check the classification_report for random forest
print(classification_report(y_test, y_pred_rfc)) #y_pred_rfc
#this is the classification report 
#notice the weighted avg 0.96

              precision    recall  f1-score   support

         acc       0.94      0.89      0.91       127
        good       0.86      0.67      0.75        18
       unacc       0.98      0.99      0.98       399
       vgood       0.80      0.92      0.86        26

    accuracy                           0.96       570
   macro avg       0.89      0.87      0.88       570
weighted avg       0.96      0.96      0.96       570



In [48]:
# print the classification report for the decision tree
print(classification_report(y_test, y_pred)) #y_pred
#notice the weighted avg 0.98
#quit better 

              precision    recall  f1-score   support

         acc       0.98      0.94      0.96       127
        good       0.79      0.83      0.81        18
       unacc       0.99      1.00      1.00       399
       vgood       0.89      0.92      0.91        26

    accuracy                           0.98       570
   macro avg       0.91      0.93      0.92       570
weighted avg       0.98      0.98      0.98       570



In [49]:
#confusion_matrix for Random forest
print(confusion_matrix(y_test, y_pred_rfc))

[[113   2  10   2]
 [  2  12   0   4]
 [  3   0 396   0]
 [  2   0   0  24]]


In [50]:
#confusion matrix for the Decision Tree 
print(confusion_matrix(y_test, y_pred))

[[120   2   3   2]
 [  2  15   0   1]
 [  0   0 399   0]
 [  0   2   0  24]]
