In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)


In [2]:
dataset = pd.read_csv("sample/car_evaluation.csv")

dataset.head(15)

Unnamed: 0,vhigh,vhigh.1,2,2.1,small,low,unacc
0,vhigh,vhigh,2,2,small,med,unacc
1,vhigh,vhigh,2,2,small,high,unacc
2,vhigh,vhigh,2,2,med,low,unacc
3,vhigh,vhigh,2,2,med,med,unacc
4,vhigh,vhigh,2,2,med,high,unacc
5,vhigh,vhigh,2,2,big,low,unacc
6,vhigh,vhigh,2,2,big,med,unacc
7,vhigh,vhigh,2,2,big,high,unacc
8,vhigh,vhigh,2,4,small,low,unacc
9,vhigh,vhigh,2,4,small,med,unacc


In [3]:
dataset.columns

Index(['vhigh', 'vhigh.1', '2', '2.1', 'small', 'low', 'unacc'], dtype='object')

In [4]:
dataset.rename(columns = {'vhigh':'price_buying', 'vhigh.1':'price_maintanance','2':'doors','2.1':'seaters','small':'bootspace','low':'safety','unacc':'decision'}, inplace = True)

In [5]:
dataset.columns

Index(['price_buying', 'price_maintanance', 'doors', 'seaters', 'bootspace',
       'safety', 'decision'],
      dtype='object')

In [6]:
# summarize all null values

dataset.isnull().sum()


price_buying         0
price_maintanance    0
doors                0
seaters              0
bootspace            0
safety               0
decision             0
dtype: int64

In [7]:
categorical = [cat_var for cat_var in dataset.columns if dataset[cat_var].dtypes == 'O']
print(categorical)


['price_buying', 'price_maintanance', 'doors', 'seaters', 'bootspace', 'safety', 'decision']


In [8]:
numerical = [num_var for num_var in dataset.columns if dataset[num_var].dtypes != 'O']
print(numerical)

[]


In [9]:
dataset['price_buying'].value_counts()

high     432
low      432
med      432
vhigh    431
Name: price_buying, dtype: int64

In [10]:
dataset['price_maintanance'].value_counts()

high     432
low      432
med      432
vhigh    431
Name: price_maintanance, dtype: int64

In [11]:
dataset['doors'].value_counts()

4        432
5more    432
3        432
2        431
Name: doors, dtype: int64

In [12]:
dataset['seaters'].value_counts()

4       576
more    576
2       575
Name: seaters, dtype: int64

In [13]:
dataset['bootspace'].value_counts()

big      576
med      576
small    575
Name: bootspace, dtype: int64

In [14]:
dataset['safety'].value_counts()

high    576
med     576
low     575
Name: safety, dtype: int64

In [15]:
dataset['decision'].value_counts()

unacc    1209
acc       384
good       69
vgood      65
Name: decision, dtype: int64

In [16]:
# We have decided that the decision as the target variable

In [17]:
pip install category_encoders


Note: you may need to restart the kernel to use updated packages.


In [18]:
import category_encoders as ce
encoders = ce.OrdinalEncoder(cols = ['price_buying', 'price_maintanance', 'doors', 'seaters', 'bootspace', 'safety'])


In [19]:
x1 = dataset.drop(['decision'], axis = 1)
y1 = dataset['decision']

In [20]:
x1 = encoders.fit_transform(x1)

  elif pd.api.types.is_categorical(cols):


In [21]:
from sklearn.model_selection import train_test_split as ttst
x1_train, x1_test, y1_train, y1_test = ttst(x1,y1, test_size = 0.45, random_state = 0)



In [22]:
# default n_estimator
from sklearn.ensemble import RandomForestClassifier as RFC
rfc_default_classifier = RFC(random_state = 0)

rfc_default_classifier.fit(x1_train, y1_train)

RandomForestClassifier(random_state=0)

In [23]:
y_default_predict = rfc_default_classifier.predict(x1_test)

In [24]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

cm1 = confusion_matrix(y1_test,y_default_predict)
acs1 = accuracy_score(y1_test,y_default_predict) * 100
y_def_clsrpr = classification_report(y1_test,y_default_predict)

print(" The accuracy score is:  ",acs1," \n and the confusion matrix are: \n \n ", cm1, "\n \n \n \n \n \n "," Classification Report \n \n ",y_def_clsrpr)

 The accuracy score is:   96.7866323907455  
 and the confusion matrix are: 
 
  [[170   1   2   0]
 [  4  22   0   3]
 [  6   0 533   0]
 [  9   0   0  28]] 
 
 
 
 
 
   Classification Report 
 
                precision    recall  f1-score   support

         acc       0.90      0.98      0.94       173
        good       0.96      0.76      0.85        29
       unacc       1.00      0.99      0.99       539
       vgood       0.90      0.76      0.82        37

    accuracy                           0.97       778
   macro avg       0.94      0.87      0.90       778
weighted avg       0.97      0.97      0.97       778



In [25]:
# Let us see for 10 estimator

x2 = dataset.drop(['decision'], axis = 1)
y2 = dataset['decision']

In [26]:
x2 = encoders.fit_transform(x2)

  elif pd.api.types.is_categorical(cols):


In [27]:
from sklearn.model_selection import train_test_split as ttst
x2_train, x2_test, y2_train, y2_test = ttst(x2,y2, test_size = 0.45, random_state = 0)



In [28]:
from sklearn.ensemble import RandomForestClassifier as RFC
rfc_10_classifier = RFC(n_estimators = 10, random_state = 0)

rfc_10_classifier.fit(x1_train, y1_train)

RandomForestClassifier(n_estimators=10, random_state=0)

In [29]:
y_10_predict = rfc_10_classifier.predict(x2_test)

In [30]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

cm2 = confusion_matrix(y2_test,y_10_predict)
acs2 = accuracy_score(y2_test,y_10_predict) * 100
y_10_clsrpr = classification_report(y2_test,y_10_predict)

print(" The accuracy score is:  ",acs2," \n and the confusion matrix are: \n \n ", cm2, "\n \n \n \n \n \n "," Classification Report \n \n ",y_10_clsrpr)

 The accuracy score is:   96.65809768637533  
 and the confusion matrix are: 
 
  [[168   2   3   0]
 [  4  22   0   3]
 [  7   0 532   0]
 [  7   0   0  30]] 
 
 
 
 
 
   Classification Report 
 
                precision    recall  f1-score   support

         acc       0.90      0.97      0.94       173
        good       0.92      0.76      0.83        29
       unacc       0.99      0.99      0.99       539
       vgood       0.91      0.81      0.86        37

    accuracy                           0.97       778
   macro avg       0.93      0.88      0.90       778
weighted avg       0.97      0.97      0.97       778



In [31]:
x3= dataset.drop(['decision'], axis = 1)
y3 = dataset['decision']

In [32]:
x3 = encoders.fit_transform(x1)

  elif pd.api.types.is_categorical(cols):


In [33]:
from sklearn.model_selection import train_test_split as ttst
x3_train, x3_test, y3_train, y3_test = ttst(x3,y3, test_size = 0.45, random_state = 0)



In [34]:
from sklearn.ensemble import RandomForestClassifier as RFC
rfc_100_classifier = RFC(n_estimators = 100, random_state = 0)

rfc_100_classifier.fit(x1_train, y1_train)

RandomForestClassifier(random_state=0)

In [35]:
y_100_predict = rfc_100_classifier.predict(x2_test)

In [36]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

cm3 = confusion_matrix(y2_test,y_100_predict)
acs3 = accuracy_score(y2_test,y_100_predict) * 100
y_100_clsrpr = classification_report(y2_test,y_100_predict)

print(" The accuracy score is:  ",acs3," \n and the confusion matrix are: \n \n ", cm3, "\n \n \n \n \n \n "," Classification Report \n \n ",y_100_clsrpr)

 The accuracy score is:   96.7866323907455  
 and the confusion matrix are: 
 
  [[170   1   2   0]
 [  4  22   0   3]
 [  6   0 533   0]
 [  9   0   0  28]] 
 
 
 
 
 
   Classification Report 
 
                precision    recall  f1-score   support

         acc       0.90      0.98      0.94       173
        good       0.96      0.76      0.85        29
       unacc       1.00      0.99      0.99       539
       vgood       0.90      0.76      0.82        37

    accuracy                           0.97       778
   macro avg       0.94      0.87      0.90       778
weighted avg       0.97      0.97      0.97       778

