In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt # data visualization
import seaborn as sns # statistical data visualization
#reading our car evaluation dataset
df = pd.read_csv('car_evaluation.csv')

In [3]:
# view dimensions of dataset
df.shape
# preview the dataset
df.head()

Unnamed: 0,vhigh,vhigh.1,2,2.1,small,low,unacc
0,vhigh,vhigh,2,2,small,med,unacc
1,vhigh,vhigh,2,2,small,high,unacc
2,vhigh,vhigh,2,2,med,low,unacc
3,vhigh,vhigh,2,2,med,med,unacc
4,vhigh,vhigh,2,2,med,high,unacc


In [14]:
# Rename column names
col_names=df.columns=['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety', 'class']
df.head()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,class
0,vhigh,vhigh,2,2,small,med,unacc
1,vhigh,vhigh,2,2,small,high,unacc
2,vhigh,vhigh,2,2,med,low,unacc
3,vhigh,vhigh,2,2,med,med,unacc
4,vhigh,vhigh,2,2,med,high,unacc


In [15]:
# View summary of dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1727 entries, 0 to 1726
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   buying    1727 non-null   object
 1   maint     1727 non-null   object
 2   doors     1727 non-null   object
 3   persons   1727 non-null   object
 4   lug_boot  1727 non-null   object
 5   safety    1727 non-null   object
 6   class     1727 non-null   object
dtypes: object(7)
memory usage: 94.6+ KB


In [16]:
#  I will check the frequency counts of categorical variables
for col in col_names:
 df[col].value_counts()

In [17]:
# Explore class variable
df['class'].value_counts

<bound method IndexOpsMixin.value_counts of 0       unacc
1       unacc
2       unacc
3       unacc
4       unacc
        ...  
1722     good
1723    vgood
1724    unacc
1725     good
1726    vgood
Name: class, Length: 1727, dtype: object>

In [18]:
# Missing values in variables
print('Missing values are:',df.isnull().sum())

Missing values are: buying      0
maint       0
doors       0
persons     0
lug_boot    0
safety      0
class       0
dtype: int64


In [19]:
# Declare feature vector and target variable
X = df.drop(['class'], axis=1)
y = df['class']

In [20]:
# split data into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = 42)

In [21]:
# check data types in X_train
print(X_train.dtypes)

buying      object
maint       object
doors       object
persons     object
lug_boot    object
safety      object
dtype: object


In [22]:
import category_encoders as ce
# Initialize an OrdinalEncoder with specified column names
#encoder = ce.OrdinalEncoder(col_names)
encoder = ce.OrdinalEncoder(cols=['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety'])

# Fit and transform the training data using the encoder
X_train = encoder.fit_transform(X_train)
# Fit and transform the test data using the same encoder
# Note: It's better to use transform instead of fit_transform for the test set to maintain consistency 
# with training encoding
X_test = encoder.transform(X_test)
# Display the first few rows of the encoded training data
print(X_train.head())
# Display the first few rows of the encoded test data
X_test.head()

      buying  maint  doors  persons  lug_boot  safety
83         1      1      1        1         1       1
48         1      1      2        2         1       2
468        2      1      2        3         2       2
155        1      2      2        2         1       1
1043       3      2      3        2         2       1


Unnamed: 0,buying,maint,doors,persons,lug_boot,safety
599,2,2,3,1,3,1
932,3,1,3,3,3,1
628,2,2,1,1,3,3
1497,4,2,1,3,1,2
1262,3,4,3,2,1,1


In [29]:
# Import the RandomForestClassifier from scikit-learn's ensemble module
from sklearn.ensemble import RandomForestClassifier
# Initialize a RandomForestClassifier with a specified random seed for reproducibility
rfc = RandomForestClassifier(random_state=0)
# Fit the RandomForestClassifier on the training data
rfc.fit(X_train, y_train)
y_pred = rfc.predict(X_test)

In [30]:
# Display the first few rows of the encoded test data
print(X_test.head())

      buying  maint  doors  persons  lug_boot  safety
599        2      2      3        1         3       1
932        3      1      3        3         3       1
628        2      2      1        1         3       3
1497       4      2      1        3         1       2
1262       3      4      3        2         1       1


#### Check the accuracy

In [33]:
from sklearn.metrics import accuracy_score
print('The accuracy score is {0:.4f}'.format(accuracy_score(y_test,y_pred)))

The accuracy score is 0.9649


In [35]:
feature_scores = pd.Series(rfc.feature_importances_, 
index=X_train.columns).sort_values(ascending=False)
print(feature_scores)
feature_scores = pd.Series(rfc.feature_importances_, 
index=X_train.columns).sort_values(ascending=False)
random_colors = sns.color_palette("viridis", len(feature_scores))
print(feature_scores)

safety      0.291657
persons     0.235380
buying      0.160692
maint       0.134143
lug_boot    0.111595
doors       0.066533
dtype: float64
safety      0.291657
persons     0.235380
buying      0.160692
maint       0.134143
lug_boot    0.111595
doors       0.066533
dtype: float64


In [None]:
# Creating a seaborn bar plot
sns.barplot(x=feature_scores, y=feature_scores.index)
# Add labels to the graph
plt.xlabel('Feature Importance Score')
plt.ylabel('Features')
# Add title to the graph
plt.title("Visualizing Important Features")
# Visualize the graph
plt.show()

In [None]:
encoder = ce.OrdinalEncoder(cols=['buying', 'maint', 'persons', 'lug_boot', 'safety'])
X_train = encoder.fit_transform(X_train)
X_test = encoder.transform(X_test)
# instantiate the classifier with n_estimators = 100
clf = RandomForestClassifier(random_state=0)
# fit the model to the training set
clf.fit(X_train, y_train)
# Predict on the test set results
y_pred = clf.predict(X_test)
# Check accuracy score
print('Model accuracy score with doors variable removed : {0:0.4f}'. 
format(accuracy_score(y_test, y_pred)))

In [None]:
# confusion matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
print('Confusion matrix\n\n')
cm