In [1]:
"""Steps for classification:
1. *Create the dataframe properly-->pd.read_csv(),pd.read_excel()
2. Preprocessing the data:
a. Feature selection-->domain knowledge-->drop()
b. *Handling missing values-->df.isnull().sum(),dropna(),fillna()
c. Outlier handling[Optional]-->boxplot()
3. *Converting categorical data to numerical-->map(),pd.get_dummies(),OneHotEncoder(),LabelEncoder()
4. *Create X and Y-->X=df.values[:,:-1], Y=df.values[:,-1]
5. Scaling the data[Optional]-->StandardScaler(),MinMaxScaler()
6. *Splitting the data into train and test(validation)-->train_test_split()
7. *Building the model:
a. Create the model-->obj=AlgoName()
b. Train the model-->obj.fit(X_train, Y_train)
c. Predict using the model-->Y_pred=obj.predict(X_test)
8. *Evaluating the model:
a. confusion_matrix(Y_test,Y_pred)
b. accuracy_score(Y_test,Y_pred)
c. classification_report(Y_test,Y_pred)
9. *Tuning the model:
a. Feature selection
b. Dedicated approach-->Adjustment of threshold
c. Stochastic Gradient Descent"""

'Steps for classification:\n1. *Create the dataframe properly-->pd.read_csv(),pd.read_excel()\n2. Preprocessing the data:\na. Feature selection-->domain knowledge-->drop()\nb. *Handling missing values-->df.isnull().sum(),dropna(),fillna()\nc. Outlier handling[Optional]-->boxplot()\n3. *Converting categorical data to numerical-->map(),pd.get_dummies(),OneHotEncoder(),LabelEncoder()\n4. *Create X and Y-->X=df.values[:,:-1], Y=df.values[:,-1]\n5. Scaling the data[Optional]-->StandardScaler(),MinMaxScaler()\n6. *Splitting the data into train and test(validation)-->train_test_split()\n7. *Building the model:\na. Create the model-->obj=AlgoName()\nb. Train the model-->obj.fit(X_train, Y_train)\nc. Predict using the model-->Y_pred=obj.predict(X_test)\n8. *Evaluating the model:\na. confusion_matrix(Y_test,Y_pred)\nb. accuracy_score(Y_test,Y_pred)\nc. classification_report(Y_test,Y_pred)\n9. *Tuning the model:\na. Feature selection\nb. Dedicated approach-->Adjustment of threshold\nc. Stochastic

In [2]:
import numpy as np
import pandas as pd

In [3]:
cars_train=pd.read_csv(r'cars_train.csv', header=None)
cars_test=pd.read_csv(r'cars_test.csv', header=None)

In [4]:
print(cars_train.shape)
cars_train.head()

(1382, 7)


Unnamed: 0,0,1,2,3,4,5,6
0,vhigh,high,3,more,small,low,unacc
1,low,vhigh,3,4,small,med,unacc
2,low,high,5more,more,big,low,unacc
3,high,med,4,2,small,med,unacc
4,low,low,3,more,big,med,good


In [5]:
print(cars_test.shape)
cars_test.head()

(346, 7)


Unnamed: 0,0,1,2,3,4,5,6
0,med,vhigh,5more,4,small,low,unacc
1,vhigh,high,2,2,big,med,unacc
2,low,high,2,more,small,low,unacc
3,vhigh,vhigh,3,2,big,high,unacc
4,low,med,4,4,med,med,good


In [6]:
cars_train.columns=['buying','maint','doors','persons','lug_boot','safety','classes']
cars_test.columns=['buying','maint','doors','persons','lug_boot','safety','classes']

In [7]:
cars_train.head()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,classes
0,vhigh,high,3,more,small,low,unacc
1,low,vhigh,3,4,small,med,unacc
2,low,high,5more,more,big,low,unacc
3,high,med,4,2,small,med,unacc
4,low,low,3,more,big,med,good


In [8]:
cars_test.head()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,classes
0,med,vhigh,5more,4,small,low,unacc
1,vhigh,high,2,2,big,med,unacc
2,low,high,2,more,small,low,unacc
3,vhigh,vhigh,3,2,big,high,unacc
4,low,med,4,4,med,med,good


In [9]:
cars_train.isnull().sum()

buying      0
maint       0
doors       0
persons     0
lug_boot    0
safety      0
classes     0
dtype: int64

In [10]:
cars_test.isnull().sum()

buying      0
maint       0
doors       0
persons     0
lug_boot    0
safety      0
classes     0
dtype: int64

In [11]:
cars_test.drop('classes',axis=1,inplace=True)
cars_test.head()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety
0,med,vhigh,5more,4,small,low
1,vhigh,high,2,2,big,med
2,low,high,2,more,small,low
3,vhigh,vhigh,3,2,big,high
4,low,med,4,4,med,med


In [12]:
colname=cars_train.columns
colname

Index(['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety', 'classes'], dtype='object')

In [13]:
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()

for x in colname:
    cars_train[x]=le.fit_transform(cars_train[x])

In [14]:
cars_train.head()
#acc==>0
#good==>1
#unacc==>2
#evgood==>3

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,classes
0,3,0,1,2,2,1,2
1,1,3,1,1,2,2,2
2,1,0,3,2,0,1,2
3,0,2,2,0,2,2,2
4,1,1,1,2,0,2,1


In [15]:
colname=cars_test.columns
colname

Index(['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety'], dtype='object')

In [16]:
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()

for x in colname:
    cars_test[x]=le.fit_transform(cars_test[x])

In [17]:
cars_test.head()
#acc==>0
#good==>1
#unacc==>2
#evgood==>3

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety
0,2,3,3,1,2,1
1,3,0,0,0,0,2
2,1,0,0,2,2,1
3,3,3,1,0,0,0
4,1,2,2,1,1,2


In [18]:
X=cars_train.values[:,0:-1]
Y=cars_train.values[:,-1]
Y=Y.astype(int)

In [19]:
print(X.shape)
print(Y.shape)

(1382, 6)
(1382,)


In [20]:
from sklearn.preprocessing import StandardScaler

scaler=StandardScaler()
scaler.fit(X)
X = scaler.transform(X)

In [21]:
print(X)

[[ 1.33507272 -1.3488262  -0.45682233  1.21505861  1.22565305  0.00176987]
 [-0.44760409  1.32688358 -0.45682233 -0.01064285  1.22565305  1.22474807]
 [-0.44760409 -1.3488262   1.33418038  1.21505861 -1.21505663  0.00176987]
 ...
 [-1.33894249  1.32688358  1.33418038 -0.01064285  0.00529821 -1.22120833]
 [ 0.44373431  0.43498032  0.43867903 -0.01064285 -1.21505663  0.00176987]
 [ 0.44373431 -0.45692294  1.33418038  1.21505861  1.22565305 -1.22120833]]


In [22]:
from sklearn.model_selection import train_test_split

#Split the data into test and train
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=10)

In [23]:
print(X_train.shape)
print(X_test.shape)
print(Y_train.shape)
print(Y_test.shape)

(1105, 6)
(277, 6)
(1105,)
(277,)


# Running Decision Tree model

In [24]:
#predicting using the Decision_Tree_Calssifier
from sklearn.tree import DecisionTreeClassifier

model_DecisionTree=DecisionTreeClassifier(random_state=10, criterion='gini')
#min_samples_leaf, min_samples_split, max_depth, max_features, max_leaf_nodes

#fit the model on the data and predict the values
model_DecisionTree.fit(X_train,Y_train)
Y_pred=model_DecisionTree.predict(X_test)
#print(Y_pred)
print(list(zip(Y_test,Y_pred)))

[(2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (0, 0), (2, 2), (2, 2), (1, 1), (0, 0), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (0, 0), (0, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (0, 0), (2, 2), (0, 0), (1, 0), (2, 2), (2, 2), (2, 2), (0, 0), (2, 2), (2, 2), (0, 0), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (3, 3), (0, 0), (2, 2), (0, 0), (3, 3), (0, 0), (2, 2), (0, 0), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (0, 0), (2, 2), (0, 0), (2, 2), (2, 2), (0, 0), (2, 2), (2, 2), (1, 0), (2, 2), (0, 0), (0, 0), (2, 2), (2, 2), (0, 0), (3, 3), (0, 0), (0, 0), (2, 2), (2, 2), (0, 0), (2, 2), (0, 0), (2, 2), (2, 2), (0, 0), (2, 2), (2, 2), (0, 0), (2, 2), (2, 2), (2, 2), (0, 0), (2, 2), (2, 2), (0, 0), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (0, 0), (3, 3), (2, 2), (2, 2), (0, 0), (0, 0), (2, 2), (2, 2), (2, 2), (0, 0), (2, 2), (0, 0), (3, 3), (2, 2), (0, 0), (3, 3), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (3, 3), (2, 2), (0, 0), (2, 2), (0, 0),

In [25]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
#confusion matrix
print(confusion_matrix(Y_test,Y_pred))
print(accuracy_score(Y_test,Y_pred))
print(classification_report(Y_test,Y_pred))

[[ 69   1   1   0]
 [  4   8   0   0]
 [  0   0 185   0]
 [  0   0   0   9]]
0.9783393501805054
              precision    recall  f1-score   support

           0       0.95      0.97      0.96        71
           1       0.89      0.67      0.76        12
           2       0.99      1.00      1.00       185
           3       1.00      1.00      1.00         9

    accuracy                           0.98       277
   macro avg       0.96      0.91      0.93       277
weighted avg       0.98      0.98      0.98       277



In [26]:
model_DecisionTree.score(X_train,Y_train)

1.0

In [27]:
"""The accuracy on the traning data is 100% but at the same time the accuracy on the test data is also about 98%.
This indicates that the model might not be an ovefitted model"""

'The accuracy on the traning data is 100% but at the same time the accuracy on the test data is also about 98%.\nThis indicates that the model might not be an ovefitted model'

In [28]:
print((list(zip(cars_train.columns[0:-1],
               model_DecisionTree.feature_importances_))))
'''The total of all the feature importances will be 1 which means 100% of contribution. In this output we may get to see some
variables returning a 0.0000.. kind of value or a very negligible value like 0.0005 these variables can be eliminated given 
that they are showing zero or very low contribution to the growth of the tree.
A zero value indicates that the variable was never found as a significant variable and was never use for splitting the tree.'''

[('buying', 0.21976792783843332), ('maint', 0.18220929667385857), ('doors', 0.062005934167191426), ('persons', 0.19425872158174767), ('lug_boot', 0.09772725134941933), ('safety', 0.24403086838934968)]


'The total of all the feature importances will be 1 which means 100% of contribution. In this output we may get to see some\nvariables returning a 0.0000.. kind of value or a very negligible value like 0.0005 these variables can be eliminated given \nthat they are showing zero or very low contribution to the growth of the tree.\nA zero value indicates that the variable was never found as a significant variable and was never use for splitting the tree.'

In [29]:
sample=pd.DataFrame()
 
sample["Column"]=cars_train.columns[0:-1]
sample["Imp value"]=model_DecisionTree.feature_importances_
 
sample.sort_values("Imp value",ascending=False)

Unnamed: 0,Column,Imp value
5,safety,0.244031
0,buying,0.219768
3,persons,0.194259
1,maint,0.182209
4,lug_boot,0.097727
2,doors,0.062006


# Continuation Visualization of Decision Tree 2-12

In [30]:
from sklearn import tree
with open(r"model_DecisionTree.txt", "w") as f:
    
    f = tree.export_graphviz(model_DecisionTree, feature_names=cars_train.columns[0:-1],
                             out_file=f)
    
#generate the file and upload the code in webgraphviz.com to plot the decision tree
#model_DecisionTree txt file is generated in the directory
#use Webgraphviz.com to view the graph

In [31]:
#predicting using the Decision_Tree_Calssifier
from sklearn.tree import DecisionTreeClassifier

model_DecisionTree=DecisionTreeClassifier(random_state=10,
                                          criterion='gini',
                                          splitter='best',
                                          min_samples_leaf=3,
                                         min_samples_split=5,
                                         max_depth=10)
#min_samples_leaf, min_samples_split, max_depth, max_features, max_leaf_nodes

#fit the model on the data and predict the values
model_DecisionTree.fit(X_train,Y_train)
Y_pred=model_DecisionTree.predict(X_test)
#print(Y_pred)
print(list(zip(Y_test,Y_pred)))

[(2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (0, 0), (2, 2), (2, 2), (1, 1), (0, 0), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (0, 0), (0, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (0, 0), (2, 2), (0, 2), (1, 3), (2, 2), (2, 2), (2, 2), (0, 0), (2, 2), (2, 2), (0, 0), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (3, 3), (0, 0), (2, 2), (0, 0), (3, 3), (0, 0), (2, 2), (0, 0), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (0, 0), (2, 2), (0, 2), (2, 2), (2, 2), (0, 0), (2, 2), (2, 2), (1, 1), (2, 2), (0, 0), (0, 0), (2, 2), (2, 2), (0, 0), (3, 3), (0, 0), (0, 0), (2, 2), (2, 2), (0, 0), (2, 2), (0, 0), (2, 2), (2, 2), (0, 0), (2, 2), (2, 2), (0, 0), (2, 2), (2, 2), (2, 2), (0, 0), (2, 2), (2, 2), (0, 0), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (0, 0), (3, 3), (2, 2), (2, 2), (0, 0), (0, 0), (2, 2), (2, 2), (2, 2), (0, 0), (2, 2), (0, 0), (3, 3), (2, 2), (0, 0), (3, 3), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (3, 3), (2, 2), (0, 0), (2, 2), (0, 0),

In [32]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
#confusion matrix
print(confusion_matrix(Y_test,Y_pred))
print(accuracy_score(Y_test,Y_pred))
print(classification_report(Y_test,Y_pred))

[[ 66   0   5   0]
 [  1   8   1   2]
 [  2   0 183   0]
 [  0   0   0   9]]
0.9602888086642599
              precision    recall  f1-score   support

           0       0.96      0.93      0.94        71
           1       1.00      0.67      0.80        12
           2       0.97      0.99      0.98       185
           3       0.82      1.00      0.90         9

    accuracy                           0.96       277
   macro avg       0.94      0.90      0.91       277
weighted avg       0.96      0.96      0.96       277



In [33]:
from sklearn.linear_model import LogisticRegression
#create a model
classifier=LogisticRegression(multi_class='multinomial')
#fitting training data to the model
classifier.fit(X_train,Y_train)

Y_pred=classifier.predict(X_test)
print(Y_pred)

[2 2 2 2 2 0 2 2 2 0 0 2 0 2 2 2 2 2 0 2 2 2 2 0 2 2 2 2 3 0 2 2 2 0 2 2 2
 2 2 2 2 2 0 2 0 2 2 2 0 2 2 2 2 2 2 2 2 2 2 2 2 2 0 2 2 2 2 2 2 2 2 2 0 3
 2 2 2 2 2 2 2 2 2 2 2 2 2 0 2 2 2 2 2 2 2 2 2 2 2 2 0 2 2 2 2 0 2 2 2 2 2
 0 2 0 0 2 2 2 2 2 2 2 2 2 0 2 2 0 2 2 0 2 2 2 0 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 0 2 2 2 2 2 2 2 2 0 2 2 2 2 2 2 2 2 2 0 2 2 2 2 2 2 2 2 2 0 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 0 2 2 2 2 2 2 2 0 2 0 2 2 2 0 2 2 0 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 0 0 2 2 0 2 2 2 2 2 2 2 0 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 3 2 2 2 2 2 2 2 2 2 2 0 2 2 2 2 2]


In [34]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
#confusion matrix
print(confusion_matrix(Y_test,Y_pred))
print(accuracy_score,(Y_test,Y_pred))
print(classification_report(Y_test,Y_pred))

[[ 11   0  58   2]
 [  2   0  10   0]
 [ 18   0 167   0]
 [  6   0   2   1]]
<function accuracy_score at 0x0000021C218C19D8> (array([2, 2, 2, 2, 2, 2, 0, 2, 2, 1, 0, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 2,
       2, 2, 2, 2, 2, 2, 0, 2, 0, 1, 2, 2, 2, 0, 2, 2, 0, 2, 2, 2, 2, 2,
       3, 0, 2, 0, 3, 0, 2, 0, 2, 2, 2, 2, 2, 2, 0, 2, 0, 2, 2, 0, 2, 2,
       1, 2, 0, 0, 2, 2, 0, 3, 0, 0, 2, 2, 0, 2, 0, 2, 2, 0, 2, 2, 0, 2,
       2, 2, 0, 2, 2, 0, 2, 2, 2, 2, 2, 0, 3, 2, 2, 0, 0, 2, 2, 2, 0, 2,
       0, 3, 2, 0, 3, 2, 2, 2, 2, 2, 3, 2, 0, 2, 0, 2, 2, 0, 2, 2, 2, 2,
       0, 2, 2, 2, 2, 0, 2, 2, 2, 0, 2, 2, 2, 0, 0, 2, 2, 2, 2, 2, 0, 2,
       2, 1, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 3, 0, 0, 2, 2, 0, 2, 2, 2, 2,
       2, 0, 2, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2,
       2, 2, 2, 2, 2, 1, 0, 2, 2, 2, 2, 2, 0, 0, 2, 3, 1, 2, 2, 0, 0, 2,
       0, 0, 0, 2, 2, 0, 2, 2, 2, 2, 0, 0, 0, 2, 2, 2, 0, 2, 0, 2, 1, 0,
       0, 1, 1, 2, 2, 2, 0, 2, 2, 2, 2, 0, 1, 2, 2, 2, 2, 2, 0, 2, 2, 2

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [35]:
np.set_printoptions(suppress=True)
Y_pred_prob=classifier.predict_proba(X_test)
Y_pred_prob

array([[0.17723996, 0.04469266, 0.77722473, 0.00084265],
       [0.07716139, 0.02202663, 0.90080527, 0.0000067 ],
       [0.14540796, 0.02338039, 0.83015498, 0.00105667],
       ...,
       [0.16631889, 0.04582859, 0.78778272, 0.0000698 ],
       [0.1018213 , 0.03768644, 0.86046754, 0.00002472],
       [0.05074201, 0.01342706, 0.93581987, 0.00001105]])

In [36]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
 
# first, initialize the classificators
tree= DecisionTreeClassifier(random_state=10) # using the random state for reproducibility
knn= KNeighborsClassifier(n_neighbors=5,metric='euclidean')
svm= SVC(kernel="rbf", gamma=0.1, C=1,random_state=10)
logreg=LogisticRegression(multi_class="multinomial",random_state=10)
 

In [37]:
# now, create a list with the objects 
models= [tree, knn, svm, logreg]

In [38]:
from sklearn.metrics import confusion_matrix, accuracy_score,classification_report
 
for model in models:
    model.fit(X_train, Y_train) # fit the model
    Y_pred= model.predict(X_test) # then predict on the test set
    accuracy= accuracy_score(Y_test, Y_pred) 
    clf_report= classification_report(Y_test, Y_pred) 
    print(confusion_matrix(Y_test,Y_pred))
    print("The accuracy of the ",type(model).__name__, " model is ", accuracy*100 )
    print("Classification report:\n", clf_report)
    print("\n")

[[ 69   1   1   0]
 [  4   8   0   0]
 [  0   0 185   0]
 [  0   0   0   9]]
The accuracy of the  DecisionTreeClassifier  model is  97.83393501805054
Classification report:
               precision    recall  f1-score   support

           0       0.95      0.97      0.96        71
           1       0.89      0.67      0.76        12
           2       0.99      1.00      1.00       185
           3       1.00      1.00      1.00         9

    accuracy                           0.98       277
   macro avg       0.96      0.91      0.93       277
weighted avg       0.98      0.98      0.98       277



[[ 65   1   5   0]
 [  8   4   0   0]
 [  1   0 184   0]
 [  2   0   1   6]]
The accuracy of the  KNeighborsClassifier  model is  93.50180505415162
Classification report:
               precision    recall  f1-score   support

           0       0.86      0.92      0.88        71
           1       0.80      0.33      0.47        12
           2       0.97      0.99      0.98       185


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [39]:
cars_test.head()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety
0,2,3,3,1,2,1
1,3,0,0,0,0,2
2,1,0,0,2,2,1
3,3,3,1,0,0,0
4,1,2,2,1,1,2


In [40]:
test=cars_test.values
test=scaler.transform(test)
print(test)

[[ 0.44373431  1.32688358  1.33418038 -0.01064285  1.22565305  0.00176987]
 [ 1.33507272 -1.3488262  -1.35232368 -1.23634431 -1.21505663  1.22474807]
 [-0.44760409 -1.3488262  -1.35232368  1.21505861  1.22565305  0.00176987]
 ...
 [ 0.44373431 -0.45692294 -0.45682233 -1.23634431  1.22565305 -1.22120833]
 [ 1.33507272  0.43498032 -1.35232368  1.21505861  1.22565305  0.00176987]
 [ 1.33507272 -1.3488262  -0.45682233 -0.01064285  1.22565305  1.22474807]]


In [41]:
#predicting using the Decision_Tree_Calssifier
from sklearn.tree import DecisionTreeClassifier

model_DecisionTree=DecisionTreeClassifier(random_state=10, criterion='gini')
#min_samples_leaf, min_samples_split, max_depth, max_features, max_leaf_nodes

#fit the model on the data and predict the values
model_DecisionTree.fit(X_train,Y_train)

DecisionTreeClassifier(random_state=10)

In [42]:
test_pred=model_DecisionTree.predict(test)
test_pred

array([2, 2, 2, 2, 1, 2, 0, 0, 2, 0, 2, 2, 2, 2, 0, 0, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 0, 2, 2, 2, 3,
       2, 0, 2, 2, 2, 2, 2, 0, 1, 3, 1, 2, 0, 2, 0, 2, 2, 2, 2, 3, 2, 2,
       0, 0, 2, 2, 3, 2, 2, 2, 1, 2, 0, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0,
       2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 1, 2, 2, 0, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 0, 2, 2, 3, 2, 2, 0, 2, 0, 3, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2,
       2, 2, 2, 2, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 3, 2, 2,
       2, 2, 0, 0, 2, 2, 2, 2, 3, 2, 0, 2, 1, 0, 2, 2, 2, 2, 2, 3, 0, 0,
       2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 3, 0, 2, 2, 2, 3, 2, 2, 0, 2,
       2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 0, 0, 2, 2, 2, 2, 3, 2, 2, 0,
       2, 0, 2, 0, 2, 2, 2, 2, 2, 0, 2, 2, 2, 0, 2, 2, 0, 2, 2, 2, 0, 2,
       2, 2, 0, 2, 2, 0, 2, 2, 2, 1, 1, 2, 2, 2, 0, 2, 2, 0, 3, 3, 0, 2,
       0, 2, 2, 2, 3, 2, 2, 0, 2, 2, 2, 2, 2, 0, 2,

In [43]:
cars_test=pd.read_csv(r"cars_test.csv", header=None)
cars_test.columns=['buying','maint','doors','persons',
                   'lug_boot','safety','classes']
cars_test["Pred"]=test_pred
cars_test["Pred"]=cars_test["Pred"].replace({0:"acc",1:"good",2:"unacc",
                                         3:"vgood"})
cars_test.head()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,classes,Pred
0,med,vhigh,5more,4,small,low,unacc,unacc
1,vhigh,high,2,2,big,med,unacc,unacc
2,low,high,2,more,small,low,unacc,unacc
3,vhigh,vhigh,3,2,big,high,unacc,unacc
4,low,med,4,4,med,med,good,good


In [44]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
#confusion matrix
print(confusion_matrix(cars_test.classes,cars_test.Pred))
print(accuracy_score(cars_test.classes,cars_test.Pred))
print(classification_report(cars_test.classes,cars_test.Pred))

[[ 60   1   3   0]
 [  3  10   0   0]
 [  0   0 251   0]
 [  0   0   0  18]]
0.9797687861271677
              precision    recall  f1-score   support

         acc       0.95      0.94      0.94        64
        good       0.91      0.77      0.83        13
       unacc       0.99      1.00      0.99       251
       vgood       1.00      1.00      1.00        18

    accuracy                           0.98       346
   macro avg       0.96      0.93      0.94       346
weighted avg       0.98      0.98      0.98       346



In [45]:
cars_test.to_excel('Decision Test Output.xlsx',header=True)

In [46]:
#predicting using the Random_Forest_classifier
from sklearn.ensemble import RandomForestClassifier

model_RandomForest=RandomForestClassifier(n_estimators=100,
                                         random_state=10,bootstrap=True,
                                         n_jobs=-1)
#fit the model on the data and predict the values
model_RandomForest.fit(X_train,Y_train)

Y_pred=model_RandomForest.predict(X_test)
#job -1 is use to indicate all the CPU to get the job done on priority

In [47]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
#confusion matrix
print(confusion_matrix(Y_test,Y_pred))
print(accuracy_score(Y_test,Y_pred))
print(classification_report(Y_test,Y_pred))

[[ 65   1   5   0]
 [  4   8   0   0]
 [  1   0 184   0]
 [  0   0   0   9]]
0.9602888086642599
              precision    recall  f1-score   support

           0       0.93      0.92      0.92        71
           1       0.89      0.67      0.76        12
           2       0.97      0.99      0.98       185
           3       1.00      1.00      1.00         9

    accuracy                           0.96       277
   macro avg       0.95      0.89      0.92       277
weighted avg       0.96      0.96      0.96       277



In [48]:
#predicting using the Extra_Tree_classifier
from sklearn.ensemble import ExtraTreesClassifier

model_EXT=ExtraTreesClassifier(n_estimators=500,
                                         random_state=10,bootstrap=True,
                                         n_jobs=-1)
#fit the model on the data and predict the values
model_EXT.fit(X_train,Y_train)

Y_pred=model_EXT.predict(X_test)
#job -1 is use to indicate all the CPU to get the job done on priority

In [49]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
#confusion matrix
print(confusion_matrix(Y_test,Y_pred))
print(accuracy_score(Y_test,Y_pred))
print(classification_report(Y_test,Y_pred))

[[ 63   0   8   0]
 [  8   4   0   0]
 [  1   0 184   0]
 [  2   0   0   7]]
0.9314079422382672
              precision    recall  f1-score   support

           0       0.85      0.89      0.87        71
           1       1.00      0.33      0.50        12
           2       0.96      0.99      0.98       185
           3       1.00      0.78      0.88         9

    accuracy                           0.93       277
   macro avg       0.95      0.75      0.81       277
weighted avg       0.93      0.93      0.92       277



# Continutation 3-12

In [56]:
from sklearn.ensemble import ExtraTreesClassifier
 
model_EXT=ExtraTreesClassifier( random_state=10, bootstrap=True)
 
parameter_space = {
    'n_estimators':[100,200,300,400,500],
    'max_depth':[10,15, 8, 12]
    }
from sklearn.model_selection import RandomizedSearchCV
clf = RandomizedSearchCV(model_EXT, parameter_space, n_jobs=-1, cv=5)
#while building the model object pass the arguments for which we want to fix the valuesII
#In the hyperparameter_dict keys should indicate the hyperparameters which we want to tune and the values should indicate all the values that we like to test upon.

In [None]:
"""Grid search CV test for each and every combination based on the values pass. 
This increases the time complexity and the memory utilization especially in case of a huge dataset and a lot of combinations. 
So to overcome this drawback we have randomise search CV which randomly selects certain combinations out of one that are pass
and returns the best from the ones that were tested/tried """

In [57]:
clf.fit(X_train,Y_train)

RandomizedSearchCV(cv=5,
                   estimator=ExtraTreesClassifier(bootstrap=True,
                                                  random_state=10),
                   n_jobs=-1,
                   param_distributions={'max_depth': [10, 15, 8, 12],
                                        'n_estimators': [100, 200, 300, 400,
                                                         500]})

In [58]:
print('Best parameters found:\n', clf.best_params_)

Best parameters found:
 {'n_estimators': 100, 'max_depth': 12}


In [59]:
clf.best_score_ #accuracy of the best params using the 5-fold CV

0.9438914027149321

In [60]:
Y_pred=clf.predict(X_test)

In [61]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
#confusion matrix
print(confusion_matrix(Y_test,Y_pred))
print(accuracy_score(Y_test,Y_pred))
print(classification_report(Y_test,Y_pred))

[[ 65   0   6   0]
 [  8   4   0   0]
 [  1   0 184   0]
 [  2   0   0   7]]
0.9386281588447654
              precision    recall  f1-score   support

           0       0.86      0.92      0.88        71
           1       1.00      0.33      0.50        12
           2       0.97      0.99      0.98       185
           3       1.00      0.78      0.88         9

    accuracy                           0.94       277
   macro avg       0.96      0.76      0.81       277
weighted avg       0.94      0.94      0.93       277

