In [41]:
import sys
import sklearn
import numpy as np
import os
import pandas as pd
import cv2

np.random.seed(42)

%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

In [42]:
from sklearn.tree import DecisionTreeClassifier,plot_tree
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve, auc

In [60]:
# Importing training data
x = pd.read_csv("x_train_all.csv")
x_train = pd.read_csv("subset_2.csv") # Subset of best features
y_train = pd.read_csv("y_train_all.csv")

# Converting data to matrix
X = x_train.to_numpy()
y = y_train.to_numpy()
x = x.to_numpy()

In [44]:
# Fitting the decision tree model with default settings
# This contains all of the training data
tree_clf = DecisionTreeClassifier(random_state=42)
tree_clf.fit(X, y)

In [45]:
# Predicting the data against itself and printing accuracy
y_pred = tree_clf.predict(X)
accuracy = accuracy_score(y, y_pred)

print(classification_report(y, y_pred))
print("Accuracy: ", accuracy_score(y, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       210
           1       1.00      1.00      1.00      2220
           2       1.00      1.00      1.00      2250
           3       1.00      1.00      1.00      1410
           4       1.00      1.00      1.00      1980
           5       1.00      1.00      1.00       210
           6       1.00      1.00      1.00       360
           7       1.00      1.00      1.00       240
           8       1.00      1.00      1.00       540
           9       1.00      1.00      1.00       270

    accuracy                           1.00      9690
   macro avg       1.00      1.00      1.00      9690
weighted avg       1.00      1.00      1.00      9690

Accuracy:  1.0


In [46]:
# Import KFold to help with cross validation
from sklearn.model_selection import KFold

# Split the data into train and test via 10-fold cross validation
kf = KFold(n_splits=10)
for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
print("Training data shape: " + str(len(X_train)))
print("Test data shape: " + str(len(X_test)))

Training data shape: 8721
Test data shape: 969


In [47]:
# Create another model with the new datasets
tree_clf = DecisionTreeClassifier(random_state=42)
tree_clf.fit(X_train, y_train)

In [48]:
# Predicting the data against itself and printing accuracy
y_pred = tree_clf.predict(X_test)

print(classification_report(y_test, y_pred))
print("Accuracy: ", accuracy_score(y_test, y_pred))

              precision    recall  f1-score   support

           7       0.16      1.00      0.28       159
           8       0.00      0.00      0.00       540
           9       0.00      0.00      0.00       270

    accuracy                           0.16       969
   macro avg       0.05      0.33      0.09       969
weighted avg       0.03      0.16      0.05       969

Accuracy:  0.16408668730650156


In [49]:
# Create another model with the entire training dataset again
tree_clf = DecisionTreeClassifier(random_state=42)
tree_clf.fit(X, y)

In [50]:
# Import preprocessing test data
x_test = pd.read_csv("test_subset_2.csv") 
y_test = pd.read_csv("y_test_all.csv")

X_test = x_test.to_numpy()
y_test = y_test.to_numpy()

In [51]:
# Predicting the data against test data
y_pred = tree_clf.predict(X_test)

print(classification_report(y_test, y_pred))
print("Accuracy: ", accuracy_score(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.03      0.10      0.04        60
           1       0.24      0.74      0.36       720
           2       0.25      0.22      0.23       750
           3       0.00      0.00      0.00       450
           4       0.00      0.00      0.00       660
           5       0.00      0.00      0.00        60
           6       0.00      0.00      0.00        90
           7       0.00      0.00      0.00        60
           8       0.00      0.00      0.00       150
           9       0.00      0.00      0.00        90

    accuracy                           0.23      3090
   macro avg       0.05      0.11      0.06      3090
weighted avg       0.12      0.23      0.14      3090

Accuracy:  0.2262135922330097


In [52]:
tree_clf = DecisionTreeClassifier(
    max_features=9,
    random_state=42)
tree_clf.fit(X, y)

y_pred = tree_clf.predict(X_test)

print("Accuracy: ", accuracy_score(y_test, y_pred))

Accuracy:  0.27346278317152106


# Decision Tree on 30% of original training dataset

In [61]:
from sklearn.model_selection import train_test_split
X_train2, X_test2, y_train2, y_test2 = train_test_split(x, y, test_size = 0.3, random_state = 0)

In [62]:
tree_clf = DecisionTreeClassifier(
    max_features=9,
    random_state=42)
tree_clf.fit(X_train2, y_train2)

y_pred2 = tree_clf.predict(X_test2)

print(classification_report(y_test2, y_pred2))
print("Accuracy: ", accuracy_score(y_test2, y_pred2))

              precision    recall  f1-score   support

           0       0.47      0.45      0.46        65
           1       0.67      0.66      0.67       650
           2       0.63      0.62      0.63       676
           3       0.61      0.66      0.64       426
           4       0.69      0.68      0.69       587
           5       0.57      0.57      0.57        74
           6       0.61      0.68      0.64       114
           7       0.53      0.55      0.54        71
           8       0.65      0.57      0.61       176
           9       0.56      0.60      0.58        68

    accuracy                           0.64      2907
   macro avg       0.60      0.60      0.60      2907
weighted avg       0.64      0.64      0.64      2907

Accuracy:  0.6398348813209495


# Decision Tree on 60% of original training dataset

In [64]:
from sklearn.model_selection import train_test_split
X_train3, X_test3, y_train3, y_test3 = train_test_split(x, y, test_size = 0.6, random_state = 0)

In [65]:
tree_clf = DecisionTreeClassifier(max_features=9,random_state=42)
tree_clf.fit(X_train3, y_train3)

y_pred3 = tree_clf.predict(X_test3)

print(classification_report(y_test3, y_pred3))
print("Accuracy: ", accuracy_score(y_test3, y_pred3))

              precision    recall  f1-score   support

           0       0.45      0.44      0.45       123
           1       0.61      0.65      0.63      1331
           2       0.58      0.58      0.58      1326
           3       0.59      0.60      0.60       854
           4       0.68      0.68      0.68      1182
           5       0.50      0.37      0.42       142
           6       0.54      0.51      0.53       222
           7       0.58      0.56      0.57       149
           8       0.63      0.55      0.59       334
           9       0.47      0.43      0.45       151

    accuracy                           0.60      5814
   macro avg       0.56      0.54      0.55      5814
weighted avg       0.60      0.60      0.60      5814

Accuracy:  0.6037151702786377


# Conclusion on Overfitting

Yes, we noticed the problems of over fitting as we are driving the datasets into subsets and clusters, the data points are overfitting due to which the pixcels points losses the points and the images is not properly saved in the datasets and clusters. Due to that the algorithms are not perfoming well as accuracy and classification report shows. As we are plotting the images of the data that are predicted from the algorithms the predicted data is not much clear which is the reason that caused by overfitting.

# Random Forest

In [66]:
from sklearn.ensemble import RandomForestClassifier
Model=RandomForestClassifier(max_depth=2)
Model.fit(X_train2, y_train2)
y_pred4=Model.predict(X_test2)

# Summary of the predictions made by the classifier
print(classification_report(y_test2,y_pred4))
print(confusion_matrix(y_pred4,y_test2))
#Accuracy Score
print('accuracy is ',accuracy_score(y_pred4,y_test2))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        65
           1       0.41      0.78      0.54       650
           2       0.39      0.47      0.43       676
           3       0.49      0.59      0.54       426
           4       0.70      0.42      0.52       587
           5       0.00      0.00      0.00        74
           6       0.00      0.00      0.00       114
           7       0.00      0.00      0.00        71
           8       0.00      0.00      0.00       176
           9       0.00      0.00      0.00        68

    accuracy                           0.45      2907
   macro avg       0.20      0.23      0.20      2907
weighted avg       0.40      0.45      0.40      2907

[[  0   0   0   0   0   0   0   0   0   0]
 [ 65 506 278  42 243  16   5  11  65   4]
 [  0  72 316  97  82  44  25  40  73  58]
 [  0  55  49 253  17   3  76  20  37   5]
 [  0  17  33  34 245  11   8   0   1   1]
 [  0   0   0   0   0   0 

In [67]:
from sklearn.ensemble import RandomForestClassifier
Model2=RandomForestClassifier(max_depth=2)
Model2.fit(X_train3, y_train3)
y_pred5=Model2.predict(X_test3)

# Summary of the predictions made by the classifier
print(classification_report(y_test3,y_pred5))
print(confusion_matrix(y_pred5,y_test3))
#Accuracy Score
print('accuracy is ',accuracy_score(y_pred5,y_test3))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00       123
           1       0.45      0.71      0.55      1331
           2       0.36      0.55      0.43      1326
           3       0.47      0.52      0.49       854
           4       0.66      0.42      0.51      1182
           5       0.00      0.00      0.00       142
           6       0.00      0.00      0.00       222
           7       0.00      0.00      0.00       149
           8       0.00      0.00      0.00       334
           9       0.00      0.00      0.00       151

    accuracy                           0.45      5814
   macro avg       0.19      0.22      0.20      5814
weighted avg       0.39      0.45      0.40      5814

[[  0   0   0   0   0   0   0   0   0   0]
 [113 943 438  50 365  31  11  12 118  13]
 [  9 240 727 252 284  84  62 101 147 125]
 [  0 108 108 441  41   5 125  36  64  12]
 [  1  40  53 111 492  22  24   0   5   1]
 [  0   0   0   0   0   0 