# Best-DT DS1
a better performing Decision Tree found by performing grid search to find the best combination of hyper-parameters. For this, you need to experiment with the following parameter values:
* splitting criterion: gini and entropy
* maximum depth of the tree: 10 and no maximum
* minimum number of samples to split an internal node: experiment with values of your choice
* minimum impurity decrease: experiment with values of your choice
* class weight: None and balanced

## Importing Libraries

In [1]:
import numpy as np
import pandas as pd

## Importing and Splitting the dataset

In [2]:
dataset_train1 = pd.read_csv("dataset/train_1.csv", header=None)
dataset_validate1 = pd.read_csv("dataset/val_1.csv",header=None)
dataset_test1_no_label_1 = pd.read_csv("dataset/test_no_label_1.csv",header=None)
dataset_test1_with_label_1 = pd.read_csv("dataset/test_with_label_1.csv",header=None)
X_training_1 = dataset_train1.iloc[:, :-1].values
Y_training_1 = dataset_train1.iloc[:, -1].values
X_validate_1 = dataset_validate1.iloc[:, :-1].values
Y_validate_1 = dataset_validate1.iloc[:, -1].values
X_test1 = dataset_test1_no_label_1.iloc[:, ].values
Y_test1 = dataset_test1_with_label_1.iloc[:,-1]

## Training Using Training set

In [3]:
from sklearn.tree import export_graphviz
from sklearn.model_selection   import GridSearchCV, cross_val_score
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
import warnings
warnings.filterwarnings("ignore")

tree_para = {'criterion':['gini','entropy'],'max_depth':[10],'min_impurity_decrease':[0,1.0,2.0,3.0,4.0,5.0],'min_samples_split':[2,4,6,8]}
classifier_Best_DT = GridSearchCV(DecisionTreeClassifier(), tree_para)
classifier_Best_DT.fit(X_training_1, Y_training_1)
classifier = classifier_Best_DT.best_estimator_
classifier.fit(X_training_1, Y_training_1)

DecisionTreeClassifier(criterion='entropy', max_depth=10,
                       min_impurity_decrease=0)

## Validate - Predict Using Validate Set

In [4]:
Y_validate1_pred = classifier.predict(X_validate_1)

## Validate - Output - Confusion Matrix and Classification Report

In [5]:
from sklearn.metrics import confusion_matrix, classification_report
import csv

print(Y_validate1_pred)
cm = confusion_matrix(Y_validate_1, Y_validate1_pred)
print(cm)
cr = classification_report(Y_validate_1, Y_validate1_pred)
print(cr)

[ 9 17 12 13 25 13 15  9  9  9  9 25 22 15  9 24 16  8 20 15 19  9 25 13
 24 12  5  2 20 14 15  1 20  0 24 17 10 22  8 13  9 19 18 20  8 19 14 22
 21  2 10 23 24  7 16 11  2  0 20  2 11 13 25 13 13  2 17  4 22 11 10 25
  3  4  7  4  3  6  1  3 12 22  9  4  7  6  0  4 14  8 18 10 25  5  9  5
  2  0 13 10 14 20 12 13  0 24 13 25 13 13  2  2 11 14 23 21 20  3 14  5
  0 17 11 15 15 11 14  7 25 22 22 16 17  4 14 19 22 10 23 13 18  8 18  3
  9  6  6  4  3 20 22 25 24  5 10 25  4  0 20 10 22  4  0 13 13  2 11 13
 13  6 19 22 11 19 14 15 20 19 23 18 11  5  4  5 24  6 11 10 16 23 13  8
 22  1  9 10  2 10 16 19  8  3 23  0 10  5  9  2  2 23 18  9 10 14 22  2
  8  3 14 21  0 24  2  7 24 25 25  9 13  8  3 11 21 10 13 24  2  8  5]
[[7 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0]
 [0 0 0 2 1 1 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 3]
 [0 0 8 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 3 0 0 0 0 0 2 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 1]
 [0 0 0 0 3 1 0 0 0 0 0 1 0 0 1 0 1 0 0 0 0 0 0 0 0 

## Test - Predict Using Test Set

In [6]:
Y_test1_pred = classifier.predict(X_test1)

## Test - Output - Confusion Matrix and Classification Report

In [7]:
print(Y_test1_pred)
cm = confusion_matrix(Y_test1, Y_test1_pred)
print(cm)
cr = classification_report(Y_test1, Y_test1_pred)
print(cr)

pd.DataFrame(Y_test1_pred).to_csv('output/Best-DT-DS1.csv', header = None)
with open('output/Best-DT-DS1.csv', 'a') as fd:
    fd.write('\n\nConfusion Matrix\n\n')
    fd.write(pd.DataFrame(cm).to_csv())
    fd.write('\n\n Classification Report\n\n')
    fd.write(cr)

[25 13 22  2 18 17 11 13 13 20  6  9 11 14  4  5 18  3  2 12  3  0  9 17
  9  4 22 20 23 22 25 23 19 14 22 24  2 19  9 17 16 22 13 15 12 13 13 14
 24  0  7  3 22 20 15 14 23  9 25 14 22  2 11 10  0  0 16 20 24 21 24 23
 21  8 22 18 15 25 14 19]
[[3 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0]
 [0 0 3 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1]
 [0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1]
 [0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0]
 [1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 1 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0 0]
 [0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 3 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 1 0]
 [0 0 0 0 0 0 0 0 0 0 0 3 0 0 0 0 0 0 0 0 0 0 1 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 1 2 0 0 0 0 0 

# Best-DT DS2

## Importing and Splitting the dataset

In [8]:
dataset_train2 = pd.read_csv("dataset/train_2.csv", header=None)
dataset_validate2 = pd.read_csv("dataset/val_2.csv",header=None)
dataset_test2_no_label_2 = pd.read_csv("dataset/test_no_label_2.csv",header=None)
dataset_test2_with_label_2 = pd.read_csv("dataset/test_with_label_2.csv",header=None)
X_training_2 = dataset_train2.iloc[:, :-1].values
Y_training_2 = dataset_train2.iloc[:, -1].values
X_validate_2 = dataset_validate2.iloc[:, :-1].values
Y_validate_2 = dataset_validate2.iloc[:, -1].values
X_test2 = dataset_test2_no_label_2.iloc[:, ].values
Y_test2 = dataset_test2_with_label_2.iloc[:,-1]

## Training Using Training set

In [9]:
tree_para2 = {'criterion':['gini','entropy'],'max_depth':[10],'min_impurity_decrease':[0,1.0,2.0,3.0,4.0,5.0],'min_samples_split':[2,4,6,8]}
classifier_Best_DT2 = GridSearchCV(DecisionTreeClassifier(), tree_para2)
classifier_Best_DT2.fit(X_training_2, Y_training_2)
classifier = classifier_Best_DT2.best_estimator_
classifier.fit(X_training_2, Y_training_2)

DecisionTreeClassifier(max_depth=10, min_impurity_decrease=0,
                       min_samples_split=6)

## Validate - Predict Using Validate Set

In [10]:
Y_validate2_pred = classifier.predict(X_validate_2)

## Validate - Output - Confusion Matrix and Classification Report

In [11]:
print(Y_validate2_pred)
cm2 = confusion_matrix(Y_validate_2, Y_validate2_pred)
print(cm2)
cr2 = classification_report(Y_validate_2, Y_validate2_pred)
print(cr2)

[9 7 1 ... 8 0 1]
[[107   4   1   2   9   1   1   0  31   9]
 [ 12 324   1  12   3   1   1   1  20   0]
 [  0   2  26   0   0   0   0   0  12   5]
 [  1  13   0  26   1   2   0   1   0   1]
 [ 15   3   0   0  80  11   0   0  17  24]
 [  2   4   2   1  17 103   0   0   1  35]
 [  3   0   0   0   4   0   0   0  35   3]
 [  0   2   0   0   4   0   0  38   0   1]
 [  3   0   5   2   5   0   0   1 130   4]
 [  1   0   5   1  19  26   1   0  32 290]]
              precision    recall  f1-score   support

           0       0.74      0.65      0.69       165
           1       0.92      0.86      0.89       375
           2       0.65      0.58      0.61        45
           3       0.59      0.58      0.58        45
           4       0.56      0.53      0.55       150
           5       0.72      0.62      0.67       165
           6       0.00      0.00      0.00        45
           7       0.93      0.84      0.88        45
           8       0.47      0.87      0.61       150
          

## Test - Predict Using Test Set

In [12]:
Y_test2_pred = classifier.predict(X_test2)

## Test - Output - Confusion Matrix and Classification Report

In [13]:
print(Y_test2_pred)
cm2 = confusion_matrix(Y_test2, Y_test2_pred)
print(cm2)
cr2 = classification_report(Y_test2, Y_test2_pred)
print(cr2)

pd.DataFrame(Y_test2_pred).to_csv('output/Best-DT-DS2.csv', header = None)
with open('output/Best-DT-DS2.csv', 'a') as fd:
    fd.write('\n\nConfusion Matrix\n\n')
    fd.write(pd.DataFrame(cm2).to_csv())
    fd.write('\n\n Classification Report\n\n')
    fd.write(cr2)

[9 0 0 5 9 1 1 3 8 9 9 1 3 9 8 3 9 4 7 1 8 1 9 7 4 8 1 1 8 8 9 8 8 9 9 1 9
 9 1 1 1 7 9 9 9 9 1 8 5 1 4 0 5 4 0 1 8 5 2 1 1 9 0 0 9 0 1 4 0 9 1 0 1 9
 9 8 9 1 4 9 2 1 8 5 0 8 9 9 7 8 8 4 0 1 5 1 9 9 0 0 4 8 8 2 5 5 8 5 5 4 1
 9 5 9 8 9 8 1 8 9 1 5 5 9 1 0 0 5 4 9 1 3 8 1 9 1 4 9 8 8 5 0 0 9 4 0 9 8
 9 1 1 9 9 1 5 0 1 5 5 3 8 8 0 1 1 1 8 3 9 0 5 9 5 0 9 1 1 0 8 5 4 1 9 9 9
 8 9 4 8 3 1 9 1 8 2 7 9 8 9 9 9 0 9 0 9 5 8 8 5 8 4 4 8 8 9 8 8 1 3 9 8 1
 9 5 9 1 2 5 0 1 9 1 8 8 1 5 5 9 8 1 2 0 9 9 1 1 9 0 9 7 8 1 3 1 9 0 8 5 1
 1 9 1 1 1 4 8 9 2 7 8 5 0 5 1 7 8 5 8 9 9 0 9 4 8 1 5 0 1 5 1 1 9 8 8 0 9
 1 0 8 5 1 5 9 5 1 1 1 7 8 1 9 9 1 4 1 8 5 9 9 7 9 8 9 8 0 9 9 0 1 9 5 1 9
 4 1 9 7 1 9 4 8 8 5 8 0 9 5 9 8 9 9 3 1 8 1 4 5 1 8 3 0 5 2 5 9 8 0 1 1 3
 9 5 1 5 1 8 1 9 8 8 1 5 1 1 1 4 0 4 9 8 9 9 0 1 4 1 2 8 2 1 1 9 4 8 8 1 8
 9 8 5 5 8 1 9 1 1 9 1 4 5 4 8 3 9 8 0 9 5 0 8 9 0 3 9 9 1 1 9 9 5 1 0 0 1
 0 8 8 8 8 9 9 9 1 9 8 0 0 0 1 9 1 8 8 9 8 4 1 8 7 8 9 8 5 9 3 5 5 0 8 8 8
 1 1 8 9 8 9 2 9 0 1 5 4 