## Decision Tree implementation on iris dataset

### Dataset obtaining and preprocessing

In [31]:
#importing basic py libraries
import pandas as pd 
import numpy as np
import seaborn as sns

In [32]:
#loading the iris dataset from seaborn
df =sns.load_dataset('iris')

In [33]:
#The dataset: 
df

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,virginica
146,6.3,2.5,5.0,1.9,virginica
147,6.5,3.0,5.2,2.0,virginica
148,6.2,3.4,5.4,2.3,virginica


In [34]:
#labelling the target variables
df['species']=df['species'].map({'setosa':0,'versicolor':1,'virginica':2})

In [35]:
#dividing the dataset into features and lables
X=df.drop(['species'],axis=1)
y=df['species']

### Modelling

In [37]:
#Spilliting data into test and train 
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y, test_size=0.2,random_state=0)

In [38]:
#importing the decision tree model
from sklearn.tree import DecisionTreeClassifier

In [39]:
#making the entity of decisiontree model
clf = DecisionTreeClassifier()

In [40]:
#fitting the data
clf.fit(X_train,y_train)

DecisionTreeClassifier()

In [41]:
#obtaining the predictions
y_pred = clf.predict(X_test)

### Metrices

In [43]:
#importing the metrices
from sklearn.metrics import  classification_report, accuracy_score, confusion_matrix

#### Results (before tuning) 

In [48]:
print(confusion_matrix(y_test,y_pred))
print(accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))

[[11  0  0]
 [ 0 13  0]
 [ 0  0  6]]
1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        11
           1       1.00      1.00      1.00        13
           2       1.00      1.00      1.00         6

    accuracy                           1.00        30
   macro avg       1.00      1.00      1.00        30
weighted avg       1.00      1.00      1.00        30



### Performing basic hyperparameter tuning

In [49]:
from sklearn.model_selection import GridSearchCV

In [50]:
#preparing the parameter grid
grid_param = {
    'criterion': ['gini', 'entropy'],
    'max_depth' : range(2,32,1),
    'min_samples_leaf' : range(1,10,1),
    'min_samples_split': range(2,10,1),
    'splitter' : ['best', 'random']
    
}


In [51]:
#preparing the gridsearch model
grid_search = GridSearchCV(estimator=clf,
                     param_grid=grid_param,
                     cv=5,
                    n_jobs =-1)

In [52]:
#fitting the data on tuned model
rf = DecisionTreeClassifier()
grid_search = GridSearchCV(estimator = rf, param_grid = grid_param, 
                          cv = 3, n_jobs = -1, verbose = 2)

In [53]:
grid_search.fit(X_train,y_train)

Fitting 3 folds for each of 8640 candidates, totalling 25920 fits


GridSearchCV(cv=3, estimator=DecisionTreeClassifier(), n_jobs=-1,
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': range(2, 32),
                         'min_samples_leaf': range(1, 10),
                         'min_samples_split': range(2, 10),
                         'splitter': ['best', 'random']},
             verbose=2)

In [54]:
#obtaining the best parameters
grid_search.best_params_

{'criterion': 'gini',
 'max_depth': 5,
 'min_samples_leaf': 3,
 'min_samples_split': 5,
 'splitter': 'random'}