## 1. Import python modules

In [1]:
                # Load necessary pyhton modules
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

In [2]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

## 2. Load IRIS Dataset

In [3]:
iris = pd.read_csv("/kaggle/input/iris-flower-dataset/IRIS.csv")

In [4]:
print(iris.columns)

print(iris.shape)

print(iris.dtypes)

In [5]:
iris.info()

In [6]:
iris.isnull().value_counts()

In [7]:
iris["species"].value_counts()

In [8]:
iris.describe()

In [9]:
iris.plot(figsize = (15,15), subplots = True)
plt.show()

In [10]:
sns.set(font_scale=1.5)
sns.pairplot(data = iris, kind = "scatter", hue="species", aspect =1, height = 3)
plt.show()

In [11]:
fig, axs = plt.subplots(1, 2, figsize=(15, 5), sharey=True)
flower_names=["setosa","versicolor","virginica"]
values = list(iris["species"].value_counts())
axs[0].bar(flower_names,values)
axs[1].pie(values,labels=flower_names)
fig.suptitle('Flowers')
plt.show()


In [12]:
sns.set_style("whitegrid")
fig, axes = plt.subplots(2, 2, figsize=(15, 15))
fig.suptitle('Box-Plots')
sns.boxplot(ax=axes[0, 0], x='species',y='petal_width', data=iris)
sns.boxplot(ax=axes[0, 1], x='species',y='petal_length', data=iris)
sns.boxplot(ax=axes[1, 0], x='species',y='sepal_width', data=iris)
sns.boxplot(ax=axes[1, 1], x='species',y='sepal_length', data=iris)
plt.show()

**Observations**
* sepal length did not have any outliers
* sepal width (almost)did not have any outliers
* petal width and petal length have some outliers

In [13]:
iris.agg(["min","max","mean","median","std"])

In [14]:
sns.set_style("whitegrid")
fig, axes = plt.subplots(2, 2, figsize=(15, 15))
fig.suptitle('Violin-Plots')
sns.violinplot(ax=axes[0, 0], x='species',y='petal_width', data=iris)
sns.violinplot(ax=axes[0, 1], x='species',y='petal_length', data=iris)
sns.violinplot(ax=axes[1, 0], x='species',y='sepal_width', data=iris)
sns.violinplot(ax=axes[1, 1], x='species',y='sepal_length', data=iris)
plt.show()

In [15]:
sns.set_style("whitegrid")
fig, axes = plt.subplots(2, 2, figsize=(15, 15))
fig.suptitle('Kde-Plots')
sns.histplot(ax=axes[0, 0], hue='species',x='petal_width', data=iris,kde=True)
sns.histplot(ax=axes[0, 1], hue='species',x='petal_length', data=iris,kde=True)
sns.histplot(ax=axes[1, 0], hue='species',x='sepal_width', data=iris,kde=True)
sns.histplot(ax=axes[1, 1], hue='species',x='sepal_length', data=iris,kde=True)
plt.show()

In [16]:
Q1 = iris.petal_length.quantile(0.25)
Q3 = iris.petal_length.quantile(0.75)
IQR = Q3 - Q1
lower_limit = Q1 - 1.5*IQR
upper_limit = Q3 + 1.5*IQR
lower_limit, upper_limit
iris = iris[(iris.petal_length>lower_limit)&(iris.petal_length<upper_limit)]

Q1 = iris.petal_width.quantile(0.25)
Q3 = iris.petal_width.quantile(0.75)
IQR = Q3 - Q1
lower_limit = Q1 - 1.5*IQR
upper_limit = Q3 + 1.5*IQR
lower_limit, upper_limit
iris = iris[(iris.petal_width>lower_limit)&(iris.petal_width<upper_limit)]

In [17]:
Q1 = iris.sepal_length.quantile(0.25)
Q3 = iris.sepal_length.quantile(0.75)
IQR = Q3 - Q1
lower_limit = Q1 - 1.5*IQR
upper_limit = Q3 + 1.5*IQR
lower_limit, upper_limit
iris = iris[(iris.sepal_length>lower_limit)&(iris.sepal_length<upper_limit)]

Q1 = iris.sepal_width.quantile(0.25)
Q3 = iris.sepal_width.quantile(0.75)
IQR = Q3 - Q1
lower_limit = Q1 - 1.5*IQR
upper_limit = Q3 + 1.5*IQR
lower_limit, upper_limit
iris = iris[(iris.sepal_width>lower_limit)&(iris.sepal_width<upper_limit)]

In [18]:
le = LabelEncoder()
iris["species"] = le.fit_transform(iris["species"])

In [19]:
iris["species"].value_counts()

In [20]:
iris

In [21]:
print(iris.corr())
sns.heatmap(iris.corr(), cmap="YlGnBu", annot=True)
plt.show()

In [22]:
X=iris.drop(['species'],axis='columns')

In [23]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit_transform(X)

## 6. Split train and test on dataset

In [24]:
Y=iris["species"]
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.30)

In [25]:
print("train data length:",len(X_train))
print("test data length:",len(X_test))

## 7. DT Model - fit - accuracy - score - matrix

In [26]:
dt_model = DecisionTreeClassifier(max_depth=4)
dt_model.fit(X_train,Y_train)

In [27]:
dt_model.score(X_train,Y_train)

In [28]:
dt_model.score(X_test,Y_test)

In [29]:
print(classification_report(Y_test, dt_model.predict(X_test)))

In [30]:
print(confusion_matrix(Y_test,dt_model.predict(X_test)))
sns.heatmap(confusion_matrix(Y_test,dt_model.predict(X_test)), annot=True)
plt.show()

In [31]:
print(accuracy_score(Y_train,dt_model.predict(X_train)))

In [32]:
print(accuracy_score(Y_test,dt_model.predict(X_test)))

## 8. Model  - Feature Importance

In [33]:
print(dt_model.feature_importances_)

In [34]:
print(dt_model.get_n_leaves())

In [35]:
print(dt_model.get_params())

In [36]:
print(dt_model.get_depth())

## 9. DT Model - Hyper Parameter - Finally- Run Model

In [37]:
from sklearn.model_selection import GridSearchCV
tree_para = {'criterion':['gini','entropy'],'max_depth':[4,5,6,7,8,9,10,11,12,15,20,30,40,50,70,90,120,150],'min_samples_split': [2, 3, 4]}
clf = GridSearchCV(DecisionTreeClassifier(), tree_para, cv=5)
clf.fit(X_train,Y_train)
print(clf.best_params_)
print(clf.best_estimator_)

In [38]:
dt_model = DecisionTreeClassifier(criterion="gini",max_depth=4,min_samples_split=2)
dt_model.fit(X_train,Y_train)

In [39]:
print(confusion_matrix(Y_test,dt_model.predict(X_test)))
sns.heatmap(confusion_matrix(Y_test,dt_model.predict(X_test)), annot=True)
plt.show()

In [40]:
print(classification_report(Y_test, dt_model.predict(X_test)))

In [41]:
print(dt_model.get_depth())

In [42]:
print(dt_model.get_n_leaves())

**DecisionTreeClassifier on IRIS Data Set:**

**accuracy=0.95 **