In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# importing the Important Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

In [None]:
# Importing the Dataset
df = pd.read_csv("/kaggle/input/iriscsv/Iris.csv")
df.head()

# Profile Report

In [None]:
from pandas_profiling import ProfileReport
ProfileReport(df)

In [None]:
# Dropping Unnecessary values
df.drop("Id",axis=1,inplace=True)

In [None]:
# Checking if any null values
import missingno as msno
msno.matrix(df,figsize=(5,4))

As there are no missing values we do not need to fill any values .

# 1. Exploratory Data Analysis


In [None]:
plt.figure(figsize=(12,8))
plt.subplot(1,2,1)
df["Species"].value_counts().plot(kind="bar",color="orange")
plt.title("Species Count Plot")
plt.xlabel("Species",fontweight="bold")
plt.ylabel("Count",fontweight="bold");
plt.subplot(1,2,2)
df["Species"].value_counts().plot(kind="pie",shadow=True)

In [None]:
plt.style.use("Solarize_Light2")
def Plot(dataset,features):
    plt.figure(figsize=(12,8))
    plt.subplot(1,2,1)
    sns.distplot(dataset[features],bins=30,color="orange")
    plt.title(f"{features} DistPlot")
    plt.xlabel(f"{features}",fontweight="bold")
    plt.ylabel("Density",fontweight="bold")
    # Checking if any outliers    
    plt.subplot(1,2,2)
    sns.boxplot(df[features],color="orange")
    plt.xlabel(f"{features}")
    plt.ylabel("Count")
    plt.title("Boxplot of {}".format(features))

In [None]:
Plot(df,"SepalLengthCm")

In [None]:
Plot(df,"SepalWidthCm")

In [None]:
Plot(df,"PetalLengthCm")

In [None]:
Plot(df,"PetalWidthCm")

# 2. Scatter plot of Two Columns

In [None]:
sns.scatterplot(data=df,x="SepalLengthCm",y="PetalLengthCm",hue="Species");

In [None]:
sns.scatterplot(data=df,x="SepalLengthCm",y="SepalWidthCm",hue="Species");

In [None]:
sns.scatterplot(data=df,x="PetalLengthCm",y="PetalWidthCm",hue="Species");

In [None]:
sns.scatterplot(data=df,x="SepalLengthCm",y="PetalWidthCm",hue="Species");

# 3. Correlation

In [None]:
plt.figure(figsize=(8,6))
sns.heatmap(df.corr(numeric_only=True),annot=True,cmap="Blues");

In [None]:
sns.pairplot(df,hue="Species")

In [None]:
# Converting Categorical into Numerical Columns
mapping = {"Iris-setosa":0,"Iris-versicolor":1,"Iris-virginica":2}
df["Species"] = df["Species"].map(mapping)

# Splitting the Data `

In [None]:
X = df.drop("Species",axis=1)
y = df["Species"]
np.random.seed(42)
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test  = train_test_split(X,y,test_size=0.3,random_state=33)
print("The shape of X_train is :",X_train.shape)
print("The shape of X_test is :",X_test.shape)
print("The shape of y_train is :",y_train.shape)
print("The shape of y_test is :",y_test.shape)


# Modelling

In [None]:
# Importing all the models
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

In [None]:
models = [("LR" ,LogisticRegression()),("DTR" ,DecisionTreeClassifier()),("RFC" , RandomForestClassifier()),("KNN",KNeighborsClassifier())]


In [None]:
# Importing important Classification metrics
from sklearn.metrics import confusion_matrix,precision_score,f1_score,accuracy_score,classification_report


In [None]:
for name,model in models:
    model.fit(X_train,y_train)
    y_pred = model.predict(X_test)
    print(f"====================={model}=====================")

    print(f"The precision score of the model {model} is :",{precision_score(y_test,y_pred,average="weighted")})
    print(f"The F1 score of the model {model} is :",({np.round(f1_score(y_test,y_pred,average="weighted"),2)*100}))
    print(f"The Accuracy of the model is {model} is :",{(np.round(accuracy_score(y_test,y_pred),2))*100})
    print("-"*64)
    print((classification_report(y_pred,y_test)))
    
    print("="*100)
    
    

# Hyperparamter Tuning

# 1. Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
grid = {"C":np.logspace(2,10),"penalty":["l1","l2"]}
LR = LogisticRegression()
LR_cv = GridSearchCV(LR,grid,cv=10)
LR_cv.fit(X_train,y_train)
print("The best parameters are :",LR_cv.best_params_)

In [None]:
LR_HT = LogisticRegression(C=100,penalty="l2")
LR_HT.fit(X_train,y_train)
y_pred = LR_HT.predict(X_test)
LR_HT_score = (accuracy_score(y_test,y_pred)*100)
print("The model prediction after model tuning is :",(accuracy_score(y_test,y_pred)*100),"%")


In [None]:
print(classification_report(y_test,y_pred))

# 2. Decision Tree Classifier

In [None]:
import time
%time
from sklearn.model_selection import GridSearchCV

param_grid = {'max_features': ['auto', 'sqrt', 'log2'],
              'ccp_alpha': [0.1, .01, .001],
              'max_depth' : [5, 6, 7, 8, 9],
              'criterion' :['gini', 'entropy'],
              'max_features' : [1,3,5,7]
             }
DTC = DecisionTreeClassifier(random_state=100)
grid_search = GridSearchCV(estimator=DTC, param_grid=param_grid, cv=8, verbose=True)
grid_search.fit(X_train, y_train)

In [None]:
grid_search.best_params_

In [None]:
DTR_HT = DecisionTreeClassifier(ccp_alpha= 0.001, criterion= 'gini', max_depth= 5, max_features= 1)
DTR_HT.fit(X_train,y_train)
y_pred = DTR_HT.predict(X_test)
DHT_score = np.round(accuracy_score(y_test,y_pred),2)*100
print(DHT_score)

# 3. Random Forest Classifier


In [None]:
from sklearn.model_selection import GridSearchCV 
grid = {
    'bootstrap': [True],
    'max_depth': [90, 100, 110],
    'max_features': [2, 3],
    'min_samples_leaf': [3, 4, 5],
    'min_samples_split': [8, 10, 12],
    'n_estimators': [100, 200, 300, 1000]
}
RFC = RandomForestClassifier()
RFC_HT = GridSearchCV(RFC,grid,cv=5,verbose=True,n_jobs=-1)
RFC_HT.fit(X_train,y_train)
RFC_HT.best_params_

In [None]:
RFC = RandomForestClassifier(bootstrap= True,
 max_depth= 90,
 max_features= 2,
 min_samples_leaf= 3,
 min_samples_split= 8,
 n_estimators= 100)
RFC.fit(X_train,y_train)
y_pred_RFC = RFC.predict(X_test)
RFC_accuracy = np.round(accuracy_score(y_test,y_pred_RFC),2)*100
RFC_accuracy

# 4. KNN Classifiers

In [None]:
grid_params = { 'n_neighbors' : [5,7,9,11,13,15],
               'weights' : ['uniform','distance'],
               'metric' : ['minkowski','euclidean','manhattan']}
KNN = KNeighborsClassifier()
KNN_grid  = GridSearchCV(KNN,grid_params,verbose=1,n_jobs=-1,cv=3)
KNN_grid.fit(X_train,y_train)
y_pred = KNN_grid.predict(X_test)
KNN_accuracy = np.round(accuracy_score(y_test,y_pred),2)*100
KNN_accuracy


In [None]:
# Accuracy Evaluation in a dataframe
Model_fit = [["LR",LR_HT_score],["DTR",DHT_score],["RFC" ,RFC_accuracy],["KNN" , KNN_accuracy]]
accuracy_df = pd.DataFrame(data=Model_fit,columns=["MODELS","ACCURACY"])
accuracy_df

In [None]:
plt.figure(figsize=(8,8))
sns.barplot(data=accuracy_df,x="MODELS",y="ACCURACY")
plt.title("Accuracy Percentage of all Models",fontweight="bold")
plt.xlabel("MODELS",fontweight="bold")
plt.ylabel("Accuracy Percentage",fontweight="bold");