# 1. Read the dataset to the python environment.

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
data=pd.read_excel("iris.xls")

In [3]:
data.head()

Unnamed: 0,SL,SW,PL,PW,Classification
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


# 2. Do necessary pre-processing steps.

In [4]:
data.shape

(150, 5)

In [5]:
data.isna().sum

<bound method NDFrame._add_numeric_operations.<locals>.sum of         SL     SW     PL     PW  Classification
0    False  False  False  False           False
1    False  False  False  False           False
2     True  False  False  False           False
3    False  False  False  False           False
4    False  False  False  False           False
..     ...    ...    ...    ...             ...
145  False  False  False  False           False
146  False  False  False  False           False
147  False  False   True  False           False
148  False  False  False  False           False
149  False  False  False  False           False

[150 rows x 5 columns]>

In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   SL              143 non-null    float64
 1   SW              144 non-null    float64
 2   PL              144 non-null    float64
 3   PW              150 non-null    float64
 4   Classification  150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB


In [7]:
data.columns

Index(['SL', 'SW', 'PL', 'PW', 'Classification'], dtype='object')

In [8]:
for i in ['SL', 'SW', 'PL']:
    data[i]=data[i].fillna(data[i].median())

In [9]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   SL              150 non-null    float64
 1   SW              150 non-null    float64
 2   PL              150 non-null    float64
 3   PW              150 non-null    float64
 4   Classification  150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB


# 3. Find out which classification model gives the best result to predict iris species

# a) Linear Regression

In [10]:
x=data[['SL', 'SW', 'PL', 'PW']]
y=data['Classification']   

In [11]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,random_state=42,test_size=0.3)

In [15]:
from sklearn.linear_model import LogisticRegression
lr=LogisticRegression()
lr.fit(x_train,y_train)
y_pred=lr.predict(x_test)
from sklearn.metrics import confusion_matrix,accuracy_score,precision_score,recall_score,f1_score
confusion_matrix(y_test,y_pred)

array([[ 0,  9,  1],
       [ 0,  0,  9],
       [ 0,  0, 11]], dtype=int64)

In [16]:
accuracy_score(y_test,y_pred)

0.36666666666666664

In [21]:
f1_score(y_test,y_pred,average='micro')

0.36666666666666664

# b) KNN 

In [29]:
from sklearn.neighbors import KNeighborsClassifier


In [43]:
k_value=[]
neighbors=np.arange(3,10)
for k in neighbors:
    classifier=KNeighborsClassifier(n_neighbors=k,metric='minkowski')
    classifier.fit(x_train,y_train)
    y_pred=classifier.predict(x_test)
    accurate=accuracy_score(y_test,y_pred)
    k_value.append(accurate)
  

In [44]:
k_value

[0.36666666666666664,
 0.36666666666666664,
 0.36666666666666664,
 0.36666666666666664,
 0.36666666666666664,
 0.36666666666666664,
 0.36666666666666664]

In [34]:
confusion_matrix(y_test,y_pred)

array([[ 0,  0, 10],
       [ 0,  0,  9],
       [ 0,  0, 11]], dtype=int64)

In [35]:
accuracy_score(y_test,y_pred)

0.36666666666666664

In [40]:
f1_score(y_test,y_pred,average='macro')

0.17886178861788615

# c) SVM

In [50]:
from sklearn.svm import SVC

# i) Linear SVM

In [51]:
svm=SVC(kernel='linear')
svm.fit(x_train,y_train)
y_pred=svm.predict(x_test)

In [52]:
accuracy_score(y_test,y_pred)

0.36666666666666664

In [53]:
f1_score(y_test,y_pred,average='macro')

0.23655913978494625

In [54]:
confusion_matrix(y_test,y_pred)

array([[ 0, 10,  0],
       [ 0,  0,  9],
       [ 0,  0, 11]], dtype=int64)

# ii) Radial SVM

In [55]:
svm_radial=SVC(kernel='rbf')
svm_radial.fit(x_train,y_train)
y_pred=svm_radial.predict(x_test)

In [56]:
accuracy_score(y_test,y_pred)

0.16666666666666666

In [57]:
f1_score(y_test,y_pred,average='macro')

0.09523809523809525

In [58]:
confusion_matrix(y_test,y_pred)

array([[ 5,  0,  5],
       [ 9,  0,  0],
       [11,  0,  0]], dtype=int64)

# iii) Polynomial SVM

In [71]:
svm=SVC(kernel='poly',degree=2)
svm.fit(x_train,y_train)
y_pred=svm.predict(x_test)

In [72]:
accuracy_score(y_test,y_pred)

0.3333333333333333

In [73]:
f1_score(y_test,y_pred,average='macro')

0.1801801801801802

In [74]:
confusion_matrix(y_test,y_pred)

array([[10,  0,  0],
       [ 6,  0,  3],
       [11,  0,  0]], dtype=int64)

# d) Decision Tree

In [75]:
from sklearn.tree import DecisionTreeClassifier
dt=DecisionTreeClassifier()
dt.fit(x_train,y_train)
y_pred=dt.predict(x_test)

In [76]:
confusion_matrix(y_test,y_pred)

array([[ 0,  0, 10],
       [ 0,  0,  9],
       [ 0,  0, 11]], dtype=int64)

In [77]:
f1_score(y_test,y_pred,average='macro')

0.17886178861788615

In [78]:
accuracy_score(y_test,y_pred)

0.36666666666666664

# e) Random Forest

In [79]:
from sklearn.ensemble import RandomForestClassifier
rf=RandomForestClassifier()
rf.fit(x_train,y_train)
y_pred=rf.predict(x_test)

In [80]:
confusion_matrix(y_test,y_pred)

array([[ 0,  0, 10],
       [ 0,  0,  9],
       [ 0,  0, 11]], dtype=int64)

In [81]:
f1_score(y_test,y_pred,average='macro')

0.17886178861788615

In [82]:
accuracy_score(y_test,y_pred)

0.36666666666666664

Identify the important features and build the model using that features

In [89]:
feature_imp=pd.Series(rf.feature_importances_).sort_values(ascending=False)*100

In [90]:
feature_imp

3    53.580259
2    34.553679
0     8.373088
1     3.492973
dtype: float64

In [96]:
x=data[['PL', 'PW']]
y=data['Classification'] 

In [97]:
x_train,x_test,y_train,y_test=train_test_split(x,y,random_state=42,test_size=0.3)

In [98]:
rf=RandomForestClassifier()
rf.fit(x_train,y_train)
y_pred=rf.predict(x_test)

In [99]:
confusion_matrix(y_test,y_pred)

array([[19,  0,  0],
       [ 0, 13,  0],
       [ 0,  0, 13]], dtype=int64)

In [100]:
accuracy_score(y_test,y_pred)

1.0

In [101]:
f1_score(y_test,y_pred,average='macro')

1.0

# f) Gradient Boosting

In [102]:
x=data[['SL', 'SW', 'PL', 'PW']]
y=data['Classification']   

In [103]:
x_train,x_test,y_train,y_test=train_test_split(x,y,random_state=42,test_size=0.3)

In [104]:
from sklearn.ensemble import GradientBoostingClassifier
grb=GradientBoostingClassifier()
grb.fit(x_train,y_train)
y_pred=grb.predict(x_test)

In [105]:
accuracy_score(y_test,y_pred)

1.0

In [106]:
f1_score(y_test,y_pred,average='macro')

1.0

From the above we understood that gradient boosting and Random forest classification on important features gave the best result in predicting the species