In [54]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore")

import scipy.stats as stat

# 1. Read the dataset to the python environment.

In [55]:
data=pd.read_excel('iris1.xls')

In [56]:
data.shape

(150, 5)

In [57]:
data.head()

Unnamed: 0,SL,SW,PL,PW,Classification
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


# 2. Do necessary pre-processing steps

In [58]:
data['Classification'].value_counts()

Iris-virginica     50
Iris-setosa        50
Iris-versicolor    50
Name: Classification, dtype: int64

In [59]:
data['Classification'].value_counts(normalize=True)

Iris-virginica     0.333333
Iris-setosa        0.333333
Iris-versicolor    0.333333
Name: Classification, dtype: float64

In [60]:
data.isna().sum()

SL                7
SW                6
PL                6
PW                0
Classification    0
dtype: int64

In [61]:
x=data.drop(['Classification'],axis=1)

In [62]:
y=data['Classification']

In [63]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,random_state=42,test_size=0.2)

In [64]:
x_train.shape

(120, 4)

In [65]:
x_test.shape

(30, 4)

In [39]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   SL              143 non-null    float64
 1   SW              144 non-null    float64
 2   PL              144 non-null    float64
 3   PW              150 non-null    float64
 4   Classification  150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 5.3+ KB


In [66]:
data['Classification'].value_counts()

Iris-virginica     50
Iris-setosa        50
Iris-versicolor    50
Name: Classification, dtype: int64

In [67]:
from sklearn.preprocessing import LabelEncoder
label_en=LabelEncoder()
a=['Classification']
for i in np.arange(len(a)):
    data[a[i]]=label_en.fit_transform(data[a[i]])

In [68]:
data.head()

Unnamed: 0,SL,SW,PL,PW,Classification
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [69]:
data['SL']=data['SL'].fillna(data['SL'].mode())
data['SW']=data['SW'].fillna(data['SW'].mode())
data['PL']=data['PL'].fillna(data['PL'].mode())
data['PW']=data['PW'].fillna(data['PW'].mode())

# 3. Find out which classification model gives the best result to predict iris species.(also do random forest algorithm)

# Logistic regression model

In [70]:
from sklearn.linear_model import LogisticRegression

In [None]:
logit_model=LogisticRegression()
logit_model.fit(x_train,y_train)
y_pred=logit_model.predict(x_test)

In [None]:
from sklearn.metrics import confusion_matrix,accuracy_score,precision_score,recall_score,f1_score

In [None]:
print('Accuracy is',accuracy_score(y_test,y_pred))
print('Precision is',precision_score(y_test,y_pred))
print('Recall is',recall_score(y_test,y_pred))
print('f1 score is',f1_score(y_test,y_pred))

For a good model 'accuracy' and 'f1 score' should be maximum.

In [None]:
confusion_matrix(y_test,y_pred)

# k-NN model

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
acc_values=[]
neighbors=np.arange(3,15)
for k in neighbors:
    classifier=KNeighborsClassifier(n_neighbors=k,metric='minkowski') 
    classifier.fit(x_train,y_train)
    y_pred=classifier.predict(x_test)
    acc=accuracy_score(y_test,y_pred)
    acc_values.append(acc)

In [None]:
acc_values

In [None]:
plt.plot(neighbors,acc_values,'o-')
plt.xlabel('k value')
plt.ylabel('accuracy')

# Standard scaling

In [None]:
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
x_train=scaler.fit_transform(x_train)
x_test=scaler.fit_transform(x_test)

# Decision Tree Classifier

In [None]:
from sklearn.tree import DecisionTreeClassifier
dt_model=DecisionTreeClassifier()

In [None]:
dt_model.fit(x_train,y_train)

In [None]:
y_pred=dt_model.predict(x_test)

In [None]:
print('Accuracy on decision tree model is',accuracy_score(y_test,y_pred))

In [None]:
y_test.value_counts()

In [None]:
confusion_matrix(y_test,y_pred)

# Linear svm

In [None]:
from sklearn.svm import SVC
svm_linear=SVC(kernel='linear')

In [None]:
svm_linear.fit(x_train,y_train)

In [None]:
y_pred=svm_linear.predict(x_test)

In [None]:
print('Accuracy using linear svm is',accuracy_score(y_test,y_pred))

# Polynomial svm

In [None]:
svm_poly=SVC(kernel='poly',degree=3)

In [None]:
svm_poly.fit(x_train,y_train)

In [None]:
y_pred=svm_poly.predict(x_test)

In [None]:
print('Accuracy is',accuracy_score(y_test,y_pred))

# Radial SVM

In [None]:
svm_radial=SVC(kernel='rbf')

In [None]:
svm_radial.fit(x_train,y_train)

In [None]:
y_pred=svm_radial.predict(x_test)

In [None]:
print('Accuracy is',accuracy_score(y_test,y_pred))

# Random forest classification

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf=RandomForestClassifier()
rf.fit(x_train,y_train)
y_pred=rf.predict(x_test)

In [None]:
from sklearn.metrics import f1_score,confusion_matrix
f1_score(y_test,y_pred)

In [None]:
confusion_matrix(y_test,y_pred)

In [None]:
pd.Series(rf.feature_importances_,index=x.columns).sort_values(ascending=False)*100

In [None]:
threshold=0.22
y_pred_proba=rf.predict_proba(x_test)[:,1]
y_pred=(y_pred_proba>threshold).astype(int)

In [None]:
f1_score(y_test,y_pred)

In [None]:
rf.fit(x_train,y_train)

In [None]:
rft=RandomForestClassifier(n_estimators=500,max_depth=10,random_state=42,criterion='entropy')

In [None]:
rft.fit(x_train,y_train)

In [None]:
y_pred=rft.predict(x_test)
f1_score(y_test,y_pred)