In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df=pd.read_excel("iris.xls")

In [3]:
df

Unnamed: 0,SL,SW,PL,PW,Classification
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,Iris-virginica
146,6.3,2.5,5.0,1.9,Iris-virginica
147,6.5,3.0,,2.0,Iris-virginica
148,6.2,3.4,5.4,2.3,Iris-virginica


In [4]:
df.shape

(150, 5)

In [None]:
df["Classification"].unique()

array(['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'], dtype=object)

In [None]:
#to find duplicated rows
duplicates=df[df.duplicated()]
print("Duplicate Rows except first occurrence:\n",duplicates)

In [None]:
df = df.drop_duplicates()

# print dataframe after removing duplicates
print(df)

In [None]:
df.info()

In [None]:
df['Classification'].value_counts(normalize=True)

In [None]:
df.shape

In [None]:
#to find null values
df.isna().sum()

In [None]:
print('Iris-setosa')
setosa = df['Classification'] == "Iris-setosa"
print(df[setosa].median())

print('\nIris-versicolor')
versicolor = df['Classification'] == 'Iris-versicolor'
print(df[versicolor].median())
print('\nIris-virginica')
virginica = df['Classification'] == 'Iris-virginica'
print(df[virginica].median())

In [None]:
# calculate the median 'SL' for each species and store in a dictionary
median_sl_by_species = df.groupby('Classification')['SL'].median().to_dict()

# fill the NaN values in 'SL' with the median of the corresponding species
df['SL'] = df.apply(lambda x: median_sl_by_species[x['Classification']] if pd.isna(x['SL']) else x['SL'], axis=1)

In [None]:
median_sw_by_species = df.groupby('Classification')['SW'].median().to_dict()

# fill the NaN values in 'SW' with the median of the corresponding species
df['SW'] = df.apply(lambda x: median_sw_by_species[x['Classification']] if pd.isna(x['SW']) else x['SW'], axis=1)

In [None]:
median_pl_by_species = df.groupby('Classification')['PL'].median().to_dict()

# fill the NaN values in 'PL' with the median of the corresponding species
df['PL'] = df.apply(lambda x: median_pl_by_species[x['Classification']] if pd.isna(x['PL']) else x['PL'], axis=1)

In [None]:
df.head()

In [None]:
df.isna().sum()

In [None]:
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
df['Classification']=le.fit_transform(df['Classification'])

In [None]:
df.head()

In [None]:
#TO FIND OUTLIERS
for i in df:
    sns.boxplot(x=df[i])
    plt.show()

In [None]:
#to eliminate outliers
from scipy import stats
IQR =stats.iqr(df.SW,interpolation='midpoint')
IQR

In [None]:
Q1=df.SW.quantile(0.25)
Q3=df.SW.quantile(0.75)
min_limit=Q1-1.5*IQR
max_limit=Q3+1.5*IQR
min_limit,max_limit

In [None]:
df.loc[df['SW']>max_limit,'SW']=np.median(df.SW)
df.loc[df['SW']<min_limit,'SW']=np.median(df.SW)

In [None]:
df.loc[df.SW>max_limit]

In [None]:
df.loc[df.SW<min_limit]

In [None]:
df.head()

In [None]:
y=df['Classification']
x=df.drop(['Classification'],axis=1)

In [None]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,random_state=42,test_size=0.2)

In [None]:
from sklearn.preprocessing import StandardScaler
sc=StandardScaler()
x_train=sc.fit_transform(x_train)
x_test=sc.transform(x_test)

# LOGISTIC REGRESSION

In [None]:
from sklearn.linear_model import LogisticRegression
clf=LogisticRegression()
model=clf.fit(x_train,y_train)

In [None]:
y_pred=model.predict(x_test)

In [None]:
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score
print("Accuracy :",accuracy_score(y_test,y_pred))
precision=precision_score(y_test,y_pred,average='weighted')
print("precision :",precision)
recall_score=recall_score(y_test,y_pred,average='weighted')
print("recall_score :",recall_score)
f1_score=f1_score(y_test,y_pred,average='weighted')
print("f1_score :",f1_score)

In [None]:
from sklearn.metrics import confusion_matrix

In [None]:
print(confusion_matrix(y_test,y_pred))

# KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier
metric_k=[]
neighbors=np.arange(3,15)
for k in neighbors:
    classifier=KNeighborsClassifier(n_neighbors=k,metric="minkowski",p=2)
    model=classifier.fit(x_train,y_train)
    y_pred=model.predict(x_test)
    acc=accuracy_score(y_test,y_pred)
    metric_k.append(acc)

In [None]:
print( metric_k)

In [None]:
plt.plot(neighbors,metric_k,"o-")
plt.xlabel('k value')
plt.ylabel('accuracy')
plt.grid()

In [None]:
classifier=KNeighborsClassifier(n_neighbors=10,metric="minkowski",p=2)
model1=classifier.fit(x_train,y_train)
y_pred1=model1.predict(x_test)
acc=accuracy_score(y_test,y_pred1)

In [None]:
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score
print("Accuracy :",accuracy_score(y_test,y_pred1))
print("precision :",precision_score(y_test,y_pred1,average="weighted"))
print("recall_score :",recall_score(y_test,y_pred1,average="weighted"))
print("f1_score :",f1_score(y_test,y_pred1,average="weighted"))

# DECISION TREE

In [None]:
from sklearn.tree import DecisionTreeClassifier
dt_clf=DecisionTreeClassifier(random_state=42)
dt_clf.fit(x_train,y_train)
y_pred_dt=dt_clf.predict(x_test)

In [None]:
from sklearn.metrics import accuracy_score,confusion_matrix
print("accuracy :",accuracy_score(y_test,y_pred_dt))
print("confusion matrix:\n",confusion_matrix(y_test,y_pred_dt))

# random forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf_clf=RandomForestClassifier(random_state=42)
rf_clf.fit(x_train,y_train)
y_pred_rf=rf_clf.predict(x_test)

In [None]:
from sklearn.metrics import accuracy_score,confusion_matrix
print("accuracy :",accuracy_score(y_test,y_pred_rf))
print("confusion matrix:\n",confusion_matrix(y_test,y_pred_rf))

# MODEL COMPARISON AND CONCLUSION

# MODEL ACCURACY:      
LOGISTICREGRESSION :0.966
KNN                :1
DECISIONTREE       :0.966
RANDOMFOREST       :0.933

CONCLUSION:
KNN is the best accurate model for the scenerio by comparing other classification models 