## Import Libraries

In [138]:
import numpy as np                 # for linear algebra operations
import pandas as pd                # for data manipulation and analysis
import os                          # for interacting with the operating system
import matplotlib.pyplot as plt    # for creating visualizations and enable inline plotting in Jupyter notebooks for matplotlib
%matplotlib inline                 
import seaborn as sns              # for creating statistical graphics

import warnings                    # for handling warning messages
warnings.filterwarnings('ignore')  # ignore warnings during code execution

## Loading the dataset

In [140]:
df = pd.read_csv('Iris.csv')
df.head(15)     # for displaying first 15 rows of data from the table

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa
5,6,5.4,3.9,1.7,0.4,Iris-setosa
6,7,4.6,3.4,1.4,0.3,Iris-setosa
7,8,5.0,3.4,1.5,0.2,Iris-setosa
8,9,4.4,2.9,1.4,0.2,Iris-setosa
9,10,4.9,3.1,1.5,0.1,Iris-setosa


In [133]:
df.shape    # 150 rows and 6 columns

(150, 6)

In [141]:
df = df.drop(columns=['Id'])  # deleting a column which is of no use
df.head()

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [5]:
df.info()  # Basic information about the data

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   SepalLengthCm  150 non-null    float64
 1   SepalWidthCm   150 non-null    float64
 2   PetalLengthCm  150 non-null    float64
 3   PetalWidthCm   150 non-null    float64
 4   Species        150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB


In [7]:
# Number of sample in each categories
df['Species'].value_counts()

Iris-setosa        50
Iris-versicolor    50
Iris-virginica     50
Name: Species, dtype: int64

## Preprocessing the dataset

In [9]:
# check for null values
df.isnull().sum()

SepalLengthCm    0
SepalWidthCm     0
PetalLengthCm    0
PetalWidthCm     0
Species          0
dtype: int64

## Transforming Spacies Column into Numerical Values

In [143]:
df["Species"].replace({"Iris-setosa": 1, "Iris-versicolor": 2, "Iris-virginica": 3}, inplace = True)

df.head(3)   # displaying first 3 rows from the dataset

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,5.1,3.5,1.4,0.2,1
1,4.9,3.0,1.4,0.2,1
2,4.7,3.2,1.3,0.2,1


In [144]:
df.tail(3)   # displaying last 3 rows from the dataset

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
147,6.5,3.0,5.2,2.0,3
148,6.2,3.4,5.4,2.3,3
149,5.9,3.0,5.1,1.8,3


## Array Creation

In [151]:
X_data = pd.DataFrame(df,columns=["SepalLengthCm","SepalWidthCm","PetalLengthCm","PetalWidthCm"]).values
Y_data = df.Species.values.reshape(-1,1)

In [152]:
X_data[:10]  # displaying first 10

array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2],
       [5.4, 3.9, 1.7, 0.4],
       [4.6, 3.4, 1.4, 0.3],
       [5. , 3.4, 1.5, 0.2],
       [4.4, 2.9, 1.4, 0.2],
       [4.9, 3.1, 1.5, 0.1]])

In [153]:
Y_data[:10]  # displaying first 10

array([[1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1]], dtype=int64)

## Model Training

In [154]:
from sklearn.model_selection import train_test_split

x_data_train, x_data_test, y_data_train, y_data_test = train_test_split(X_data, Y_data, test_size=0.20, random_state=42)  
# test_size=20%
# With random_state=42 , we get the same train and test sets across different executions

In [155]:
# Linear Regression 
from sklearn.linear_model import LinearRegression
model = LinearRegression()

In [156]:
# model training

model.fit(x_data_train, y_data_train)

In [163]:
print("Accuracy in Linear Regression Model: ", round((model.score(x_data_test, y_data_test) * 100),2),"%")

Accuracy in Linear Regression Model:  94.67 %


In [164]:
# Logistic regression
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()

In [165]:
# model training 

model.fit(x_data_train, y_data_train)

In [166]:
print("Accuracy in Logistic Regression Model: ", round((model.score(x_data_test, y_data_test) * 100),2),"%")

Accuracy in Logistic Regression Model:  100.0 %


### 100% Accuracy achieved but trying out with other models

In [167]:
# KNN - k-nearest neighbours
from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier()

In [168]:
# model training 

model.fit(x_data_train, y_data_train)

In [169]:
print("Accuracy in KNN Model: ", round((model.score(x_data_test, y_data_test) * 100),2),"%")

Accuracy in KNN Model:  100.0 %


In [171]:
# Decision tree
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier()

In [172]:
# model training 

model.fit(x_data_train, y_data_train)

In [173]:
print("Accuracy in Decision Tree Model: ", round((model.score(x_data_test, y_data_test) * 100),2),"%")

Accuracy in Decision Tree Model:  100.0 %
