In [1]:
import numpy as np
import pandas as pd

# Loading the dataset

In [2]:
df = pd.read_csv('iris.csv')

In [3]:
df.head()

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width,variety
0,5.1,3.5,1.4,0.2,Setosa
1,4.9,3.0,1.4,0.2,Setosa
2,4.7,3.2,1.3,0.2,Setosa
3,4.6,3.1,1.5,0.2,Setosa
4,5.0,3.6,1.4,0.2,Setosa


# EDA and Missing values

In [4]:
df.columns

Index(['sepal.length', 'sepal.width', 'petal.length', 'petal.width',
       'variety'],
      dtype='object')

In [5]:
df['variety'].unique()

array(['Setosa', 'Versicolor', 'Virginica'], dtype=object)

In [6]:
df.isna().sum()

sepal.length    0
sepal.width     0
petal.length    0
petal.width     0
variety         0
dtype: int64

This dataset is clean and fit for feeding into the ML model since there are no missing values.
But before feeding the data into the model, let's first encode our target variable classes into different numbers.

# Label Encoding

In [7]:
from sklearn.preprocessing import LabelEncoder

In [8]:
encoder = LabelEncoder()
df.variety = encoder.fit_transform(df.variety)

In [9]:
df['variety'].unique()

array([0, 1, 2])

# Train-Test Split

In [10]:
from sklearn.model_selection import train_test_split

In [11]:
train, test = train_test_split(df, test_size=0.3, random_state=0)

In [12]:
train.shape

(105, 5)

In [13]:
test.shape

(45, 5)

In [14]:
trainX = train.drop('variety', axis=1)

In [15]:
testX = test.drop('variety', axis=1)

In [16]:
trainY = train['variety']

In [17]:
testY = test['variety']

# Building the model

### Logistic Regression

In [18]:
from sklearn.linear_model import LogisticRegression

In [19]:
log_reg_model = LogisticRegression()

In [20]:
log_reg_model.fit(trainX, trainY)

LogisticRegression()

In [24]:
prediction = log_reg_model.predict(testX)
prediction

array([2, 1, 0, 2, 0, 2, 0, 1, 1, 1, 2, 1, 1, 1, 1, 0, 1, 1, 0, 0, 2, 1,
       0, 0, 2, 0, 0, 1, 1, 0, 2, 1, 0, 2, 2, 1, 0, 2, 1, 1, 2, 0, 2, 0,
       0])

### Decision Tree

In [25]:
from sklearn.tree import DecisionTreeClassifier

In [26]:
dec_tree = DecisionTreeClassifier(criterion='gini')

In [27]:
dec_tree.fit(trainX, trainY)

DecisionTreeClassifier()

In [28]:
tree_prediction = dec_tree.predict(testX)
tree_prediction

array([2, 1, 0, 2, 0, 2, 0, 1, 1, 1, 2, 1, 1, 1, 1, 0, 1, 1, 0, 0, 2, 1,
       0, 0, 2, 0, 0, 1, 1, 0, 2, 1, 0, 2, 2, 1, 0, 2, 1, 1, 2, 0, 2, 0,
       0])

# Accuracy Score

In [22]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

#### Logistic Regression Model

In [29]:
confusion_matrix(testY, prediction)

array([[16,  0,  0],
       [ 0, 17,  1],
       [ 0,  0, 11]], dtype=int64)

In [32]:
log_reg_accuracy = accuracy_score(testY, prediction)
log_reg_accuracy

0.9777777777777777

#### Decision Tree Model

In [31]:
confusion_matrix(testY, tree_prediction)

array([[16,  0,  0],
       [ 0, 17,  1],
       [ 0,  0, 11]], dtype=int64)

In [33]:
dec_tree_accuracy = accuracy_score(testY, tree_prediction)
dec_tree_accuracy

0.9777777777777777