In [1]:
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

from sklearn.metrics import confusion_matrix, multilabel_confusion_matrix, accuracy_score
from sklearn.metrics import roc_auc_score,roc_curve,classification_report

In [2]:
df = pd.read_csv('Iris.csv')
df

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa
...,...,...,...,...,...,...
145,146,6.7,3.0,5.2,2.3,Iris-virginica
146,147,6.3,2.5,5.0,1.9,Iris-virginica
147,148,6.5,3.0,5.2,2.0,Iris-virginica
148,149,6.2,3.4,5.4,2.3,Iris-virginica


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             150 non-null    int64  
 1   SepalLengthCm  150 non-null    float64
 2   SepalWidthCm   150 non-null    float64
 3   PetalLengthCm  150 non-null    float64
 4   PetalWidthCm   150 non-null    float64
 5   Species        150 non-null    object 
dtypes: float64(4), int64(1), object(1)
memory usage: 7.2+ KB


In [4]:
df['Species'].unique()

array(['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'], dtype=object)

In [5]:
df['Species'] = df['Species'].replace({'Iris-setosa':0,'Iris-versicolor':1,'Iris-virginica':2})

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             150 non-null    int64  
 1   SepalLengthCm  150 non-null    float64
 2   SepalWidthCm   150 non-null    float64
 3   PetalLengthCm  150 non-null    float64
 4   PetalWidthCm   150 non-null    float64
 5   Species        150 non-null    int64  
dtypes: float64(4), int64(2)
memory usage: 7.2 KB


In [7]:
x = df.drop(['Id','Species'], axis = 1)
y = df['Species']

In [9]:
y

0      0
1      0
2      0
3      0
4      0
      ..
145    2
146    2
147    2
148    2
149    2
Name: Species, Length: 150, dtype: int64

In [10]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=1,stratify=y)

In [12]:
y_test.value_counts()

0    10
1    10
2    10
Name: Species, dtype: int64

In [13]:
clf_model = LogisticRegression(multi_class='ovr')  # one Vs Rest
clf_model.fit(x_train,y_train)

In [14]:
y_pred = clf_model.predict(x_test)
y_pred

array([2, 0, 1, 0, 0, 0, 2, 2, 2, 1, 0, 1, 2, 1, 2, 0, 2, 1, 1, 2, 1, 1,
       0, 0, 2, 1, 0, 0, 2, 1], dtype=int64)

In [16]:
# y_test

In [17]:
y_pred_prob = clf_model.predict_proba(x_test)
y_pred_prob

array([[3.61965026e-05, 3.41589378e-01, 6.58374426e-01],
       [7.87974999e-01, 2.12022695e-01, 2.30633454e-06],
       [3.26510816e-01, 6.71338722e-01, 2.15046192e-03],
       [8.50466721e-01, 1.49529750e-01, 3.52954463e-06],
       [9.13091357e-01, 8.69001472e-02, 8.49539420e-06],
       [8.97258003e-01, 1.02739864e-01, 2.13294200e-06],
       [5.94589898e-04, 1.56423895e-01, 8.42981515e-01],
       [4.64713312e-04, 3.74118772e-01, 6.25416515e-01],
       [2.03589538e-03, 4.10344609e-01, 5.87619496e-01],
       [3.41040014e-03, 5.84837290e-01, 4.11752310e-01],
       [9.63676918e-01, 3.63207683e-02, 2.31387402e-06],
       [2.81405526e-02, 9.17277183e-01, 5.45822644e-02],
       [2.76924380e-04, 2.46468518e-01, 7.53254558e-01],
       [2.53203623e-02, 9.42989792e-01, 3.16898459e-02],
       [1.02033811e-03, 4.49214344e-01, 5.49765318e-01],
       [9.84935673e-01, 1.50608361e-02, 3.49136696e-06],
       [9.16214104e-05, 2.56830786e-01, 7.43077593e-01],
       [5.00623019e-02, 8.69746

In [18]:
test_accuracy = accuracy_score(y_test,y_pred)
print('Accuracy on testing dataset is :', test_accuracy)

Accuracy on testing dataset is : 0.9333333333333333


In [19]:
y_pred_train = clf_model.predict(x_train)
train_accuracy = accuracy_score(y_train,y_pred_train)
print('Accuracy on training dataset is :',train_accuracy)

Accuracy on training dataset is : 0.9583333333333334


In [20]:
multilabel_confusion_matrix(y_test,y_pred)

array([[[20,  0],
        [ 0, 10]],

       [[19,  1],
        [ 1,  9]],

       [[19,  1],
        [ 1,  9]]], dtype=int64)

In [21]:
confusion_matrix(y_test,y_pred)

array([[10,  0,  0],
       [ 0,  9,  1],
       [ 0,  1,  9]], dtype=int64)

In [23]:
clf_report = classification_report(y_test,y_pred)
print('Classification report is \n:', clf_report)

Classification report is 
:               precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       0.90      0.90      0.90        10
           2       0.90      0.90      0.90        10

    accuracy                           0.93        30
   macro avg       0.93      0.93      0.93        30
weighted avg       0.93      0.93      0.93        30

