### Import libraries

In [1]:
import numpy as np
import pandas as pd

#Visualization
import matplotlib.pyplot as plt
import seaborn as sns

#Train test splitting
from sklearn.model_selection import train_test_split

# Model
from sklearn.linear_model import LogisticRegression

#Evaluation
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

In [10]:
df = pd.read_csv('Iris.csv')
df

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa
...,...,...,...,...,...,...
145,146,6.7,3.0,5.2,2.3,Iris-virginica
146,147,6.3,2.5,5.0,1.9,Iris-virginica
147,148,6.5,3.0,5.2,2.0,Iris-virginica
148,149,6.2,3.4,5.4,2.3,Iris-virginica


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             150 non-null    int64  
 1   SepalLengthCm  150 non-null    float64
 2   SepalWidthCm   150 non-null    float64
 3   PetalLengthCm  150 non-null    float64
 4   PetalWidthCm   150 non-null    float64
 5   Species        150 non-null    object 
dtypes: float64(4), int64(1), object(1)
memory usage: 7.2+ KB


In [12]:
df['Species'].unique()

array(['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'], dtype=object)

In [13]:
df['Species'] = df['Species'].replace({'Iris-setosa':0, 'Iris-versicolor':1,
                                      'Iris-virginica':2})

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             150 non-null    int64  
 1   SepalLengthCm  150 non-null    float64
 2   SepalWidthCm   150 non-null    float64
 3   PetalLengthCm  150 non-null    float64
 4   PetalWidthCm   150 non-null    float64
 5   Species        150 non-null    int64  
dtypes: float64(4), int64(2)
memory usage: 7.2 KB


In [15]:
df.isna().sum()

Id               0
SepalLengthCm    0
SepalWidthCm     0
PetalLengthCm    0
PetalWidthCm     0
Species          0
dtype: int64

In [17]:
df.drop(['Id'], axis=1, inplace=True)

In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   SepalLengthCm  150 non-null    float64
 1   SepalWidthCm   150 non-null    float64
 2   PetalLengthCm  150 non-null    float64
 3   PetalWidthCm   150 non-null    float64
 4   Species        150 non-null    int64  
dtypes: float64(4), int64(1)
memory usage: 6.0 KB


### Split the data

In [19]:
X = df.drop(['Species'], axis=1)
y = df['Species']

In [26]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, 
                                                    random_state=1)

### Model Building

In [27]:
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [28]:
y_pred = log_reg.predict(X_test)
y_pred

array([0, 1, 1, 0, 2, 1, 2, 0, 0, 2, 1, 0, 2, 1, 1, 0, 1, 1, 0, 0, 1, 1,
       2, 0, 2, 1, 0, 0, 1, 2, 1, 2, 1, 2, 2, 0, 1, 0, 1, 2, 2, 0, 2, 2,
       1], dtype=int64)

In [29]:
# Accuracy Score
accuracy = accuracy_score(y_test, y_pred)
accuracy

0.9777777777777777

In [30]:
#confusion matrix
confusion_matrix(y_test, y_pred)

array([[14,  0,  0],
       [ 0, 17,  1],
       [ 0,  0, 13]], dtype=int64)

In [31]:
from sklearn.metrics import multilabel_confusion_matrix

In [32]:
multilabel_confusion_matrix(y_test, y_pred)

array([[[31,  0],
        [ 0, 14]],

       [[27,  0],
        [ 1, 17]],

       [[31,  1],
        [ 0, 13]]], dtype=int64)

In [34]:
#classification
class_report = classification_report(y_test, y_pred)
print(class_report)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        14
           1       1.00      0.94      0.97        18
           2       0.93      1.00      0.96        13

    accuracy                           0.98        45
   macro avg       0.98      0.98      0.98        45
weighted avg       0.98      0.98      0.98        45



In [35]:
logi_reg1 = LogisticRegression(multi_class='ovr')  #One vs Rest
logi_reg1.fit(X_train, y_train)

In [36]:
y_pred_ovr = logi_reg1.predict(X_test)
y_pred_ovr

array([0, 1, 1, 0, 2, 2, 2, 0, 0, 2, 1, 0, 2, 2, 1, 0, 1, 2, 0, 0, 1, 2,
       2, 0, 2, 1, 0, 0, 1, 2, 1, 2, 1, 2, 2, 0, 1, 0, 1, 2, 2, 0, 1, 2,
       1], dtype=int64)

In [37]:
# Accuracy Score
accuracy_ovr = accuracy_score(y_test, y_pred_ovr)
accuracy_ovr

0.8666666666666667

In [38]:
multilabel_confusion_matrix(y_test, y_pred_ovr)

array([[[31,  0],
        [ 0, 14]],

       [[26,  1],
        [ 5, 13]],

       [[27,  5],
        [ 1, 12]]], dtype=int64)

In [39]:
 #classification
class_report = classification_report(y_test, y_pred_ovr)
print(class_report)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        14
           1       0.93      0.72      0.81        18
           2       0.71      0.92      0.80        13

    accuracy                           0.87        45
   macro avg       0.88      0.88      0.87        45
weighted avg       0.89      0.87      0.87        45

