In [1]:
import numpy as np 
import pandas as pd 
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/iris-dataset/Iris.csv


# Load the dataset

In [2]:
iris_data = pd.read_csv('/kaggle/input/iris-dataset/Iris.csv')
print(iris_data.shape)
print(iris_data.head())

(150, 6)
   Id  SepalLengthCm  SepalWidthCm  PetalLengthCm  PetalWidthCm      Species
0   1            5.1           3.5            1.4           0.2  Iris-setosa
1   2            4.9           3.0            1.4           0.2  Iris-setosa
2   3            4.7           3.2            1.3           0.2  Iris-setosa
3   4            4.6           3.1            1.5           0.2  Iris-setosa
4   5            5.0           3.6            1.4           0.2  Iris-setosa


We remove the Id column as it has no significance on determining the class labels

In [4]:
iris_data.drop('Id',inplace=True,axis=1)
print(iris_data.head())

   SepalLengthCm  SepalWidthCm  PetalLengthCm  PetalWidthCm      Species
0            5.1           3.5            1.4           0.2  Iris-setosa
1            4.9           3.0            1.4           0.2  Iris-setosa
2            4.7           3.2            1.3           0.2  Iris-setosa
3            4.6           3.1            1.5           0.2  Iris-setosa
4            5.0           3.6            1.4           0.2  Iris-setosa


**Let us see the class distribution in the dataset. We can use the value_counts function to get the count of unique values for a given column**

In [5]:
print(iris_data['Species'].value_counts())

Iris-setosa        50
Iris-virginica     50
Iris-versicolor    50
Name: Species, dtype: int64


In [6]:
iris_data.isnull().sum()

SepalLengthCm    0
SepalWidthCm     0
PetalLengthCm    0
PetalWidthCm     0
Species          0
dtype: int64

That's great! There are no missing values in the dataset

# Partition the dataset into features and labels

**We need to shuffle our dataset as the examples are ordered by classes. Training without shuffling the dataset will lead to poor generalization**

In [7]:
iris_data_shuffled = iris_data.sample(frac=1).reset_index(drop=True)
print(iris_data_shuffled)

     SepalLengthCm  SepalWidthCm  PetalLengthCm  PetalWidthCm          Species
0              7.7           3.8            6.7           2.2   Iris-virginica
1              5.7           2.8            4.1           1.3  Iris-versicolor
2              5.4           3.7            1.5           0.2      Iris-setosa
3              4.8           3.4            1.9           0.2      Iris-setosa
4              4.8           3.0            1.4           0.1      Iris-setosa
..             ...           ...            ...           ...              ...
145            6.5           2.8            4.6           1.5  Iris-versicolor
146            7.0           3.2            4.7           1.4  Iris-versicolor
147            7.7           2.8            6.7           2.0   Iris-virginica
148            5.6           2.9            3.6           1.3  Iris-versicolor
149            6.7           3.0            5.0           1.7  Iris-versicolor

[150 rows x 5 columns]


**we will create a new dataframe which consists of the 4 features and another dataframe which consists of the class labels**

In [10]:
#drop the Species column. Note that we are not doing it inplace so the original dataframe will not be modified
train_x = iris_data_shuffled.drop('Species',axis=1)
print(train_x.head())

   SepalLengthCm  SepalWidthCm  PetalLengthCm  PetalWidthCm
0            7.7           3.8            6.7           2.2
1            5.7           2.8            4.1           1.3
2            5.4           3.7            1.5           0.2
3            4.8           3.4            1.9           0.2
4            4.8           3.0            1.4           0.1


In [17]:
train_y = iris_data_shuffled['Species']
print(train_y.head())

0     Iris-virginica
1    Iris-versicolor
2        Iris-setosa
3        Iris-setosa
4        Iris-setosa
Name: Species, dtype: object


 **Let us convert the labels into numeric values. We use LabelEncoder for the same**

In [22]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
labels = le.fit_transform(train_y)
print(labels)

[2 1 0 0 0 2 2 0 2 0 1 1 2 2 0 2 2 2 0 1 2 2 1 2 2 2 0 1 0 2 1 1 1 0 0 1 2
 0 1 1 2 2 2 1 1 0 2 0 0 1 1 0 0 1 0 2 0 0 0 0 0 1 2 0 0 1 2 2 1 1 2 1 0 0
 0 0 1 2 1 2 1 1 0 1 1 1 1 0 2 1 0 2 1 0 1 1 1 0 2 2 1 1 2 0 1 2 2 2 2 2 2
 2 1 2 0 0 2 0 2 0 2 2 0 0 2 1 2 0 0 2 2 1 0 1 2 0 1 0 0 0 1 0 1 0 1 1 1 2
 1 1]


convert train_x to numpy array

In [23]:
X = train_x.values
y = labels
print(X.shape)
print(y.shape)

(150, 4)
(150,)


**Let us perform logistic regression. First we split the data into training and testing sets**

In [24]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.33)
print('no. of training samples:',len(X_train))
print('no. of testing samples:',len(X_test))

no. of training samples: 100
no. of testing samples: 50


**Let us train the model using logistic regression**

In [26]:
lr_model = LogisticRegression(multi_class='multinomial')
lr_model.fit(X_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='multinomial', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [27]:
y_pred = lr_model.predict(X_test)
print(y_pred)

[0 1 2 2 1 2 2 2 0 1 0 1 0 1 1 0 2 2 1 2 0 2 0 2 2 1 0 2 1 0 2 0 0 0 0 1 1
 1 0 1 2 1 0 2 0 1 1 1 2 2]


In [32]:
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix
print('test accuracy:',accuracy_score(y_test,y_pred))

test accuracy: 0.98


In [33]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        16
           1       0.94      1.00      0.97        16
           2       1.00      0.94      0.97        18

    accuracy                           0.98        50
   macro avg       0.98      0.98      0.98        50
weighted avg       0.98      0.98      0.98        50



In [34]:
print(confusion_matrix(y_test,y_pred))

[[16  0  0]
 [ 0 16  0]
 [ 0  1 17]]
