Importing the Dependencies

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

Data Collection and Data Processing

In [11]:
#loading the dataset to a pandas Dataframe
iris_data = pd.read_csv('iris.csv')
iris_data.head()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa


In [12]:
# number of rows and columns
iris_data.shape

(150, 6)

In [13]:
#describe --> statistical measures of the data
iris_data.describe()  

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm
count,150.0,150.0,150.0,150.0,150.0
mean,75.5,5.843333,3.054,3.758667,1.198667
std,43.445368,0.828066,0.433594,1.76442,0.763161
min,1.0,4.3,2.0,1.0,0.1
25%,38.25,5.1,2.8,1.6,0.3
50%,75.5,5.8,3.0,4.35,1.3
75%,112.75,6.4,3.3,5.1,1.8
max,150.0,7.9,4.4,6.9,2.5


In [20]:
iris_data.Species.value_counts()

Iris-virginica     50
Iris-setosa        50
Iris-versicolor    50
Name: Species, dtype: int64

In [21]:
iris_data.isnull().sum()

Id               0
SepalLengthCm    0
SepalWidthCm     0
PetalLengthCm    0
PetalWidthCm     0
Species          0
dtype: int64

In [22]:
iris_data.replace({'Species':{'Iris-virginica':0,'Iris-setosa':1,'Iris-versicolor':2}},inplace=True)

In [34]:
# separating data and Labels
X = iris_data.drop(['Id','Species'], axis=1)
Y = iris_data['Species']
print(X)
print(Y)

     SepalLengthCm  SepalWidthCm  PetalLengthCm  PetalWidthCm
0              5.1           3.5            1.4           0.2
1              4.9           3.0            1.4           0.2
2              4.7           3.2            1.3           0.2
3              4.6           3.1            1.5           0.2
4              5.0           3.6            1.4           0.2
..             ...           ...            ...           ...
145            6.7           3.0            5.2           2.3
146            6.3           2.5            5.0           1.9
147            6.5           3.0            5.2           2.0
148            6.2           3.4            5.4           2.3
149            5.9           3.0            5.1           1.8

[150 rows x 4 columns]
0      1
1      1
2      1
3      1
4      1
      ..
145    0
146    0
147    0
148    0
149    0
Name: Species, Length: 150, dtype: int64


Training and Test data

In [35]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.1, stratify=Y, random_state=1)
print(X.shape, X_train.shape, X_test.shape)

(150, 4) (135, 4) (15, 4)


In [26]:
print(X_train)
print(Y_train)

      Id  SepalLengthCm  SepalWidthCm  PetalLengthCm  PetalWidthCm
27    28            5.2           3.5            1.5           0.2
64    65            5.6           2.9            3.6           1.3
60    61            5.0           2.0            3.5           1.0
89    90            5.5           2.5            4.0           1.3
139  140            6.9           3.1            5.4           2.1
..   ...            ...           ...            ...           ...
145  146            6.7           3.0            5.2           2.3
68    69            6.2           2.2            4.5           1.5
141  142            6.9           3.1            5.1           2.3
77    78            6.7           3.0            5.0           1.7
142  143            5.8           2.7            5.1           1.9

[135 rows x 5 columns]
27     1
64     2
60     2
89     2
139    0
      ..
145    0
68     2
141    0
77     2
142    0
Name: Species, Length: 135, dtype: int64


Model Training --> Logistic Regression 

In [36]:
#training the Logistic Regression model with training data
model = LogisticRegression()
model.fit(X_train, Y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Model Evaluation

In [37]:
#accuracy on training data
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)
print('Accuracy on training data : ', training_data_accuracy)

Accuracy on training data :  0.9703703703703703


In [38]:
#accuracy on test data
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)
print('Accuracy on test data : ', test_data_accuracy)

Accuracy on test data :  1.0


Making a Predictive System

In [39]:
input_data = (5.6,2.5,3.9,1.12)

# changing the input_data to a numpy array
input_data_as_numpy_array = np.asarray(input_data)

# reshape the np array as we are predicting for one instance
input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)

prediction = model.predict(input_data_reshaped)
print(prediction)

if (prediction[0]==0):
  print('Iris-virginica')
elif (prediction[0]==1):
  print('Iris-setosa')
else :
  print('Iris-versicolor')


[2]
Iris-versicolor


