In [1]:
# the data science methodology for using notebooks
# importing pandas package to help describe the data

import pandas
data = pandas.read_csv("http://modcom.co.ke/bigdatasept/datasets/iris.csv")
print(data)

     sepallength sepalwidth  petallength  petalwidth           class
0            5.1        3.5          1.4         0.2     Iris-setosa
1            4.9          3          1.4         0.2     Iris-setosa
2            4.7        3.2          1.3         0.2     Iris-setosa
3            4.6        3.1          1.5         0.2     Iris-setosa
4            5.0        3.6          1.4         0.2     Iris-setosa
5            5.4        3.9          1.7         0.4     Iris-setosa
6            4.6        3.4          1.4         0.3     Iris-setosa
7            5.0        3.4          1.5         0.2     Iris-setosa
8            4.4        2.9          1.4         0.2     Iris-setosa
9            4.9        3.1          1.5         0.1     Iris-setosa
10           5.4        3.7          1.5         0.2     Iris-setosa
11           4.8        3.4          1.6         0.2     Iris-setosa
12           4.8          3          1.4         0.1     Iris-setosa
13           4.3          3       

In [2]:
# to know how many rows(records) and columns

print(data.shape)

(150, 5)


In [3]:
 # to describe/analyse the data using the describe function

print(data.describe())

       sepallength  petallength  petalwidth
count   150.000000   148.000000  150.000000
mean      5.843333     3.791216    1.198667
std       0.828066     1.753711    0.763161
min       4.300000     1.000000    0.100000
25%       5.100000     1.600000    0.300000
50%       5.800000     4.400000    1.300000
75%       6.400000     5.100000    1.800000
max       7.900000     6.900000    2.500000


In [4]:
# to show relationships in the data using the cor-relate function corr()

print(data.corr())

             sepallength  petallength  petalwidth
sepallength     1.000000     0.870964    0.817954
petallength     0.870964     1.000000    0.962038
petalwidth      0.817954     0.962038    1.000000


In [5]:
# to group data according to specifications using groupby function groupby()

print('Grouping each class by size\n', data.groupby('class').size(), '\n\n','Grouping each class sepallength by mean\n', data.groupby('class')['sepallength'].mean(), '\n\nGrouping each class petallength by median\n', data.groupby('class')['petallength'].median())

Grouping each class by size
 class
Iris-setosa        50
Iris-versicolor    50
Iris-virginica     50
dtype: int64 

 Grouping each class sepallength by mean
 class
Iris-setosa        5.006
Iris-versicolor    5.936
Iris-virginica     6.588
Name: sepallength, dtype: float64 

Grouping each class petallength by median
 class
Iris-setosa        1.50
Iris-versicolor    4.35
Iris-virginica     5.55
Name: petallength, dtype: float64


In [6]:
# how to find null data using isnull() function and how many null inputs we have

print(data.isnull().sum())

sepallength    0
sepalwidth     0
petallength    2
petalwidth     0
class          0
dtype: int64


In [7]:
# how to fix null inputs
petallenth_median = data['petallength'].median()
print('Median of petallength is ', petallenth_median)

Median of petallength is  4.4


In [8]:
# fill null slots using median and fillna() function
# we use median because in statistics for every empty slot the median value is used to replace it for fairness sake

data['petallength'].fillna(petallenth_median, inplace = True)

#check if all null values have been replaced in the data
print(data.isnull().sum())

sepallength    0
sepalwidth     0
petallength    0
petalwidth     0
class          0
dtype: int64


In [9]:
# how to solve "?" in sepalwidth
# replace ? with NaN type
# import library numpy

import numpy
data['sepalwidth'] = data['sepalwidth'].replace('?', numpy.NaN)

# get median and replace null inputs
sepalwidth_median = data['sepalwidth'].median()
print('\nSepalwidth median is,',sepalwidth_median)

data['sepalwidth'].fillna(sepalwidth_median, inplace = True)

# make sepalwidth a float type
data['sepalwidth']  = data['sepalwidth'].astype(float)

#confirm the data can be rep.
print('\n\n', data.describe())


Sepalwidth median is, 3.0


        sepallength  sepalwidth  petallength  petalwidth
count   150.000000  150.000000   150.000000  150.000000
mean      5.843333    3.052667     3.799333    1.198667
std       0.828066    0.433450     1.743309    0.763161
min       4.300000    2.000000     1.000000    0.100000
25%       5.100000    2.800000     1.600000    0.300000
50%       5.800000    3.000000     4.400000    1.300000
75%       6.400000    3.300000     5.100000    1.800000
max       7.900000    4.400000     6.900000    2.500000


In [10]:
#how to train the model to predict a flower class
# STEP 1: the data we have cannot be splitted easily and therefore to make it dynamic we convert it into an array

data_array = data.values
print('Data in form of an array:\n\n',data_array)

# to split the data in form of inputs and output we can group in into inputs and outputs
# x-> will have decimal inputs from 0 to 3 and y-> will have the class as the output
# grouping into x and y and confirm successful execution
# the full collon shows that all rows are included note it is upto 4 and not 3 because of tupple nature of minus 1 for range

x = data_array[:, 0:4] 
y = data_array[:,4]
print('\nInput x:\n\n', x)
print('\nOutput y:\n\n', y)

Data in form of an array:

 [[5.1 3.5 1.4 0.2 'Iris-setosa']
 [4.9 3.0 1.4 0.2 'Iris-setosa']
 [4.7 3.2 1.3 0.2 'Iris-setosa']
 [4.6 3.1 1.5 0.2 'Iris-setosa']
 [5.0 3.6 1.4 0.2 'Iris-setosa']
 [5.4 3.9 1.7 0.4 'Iris-setosa']
 [4.6 3.4 1.4 0.3 'Iris-setosa']
 [5.0 3.4 1.5 0.2 'Iris-setosa']
 [4.4 2.9 1.4 0.2 'Iris-setosa']
 [4.9 3.1 1.5 0.1 'Iris-setosa']
 [5.4 3.7 1.5 0.2 'Iris-setosa']
 [4.8 3.4 1.6 0.2 'Iris-setosa']
 [4.8 3.0 1.4 0.1 'Iris-setosa']
 [4.3 3.0 1.1 0.1 'Iris-setosa']
 [5.8 4.0 1.2 0.2 'Iris-setosa']
 [5.7 4.4 1.5 0.4 'Iris-setosa']
 [5.4 3.9 1.3 0.4 'Iris-setosa']
 [5.1 3.5 1.4 0.3 'Iris-setosa']
 [5.7 3.8 1.7 0.3 'Iris-setosa']
 [5.1 3.8 1.5 0.3 'Iris-setosa']
 [5.4 3.4 1.7 0.2 'Iris-setosa']
 [5.1 3.7 1.5 0.4 'Iris-setosa']
 [4.6 3.6 1.0 0.2 'Iris-setosa']
 [5.1 3.3 1.7 0.5 'Iris-setosa']
 [4.8 3.4 1.9 0.2 'Iris-setosa']
 [5.0 3.0 1.6 0.2 'Iris-setosa']
 [5.0 3.4 1.6 0.4 'Iris-setosa']
 [5.2 3.5 1.5 0.2 'Iris-setosa']
 [5.2 3.4 4.4 0.2 'Iris-setosa']
 [4.7 3.2 1.6 0

In [None]:
# STEP 2: We have to train the model by splitting the array into training and testing variables
# One can test 70% and train 30% or 60% testing and 40% training
# import macjine learning library for python which is sklearn
# my choice is 67% training and 33% testing
# 42 is usually used for random numbers it has a deep meaning lol

import sklearn
from sklearn import model_selection
x_train, x_test, y_train, y_test = model_selection.train_test_split(x, y, test_size = 0.33, random_state = 42)
print('Data array shape:', data_array.shape, '\nx_train shape:', x_train.shape, '\nx_test shape:', x_test.shape, '\ny_train shape:', y_train.shape, '\ny_test shape:', y_test.shape)

# STEP 3: We can use logistic regression to TRAIN because it
# is designed for this purpose (classification), and is most useful for understanding the 
# influence of several independent variables on a single outcome variable.
# import logisticregression library
# also need to remove solver warning by import warning library

import warnings
warnings.filterwarnings('ignore', category = FutureWarning)


from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(x_train, y_train)
print('\n\nUsing Logistic Regression pipeline, model is training...')

# STEP 5: TEST the model
# the goal was to predict the classification of flowers so we will test the x_test and not y_test

y_prediction = lr.predict(x_test)

# STEP 6: ACCURACY of the model
# import accuracy model library

from sklearn.metrics import accuracy_score
print('\n\nAccuracy of logistic regression model:',accuracy_score(y_test, y_prediction))

# STEP 7 : Get input from user to predict class of flower
# create empty dictionary and prompt user for variables of unknown flower
x_new = {}
print('\nPlease use the following format for prompted input: e.g. \nsepallength: 2.3 \nsepalwidth: 1.0 \npetallength: 2.5 \npetalwidth: 2.9\n')
# get the no. of elements of input to know shape of array
for i in range(4):
    # split inputs to key and variable
    element = input('Input variables of unknown flower: ').split()
    # add new element to dictionary
    # assign first item to key and second item to value of the dictionary
    x_new[element[0]] = element[1]
    
print('\n\nDictionary of unknown flower:\n', x_new)

# convert string type to float type for every value in the dictionary
for key, value in x_new.items():
    x_new[key] = float(value)
print('\nValue of dictionary converted to float type:\n',x_new)

# STEP 8: PREDICT from user's input
# extract values from dictionary created and create list from which the function will predict from...

x_newvariables = list(x_new.values())
print('\nExtracted list of unknown flower variables: ',x_newvariables)
# newprediction = lr.predict([[1.5,2.5,3.6,2.2]])
newflower_prediction = lr.predict([x_newvariables])
print('\n\nFlower prediction is: ', newflower_prediction)

Data array shape: (150, 5) 
x_train shape: (100, 4) 
x_test shape: (50, 4) 
y_train shape: (100,) 
y_test shape: (50,)


Using Logistic Regression pipeline, model is training...


Accuracy of logistic regression model: 1.0

Please use the following format for prompted input: e.g. 
sepallength: 2.3 
sepalwidth: 1.0 
petallength: 2.5 
petalwidth: 2.9

Input variables of unknown flower: sepallength: 6.2
Input variables of unknown flower: sepalwidth: 3.2
Input variables of unknown flower: petallength: 5.9
