# Iris Flower Species Dataset 

use the Iris Flower Species Dataset.

The Iris Flower Dataset involves predicting the flower species given measurements of iris flowers.
 
There are 150 observations with 4 input variables and 1 output variable. The variable names are as follows:
* Sepal length in cm.
* *Sepal width in cm.
* Petal length in cm.
* Petal width in cm.
* Class

In [1]:
from sklearn.datasets import load_iris 

In [2]:
import pandas as pd
iris = load_iris()
df = pd.DataFrame(iris.data, columns=iris.feature_names)
df['species'] = iris.target
df['species_name'] = df['species'].map(dict(enumerate(iris.target_names)))

df.head()


Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),species,species_name
0,5.1,3.5,1.4,0.2,0,setosa
1,4.9,3.0,1.4,0.2,0,setosa
2,4.7,3.2,1.3,0.2,0,setosa
3,4.6,3.1,1.5,0.2,0,setosa
4,5.0,3.6,1.4,0.2,0,setosa


# k- Nearest Neighbors  

Step 1: Calculate Euclidean Distance.

Step 2: Get Nearest Neighbors.

Step 3: Make Predictions.

In [3]:
# calculate the Euclidean distance between two vectors 

from math import sqrt
def euclidean_distance(vector1, vector2):
    distance = 0.0 
    for i in range(len(vector1) - 1):  # exclude the last element (label)
        distance += (vector1[i] - vector2[i]) ** 2
    return sqrt(distance) 


In [4]:
# get the k nearest neighbors 

def get_neighbors(train, test_row, num_neighbors):
    distance = [] 
    for train_row in train: 
        dist = euclidean_distance(test_row, train_row)
        distance.append((train_row, dist))  
    distance.sort(key=lambda x: x[1])  # sort by distance
    neighbors = []
    for i in range(num_neighbors):
        neighbors.append(distance[i][0])  # get the k nearest neighbors
    return neighbors

In [5]:
# make a classification prediction 

def predict_classification(train, test_row, num_neighbors):
    neighbors = get_neighbors(train, test_row, num_neighbors) 
    output_values = [row[-1] for row in neighbors]  # get the labels of the neighbors
    prediction = max(set(output_values), key=output_values.count)  # get the most common label
    return prediction

# Pre-processing 

In [6]:
# Find the min and max values for each column
def dataset_minmax(dataset):
	minmax = list()
	for i in range(len(dataset[0])):
		col_values = [row[i] for row in dataset]
		value_min = min(col_values)
		value_max = max(col_values)
		minmax.append([value_min, value_max])
	return minmax

In [7]:
# Rescale dataset columns to the range 0-1
def normalize_dataset(dataset, minmax):
	for row in dataset:
		for i in range(len(row)):
			row[i] = (row[i] - minmax[i][0]) / (minmax[i][1] - minmax[i][0])

# Prediction

[Iris-virginica] => 0 

[Iris-setosa] => 1 

[Iris-versicolor] => 2

In [8]:
iris = load_iris()
dataset = [list(iris.data[i]) + [iris.target[i]] for i in range(len(iris.data))] 
minmax = dataset_minmax(dataset)
normalize_dataset(dataset, minmax)


In [9]:
# define model parameter
num_neighbors = 5

In [10]:
# define a new record
row = [5.7,2.9,4.2,1.3]
# predict the label
label = predict_classification(dataset, row, num_neighbors)
print('Data=%s, Predicted: %s' % (row, label))

Data=[5.7, 2.9, 4.2, 1.3], Predicted: 1.0
