This exercise is to cluster seeds according to a 1-NN algorithm, using holdout sets to create training and test data.

# Script

In [1]:
import os
import pandas as pd
import numpy as np
import math
import sys
import matplotlib.pyplot as plt
import requests as req
from io import StringIO
from scipy.spatial.distance import euclidean
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import normalize
from matplotlib.colors import ListedColormap
% matplotlib inline

In [2]:
#set number of folds for cross validation
num_folds=5

#URL data source
url = 'https://raw.github.com/luispedro/BuildingMachineLearningSystemsWithPython/master/ch02/data/seeds.tsv'

In [None]:
#Store data in a StringIO object which mimics a file
response = req.get(url)
bytesstr = response.content
sio = StringIO()
sio.write(bytesstr.decode(response.encoding))
sio.seek(0)

In [3]:
#Temporary code to read from local file
data = []
target = []
with open('seeds.tsv','r') as sio:
    lines = [line.strip() for line in sio.readlines()]
for line in lines:
    tokens = line.split('\t')
    target.append(tokens[-1])
    data.append([float(t) for t in tokens[:-1]])
X = normalize(np.array(data)) #Store normalised features as X
Y = np.array(target) #Store labels/targets as Y

In [None]:
#Read the features and label into an array each
data = []
target = []
lines = [line.strip() for line in sio.readlines()]
for line in lines:
    tokens = line.split('\t')
    target.append(tokens[-1])
    data.append([float(t) for t in tokens[:-1]])
X = normalize(np.array(data)) #Store features as X
Y = np.array(target) #Store labels/targets as Y

In [4]:
#returns the label of the nearest neighbour
def get_nearest_neighbour(test_point,train_features,train_labels):
    distances = [euclidean(test_point,tf) for tf in train_features]
    closest = np.array(distances).argmin()
    return train_labels[closest]

In [28]:
#across each holdout set calculate model accuracy
import pdb
NNs = []
model_accuracies = []
for fold in range(num_folds):
    #pdb.set_trace()
    X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size=1.0/num_folds)
    NNs = [get_nearest_neighbour(data_point,X_train,Y_train) for data_point in X_test]
    accuracy =[1 if nn==y_tst else 0 for nn, y_tst in zip(NNs,Y_test)]
    model_accuracies.append(np.mean(accuracy))
model_accuracies, 'Mean accuracy is %4.4f' % np.mean(model_accuracies)

([0.9285714285714286,
  0.9285714285714286,
  0.88095238095238093,
  0.8571428571428571,
  0.95238095238095233],
 'Mean accuracy is 0.9095')

Let's have a look at a plot of nearest-neighbour using two dimensions, say compactness and area, indexed by 0 and 2 respectively in the data 

In [29]:
#Create a dataframe to store the data for plotting and a meshgrid to plot NN algorithm
df = pd.DataFrame(data)
df = df[[0,2]].rename(columns={0:'area',2:'compactness'})
#Identify the minimum and maximum and use these plus a margin for the plot range
xmin, ymin = df.apply(min,axis=0)
xmax, ymax = df.apply(max,axis=0)
X = np.linspace(xmin*0.9,xmax*1.1,100)
Y = np.linspace(ymin*0.9,ymax*1.1,100)
X,Y = np.meshgrid(X,Y)

In [30]:
#create a set of coordinates
gridvals = np.vstack([X.ravel(),Y.ravel()]).T
#create a set of points to plot
plt_features = data[:,(0,2)]
#use every point in the seeds set as a training point
all_train = [True for dp in data]
predictions = [get_nearest_neighbour(dp, all_train,plt_features) for dp in gridvals]

#set colour maps for existing features and predicted labels  
feature_cmap = ListedColormap([(1., .0, .0), (.0, 1., .0), (.0, .0, 1.)])
prediction_cmap = ListedColormap([(1., .6, .6), (.6, 1., .6), (.6, .6, 1.)])

#map predictions to numeric labels for colormap
colmapper = {'Canadian':1,'Kama':2,'Rosa':3}
predicted_labels = list(map(colmapper.get,predictions))
target_labels = list(map(colmapper.get,target))

predicted_labels = np.array(predicted_labels).reshape(X.shape)

TypeError: list indices must be integers or slices, not tuple

In [32]:
data[0]

[15.26, 14.84, 0.871, 5.763, 3.312, 2.221, 5.22]

In [None]:
#Set up NN plots
fig, ax = plt.subplots()
ax.set_xlim(X.min(),X.max())
ax.set_ylim(Y.min(),Y.max())
ax.set_xlabel('Area')
ax.set_ylabel('Compactness')
#plot mesh and scatter
ax.pcolormesh(X,Y,predicted_labels,cmap=prediction_cmap)
ax.scatter(data[:, 0], data[:, 2], c=target_labels, cmap=feature_cmap)
plt.show()

In [None]:
#Normalise values and plot again
norm_plt_features = (plt_features - plt_features.mean(axis=0))/plt_features.std(axis=0)

In [None]:
#Recreate the grid values normalised
df = pd.DataFrame(norm_plt_features)
xmin, ymin = df.apply(min,axis=0)
xmax, ymax = df.apply(max,axis=0)
X = np.linspace(xmin*0.9,xmax*1.1,100)
Y = np.linspace(ymin*0.9,ymax*1.1,100)
X,Y = np.meshgrid(X,Y)
gridvals = np.vstack([X.ravel(),Y.ravel()]).T

In [None]:
#rerun predictions
predictions = [get_nearest_neighbour(dp, all_train,norm_plt_features) for dp in gridvals]
predicted_labels = list(map(colmapper.get,predictions))

In [None]:
#Set up NN plots
fig, ax = plt.subplots()
ax.set_xlim(X.min(),X.max())
ax.set_ylim(Y.min(),Y.max())
ax.set_xlabel('Area')
ax.set_ylabel('Compactness')
#plot mesh and scatter
ax.pcolormesh(X,Y,predicted_labels,cmap=prediction_cmap)
ax.scatter(norm_plt_features[:, 0], norm_plt_features[:, 1], c=target_labels, cmap=feature_cmap)
plt.show()

# Visualisation
This section plots the 1-NN algorithm above

In [None]:
def plotNN(NN_function,Xfeature,Yfeature,Xname,Yname,target,predCMap,featCMap,colourmapper):
    
    #create grid values to plot
    data = np.array([Xfeature,Yfeature]).T
    df = pd.DataFrame(data)
    try:
        xmin, ymin = df.apply(min,axis=0)
        xmax, ymax = df.apply(max,axis=0)
        X = np.linspace(xmin*0.9,xmax*1.1,100)
        Y = np.linspace(ymin*0.9,ymax*1.1,100)
        X,Y = np.meshgrid(X,Y)
        gridvals = np.vstack([X.ravel(),Y.ravel()]).T
    except ValueError as e:
        print(e)

    #Use every point as a training point
    all_train = [True for dp in data]
    
    #Predict nearest neighbour of each grid point
    predictions = [NN_function(dp, all_train,data) for dp in gridvals]
    
    #Get numeric colour labels for predictions    
    colour_labels = list(map(colourmapper.get,predictions))
    #reshape labels to 2 dimensional grid for colormesh
    colour_labels = np.array(colour_labels).reshape(X.shape)
        
    #Map target labels to colours
    target_labels = list(map(colourmapper.get,target))
    
    #set up plot
    fig, ax = plt.subplots()
    ax.set_xlim(X.min(),X.max())
    ax.set_ylim(Y.min(),Y.max())
    ax.set_xlabel(Xname)
    ax.set_ylabel(Yname)
    
    #plot mesh and scatter
    ax.pcolormesh(X,Y,colour_labels,cmap=predCMap)
    ax.scatter(Xfeature, Yfeature, c=target_labels, cmap=featCMap)
    plt.show()

In [None]:
plotNN(get_nearest_neighbour,data[:,0],data[:,2],'Area','Compactness',target,prediction_cmap,feature_cmap,colmapper)

In [None]:
plt_features = data[:,(0,2)]
norm_plt_features = (plt_features - plt_features.mean(axis=0))/plt_features.std(axis=0)
plotNN(get_nearest_neighbour,norm_plt_features[:,0],norm_plt_features[:,1],'Area','Compactness',target, /
       prediction_cmap,feature_cmap,colmapper)