In [None]:
%matplotlib notebook
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split

# read in the data
fruits = pd.read_table('fruit_data_with_colors.txt')

In [None]:
# peeking at the first couple of rows of the dataframe
fruits.head()

In [None]:
# seeing which fruit name is mapped to which number
lookup_fruit_name = dict(zip(fruits.fruit_label.unique(), fruits.fruit_name.unique()))
lookup_fruit_name

In [None]:
# initializing the input values as the mass, width, height, and color score of the fruits
X = fruits[['mass', 'width', 'height', 'color_score']]
# initializing the output values as the fruit label
y = fruits['fruit_label']

# splitting the txt data file into a training dataset and a testing dataset
# the random_state parameter allows for a seed value to be inputted so that
# you can have the same split with various runs. This split creates a 75%/25% split
# between the training and testing datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

In [None]:
from matplotlib import cm
# getting a certain colormap from matplotlib
cmap = cm.get_cmap('gnuplot')
# This is a feature pair plot that shows all the possible pairs of features
# and produces a scatter plot of each pair. Along the diagonal, there are histograms
# that shows the frequency of that value for each feature
scatter = pd.plotting.scatter_matrix(
    # the dataframe to be plotted
    X_train, 
    # c determines the color of the plot. Here, since it is y_train, each category
    # of fruit will be a different color on the plot
    c=y_train, 
    # how the data points are marked
    marker='o',
    # the size of the points on the plot
    s=40,
    # keyword argument passed to a pandas function called dataframe.hist()
    # here, we are saying that the bins parameter should be set to 15 meaning
    # there should be 15 bars in the histogram
    hist_kwds={'bins':15},
    # the size of the figure
    figsize=(6,6),
    # the colormap to be used
    cmap=cmap
)

In [None]:
from mpl_toolkits.mplot3d import Axes3D
fig = plt.figure()
# adding 3 axis all evenly split (can do 1,1,1 instead)
ax = fig.add_subplot(111, projection='3d')
# create the scatter plot
ax.scatter(
    # x-axis
    X_train['width'], 
    # y-axis
    X_train['height'], 
    # z-axis
    X_train['color_score'], 
    # color coding
    c=y_train, 
    # marker used for points
    marker='o', 
    # size of data points
    s=100
)
ax.set_xlabel('width')
ax.set_ylabel('height'),
ax.set_zlabel('color_score')
plt.show()

In [None]:
from sklearn.neighbors import KNeighborsClassifier

# k-nearest neighbor is a machine learning algorithm that memorizes the locations of
# the training data points and when the model is asked to evaluate unknown date,
# it will check to see what known data point is closest to that data. For example,
# if we had a cluster of points labeled apples, and the unknown data point was closest
# to this cluster, the algorithm would make the prediction that the unknown data point
# was an apple

# here we specify that the number of neighbors we are checking is 5
knn = KNeighborsClassifier(n_neighbors=5)

In [None]:
# training the classifier
knn.fit(X_train, y_train)

In [None]:
# seeing the accuracy of the model on unseen data
knn.score(X_test, y_test)

In [None]:
# making our own fruit and seeing the predicted result
fruit_prediction = knn.predict([[20, 4.3, 5.5, 0.80]])
# lookup_fruit_name is the dictionary we created earlier
lookup_fruit_name[fruit_prediction[0]]

In [None]:
# testing to see how the k value affects the accuracy of the model. After plotting we notice
# that as we increase the k values, we decrease the accuracy

scores = []
k_range = range(1,20)

for k in k_range:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, y_train)
    scores.append(knn.score(X_test, y_test))
    
plt.figure()
plt.xlabel('k values')
plt.ylabel('accuracy')
plt.scatter(k_range, scores)
plt.xticks([0, 5, 10, 15, 20])

In [None]:
# testing to see how the accuracy is affected by the train/test split

t = [0.8, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2]

knn = KNeighborsClassifier(n_neighbors = 5)

plt.figure()

for s in t:

    scores = []
    for i in range(1,100):
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 1-s)
        knn.fit(X_train, y_train)
        scores.append(knn.score(X_test, y_test))
    plt.plot(s, np.mean(scores), 'bo')

plt.xlabel('Training set proportion (%)')
plt.ylabel('accuracy');