In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.model_selection import validation_curve
import pandas as pd
import numpy as np

In [None]:
fruits = pd.read_table('fruit_data_with_colors.txt')
X_fruits_2d = fruits[['height', 'width']]
y_fruits_2d = fruits['fruit_label']

clf = KNeighborsClassifier(n_neighbors=5)
X = X_fruits_2d.to_numpy()
y = y_fruits_2d.to_numpy()

# the goal of cross validation is to make the model more accurate with regards to
# test data that has not been seen yet. These are the steps for cross validation
# Shuffle the dataset randomly.
# Split the dataset into k groups
# For each unique group:
#     Take the group as a hold out or test data set
#     Take the remaining groups as a training data set
#     Fit a model on the training set and evaluate it on the test set
#     Retain the evaluation score and discard the model
# Summarize the skill of the model using the sample of model evaluation scores

# sklearn does stratified and regular cross validation depending on if the model
# does classification or regression. Let's say for example that you were doing a
# 5-fold cross validation. If this were the case, then the first 20% of the data would
# be considered the test set for the first batch, and then the next 20% would be the test
# set for the second batch etc. If your data is sorted, then this is not good as you may
# not get all of the classes represented in the first 20%. Stratified cross validation works
# to prevent this issue by making sure the amount of class representation in each test set
# is indicitive of the overall data set

# note that cross_val_score has a parameter cv which can change the fold number (default is 3)
cv_scores = cross_val_score(clf, X, y)

print('Cross-validation scores (3-fold):', cv_scores[:3])
print('Mean cross-validation score (3-fold): {:.3f}'.format(np.mean(cv_scores)))

In [None]:
# using validation_curve, you can see quickly how the change of certain parameters of a model
# will affect the overall accuracy

# create an equally spaced 4 element array of values from 10^-3 to 10^3
param_range = np.logspace(-3, 3, 4)
# get the accuracy scores for both the train and test set
train_scores, test_scores = validation_curve(
    # model you want to use
    SVC(),
    # features
    X,
    # labels
    y,
    # parameter you would like to adjust
    param_name='gamma',
    # range of values used to adjust the parameter
    param_range=param_range,
    # how many folds you would like (3 by default)
    cv=3
)
print(train_scores)
print(test_scores)