In [2]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn import datasets
rng = np.random.default_rng(seed=42)

In [3]:
 dataset =  datasets.fetch_california_housing()

ONE-NEAREST NEIGHBOURS ALGORITHM

In [4]:
def NN1(traindata, trainlabel, query):
  diff  = traindata - query  # find the difference between features. Numpy automatically takes care of the size here
  sq = diff*diff # square the differences
  dist = sq.sum(1) # add up the squares
  label = trainlabel[np.argmin(dist)] # our predicted label is the label of the training data which has the least distance from the query
  return label

def NN(traindata, trainlabel, testdata):
  # we will run nearest neighbour for each sample in the test data and collect the predicted classes in an array using list comprehension
  predlabel = np.array([NN1(traindata, trainlabel, i) for i in testdata])
  return predlabel

RANDOM CLASSIFIER ALGORITHM

In [5]:
def RandomClassifier(traindata, trainlabel, testdata):
  classes = np.unique(trainlabel)
  rints = rng.integers(low=0, high=len(classes), size=len(testdata))
  predlabel = classes[rints]
  print("No. of classes are ",len(classes))
  return predlabel

ACCURACY FUNCTION: TO CHECK HOW GOOD OUR ALGORITHM

In [6]:
def Accuracy(gtlabel, predlabel):
  assert len(gtlabel)==len(predlabel), "Length of the groundtruth labels and predicted labels should be the same"
  correct = (gtlabel==predlabel).sum() # count the number of times the groundtruth label is equal to the predicted label.
  return correct/len(gtlabel)

SPLIT FUNCTION:A function to split the dataset with the desired probability.

In [7]:
def split(data, label, percent):
  # generate a random number for each sample
  rnd = rng.random(len(label))
  split1 = rnd<percent
  split2 = rnd>=percent
  split1data = data[split1,:]
  split1label = label[split1]
  split2data = data[split2,:]
  split2label = label[split2]
  return split1data, split1label, split2data, split2label

Reserving 20% of our data for testing

In [8]:
testdata, testlabel, alltraindata, alltrainlabel = split(dataset.data, dataset.target, 20/100)
print('Number of test samples = ', len(testlabel))
print('Number of other samples = ', len(alltrainlabel))
print('Percent of test data = ', len(testlabel)*100/len(dataset.target),'%')

Number of test samples =  4144
Number of other samples =  16496
Percent of test data =  20.07751937984496 %


**EXPERIMENTS WITH SPLITS**

Q1 How is the accuracy of the validation set affected if we increase the percentage of validation set? What happens when we reduce it?

In [9]:
# Reserving n% of our train data as a validation set
def solve(n):
    traindata, trainlabel, valdata, vallabel = split(alltraindata, alltrainlabel, n/100)
    trainpred = NN(traindata, trainlabel, traindata)
    trainAccuracy = Accuracy(trainlabel, trainpred)
    print("Train accuracy using nearest neighbour is ", trainAccuracy)

    trainpred = RandomClassifier(traindata, trainlabel, traindata)
    trainAccuracy = Accuracy(trainlabel, trainpred)
    print("Train accuracy using random classifier is ", trainAccuracy)

# Predicting the labels for our validation set and get the accuracy

    valpred = NN(traindata, trainlabel, valdata)
    valAccuracy = Accuracy(vallabel, valpred)
    print("Validation accuracy using nearest neighbour is ", valAccuracy)

    valpred = RandomClassifier(traindata, trainlabel, valdata)
    valAccuracy = Accuracy(vallabel, valpred)
    print("Validation accuracy using random classifier is ", valAccuracy)


In [10]:
solve(60)

Train accuracy using nearest neighbour is  1.0
No. of classes are  3195
Train accuracy using random classifier is  0.00030385900941962927
Validation accuracy using nearest neighbour is  0.008304393779254115
No. of classes are  3195
Validation accuracy using random classifier is  0.0006039559112184811


In [11]:
solve(65)

Train accuracy using nearest neighbour is  1.0
No. of classes are  3252
Train accuracy using random classifier is  0.00037330844610359307
Validation accuracy using nearest neighbour is  0.00864902266043937
No. of classes are  3252
Validation accuracy using random classifier is  0.0001729804532087874


In [12]:
solve(70)

Train accuracy using nearest neighbour is  1.0
No. of classes are  3347
Train accuracy using random classifier is  0.0004314436103201312
Validation accuracy using nearest neighbour is  0.008966782147951905
No. of classes are  3347
Validation accuracy using random classifier is  0.0002037905033625433


In [13]:
solve(71)

Train accuracy using nearest neighbour is  1.0
No. of classes are  3349
Train accuracy using random classifier is  0.00025722369887678983
Validation accuracy using nearest neighbour is  0.009104076143182288
No. of classes are  3349
Validation accuracy using random classifier is  0.0002069108214359611


In [14]:
solve(73)

Train accuracy using nearest neighbour is  1.0
No. of classes are  3376
Train accuracy using random classifier is  0.0002478110028085247
Validation accuracy using nearest neighbour is  0.009567198177676537
No. of classes are  3376
Validation accuracy using random classifier is  0.00022779043280182233


In [15]:
solve(75)


Train accuracy using nearest neighbour is  1.0
No. of classes are  3421
Train accuracy using random classifier is  0.00016273393002441008
Validation accuracy using nearest neighbour is  0.009272467902995721
No. of classes are  3421
Validation accuracy using random classifier is  0.0


In [16]:
solve(77)

Train accuracy using nearest neighbour is  1.0
No. of classes are  3422
Train accuracy using random classifier is  0.0002363507445048452
Validation accuracy using nearest neighbour is  0.011043912700499606
No. of classes are  3422
Validation accuracy using random classifier is  0.0005259006047856955


In [17]:
solve(79)

Train accuracy using nearest neighbour is  1.0
No. of classes are  3454
Train accuracy using random classifier is  7.674008134448623e-05
Validation accuracy using nearest neighbour is  0.008658008658008658
No. of classes are  3454
Validation accuracy using random classifier is  0.0


In [18]:
solve(81)

Train accuracy using nearest neighbour is  1.0
No. of classes are  3472
Train accuracy using random classifier is  0.00029982759913049995
Validation accuracy using nearest neighbour is  0.00919175911251981
No. of classes are  3472
Validation accuracy using random classifier is  0.0003169572107765452


In [19]:
solve(83)

Train accuracy using nearest neighbour is  1.0
No. of classes are  3517
Train accuracy using random classifier is  0.0005104645227156713
Validation accuracy using nearest neighbour is  0.01006108515989939
No. of classes are  3517
Validation accuracy using random classifier is  0.00035932446999640676


In [20]:
solve(85)

Train accuracy using nearest neighbour is  1.0
No. of classes are  3533
Train accuracy using random classifier is  0.0004969826056088036
Validation accuracy using nearest neighbour is  0.00829531314807134
No. of classes are  3533
Validation accuracy using random classifier is  0.00041476565740356696


In [21]:
solve(87)

Train accuracy using nearest neighbour is  1.0
No. of classes are  3540
Train accuracy using random classifier is  0.00013867702121758425
Validation accuracy using nearest neighbour is  0.00626808100289296
No. of classes are  3540
Validation accuracy using random classifier is  0.0


In [22]:
solve(89)

Train accuracy using nearest neighbour is  1.0
No. of classes are  3565
Train accuracy using random classifier is  0.0002035002035002035
Validation accuracy using nearest neighbour is  0.0068415051311288486
No. of classes are  3565
Validation accuracy using random classifier is  0.0005701254275940707


In [23]:
solve(90)

Train accuracy using nearest neighbour is  1.0
No. of classes are  3563
Train accuracy using random classifier is  0.0003369045212586753
Validation accuracy using nearest neighbour is  0.01027190332326284
No. of classes are  3563
Validation accuracy using random classifier is  0.0


In [24]:
solve(95)

Train accuracy using nearest neighbour is  1.0
No. of classes are  3614
Train accuracy using random classifier is  0.00038299502106472615
Validation accuracy using nearest neighbour is  0.016867469879518072
No. of classes are  3614
Validation accuracy using random classifier is  0.0012048192771084338


In [25]:
solve(99)

Train accuracy using nearest neighbour is  1.0
No. of classes are  3650
Train accuracy using random classifier is  0.0005508630187293427
Validation accuracy using nearest neighbour is  0.006329113924050633
No. of classes are  3650
Validation accuracy using random classifier is  0.0


ANS : a) Validation accuracy of 1- Nearest Neighbours is highest around the 75% mark . It decreases gradually below it. However the behaviour above 80% seems not too obvious to predict , with the accuracy oscillating during each random split .It increases drastically around 99%.    
 b) For random classifiers the value increases as we increase percentage of validation set.However, the accuracy remains lower than one nearest neighbour.

Q2)How does the size of the train and validation set affect how well we can predict the accuracy on the test set using the validation set?



In [26]:
def solve1(l,n):
#Reserving l% of our data for testing
    testdata, testlabel, alltraindata, alltrainlabel = split(dataset.data, dataset.target, l/100)
    print('Number of test samples = ', len(testlabel))
    print('Number of other samples = ', len(alltrainlabel))
    print('Percent of test data = ', len(testlabel)*100/len(dataset.target),'%')

 #saving n% of data as validation set
    traindata, trainlabel, valdata, vallabel = split(alltraindata, alltrainlabel, n/100)
    trainpred = NN(traindata, trainlabel, traindata)
    trainAccuracy = Accuracy(trainlabel, trainpred)
    print("Train accuracy using nearest neighbour is ", trainAccuracy)

    trainpred = RandomClassifier(traindata, trainlabel, traindata)
    trainAccuracy = Accuracy(trainlabel, trainpred)
    print("Train accuracy using random classifier is ", trainAccuracy)

    # Predicting the labels for our validation set and get the accuracy

    valpred = NN(traindata, trainlabel, valdata)
    valAccuracy = Accuracy(vallabel, valpred)
    print("Validation accuracy using nearest neighbour is ", valAccuracy)

    valpred = RandomClassifier(traindata, trainlabel, valdata)
    valAccuracy = Accuracy(vallabel, valpred)
    print("Validation accuracy using random classifier is ", valAccuracy)



In [27]:
solve1(20,75)

Number of test samples =  4127
Number of other samples =  16513
Percent of test data =  19.99515503875969 %
Train accuracy using nearest neighbour is  1.0
No. of classes are  3419
Train accuracy using random classifier is  0.00024127392633102783
Validation accuracy using nearest neighbour is  0.007354743809757294
No. of classes are  3419
Validation accuracy using random classifier is  0.00024515812699190976


In [28]:
solve1(20,78)

Number of test samples =  4277
Number of other samples =  16363
Percent of test data =  20.7218992248062 %
Train accuracy using nearest neighbour is  1.0
No. of classes are  3444
Train accuracy using random classifier is  0.0004710315591144607
Validation accuracy using nearest neighbour is  0.009931034482758621
No. of classes are  3444
Validation accuracy using random classifier is  0.0008275862068965517


In [40]:
solve1(20,69)

Number of test samples =  4179
Number of other samples =  16461
Percent of test data =  20.247093023255815 %
Train accuracy using nearest neighbour is  1.0
No. of classes are  3330
Train accuracy using random classifier is  0.00017542320849048328
Validation accuracy using nearest neighbour is  0.008102766798418972
No. of classes are  3330
Validation accuracy using random classifier is  0.0003952569169960474


In [30]:
solve1(15,75)

Number of test samples =  3105
Number of other samples =  17535
Percent of test data =  15.043604651162791 %
Train accuracy using nearest neighbour is  1.0
No. of classes are  3472
Train accuracy using random classifier is  0.00022779043280182233
Validation accuracy using nearest neighbour is  0.00870561282932417
No. of classes are  3472
Validation accuracy using random classifier is  0.0002290950744558992


In [38]:
solve1(15,81)

Number of test samples =  3122
Number of other samples =  17518
Percent of test data =  15.125968992248062 %
Train accuracy using nearest neighbour is  1.0
No. of classes are  3530
Train accuracy using random classifier is  7.049700387733522e-05
Validation accuracy using nearest neighbour is  0.006900690069006901
No. of classes are  3530
Validation accuracy using random classifier is  0.00030003000300030005


In [39]:
solve1(15,68)

Number of test samples =  3079
Number of other samples =  17561
Percent of test data =  14.917635658914728 %
Train accuracy using nearest neighbour is  1.0
No. of classes are  3368
Train accuracy using random classifier is  0.0004206629648325761
Validation accuracy using nearest neighbour is  0.00828193832599119
No. of classes are  3368
Validation accuracy using random classifier is  0.0001762114537444934


In [33]:
solve1(25,75)

Number of test samples =  5234
Number of other samples =  15406
Percent of test data =  25.358527131782946 %
Train accuracy using nearest neighbour is  1.0
No. of classes are  3330
Train accuracy using random classifier is  0.00017207261464337952
Validation accuracy using nearest neighbour is  0.007401533174729051
No. of classes are  3330
Validation accuracy using random classifier is  0.000528680941052075


In [34]:
solve1(25,80)

Number of test samples =  5171
Number of other samples =  15469
Percent of test data =  25.05329457364341 %
Train accuracy using nearest neighbour is  1.0
No. of classes are  3405
Train accuracy using random classifier is  0.0002439421044072207
Validation accuracy using nearest neighbour is  0.005991800693787449
No. of classes are  3405
Validation accuracy using random classifier is  0.000946073793755913


In [35]:
solve1(25,69)

Number of test samples =  5182
Number of other samples =  15458
Percent of test data =  25.106589147286822 %
Train accuracy using nearest neighbour is  1.0
No. of classes are  3286
Train accuracy using random classifier is  0.00037604587759706683
Validation accuracy using nearest neighbour is  0.01099356979879693
No. of classes are  3286
Validation accuracy using random classifier is  0.00020742584526031943


ANS:    *A*) **Same % data for testing** : For one-Nearest neighbour, the accuracy is lower at 75% validation set reservation .For Random classifier, The accuracy increases as %age validation increases.           
    B)**Same % data for validation** :For random Claassifiers , at lower % data allocation for validation , It decreases as we increase the % testing set . However, at higher % validation data allocation , it increases with increase in % testing set. For one nearest neighbours, It is minimum for the intermediate data around 75% .


Q3) What do you think is a good percentage to reserve for the validation set so that thest two factors are balanced?

In [41]:
solve1(30,70)

Number of test samples =  6292
Number of other samples =  14348
Percent of test data =  30.484496124031008 %
Train accuracy using nearest neighbour is  1.0
No. of classes are  3205
Train accuracy using random classifier is  0.00029943108094620223
Validation accuracy using nearest neighbour is  0.009009009009009009
No. of classes are  3205
Validation accuracy using random classifier is  0.0


In [42]:
solve1(25,75)


Number of test samples =  5102
Number of other samples =  15538
Percent of test data =  24.718992248062015 %
Train accuracy using nearest neighbour is  1.0
No. of classes are  3369
Train accuracy using random classifier is  0.0005143591941705958
Validation accuracy using nearest neighbour is  0.010586108959462949
No. of classes are  3369
Validation accuracy using random classifier is  0.0010327911179963851


In [44]:
solve1(20,80)

Number of test samples =  4154
Number of other samples =  16486
Percent of test data =  20.125968992248062 %
Train accuracy using nearest neighbour is  1.0
No. of classes are  3496
Train accuracy using random classifier is  0.00030289262456459186
Validation accuracy using nearest neighbour is  0.01097560975609756
No. of classes are  3496
Validation accuracy using random classifier is  0.0006097560975609756


In [45]:
solve1(35,65)

Number of test samples =  7230
Number of other samples =  13410
Percent of test data =  35.02906976744186 %
Train accuracy using nearest neighbour is  1.0
No. of classes are  3058
Train accuracy using random classifier is  0.00034403669724770644
Validation accuracy using nearest neighbour is  0.008742004264392323
No. of classes are  3058
Validation accuracy using random classifier is  0.00042643923240938164


In [46]:
solve1(27,73)

Number of test samples =  5462
Number of other samples =  15178
Percent of test data =  26.463178294573645 %
Train accuracy using nearest neighbour is  1.0
No. of classes are  3292
Train accuracy using random classifier is  0.00027247956403269756
Validation accuracy using nearest neighbour is  0.008157389635316698
No. of classes are  3292
Validation accuracy using random classifier is  0.0004798464491362764


Hence , it can be seen that accuracy achieved is optimum when 25% testing data and 75% validation data is used.


**MULTIPLE SPLITS**

Average of the Nearest Neighbours Algorithm Accuracies

In [48]:
def AverageAccuracy(alldata, alllabel, splitpercent, iterations, classifier=NN):
  accuracy = 0
  for ii in range(iterations):
    traindata, trainlabel, valdata, vallabel = split(alldata, alllabel, splitpercent)
    valpred = classifier(traindata, trainlabel, valdata)
    accuracy += Accuracy(vallabel, valpred)
  return accuracy/iterations
def solve3(n):
  print('Average validation accuracy is ', AverageAccuracy(alltraindata, alltrainlabel,n/100, 10, classifier=NN))
  testpred = NN(alltraindata, alltrainlabel, testdata)
  print('test accuracy is ',Accuracy(testlabel, testpred) )

Average of the Random Classifier Algorithm accuracies

In [49]:
def AverageAccuracy(alldata, alllabel, splitpercent, iterations, classifier=RandomClassifier):
  accuracy = 0
  for ii in range(iterations):
    traindata, trainlabel, valdata, vallabel = split(alldata, alllabel, splitpercent)
    valpred = classifier(traindata, trainlabel, valdata)
    accuracy += Accuracy(vallabel, valpred)
  return accuracy/iterations
def solve4(n):
  print('Average validation accuracy is ', AverageAccuracy(alltraindata, alltrainlabel,n/100, 10, classifier=NN))
  testpred = RandomClassifier(alltraindata, alltrainlabel, testdata)
  print('test accuracy is ',Accuracy(testlabel, testpred) )

Q1)Does averaging the validation accuracy across multiple splits give more consistent results?

In [51]:
solve3(75)

Average validation accuracy is  0.008087105114826229
test accuracy is  0.009893822393822393


In [52]:
solve4(75)

Average validation accuracy is  0.008742311454647569
No. of classes are  3661
test accuracy is  0.00048262548262548264


In [61]:
solve3(81)

Average validation accuracy is  0.008909069338798373
test accuracy is  0.009893822393822393


In [62]:
solve4(81)

Average validation accuracy is  0.008585447055482504
No. of classes are  3661
test accuracy is  0.0


In [55]:
solve3(73)

Average validation accuracy is  0.008877461136172998
test accuracy is  0.009893822393822393


In [56]:
solve4(73)

Average validation accuracy is  0.00903182279521009
No. of classes are  3661
test accuracy is  0.00024131274131274132


In [57]:
solve3(78)

Average validation accuracy is  0.008493488810010667
test accuracy is  0.009893822393822393


In [58]:
solve4(78)

Average validation accuracy is  0.00901083596255365
No. of classes are  3661
test accuracy is  0.0


In [59]:
solve3(70)


Average validation accuracy is  0.009277810168608824
test accuracy is  0.009893822393822393


In [60]:
solve4(70)

Average validation accuracy is  0.008757962298083646
No. of classes are  3661
test accuracy is  0.0


Yes , averaging the validation accuracy across multiple splits give more consistent results, especially when the %age allocation of validation dataset is lower .

Q2)Does it give more accurate estimate of test accuracy?

No, It doesn't give much info about the test accuracy . But ,it instead provides a fair idea of how the model might work on Unseen data from same dataset as is used for validation.


Q3)What is the effect of the number of iterations on the estimate? Do we get a better estimate with higher iterations?

Yes, It helps to nullify the err in accuracy caused owing to outlier datapoints in the dataset.

Q4)Consider the results you got for the previous questions. Can we deal with a very small train dataset or validation dataset by increasing the iterations?


This might help to a certain extent , but still it might not be able to overcome the unavailability of enough data points , depending upon the application that the ML model is being used for.