In [1]:
"""
Test the KNN learner.  (c) 2015 Tucker Balch
"""

import numpy as np
import math
import KNNLearner as knn
import pandas as pd
import matplotlib.pyplot as plt

if __name__=="__main__":
    inf = open('Data/3_groups.csv')
    data = np.array([map(float,s.strip().split(',')) for s in inf.readlines()])

    # compute how much of the data is training and testing
    train_rows = int(math.floor(0.6* data.shape[0]))
    test_rows = int(data.shape[0] - train_rows)

    # separate out training and testing data
    trainX = data[:train_rows,0:-1]
    trainY = data[:train_rows,-1]
    testX = data[train_rows:,0:-1]
    testY = data[train_rows:,-1]

    # create a learner and train it
    learner = knn.KNNLearner(k = 3, verbose = False) # create a KNNLearner
    learner.addEvidence(trainX, trainY) # train it

    # evaluate in sample
    predY = learner.query(trainX) # get the predictions
    rmse = math.sqrt(((trainY - predY) ** 2).sum()/trainY.shape[0])
    print
    print "In sample results"
    print "RMSE: ", rmse
    c = np.corrcoef(predY, y=trainY)
    print "corr: ", c[0,1]

    # evaluate out of sample
    predY = learner.query(testX) # get the predictions
    rmse = math.sqrt(((testY - predY) ** 2).sum()/testY.shape[0])
    print
    print "Out of sample results"
    print "RMSE: ", rmse
    c = np.corrcoef(predY, y=testY)
    print "corr: ", c[0,1]

    learnerOutput = pd.DataFrame(np.zeros((10,4)),columns=['in.rmse', 'in.corr','out.rmse', 'out.corr'])
    
    print learnerOutput
    print "================"
        
    #learners = []
    for i in range(1,10):
        learner = knn.KNNLearner(k = i, verbose = False) # create a KNNLearner
        learner.addEvidence(trainX, trainY) # train it
        
        predY = learner.query(trainX) # get the predictions of in sample data
        learnerOutput.set_value(i, 'in.rmse', math.sqrt(((trainY - predY) ** 2).sum()/trainY.shape[0]))
        learnerOutput.set_value(i, 'in.corr', np.corrcoef(predY, y=trainY)[0,1])
        
        predY = learner.query(testX) # get the predictions of out of sample data
        learnerOutput.set_value(i, 'out.rmse', math.sqrt(((testY - predY) ** 2).sum()/testY.shape[0]))
        learnerOutput.set_value(i, 'out.corr', np.corrcoef(predY, y=testY)[0,1])

    print "================"
    
    # Remove the first row
    learnerOutput = learnerOutput.ix[1:,:]
    
    print learnerOutput
    

    %matplotlib inline
    # add code to plot here
    plt.clf()
    plt.plot(learnerOutput.index, learnerOutput)
    plt.legend(learnerOutput.keys())
    plt.ylabel('Value')
    plt.xlabel('KNN Size')
    plt.savefig('KNNLearner.png')



In sample results
RMSE:  0.301539261002
corr:  0.92930046242

Out of sample results
RMSE:  0.408928138213
corr:  0.868256909602
   in.rmse  in.corr  out.rmse  out.corr
0      0.0      0.0       0.0       0.0
1      0.0      0.0       0.0       0.0
2      0.0      0.0       0.0       0.0
3      0.0      0.0       0.0       0.0
4      0.0      0.0       0.0       0.0
5      0.0      0.0       0.0       0.0
6      0.0      0.0       0.0       0.0
7      0.0      0.0       0.0       0.0
8      0.0      0.0       0.0       0.0
9      0.0      0.0       0.0       0.0
    in.rmse   in.corr  out.rmse  out.corr
1  0.000000  1.000000  0.497494  0.811801
2  0.259005  0.948409  0.433013  0.852947
3  0.301539  0.929300  0.408928  0.868257
4  0.312583  0.923809  0.393899  0.877559
5  0.323316  0.918252  0.392428  0.878815
6  0.329843  0.914759  0.382063  0.885239
7  0.334318  0.912319  0.383459  0.884350
8  0.336457  0.911138  0.384413  0.883471
9  0.337230  0.910715  0.382689  0.884441


.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate_ix
