# Chapter 2 - Exercise 8: ZIPCode

Compare the classification performance of linear regression and k– nearest neighbor classification on the zipcode data. 
In particular, consider only the 2’s and 3’s, and k = 1, 3, 5, 7 and 15. Show both the training and test error for each choice. 

The zipcode data are available from the book website www-stat.stanford.edu/ElemStatLearn.

In [1]:
import pandas as pd
import numpy as np

In [2]:
dfTrain = pd.read_table("zip.train", sep = ' ', header = None)
dfTest = pd.read_table("zip.test", sep = ' ', header = None)

In [3]:
columnNames = ['Y'] + ['X' + str(u) for u in range(1,dfTrain.shape[1])]

dfTrain.columns = columnNames
dfTest.columns = columnNames[:len(columnNames)-1]

del dfTrain[columnNames[len(columnNames)-1]]

In [4]:
columnNames = columnNames[:-1]

In [5]:
#uncomment next line in order to see the data

#dfTrain

In [6]:
#uncomment next line in order to see the data

#dfTest

In [7]:
#dfTrain[np.logical_and(dfTrain.Y != 2.,dfTrain.Y != 3.) ]

In [8]:
dfTrain2 = dfTrain.drop(dfTrain[dfTrain.Y != 2.].index)
dfTrain3 = dfTrain.drop(dfTrain[dfTrain.Y != 3.].index)

In [9]:
dfTrain2.ix[dfTrain2.Y == 2, 'Y'] = -1   ### digit 2 <=>  -1
dfTrain3.ix[dfTrain3.Y == 3, 'Y'] = 1  ### digit 3 <=> 1

In [10]:
dfTrain2 = dfTrain2.reset_index(drop=True)
dfTrain3 = dfTrain3.reset_index(drop=True)
dfTrain = pd.concat([dfTrain2, dfTrain3])
dfTrain = dfTrain.reset_index(drop=True)
#dfTrain

In [11]:
l2 = len(dfTrain2)
l3 = len(dfTrain3)
ltrain = len(dfTrain)

In [12]:
Xtrain = dfTrain[columnNames[1:]]
Ytrain = dfTrain['Y']

In [13]:
Xtrain = np.matrix(Xtrain)
Ytrain = np.matrix(Ytrain)
Ytrain = Ytrain.transpose()

In [14]:
#Ytrain

In [15]:
dfTest2 = dfTest.drop(dfTest[dfTest.Y != 2.].index)
dfTest3 = dfTest.drop(dfTest[dfTest.Y != 3.].index)

In [16]:
dfTest2.ix[dfTest2.Y == 2, 'Y'] = -1   ### digit 2 <=>  -1
dfTest3.ix[dfTest3.Y == 3, 'Y'] = 1  ### digit 3 <=> 1

In [17]:
dfTest2 = dfTest2.reset_index(drop=True)
dfTest3 = dfTest3.reset_index(drop=True)
dfTest = pd.concat([dfTest2, dfTest3])
dfTest = dfTest.reset_index(drop=True)
#dfTest

In [18]:
ltest2 = len(dfTest2)
ltest3 =  len(dfTest3)

In [19]:
Ytest = np.matrix(dfTest['Y'])
Ytest.resize(len(dfTest),1)
Xtest = np.matrix(dfTest[columnNames[1:]])

## Linear regression: estimate parameters

In [20]:
betaHat = np.dot(np.dot(np.linalg.pinv(np.dot(Xtrain.transpose(), Xtrain)), Xtrain.transpose()), Ytrain)

In [21]:
Yest = np.dot(Xtrain, betaHat)
trainError = float(sum(np.multiply((Ytrain - Yest),(Ytrain - Yest))) / len(Ytrain))

In [22]:
Yest.resize(ltrain,1)
#Yest

In [23]:
trainError

0.09925047911387538

In [24]:
residuals = Ytrain - Yest
residuals2 = residuals[0:l2]
residuals3 = residuals[l2:]
#residuals

In [25]:
wrong2 = len([u for u in residuals2 if u > 0])/float(l2)
wrong2

0.40355677154582764

In [26]:
wrong3 = len([u for u in residuals3 if u < 0])/float(l3)
wrong3

0.3966565349544073

In [27]:
#dfTest

## Computing error on test set

In [28]:
Ypredict = np.dot(Xtest, betaHat)

testError = float(sum(np.multiply((Ytest - Ypredict),(Ytest - Ypredict))) / len(Ytest))
testError

0.2183774942344406

In [29]:
residualTest = Ytest - Ypredict
residualTest2 = residualTest[0:ltest2]
residualTest3 = residualTest[ltest2:]
#residualTest

In [30]:
wrong2 = len([u for u in residualTest2 if u > 0])/float(ltest2)
wrong2

0.36363636363636365

In [31]:
wrong3 = len([u for u in residualTest3 if u < 0])/float(ltest3)
wrong3

0.3674698795180723

## k-Nearest neighbors on same data

In [32]:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn import neighbors, datasets

In [33]:
# import some data to play with
Xtrain = np.asarray(Xtrain)
Xtest = np.asarray(Xtest)

yTrain = Ytrain.ravel()
yTrain = np.ravel(yTrain)

In [34]:
def knn(n_neighbors):
    clf = neighbors.KNeighborsClassifier(n_neighbors, weights='uniform')
    clf.fit(Xtrain, yTrain)
    Yest = clf.predict(Xtrain)

    false2 = 0
    false3 = 0
    for i in range(len(yTrain)):
        if yTrain[i] == 1 and Yest[i] == -1:
            false2 +=1
        elif yTrain[i] == -1 and Yest[i] == 1:
            false3 +=1 
        
    percFalse2 = false2 / float(len(yTrain))
    print ("% of false 2 digits on training set, number neighbors = ", n_neighbors,": ", percFalse2)

    percFalse3 = false3 / float(len(yTrain))
    print ("% of false 3 digits on training set, number neighbors = ", n_neighbors,": ", percFalse3)

    Yest2 =  clf.predict(Xtest)

    false2Test = 0
    false3Test = 0
    for i in range(len(Ytest)):
        if Ytest[i] == 1 and Yest2[i] == -1:
            false2Test +=1
        elif Ytest[i] == -1 and Yest2[i] == 1:
            false3Test +=1

    percFalse2Test = false2Test / float(len(Ytest))
    print ("% of false 2 digits on test set, number neighbors = ", n_neighbors,": ", float(percFalse2Test))

    percFalse3Test = false3Test / float(len(Ytest))
    print ("% of false 3 digits on test set, number neighbors = ", n_neighbors,": ", float(percFalse3Test))
    
    return

In [35]:
knn(1)

('% of false 2 digits on training set, number neighbors = ', 1, ': ', 0.0)
('% of false 3 digits on training set, number neighbors = ', 1, ': ', 0.0)
('% of false 2 digits on test set, number neighbors = ', 1, ': ', 0.008241758241758242)
('% of false 3 digits on test set, number neighbors = ', 1, ': ', 0.016483516483516484)


In [36]:
knn(3)

('% of false 2 digits on training set, number neighbors = ', 3, ': ', 0.0028797696184305254)
('% of false 3 digits on training set, number neighbors = ', 3, ': ', 0.0021598272138228943)
('% of false 2 digits on test set, number neighbors = ', 3, ': ', 0.01098901098901099)
('% of false 3 digits on test set, number neighbors = ', 3, ': ', 0.019230769230769232)


In [37]:
knn(5)

('% of false 2 digits on training set, number neighbors = ', 5, ': ', 0.0021598272138228943)
('% of false 3 digits on training set, number neighbors = ', 5, ': ', 0.003599712023038157)
('% of false 2 digits on test set, number neighbors = ', 5, ': ', 0.01098901098901099)
('% of false 3 digits on test set, number neighbors = ', 5, ': ', 0.019230769230769232)


In [38]:
knn(7)

('% of false 2 digits on training set, number neighbors = ', 7, ': ', 0.0021598272138228943)
('% of false 3 digits on training set, number neighbors = ', 7, ': ', 0.004319654427645789)
('% of false 2 digits on test set, number neighbors = ', 7, ': ', 0.008241758241758242)
('% of false 3 digits on test set, number neighbors = ', 7, ': ', 0.024725274725274724)


In [39]:
knn(15)

('% of false 2 digits on training set, number neighbors = ', 15, ': ', 0.0021598272138228943)
('% of false 3 digits on training set, number neighbors = ', 15, ': ', 0.007199424046076314)
('% of false 2 digits on test set, number neighbors = ', 15, ': ', 0.008241758241758242)
('% of false 3 digits on test set, number neighbors = ', 15, ': ', 0.03021978021978022)
