In [1]:
import numpy as np
import pandas as pd
import time

### AdaBoost-Stump

In [2]:
#Training set: https://d396qusza40orc.cloudfront.net/ntumltwo/hw2_data/hw2_adaboost_train.dat
#Testing set: https://d396qusza40orc.cloudfront.net/ntumltwo/hw2_data/hw2_adaboost_test.dat
Train = pd.read_csv('hw2_adaboost_train.dat',sep=' ',header=None)
Train.head()

Unnamed: 0,0,1,2
0,0.757222,0.633831,-1
1,0.847382,0.281581,-1
2,0.24931,0.618635,1
3,0.538526,0.144259,-1
4,0.474435,0.414558,-1


In [3]:
def memo(f): 
    """Memoization decorator, Used to accelerate the retrieval"""
    cache = {}
    def _f(*args):
        try:
            return cache[args]
        except KeyError:
            cache[args] = result = f(*args)
            return result
        except TypeError: #Some elements of args unhashable
            return f(args)
    _f.cache = cache
    return _f

@memo
def stump(s,i,t):
    """Decision stump for given direction s, dimension i, and threshold t"""
    return Train.apply(lambda x: s*((x[i] > t)*2-1),axis=1)

def Accuracy(s,i,theta,w):
    """Calculate accuracy on training set for given decision stump"""
    index = stump(s,i,theta)==Train[2]
    return (np.dot(np.array(index*1),w),index)

def make_thresholds(L):
    """Given values of one dimension, let midpoints as thresholds"""
    LS = [min(L)-1]+sorted(L)
    return [(LS[i]+LS[i+1])/2 for i in range(len(LS)-1)]

def AdaBoost_Training(Train,T):
    """Given training set as a pandas dataframe and the iterations, train an AdaBoost binary classifer"""
    #Initialize weight vector
    Train['w'] = np.ones((100,))/100  
    alpha = []; g = []; Thr = []

    #Compute threshold
    for i in range(2):
        Thr.append(make_thresholds(Train[i])) 

    for r in range(T):
        Max_Weighted_Accu = 0; index = []; w0 = Train['w'].values
        for i in range(2):
            for t in Thr[i]:
                for s in [1,-1]:
                    A,ind = Accuracy(s,i,t,w0)

                    if A > Max_Weighted_Accu:
                        Max_Weighted_Accu, index = A, ind
                        best = s, i, t

        Rescale_Factor = np.sqrt(Max_Weighted_Accu/(sum(w0)-Max_Weighted_Accu))
        Train['w'][index] /= Rescale_Factor   #Rescaling the weight vector
        Train['w'][~index] *= Rescale_Factor
        alpha.append(np.log(Rescale_Factor))
        g.append(best)
        
    return g,alpha

def Predict_Accu_Train(g,alpha,T):
    G = np.zeros((len(Train),))
    for i in range(T):
        G += np.array(stump(*g[i]))*alpha[i]
    return sum(((G>0)*2-1)==Train[2])/len(Train)

@memo
def predict_stump(s,i,t):
    return Test.apply(lambda x: s*((x[i] > t)*2-1),axis=1)

def Predict_Accu_Test(g,alpha,T):
    G = np.zeros((len(Test),))
    for i in range(T):
        G += np.array(predict_stump(*g[i]))*alpha[i]
    return sum(((G>0)*2-1)==Test[2])/len(Test)

In [4]:
start = time.clock()
T = 300
g,alpha = AdaBoost_Training(Train,T)
print('Done Training, %f seconds.'%(time.clock()-start))

Done Training, 33.015375 seconds.


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [5]:
Test = pd.read_csv('hw2_adaboost_test.dat',sep=' ',header=None)
Test.head()

Unnamed: 0,0,1,2
0,0.98425,0.71261,-1
1,0.901491,0.462824,-1
2,0.872418,0.365547,-1
3,0.810913,0.058338,-1
4,0.57723,0.203007,-1


In [6]:
print('Accuracy on Training set: %.2f %%'%100*Predict_Accu_Train(g,alpha,T))
print('Accuracy on Testing set: %.2f %%'%(100*Predict_Accu_Test(g,alpha,T)))

Accuracy on Training set: 100.00 %
Accuracy on Testing set: 86.80 %


  if __name__ == '__main__':


In [7]:
print('Smallest error rate of all stumps %.4f %%' % (100*min(list(map(lambda x:1/(np.exp(2*x)+1),alpha)))))
print('Accuracy on Testing set of one stump: %.2f %%'%(sum(predict_stump(*g[0])==Test[2])/10))

Smallest error rate of all stumps 17.8728 %
Accuracy on Testing set of one stump: 71.00 %


### Least Square Support Vector Machine(LSSVM)
#### Kernel Ridge Regression for Classification

In [8]:
#Data: https://d396qusza40orc.cloudfront.net/ntumltwo/hw2_data/hw2_lssvm_all.dat
#first 400 rows as training set and last 100 rows as testing set
LS_Data = pd.read_csv('hw2_lssvm_all.dat',sep=' ',header=None,skipinitialspace=True)
LS_Data.head(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,4.115,5.02,-7.879,-11.78,2.004,-0.353,-0.735,3.561,2.441,-9.822,1
1,-3.557,0.997,2.932,7.672,5.43,-0.137,1.635,-5.19,-0.394,-7.667,1
2,6.417,5.878,5.066,-7.209,-6.953,7.639,-2.937,-1.023,3.963,-11.069,1
3,-2.247,6.532,6.437,2.293,6.302,2.187,3.429,-3.453,9.172,-4.548,1
4,3.708,5.834,3.676,-4.403,-5.296,9.08,-3.11,-3.294,3.189,-8.51,1
5,-1.586,1.96,-5.506,-8.767,7.871,0.613,-4.693,4.302,-1.219,-8.478,-1
6,-4.181,-6.797,-4.187,8.622,0.771,5.851,-3.893,3.779,4.47,-9.433,1
7,-5.589,-7.01,-5.297,7.329,1.872,3.953,-3.425,3.097,1.677,-11.32,1
8,8.124,4.813,3.519,-7.539,-4.723,8.129,-5.165,-3.411,3.552,-9.192,-1
9,-7.536,-0.448,-10.633,3.777,0.728,-1.386,-7.756,8.166,-2.979,0.629,-1


In [9]:
Train = LS_Data[:400]
Test = LS_Data[400:]

In [10]:
def memo(f): 
    """Memoization decorator, Used to accelerate the retrieval"""
    cache = {}
    def _f(*args):
        try:
            return cache[args]
        except KeyError:
            cache[args] = result = f(*args)
            return result
        except TypeError: #Some elements of args unhashable
            return f(args)
    _f.cache = cache
    return _f

def kernel(x1,x2,gamma):
    """Guassian-RBF kernel"""
    return np.exp(-gamma*sum((x1-x2)**2))

def g(beta,gamma,x):
    """One hypothesis"""
    val = sum([beta[i]*kernel(X[i],x,gamma) for i in range(len(beta))])
    return 1 if val >= 0 else -1

@memo
def kernel_matrix(N,gamma):
    """Compute the kernel matrix K(xi,xj)"""
    K = np.zeros((N,N))
    for i in range(N):
        for j in range(i,N):
            K[i][j] = kernel(X[i],X[j],gamma)
            K[j][i] = K[i][j]
    return K

def beta_star(lamb_identity,K):
    """Calculate the optimal beta_star"""
    return np.dot(np.linalg.inv(lamb_identity+K),Train[10].values)
    
def predicts(beta,gamma,X):
    """Predict by beta, gamma, and Data"""
    return np.array([g(beta,gamma,x) for x in X])

In [11]:
start = time.clock()

X = np.array(Train[list(range(10))].values)
X_test = np.array(Test[list(range(10))].values)
N = len(X); N_t = len(X_test)
Max_in = 0; Max_out = 0

for lamb in [0.001,1,1000]:
    for gamma in [32,2,0.125]:
        lamb_identity = np.identity(N)*lamb
        K = kernel_matrix(N,gamma)
        beta = beta_star(lamb_identity,K)
        
        predict_in = predicts(beta,gamma,X)
        predict_out = predicts(beta,gamma,X_test)
        Accu_in = sum(predict_in==Train[10].values)/N
        Accu_out = sum(predict_out==Test[10].values)/N_t
        
        if Accu_in > Max_in:
            best_in = lamb,gamma,beta
            Max_in = Accu_in
        
        if Accu_out > Max_out:
            best_out = lamb,gamma,beta
            Max_out = Accu_out
            
print('Best Train Accuracy %.2f %%, with lambda %f and gamma %f.'%(100*Max_in,best_in[0],best_in[1]))
print('Best Test Accuracy %.2f %%, with lambda %f and gamma %f.'%(100*Max_out,best_out[0],best_out[1]))
print('Used %.2f seconds'%(time.clock()-start))

Best Train Accuracy 100.00 %, with lambda 0.001000 and gamma 32.000000.
Best Test Accuracy 61.00 %, with lambda 1000.000000 and gamma 0.125000.
Used 13.04 seconds
