In [1]:
import numpy as np
import pandas as pd
import time

### AdaBoost-Stump

In [2]:
def memo(f):
    """Memoization decorator, Used to accelerate the retrieval"""
    cache = {}

    def _f(*args):
        try:
            return cache[args]
        except KeyError:
            cache[args] = result = f(*args)
            return result
        # Some elements of args unhashable
        except TypeError:
            return f(args)

    _f.cache = cache
    return _f


@memo
def stump(args):
    s, i, t, X = args
    """Decision stump for given direction s, dimension i, and threshold t"""
    return np.apply_along_axis(lambda x: s * ((x[i] > t) * 2 - 1), 1, X)


def accuracy(s, i, theta, w, X, y):
    """Calculate accuracy on training set for given decision stump"""
    index = (stump(s, i, theta, X) == y)
    return (np.dot(index * 1, w), index)


def make_thresholds(L):
    """Given values of one dimension, let midpoints as thresholds"""
    LS = [min(L) - 1] + sorted(L)
    return [(LS[i] + LS[i + 1]) / 2 for i in range(len(LS) - 1)]


def AdaBoost_Training(X, y, T):
    """T is the number iterations, train an AdaBoost binary classifer"""
    # Initialize weight vector
    weights = np.ones((X.shape[0],)) / X.shape[0]
    alpha = []
    g = []
    Thr = []

    # Compute threshold
    for i in range(2):
        Thr.append(make_thresholds(X[:, i]))

    for r in range(T):
        Max_Weighted_Accu = 0
        index = []

        for i in range(2):
            for t in Thr[i]:
                for s in [1, -1]:
                    A, ind = accuracy(s, i, t, weights, X, y)

                    if A > Max_Weighted_Accu:
                        Max_Weighted_Accu, index = A, ind
                        best = s, i, t

        r_2 = Max_Weighted_Accu / (sum(weights) - Max_Weighted_Accu)
        Rescale_Factor = np.sqrt(r_2)

        # Rescaling the weight vector
        weights[index] /= Rescale_Factor
        weights[~index] *= Rescale_Factor

        alpha.append(np.log(Rescale_Factor))
        g.append(best)

        if r % 10 == 9:
            print('\tNow is the %d-th iteration.' % (r + 1))

    return g, alpha, weights


def model_accuracy(g, alpha, T, X, y):
    G = np.zeros((X.shape[0],))
    for i in range(T):
        params = list(g[i]) + [X]
        G += np.array(stump(*params)) * alpha[i]

    return sum(((G > 0) * 2 - 1) == y) / X.shape[0]

In [3]:
# Training set:
# https://d396qusza40orc.cloudfront.net/ntumltwo/hw2_data/hw2_adaboost_train.dat
# Testing set:
# https://d396qusza40orc.cloudfront.net/ntumltwo/hw2_data/hw2_adaboost_test.dat

train_data = pd.read_csv('Data/hw2_adaboost_train.dat', sep=' ', header=None)
test_data = pd.read_csv('Data/hw2_adaboost_test.dat', sep=' ', header=None)

X = train_data[train_data.columns[:-1]].values
y = train_data[train_data.columns[-1]].values

X_test = test_data[test_data.columns[:-1]].values
y_test = test_data[test_data.columns[-1]].values

train_data.head()

Unnamed: 0,0,1,2
0,0.757222,0.633831,-1
1,0.847382,0.281581,-1
2,0.24931,0.618635,1
3,0.538526,0.144259,-1
4,0.474435,0.414558,-1


In [4]:
T = 300

print('Start Training...\n')
start = time.clock()

g, alpha, weights = AdaBoost_Training(X, y, T)

train_accu = model_accuracy(g, alpha, T, X, y)
print('\n\tAccuracy on Training set: %.2f %%' % (100 * train_accu))

test_accu = model_accuracy(g, alpha, T, X_test, y_test)
print('\tAccuracy on Testing set: %.2f %%' % (100 * test_accu))

min_err = min(list(map(lambda x: 1 / (np.exp(2 * x) + 1), alpha)))
print('\tSmallest error of all stumps (train) %.2f %%' % (100 * min_err))

params = list(g[0]) + [X_test]
one_accu = sum(stump(*params) == y_test) / 10.0
print('\tAccuracy on Testing set of one stump: %.2f %%' % (one_accu))

print('\nDone. Using %f seconds.' % (time.clock() - start))

Start Training...

	Now is the 10-th iteration.
	Now is the 20-th iteration.
	Now is the 30-th iteration.
	Now is the 40-th iteration.
	Now is the 50-th iteration.
	Now is the 60-th iteration.
	Now is the 70-th iteration.
	Now is the 80-th iteration.
	Now is the 90-th iteration.
	Now is the 100-th iteration.
	Now is the 110-th iteration.
	Now is the 120-th iteration.
	Now is the 130-th iteration.
	Now is the 140-th iteration.
	Now is the 150-th iteration.
	Now is the 160-th iteration.
	Now is the 170-th iteration.
	Now is the 180-th iteration.
	Now is the 190-th iteration.
	Now is the 200-th iteration.
	Now is the 210-th iteration.
	Now is the 220-th iteration.
	Now is the 230-th iteration.
	Now is the 240-th iteration.
	Now is the 250-th iteration.
	Now is the 260-th iteration.
	Now is the 270-th iteration.
	Now is the 280-th iteration.
	Now is the 290-th iteration.
	Now is the 300-th iteration.

	Accuracy on Training set: 100.00 %
	Accuracy on Testing set: 86.80 %
	Smallest error of 