In [43]:
%matplotlib notebook

import numpy as np
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
import sparsipy
import copy


# Two minor differences compared to AdaBoost
# 1. computation of alpha_t
# 2. final hypothesis have alphas normalized and does not use sign. 

# The algorithm takes "two parameters": 
# T: # hypothesis
# v: desired accuracy
# where T = 2*log(n)/v^2 ;; in this we only give T and then compute v by v=\sqrt(lg(n)/t)
class AdaBoostV():
    """
        AdaBoostV:    http://www.jmlr.org/papers/volume6/ratsch05a/ratsch05a.pdf
        Two minor differences compared to the original AdaBoost algorithm. 
        - Computation of alphas_t 
        - The final hypothesis is normalized as does not use sign. 
    
    
        Core functions:
            fit
            predict
            sparsify
            minimal_margin
    """
    
    def __init__(self, debug=False, plot=False, pre_normalize=False, post_normalize=False, limit=80):
        self.reset()
        self.debug           = debug
        self.plot            = plot 
        self.pre_normalize   = pre_normalize
        self.post_normalize  = post_normalize
        self.train_margins   = None
        self.v               = None
        self.limit           = limit
        
    def reset(self):
        """ Resets field variables. This is called by fit before training to ensure a fresh/clean training. """
        self.estimators      = []
        self.alphas          = []
        self.sample_weight   = None
        self.edges = []
    
    def fit(self, X, y, T, plot=None, color='-go'):
        """
        
        Parameters
        ----------
        X:     Matrix of size (n, d) with data point x_i as the i'th row. 
        y:     Array of size (n, ) with label y_i as the i'th entry. 
        T:     Iterations to run the algorithm for. In the JLMR paper, the user supplies both approximation
               parameter v and iterations T. Here we compute approximation parameter as v=sqrt(2*ln(n)/T).
        
        TODO: make visualize a function that is called by step. 
        plot:  Holds an array of (fig, ax)
        color: 
        
        Returns
        -------
        margins: A list containing minimal margin of each iteration. 
        
        """
        n, d = X.shape
        
        # Check labels are {-1, +1} and not {0, 1}
        assert set(y) == {-1, +1}, ("Labels should be {-1, +1} and not ", set(y))
        
        # Make sure we didn't store any estimators/weights from previous iterations.
        self.reset()
        
        # differ; TODO: read paper and understand how we set approximation parameter. 
        self.v = np.sqrt(2*np.log(n) / T)
        print(self.v)
        
        # Start with uniform distribution as sample weight
        self.sample_weight = np.ones(n) / n 
        
        # Take T steps of AdaBoostV; TODO: change return margin to margin being a field variable. 
        margins = self.step_to(X, y, T, plot=plot, color=color)
        
        return margins

    def step(self, X, y):
        """ Perform one iteration of AdaBoostV. This will add 
        - one hypothesis h_t to self.estimators,
        - one weight alpha_t to self.alphas 
        - one edge    edge_t to self.edges
        
         Parameters
        ----------
        X:     Matrix of size (n, d) with data point x_i as the i'th row. 
        y:     Array of size (n, ) with label y_i as the i'th entry. 
        
        """
        # Make sure that sample weight is a distribution; sum to 1 and >=0. 
        assert np.allclose(np.sum(self.sample_weight), 1.0), np.sum(self.sample_weight)
        assert sum(self.sample_weight >= 0) == self.sample_weight.shape[0]
        
        # Learn weak DT classifier on weighed data weighed according to sample_weight
        h_t  = DecisionTreeClassifier(max_depth=2)
        h_t.fit(X, y, sample_weight=self.sample_weight)
        
        # Compute the edge of hypothesis h_t and add to list (step b in Algorithm 2)
        edge_t = np.sum( self.sample_weight * y * h_t.predict(X)) 
        self.edges.append(edge_t)
       
        # Find the minimal edge so far and compute current estimate 'rho_t' of optimal 
        # minimal margin (step d in Algorithm 2)
        edge_min = np.min(self.edges)
        rho_t = edge_min - self.v
        
        # Compute the weight for hypothesis h_t (step e in Algorithm 2)
        edge_part =  1/2 * np.log( (1+edge_t) / (1-edge_t) ) 
        rho_part  =  1/2 * np.log( (1+rho_t)  / (1-rho_t)  )
        alpha_t = edge_part - rho_part
    
        # Append estimator and weight to list. 
        self.estimators.append(h_t)
        self.alphas.append(alpha_t)

        # Update sample weight (step f in Algorithm 2)
        self.sample_weight = self.sample_weight * np.exp(- alpha_t * y * h_t.predict(X) )
        self.sample_weight = self.sample_weight / np.sum(self.sample_weight)
        
        # If DT perfectly classifies the weighed data set we are done (step c in Algorithm 3)
        # Moved to end because this is not really the case we are interested in. 
        if np.abs(edge_t) == 1.0: 
            assert False, "Edge was 1.0, perfect classification, this actually happens?"
        
         
    
    def step_to(self, X, y, T, plot=None, color='-go', title=None):
        """ Performs AdaBoostV steps until we reach T hypothesis. 
        
        """
        n, d = X.shape
        
        # Compute how many hypothesis we need to add. 
        current_hypothesis = len(self.alphas)
        remaining = T - current_hypothesis
 
        # Visualization stuff
        minimal_margins = []
        exp_loss = []
        xs = []
        if plot is not None and current_hypothesis != 0: 
            xs.append(current_hypothesis)
            
            # TODO: optimize so margins isn't computed twice. 
            exp_loss.append(self.exponential_loss(X, y))
            minimal_margins.append(self.minimal_margin(X, y))
            
            
        # Take the AdaBoostV steps that remain before we reach T hypothesis. 
        for i in range(remaining): 
            self.step(X, y) 
            
            # Visualization; 
            if plot is not None:
                xs.append(len(self.alphas))
                
                # TODO: optimize so margins isn't computed twice. 
                exp_loss.append(self.exponential_loss(X, y))
                minimal_margins.append(self.minimal_margin(X, y))
                
                plot[1][0].plot(xs, minimal_margins, color)
                plot[1][0].set_xlabel("# hypothesis")
                plot[1][0].set_ylabel("Minimal Margin")
                plot[1][1].plot(xs, exp_loss, color)
                plot[1][1].set_xlabel("# hypothesis")
                plot[1][1].set_ylabel("Minimal Edge")
                plot[0].canvas.draw()
                
        self.train_margins = minimal_margins
        return minimal_margins, exp_loss
        
    def predict(self, X):
        """ Returns the predicted labels for the given data. 
        
        Parameters
        ----------
        X:     Matrix of size (n, d) with data point x_i as the i'th row. 
        
        Returns
        -------
        pred:  Array of size (n, ) where the i'th entry has the prediction of x_i. 
        
        """
        n, d = X.shape
        
        # Predictions is similar to original AdaBoost, but unsigned with normalized weights (alpha). 
        pred = [self.alphas[t]*self.estimators[t].predict(X) for t in range(len(self.estimators))]
        pred = np.sum(pred, axis=0)
        
        assert pred.shape == (n, ), pred.shape
        return pred / np.sum(self.alphas)
    
    def margins(self, X, y):
        """ Compute margins. The margin of a data point x_i is defined as
        
            margin(x_i) := y_i * H(x) = y_i * sum_t alpha_t h_t(x) / sum_t alpha_t
        
        where the last part is just normalizing alphas. Notice predict takes care of 
        normalization so we do not have to do it here. 
        
        Parameters
        ----------
        X:     Matrix of size (n, d) with data point x_i as the i'th row. 
        y:     Array of size (n, ) with label y_i as the i'th entry. 
        
        Returns
        -------
        pred:  Array of size (n, ) where the i'th entry has the prediction of x_i. 
        """
        return y * self.predict(X)

    def minimal_margin(self, X, y):
        """ Returns the minimal margin. """
        return np.min(self.margins(X, y))
    
    def sparsify(self, X, y, target, plot=None, color='bo'):
        """ Assumes fit has already been called. Uses data to sparsify the AdaBoostV ensemble
        into fewer hypothesis while preserving minimal margin up to \sqrt(\log(n/t)/t). 
                                                         
        Parameters
        ----------
        X:     Matrix of size (n, d) with data point x_i as the i'th row. 
        y:     Array of size (n, ) with label y_i as the i'th entry. 
        """
        # Compute the game matrix U such that U_ij = h_j(x_i)
        U = self.compute_matrix(X, y)
        n, T = U.shape
        
        # Normalize before sparisfying 
        # self.alphas = self.alphas / np.sum(self.alphas)
        
        # Call the sparisifaction code with game matrix U and weights
        w = sparsipy.sparsify(U, self.alphas, target)
        
        # TODO: How should we set theese guys?
        #self.v = np.sqrt(2*np.log(n/T) / T) ;; This value is not what we would expect, wrong calculation.
        self.edges = []
    
        # Each iteration of the above sparsification yields a set of weights. 
        # For each set of weights try train up to T hypothesis and see how well it performs. 
        for i in range(len(w)-1, 0, -1):
            
            # Count how many hypothesis with non-zero weight after sparsification.
            # If less than 80 just stop. 
            non_zero = sum(w[i] != 0)
            if non_zero > self.limit: break
            
            # Copy the pre-trained AdaBoostV object we while overwrite with new weights. 
            current_ada = copy.deepcopy(self)
            
            # Pair the non-zero weight entries with their corresponding estimator.  
            lists = []
            for j in range(w.shape[1]):
                lists.append( (w[i][j], current_ada.estimators[j]) )
            lists = sorted(lists, key = lambda x: x[0], reverse=True)
            
            # Update AdaBoostV copy to hold new weights and corresponding estimators. 
            current_ada.estimators = [tupple[1] for tupple in lists if tupple[0] > 0.0] 
            current_ada.alphas     = [tupple[0] for tupple in lists if tupple[0] > 0.0]

            # Remove. 
            #if current_ada.post_normalize == True: 
            #current_ada.alphas = list(current_ada.alphas / np.sum(current_ada.alphas))
            
            # Update sample_weight; not that self.margins(X, y) uses self.predict(X) which
            # normalizes the alphas. The JLMR article does not normalizate! 
            # (if we normalize above it maes no difference, the normalization step of predict
            # will just do nothing because self.alpha already sums to one. )
            unnormalized_predictions = [current_ada.alphas[t]*current_ada.estimators[t].predict(X) for t in range(len(current_ada.estimators))]
            unnormalized_predictions = np.sum(unnormalized_predictions, axis=0)
            unnormalized_margins = y * unnormalized_predictions
            
            current_ada.sample_weight = np.exp(- unnormalized_margins) 
            current_ada.sample_weight = current_ada.sample_weight / np.sum(current_ada.sample_weight)
            
            # Would this make a difference?
            if False:
                current_ada.sample_weight = np.ones(n)/n # I don't think this make any difference when we normalize afterwards?
                current_ada.sample_weight = current_ada.sample_weight * np.exp(- unnormalized_margins) 
                current_ada.sample_weight = current_ada.sample_weight / np.sum(current_ada.sample_weight)
           
            # Print number of hypothesis after sparsification, initial minimal margin, and
            # corresponding minimal if we trained directly to this number of hypothesis. 
            print("Sparsify %i gave margin: \t"%(non_zero), current_ada.minimal_margin(X, y), "\t(%f)"%self.train_margins[non_zero])
            
            # Train from current number of hypothesis up to T hypothesis. 
            minimal_margins, _ = current_ada.step_to(X, y, T, plot=plot, color=color)
            
            # Print the results. 
            print("Train %i -> %i gave margin: \t"%(non_zero, T), minimal_margins[-1])   
            
            # Print who won. 
            diff = np.abs(self.train_margins[-1] - minimal_margins[-1])
            if self.train_margins[-1] < minimal_margins[-1]: 
                print("Winner: OUR ALGORITHM! \tAbsolute Difference: %f"%diff)
            else:
                print("Winnder: AdaBoostV!\tAbsolute Difference: %f"%diff)
            
            print("")
        
    def compute_matrix(self, X, y): 
        '''
        Compute matrix U such that U_ij = h_j(x_i)
        Notice U will have size n x T
        '''
        T = len(self.alphas)
        n, d = X.shape
        U = np.zeros( (n, T) )
        
        for i in range(T):
            U[:, i] = self.estimators[i].predict(X)
            
        return U
    
    def score(self, X, y):
        n, d = X.shape
        pred = np.sign(self.predict(X))
        return sum(pred == y) / n

    
    # TODO: Remove. 
    def exponential_loss(self, X, y):
        if self.edges != []: return np.min(self.edges)
        else: return 1
    
    
    
def initialize_experiment(X, y, T):
    
    ada = AdaBoostV(debug=False, pre_normalize=False, post_normalize=False)
    
    fig_ax_tupple = plt.subplots(1, 2, figsize=(10,5)) 
    fig_ax_tupple[1][0].set_xscale("log")
    fig_ax_tupple[1][1].set_xscale("log")
    
    title = "Initial AdaBoost, T=" + str(T)
    fig_ax_tupple[1][0].set_title("Margins: " +title)
    fig_ax_tupple[1][1].set_title("Exponential Loss: " + title)
    
    margins_exp_loss = ada.fit(X, y, T, plot=fig_ax_tupple, color='-g')
    
    return ada, margins_exp_loss
    
# reuse ada to be more fair between experiments. 
def experiment(ada, margins_exp_loss, pre_normalize, post_normalize, T):
    margins  = margins_exp_loss[0]
    exp_loss = margins_exp_loss[1]
    
    fig_ax_tupple = plt.subplots(1, 2, figsize=(10,5)) 
    fig_ax_tupple[1][0].set_xscale("log")
    fig_ax_tupple[1][1].set_xscale("log")
    
    title = "T=" + str(T)
    fig_ax_tupple[1][0].set_title("Margins: " + title)
    fig_ax_tupple[1][1].set_title("Minimal Edge: " + title)
    
    fig_ax_tupple[1][0].plot(range(1, T+1), margins, '-g')
    fig_ax_tupple[1][1].plot(range(1, T+1), exp_loss, '-g')

    print("Fit %i directly: \t\t"%T, margins[-1]) 
    ada.sparsify(X, y, 10, plot=fig_ax_tupple, color='-b')
    fig_ax_tupple[0].savefig("figures/" + title)
    fig_ax_tupple[0].canvas.draw()
    

# Sanity check: compare AdaBoostV with SKlearns AdaBoost

In [46]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.datasets import make_classification

X, y = make_classification(1000, 5, n_classes=2)
y = y*2-1

T = 100
sklearn_ada = AdaBoostClassifier(DecisionTreeClassifier(max_depth=2),  algorithm="SAMME", n_estimators=T)
sklearn_ada.fit(X, y)

alex_ada = AdaBoostV()
alex_ada.fit(X, y, T)

print("Sklearn: \t", sklearn_ada.score(X, y))
print("Alex: \t\t", alex_ada.score(X, y)) 

0.371692218885
Sklearn: 	 0.891
Alex: 		 0.974


# Plot minimal margin during fit

In [6]:
from sklearn.datasets import make_classification

X, y = make_classification(100, 5, n_classes=2)
y = y*2-1
T = 100

alex_ada, margins = initialize_experiment(X, y, T) # init my adaboost with plot, then fit. 

<IPython.core.display.Javascript object>

in singular transformations; automatically expanding.
left=1.0, right=1.0
  'left=%s, right=%s') % (left, right))


# Simple sparify experiment

In [27]:
T = 100

experiment(ada, margins, True, True, T)

<IPython.core.display.Javascript object>

Fit 100 directly: 		 -2.50743738622e-05

Sparsify 8 gave margin: 	 -0.0202878334469 	(-0.147905)
Train 8 -> 100 gave margin: 	 -0.00305832396007

Sparsify 15 gave margin: 	 -0.00824889638855 	(-0.028762)
Train 15 -> 100 gave margin: 	 -0.00185926731591

Sparsify 21 gave margin: 	 -0.00701300948566 	(-0.011175)
Train 21 -> 100 gave margin: 	 -0.00155729879124

Sparsify 48 gave margin: 	 -0.00513200729758 	(-0.000693)
Train 48 -> 100 gave margin: 	 -0.00157778144175


# Big experiment

In [45]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.datasets import make_classification

X, y = make_classification(1000, 5, n_classes=2)
y = y*2-1

T = 50

ada, margins = initialize_experiment(X, y, T)

# OBS: I don't re-use sparsification between guys sharing pre_norm=True/False 
experiment(ada, margins, False, False, T) 

<IPython.core.display.Javascript object>

0.525652176976


in singular transformations; automatically expanding.
left=1.0, right=1.0
  'left=%s, right=%s') % (left, right))


<IPython.core.display.Javascript object>

Fit 50 directly: 		 0.0116520198428
Sparsify 10 gave margin: 	 -0.400039976421 	(-0.264213)
Train 10 -> 50 gave margin: 	 0.0141633425772
Winner: OUR ALGORITHM! 	Absolute Difference: 0.002511

Sparsify 24 gave margin: 	 -0.174923535404 	(-0.072208)
Train 24 -> 50 gave margin: 	 0.0294626740307
Winner: OUR ALGORITHM! 	Absolute Difference: 0.017811

