In [104]:
%matplotlib notebook

import numpy as np
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
import sparsipy
import copy
import time

plt.style.use('ggplot')

# Two minor differences compared to AdaBoost
# 1. computation of alpha_t
# 2. final hypothesis have alphas normalized and does not use sign. 

# The algorithm takes "two parameters": 
# T: # hypothesis
# v: desired accuracy
# where T = 2*log(n)/v^2 ;; in this we only give T and then compute v by v=\sqrt(lg(n)/t)

#TODO:
# add parameter 'compute_minimal_margins' which is automatically true if visualize is true. Setting this 
# parameter to false will increase speed significantly because we don't have to compute margin at 
# each iteration. 
class AdaBoostV():
    """
        AdaBoostV:    http://www.jmlr.org/papers/volume6/ratsch05a/ratsch05a.pdf
        Two minor differences compared to the original AdaBoost algorithm. 
        - Computation of alphas_t 
        - The final hypothesis is normalized as does not use sign. 
    
    
        Core functions:
            fit
            predict
            sparsify
            minimal_margin
    """
    
    def __init__(self, debug=False, visualize=False, print_progress = False, timing=False,
                 compute_margins = False, pre_normalize=False, 
                 post_normalize=False, limit=80, color='g'):
        self.reset()
        self.debug           = debug
        self.visualize       = visualize 
        self.pre_normalize   = pre_normalize
        self.post_normalize  = post_normalize
        self.minimal_margins = []
        self.v               = None
        self.limit           = limit
        self.compute_margins = compute_margins
        self.print_progress  = print_progress
        self.timing          = timing
        
        if self.visualize: 
            self.rho_estimates = []
            self.xs = []
            self.color = color
            self.fig_ax_tupple = plt.subplots(1, 2, figsize=(10,5))
            self.compute_margins = True
            
    def reset(self):
        """ Resets field variables. This is called by fit before training to ensure a fresh/clean training. """
        self.estimators      = []
        self.alphas          = []
        self.sample_weight   = None
        self.edges = []
    
    
    # TODO FINISH THIS!
    def fit_sparsify(self, X, y, T, repeat):
        """ Fit AdaBoostV normally to T hypothesis. 
            Sparsify down as far as possible, retrain to T hypothesis. 
            Pick the best and repeat 'repeat' times. 
        """
        target = 5
        
        # Train initially to T hypothesis. 
        self.fit(X, y, T)
        
        for i in range(repeat):
            # sparisfy to target hypothesis
            self.sparsify(X, y, target)

            # store all results in self.retrained
            best_index = np.argmin(self.retrained_minimal_margins)
            best_retrained = self.retrained[best_index]
            
            self = self.retrained
        
        
    
    def fit(self, X, y, T):
        """
        
        Parameters
        ----------
        X:     Matrix of size (n, d) with data point x_i as the i'th row. 
        y:     Array of size (n, ) with label y_i as the i'th entry. 
        T:     Iterations to run the algorithm for. In the JLMR paper, the user supplies both approximation
               parameter v and iterations T. Here we compute approximation parameter as v=sqrt(2*ln(n)/T).
        
        TODO: make visualize a function that is called by step. 
        plot:  Holds an array of (fig, ax)
        color: 
        
        Returns
        -------
        margins: A list containing minimal margin of each iteration. 
        
        """
        n, d = X.shape
        
        # Check labels are {-1, +1} and not {0, 1}
        assert set(y) == {-1, +1}, ("Labels should be {-1, +1} and not ", set(y))
        
        # Make sure we didn't store any estimators/weights from previous iterations.
        self.reset()
        
        # differ; TODO: read paper and understand how we set approximation parameter. 
        self.v = np.sqrt(2*np.log(n) / T)
        print("Setting approximation parameter v=\sqrt(2\log(n) / T): ", self.v, "\n")
        
        # Start with uniform distribution as sample weight
        self.sample_weight = np.ones(n) / n 
        
        # Take T steps of AdaBoostV;
        self.step_to(X, y, T)

        
    def step(self, X, y):
        """ Perform one iteration of AdaBoostV. This will add 
        - one hypothesis h_t to self.estimators,
        - one weight alpha_t to self.alphas 
        - one edge    edge_t to self.edges
        
         Parameters
        ----------
        X:     Matrix of size (n, d) with data point x_i as the i'th row. 
        y:     Array of size (n, ) with label y_i as the i'th entry. 
        
        """
        # Make sure that sample weight is a distribution; sum to 1 and >=0. 
        assert np.allclose(np.sum(self.sample_weight), 1.0), np.sum(self.sample_weight)
        assert sum(self.sample_weight >= 0) == self.sample_weight.shape[0]
        
        # Learn weak DT classifier on weighed data weighed according to sample_weight
        h_t  = DecisionTreeClassifier(max_depth=2)
        h_t.fit(X, y, sample_weight=self.sample_weight)
        
        # Compute the edge of hypothesis h_t and add to list (step b in Algorithm 2)
        edge_t = np.sum( self.sample_weight * y * h_t.predict(X)) 
        self.edges.append(edge_t)
       
        # Find the minimal edge so far and compute current estimate 'rho_t' of optimal 
        # minimal margin (step d in Algorithm 2)
        edge_min = np.min(self.edges)
        rho_t = edge_min - self.v
        
        # Compute the weight for hypothesis h_t (step e in Algorithm 2)
        edge_part =  1/2 * np.log( (1+edge_t) / (1-edge_t) ) 
        rho_part  =  1/2 * np.log( (1+rho_t)  / (1-rho_t)  )
        alpha_t = edge_part - rho_part
    
        # Append estimator and weight to list. 
        self.estimators.append(h_t)
        self.alphas.append(alpha_t)

        # Update sample weight (step f in Algorithm 2)
        self.sample_weight = self.sample_weight * np.exp(- alpha_t * y * h_t.predict(X) )
        self.sample_weight = self.sample_weight / np.sum(self.sample_weight)
        
        # If DT perfectly classifies the weighed data set we are done (step c in Algorithm 3)
        # Moved to end because this is not really the case we are interested in. 
        if np.abs(edge_t) == 1.0: 
            assert False, "Edge was 1.0, perfect classification, this actually happens?" 
        
    def step_to(self, X, y, T):
        """ Performs AdaBoostV steps until we reach T hypothesis. """
        n, d = X.shape
        
        # Compute how many hypothesis we need to add. 
        current_hypothesis = len(self.alphas)
        remaining = T - current_hypothesis
        
        # Start timing if enabled. 
        if self.timing: self.time0 = time.time()
 
        # Take the AdaBoostV steps that remain before we reach T hypothesis. 
        for i in range(remaining): 
            # Compute and add minimal margin if enabled
            if self.compute_margins and len(self.alphas) > 0: self.minimal_margins.append(self.minimal_margin(X, y))
            # Visualize if enabled
            if self.visualize and len(self.alphas) > 0: self.visualize_step(X, y)
            # Print progress
            if self.print_progress: self.do_print_progress(i, remaining)
            
            # Boost one step
            self.step(X, y)   
        
        # Stop timing if enabled
        if self.timing: self.time1 = time.time()
            
        # Compute and add minimal margin if enabled
        if self.compute_margins and len(self.alphas) > 0: self.minimal_margins.append(self.minimal_margin(X, y))
        # Visualize final iteration if enabled, save figure because last step. 
        if self.visualize: self.visualize_step(X, y, save=True)
        # Print done 
        if self.print_progress: self.do_print_progress(i, remaining, True)

    
    def do_print_progress(self, i, remaining, last=False):
        # we have 80 '=', compute percentages
        percentages = i / remaining
        done_num = int(percentages * 60)
        space_num = 60 - done_num-2
        
        progress_string = "\r[" + "="*done_num + ">" + " "*space_num + "]\t" + str(round(percentages*100, 3)) + "%"
        
        print(progress_string, end='')
        
        if self.timing: 
            time_consumption = str(round(time.time() - self.time0, 3))
            print("\t" + time_consumption + "s", end='')
        
        if last:
            print("\r" + " "*120, end='')
            print("\r[" + "="*done_num + ">]\tDONE!\t" + time_consumption + "s")
            
            
    
    def visualize_step(self, X, y, save=False):
        """ Plot the minimal margin and current rho estimate at an iteration of AdaBoostV. 
        """
        # Count current number of hypothesis
        current_hypothesis_count = len(self.alphas)
        
        # Add current hypothesis count as x value for plotting. 
        self.xs.append(current_hypothesis_count)
            
        # Compute current estimat of rho; add the [1] in min
        # because this could be called when self.edges is eempty 
        # and 1 is the worst value of edge. 
        rho_t = np.min(self.edges + [1]) - self.v
        self.rho_estimates.append(rho_t)
                            
        # Update the plot. 
        self.fig_ax_tupple[1][0].plot(self.xs, self.minimal_margins, self.color)
        self.fig_ax_tupple[1][0].set_xlabel("# hypothesis")
        self.fig_ax_tupple[1][0].set_ylabel("Minimal Margin")
        self.fig_ax_tupple[1][0].set_title("AdaBoostV w/ Sparsification")
        self.fig_ax_tupple[1][1].plot(self.xs, self.rho_estimates, self.color)
        self.fig_ax_tupple[1][1].set_xlabel("# hypothesis")
        self.fig_ax_tupple[1][1].set_ylabel("Rho Estimate")
        self.fig_ax_tupple[1][1].set_title("AdaBoostV 'rho' estimate")
        self.fig_ax_tupple[0].canvas.draw()
        
        
        # SAVE PLOT SOMEHOW!
        if save: 
            pass
        #fig_ax_tupple = plt.subplots(1, 2, figsize=(10,5)) 
        #self.ffig_ax_tupple[0].savefig("figures/" + title)
        #self.fig_ax_tupple[0].canvas.draw()
        
    def predict(self, X):
        """ Returns the predicted labels for the given data. 
        
        Parameters
        ----------
        X:     Matrix of size (n, d) with data point x_i as the i'th row. 
        
        Returns
        -------
        pred:  Array of size (n, ) where the i'th entry has the prediction of x_i. 
        
        """
        n, d = X.shape
        
        # Predictions is similar to original AdaBoost, but unsigned with normalized weights (alpha). 
        pred = [self.alphas[t]*self.estimators[t].predict(X) for t in range(len(self.estimators))]
        pred = np.sum(pred, axis=0)
        
        assert pred.shape == (n, ), pred.shape
        return pred / np.sum(self.alphas)
    
    def margins(self, X, y):
        """ Compute margins. The margin of a data point x_i is defined as
        
            margin(x_i) := y_i * H(x) = y_i * sum_t alpha_t h_t(x) / sum_t alpha_t
        
        where the last part is just normalizing alphas. Notice predict takes care of 
        normalization so we do not have to do it here. 
        
        Parameters
        ----------
        X:     Matrix of size (n, d) with data point x_i as the i'th row. 
        y:     Array of size (n, ) with label y_i as the i'th entry. 
        
        Returns
        -------
        pred:  Array of size (n, ) where the i'th entry has the prediction of x_i. 
        """
        return y * self.predict(X)

    def minimal_margin(self, X, y):
        """ Returns the minimal margin. """
        return np.min(self.margins(X, y))
    
    def sparsify(self, X, y, target):
        """ Assumes fit has already been called. Uses data to sparsify the AdaBoostV ensemble
        into fewer hypothesis while preserving minimal margin up to \sqrt(\log(n/t)/t). 
                                                         
        Parameters
        ----------
        X:     Matrix of size (n, d) with data point x_i as the i'th row. 
        y:     Array of size (n, ) with label y_i as the i'th entry. 
        """
        # Compute the game matrix U such that U_ij = h_j(x_i)
        U = self.compute_matrix(X, y)
        n, T = U.shape
                
        # Normalize before sparisfying 
        # self.alphas = self.alphas / np.sum(self.alphas)
        
        # Call the sparisifaction code with game matrix U and weights
        w = sparsipy.sparsify(U, self.alphas, target)
        
        # TODO: How should we set theese guys?
        #self.v = np.sqrt(2*np.log(n/T) / T) ;; This value is not what we would expect, wrong calculation.
        self.edges = []
        
        # Get the minimal margin of training direct; 
        # Re-compute because it will only be stored in self.minimal_margins if visualize=True!
        direct_mm = self.minimal_margin(X, y)
        print("Direct AdaBoostV: \t\t", direct_mm, "\n")
    
        # Each iteration of the above sparsification yields a set of weights. 
        # For each set of weights try train up to T hypothesis and see how well it performs. 
        for i in range(len(w)-1, 0, -1):
            
            # Count how many hypothesis with non-zero weight after sparsification.
            # If less than 80 just stop. 
            non_zero = sum(w[i] != 0)
            if non_zero > self.limit: break
            
            # Copy the pre-trained AdaBoostV object we while overwrite with new weights. 
            current_ada = copy.deepcopy(self)
            
            # if we draw new guys make them different color
            if self.visualize: current_ada.color = '-b'  # TODO: change to different colors and add legends
            
            # Pair the non-zero weight entries with their corresponding estimator.  
            lists = []
            for j in range(w.shape[1]):
                lists.append( (w[i][j], current_ada.estimators[j]) )
            lists = sorted(lists, key = lambda x: x[0], reverse=True)
            
            # Update AdaBoostV copy to hold new weights and corresponding estimators. 
            current_ada.estimators = [tupple[1] for tupple in lists if tupple[0] > 0.0] 
            current_ada.alphas     = [tupple[0] for tupple in lists if tupple[0] > 0.0]

            # Remove. 
            #if current_ada.post_normalize == True: 
            #current_ada.alphas = list(current_ada.alphas / np.sum(current_ada.alphas))
            
            # Update sample_weight; not that self.margins(X, y) uses self.predict(X) which
            # normalizes the alphas. The JLMR article does not normalizate! 
            # (if we normalize above it maes no difference, the normalization step of predict
            # will just do nothing because self.alpha already sums to one. )
            unnormalized_predictions = [current_ada.alphas[t]*current_ada.estimators[t].predict(X) for t in range(len(current_ada.estimators))]
            unnormalized_predictions = np.sum(unnormalized_predictions, axis=0)
            unnormalized_margins = y * unnormalized_predictions
            
            current_ada.sample_weight = np.exp(- unnormalized_margins) 
            current_ada.sample_weight = current_ada.sample_weight / np.sum(current_ada.sample_weight)
            
            # Would this make a difference?
            if False:
                current_ada.sample_weight = np.ones(n)/n # I don't think this make any difference when we normalize afterwards?
                current_ada.sample_weight = current_ada.sample_weight * np.exp(- unnormalized_margins) 
                current_ada.sample_weight = current_ada.sample_weight / np.sum(current_ada.sample_weight)
           
            # Print number of hypothesis after sparsification, initial minimal margin, and
            # corresponding minimal if we trained directly to this number of hypothesis. 
            if self.compute_margins: 
                print("Sparsify %i gave margin: \t"%(non_zero), current_ada.minimal_margin(X, y), "\t(%f)"%self.minimal_margins[non_zero])
            else:     
                print("Sparsify %i gave margin: \t"%(non_zero), current_ada.minimal_margin(X, y))
            
            # Train from current number of hypothesis up to T hypothesis. 
            current_ada.step_to(X, y, T)
            current_mm = current_ada.minimal_margin(X, y)
            
            # Print the results. 
            print("Train %i -> %i gave margin: \t"%(non_zero, T), current_mm)   
            
            # Print who won. 
            diff = np.abs(direct_mm - current_mm)
            print("Absolute Difference: \t\t", diff)
            if direct_mm < current_mm: 
                print("Winner: OUR ALGORITHM!")
            else:
                print("Winner: AdaBoostV!")
            
            print("")
        
    def compute_matrix(self, X, y): 
        '''
        Compute matrix U such that U_ij = h_j(x_i)
        Notice U will have size n x T
        '''
        T = len(self.alphas)
        n, d = X.shape
        U = np.zeros( (n, T) )
        
        for i in range(T):
            U[:, i] = self.estimators[i].predict(X)
            
        return U
    
    
    def score(self, X, y):
        n, d = X.shape
        pred = np.sign(self.predict(X))
        return sum(pred == y) / n
    
    def __deepcopy__(self, memo):
        shallow_copy = copy.copy(self)
        shallow_copy.alphas = copy.deepcopy(self.alphas)
        shallow_copy.estimators = copy.deepcopy(self.estimators)
        
        # override xs and minimal_margins
        shallow_copy.xs = []
        shallow_copy.minimal_margins = []
        shallow_copy.rho_estimates = []
        shallow_copy.edges = []
        
        return shallow_copy
        
        
from sklearn.datasets import make_classification

X, y = make_classification(4000, 5, n_classes=2)
y = y*2-1
    
T = 100
ada = AdaBoostV(print_progress=True, timing=True) # add 'timing' parameter also add 'train_progress' to print percentages of training process.
    
ada.fit(X, y, T)
ada.sparsify(X, y, 10)


Setting approximation parameter v=\sqrt(2\log(n) / T):  0.407284903725 

Direct AdaBoostV: 		 -0.054658839741 

Sparsify 7 gave margin: 	 -0.39797462787
Train 7 -> 100 gave margin: 	 -0.0318111171054
Absolute Difference: 		 0.0228477226356
Winner: OUR ALGORITHM!

Sparsify 13 gave margin: 	 -0.256869075662
Train 13 -> 100 gave margin: 	 -0.0308314723422
Absolute Difference: 		 0.0238273673988
Winner: OUR ALGORITHM!

Sparsify 29 gave margin: 	 -0.225034444594
Train 29 -> 100 gave margin: 	 -0.0314566476046
Absolute Difference: 		 0.0232021921364
Winner: OUR ALGORITHM!

Sparsify 50 gave margin: 	 -0.108216996425
Train 50 -> 100 gave margin: 	 -0.039333294879
Absolute Difference: 		 0.015325544862
Winner: OUR ALGORITHM!



# Sanity check: compare AdaBoostV with SKlearns AdaBoost

In [46]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.datasets import make_classification

X, y = make_classification(1000, 5, n_classes=2)
y = y*2-1

T = 100
sklearn_ada = AdaBoostClassifier(DecisionTreeClassifier(max_depth=2),  algorithm="SAMME", n_estimators=T)
sklearn_ada.fit(X, y)

alex_ada = AdaBoostV()
alex_ada.fit(X, y, T)

print("Sklearn: \t", sklearn_ada.score(X, y))
print("Alex: \t\t", alex_ada.score(X, y)) 

0.371692218885
Sklearn: 	 0.891
Alex: 		 0.974


# Plot minimal margin during fit

In [6]:
from sklearn.datasets import make_classification

X, y = make_classification(100, 5, n_classes=2)
y = y*2-1
T = 100

alex_ada, margins = initialize_experiment(X, y, T) # init my adaboost with plot, then fit. 

<IPython.core.display.Javascript object>

in singular transformations; automatically expanding.
left=1.0, right=1.0
  'left=%s, right=%s') % (left, right))


# Simple sparify experiment

In [27]:
T = 100

experiment(ada, margins, True, True, T)

<IPython.core.display.Javascript object>

Fit 100 directly: 		 -2.50743738622e-05

Sparsify 8 gave margin: 	 -0.0202878334469 	(-0.147905)
Train 8 -> 100 gave margin: 	 -0.00305832396007

Sparsify 15 gave margin: 	 -0.00824889638855 	(-0.028762)
Train 15 -> 100 gave margin: 	 -0.00185926731591

Sparsify 21 gave margin: 	 -0.00701300948566 	(-0.011175)
Train 21 -> 100 gave margin: 	 -0.00155729879124

Sparsify 48 gave margin: 	 -0.00513200729758 	(-0.000693)
Train 48 -> 100 gave margin: 	 -0.00157778144175


# Big experiment

In [53]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.datasets import make_classification

X, y = make_classification(4000, 5, n_classes=2)
y = y*2-1

T = 50

ada, margins = initialize_experiment(X, y, T)

# OBS: I don't re-use sparsification between guys sharing pre_norm=True/False 
experiment(ada, margins, False, False, T) 

<IPython.core.display.Javascript object>

0.525652176976


in singular transformations; automatically expanding.
left=1.0, right=1.0
  'left=%s, right=%s') % (left, right))


<IPython.core.display.Javascript object>

Fit 50 directly: 		 -0.0452859126263
Sparsify 8 gave margin: 	 -0.500075564491 	(-0.372800)
Train 8 -> 50 gave margin: 	 -0.0446890526881
Winner: OUR ALGORITHM! 	Absolute Difference: 0.000597

Sparsify 12 gave margin: 	 -0.360871160731 	(-0.264433)
Train 12 -> 50 gave margin: 	 -0.0426845860646
Winner: OUR ALGORITHM! 	Absolute Difference: 0.002601

Sparsify 18 gave margin: 	 -0.244081203208 	(-0.166959)
Train 18 -> 50 gave margin: 	 -0.038348674175
Winner: OUR ALGORITHM! 	Absolute Difference: 0.006937



In [59]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.datasets import make_classification

X, y = make_classification(4000, 5, n_classes=2)
y = y*2-1

T = 1024

ada, margins = initialize_experiment(X, y, T)

# OBS: I don't re-use sparsification between guys sharing pre_norm=True/False 
experiment(ada, margins, False, False, T) 

<IPython.core.display.Javascript object>

0.127276532414


in singular transformations; automatically expanding.
left=1.0, right=1.0
  'left=%s, right=%s') % (left, right))


<IPython.core.display.Javascript object>

Fit 1024 directly: 		 0.00111371156964
Sparsify 8 gave margin: 	 -0.511260570722 	(-1.000000)
Train 8 -> 1024 gave margin: 	 0.000907303279698
Winnder: AdaBoostV!	Absolute Difference: 0.000206

Sparsify 12 gave margin: 	 -0.397459375519 	(-0.891168)
Train 12 -> 1024 gave margin: 	 -0.00088415865321
Winnder: AdaBoostV!	Absolute Difference: 0.001998

Sparsify 24 gave margin: 	 -0.270795752485 	(-0.574873)
Train 24 -> 1024 gave margin: 	 0.00312501919299
Winner: OUR ALGORITHM! 	Absolute Difference: 0.002011

Sparsify 36 gave margin: 	 -0.163245277183 	(-0.415542)
Train 36 -> 1024 gave margin: 	 0.00178042949816
Winner: OUR ALGORITHM! 	Absolute Difference: 0.000667

Sparsify 71 gave margin: 	 -0.119811552406 	(-0.160797)
Train 71 -> 1024 gave margin: 	 0.00404324350844
Winner: OUR ALGORITHM! 	Absolute Difference: 0.002930

