### PLEASE comment/document what each function is doing in your own words to demonstrate your understanding. Of course, push it up to your GitHub on completion.

In [3]:
# importing packages
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_hastie_10_2
import matplotlib.pyplot as plt


In [10]:
""" HELPER FUNCTION: GET ERROR RATE ========================================="""
def get_error_rate(pred, Y):       # function for the error rate by summing not equal values between Y and Y_pred
    return sum(pred != Y) / float(len(Y))   # and deviding by number of Y

""" HELPER FUNCTION: PRINT ERROR RATE ======================================="""
def print_error_rate(err):         # function for printing the error rate in special format
    print ('Error rate: Training: %.4f - Test: %.4f' % err)

""" HELPER FUNCTION: GENERIC CLASSIFIER ====================================="""
def generic_clf(Y_train, X_train, Y_test, X_test, clf): # function generic classifier clf = choosen model
    clf.fit(X_train,Y_train)      # fit the model on X and Y training sets
    pred_train = clf.predict(X_train) # prediction on training data
    pred_test = clf.predict(X_test)   # prediction on test data
    # returning error rates for prediction and test data sets
    return get_error_rate(pred_train, Y_train), \
           get_error_rate(pred_test, Y_test)

In [5]:
""" ADABOOST IMPLEMENTATION ================================================="""
def adaboost_clf(Y_train, X_train, Y_test, X_test, M, clf):   # adaboost classifier
    n_train, n_test = len(X_train), len(X_test)               # write length of X_train and X_test in 2 vars
    # Initialize weights
    w = np.ones(n_train) / n_train                            # define weights depending on length of dataset
    pred_train, pred_test = [np.zeros(n_train), np.zeros(n_test)] # create 2 lists with n zeros
    
    for i in range(M):        
        # Fit a classifier with the specific weights
        clf.fit(X_train, Y_train, sample_weight = w)     # fit your model (clf) on training data with sample weight
        pred_train_i = clf.predict(X_train)              # predict on train data
        pred_test_i = clf.predict(X_test)                # predict on test data
        # Indicator function
        miss = [int(x) for x in (pred_train_i != Y_train)]  # write out weighted predicted values that were not right 
                                                            # predicted as integer nto a new var 'miss'
                                                       
        # Equivalent with 1/-1 to update weights
        miss2 = [x if x==1 else -1 for x in miss]        # replace weighted values with dummies 1 and -1 
                                                         # 1 for right prediction and -1 for false
        # Error
        err_m = np.dot(w,miss) / sum(w)                  # matrix multiplication of weighted train data and 
                                                         # false predicted values devided by sum of weighted train data
        # Alpha
        alpha_m = 0.5 * np.log( (1 - err_m) / float(err_m)) # set alpha to grab low weighted training data
        # New weights                                       # to do prediction with low weighted nodes (RF)
        w = np.multiply(w, np.exp([float(x) * alpha_m for x in miss2])) # new weights 
        # Add to prediction
        pred_train = [sum(x) for x in zip(pred_train,    # add them now to the other predicted data
                                          [x * alpha_m for x in pred_train_i])]
        pred_test = [sum(x) for x in zip(pred_test, 
                                         [x * alpha_m for x in pred_test_i])]
    
    pred_train, pred_test = np.sign(pred_train), np.sign(pred_test)  # sign of elements, like dummies 1 and -1
    # Return error rate in train and test set by comparing 1s and -1s  (-1 = false prediction)
    return get_error_rate(pred_train, Y_train), \
           get_error_rate(pred_test, Y_test)

In [6]:

""" PLOT FUNCTION ==========================================================="""
def plot_error_rate(er_train, er_test):   # function for plotting the error rate
    df_error = pd.DataFrame([er_train, er_test]).T    # convert the input into a dataframe and transpose it
    df_error.columns = ['Training', 'Test']           # give names to the 2 columns 
    plot1 = df_error.plot(linewidth = 3, figsize = (8,6),   # plot the train and test error in one figure
            color = ['lightblue', 'darkblue'], grid = True) # set some plot properties
    plot1.set_xlabel('Number of iterations', fontsize = 12)
    plot1.set_xticklabels(range(0,450,50))
    plot1.set_ylabel('Error rate', fontsize = 12)
    plot1.set_title('Error rate vs number of iterations', fontsize = 16)
    plt.axhline(y=er_test[0], linewidth=1, color = 'red', ls = 'dashed')  # Add a horizontal line across the axis



