In [1]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

In [38]:
# error function 
# this function the error rate of the weak classifier.
# Compute error rate, alpha and w
def compute_error(y, y_pred, w_i):
    '''
    Calculate the error rate of a weak classifier m. Arguments:
    y: actual target value
    y_pred: predicted value by weak classifier
    w_i: individual weights for each observation
    
    Note that all arrays should be the same length
    '''
    return (sum(w_i * (np.not_equal(y, y_pred)).astype(int)))/sum(w_i)

def compute_alpha(error):
    '''
    Calculate the weight of a weak classifier m in the majority vote of the final classifier. This is called
    alpha in chapter 10.1 of The Elements of Statistical Learning. Arguments:
    error: error rate from weak classifier m
    '''
    return np.log((1 - error) / error)

def update_weights(w_i, alpha, y, y_pred):
    ''' 
    Update individual weights w_i after a boosting iteration. Arguments:
    w_i: individual weights for each observation
    y: actual target value
    y_pred: predicted value by weak classifier  
    alpha: weight of weak classifier used to estimate y_pred
    '''  
    return w_i * np.exp(alpha * (np.not_equal(y, y_pred)).astype(int))

In [65]:
# Define AdaBoost class
class AdaBoost:
    
    def __init__(self):
        self.alphas = []
        self.G_M = []
        self.M = None
        self.training_errors = []
        self.prediction_errors = []

    def fit(self, X, y, M = 100):
        
        # Clear before calling
        self.alphas = [] 
        self.training_errors = []
        self.M = M

        # Iterate over M weak classifiers
        for m in range(0, M):
            
            # Set weights for current boosting iteration
            if m == 0:
                w_i = np.ones(len(y)) * 1 / len(y)  # At m = 0, weights are all the same and equal to 1 / N
            else:
                # (d) Update w_i
                w_i = update_weights(w_i, alpha_m, y, y_pred)
            
            # (a) Fit weak classifier and predict labels
            G_m = DecisionTreeClassifier(max_depth = 1)     # Stump: Two terminal-node classification tree
            G_m.fit(X, y, sample_weight = w_i)
            y_pred = G_m.predict(X)
            
            self.G_M.append(G_m) # Save to list of weak classifiers

            # (b) Compute error
            error_m = compute_error(y, y_pred, w_i)
            self.training_errors.append(error_m)

            # (c) Compute alpha
            alpha_m = compute_alpha(error_m)
            self.alphas.append(alpha_m)

        assert len(self.G_M) == len(self.alphas)
    def predict(self, X):
        '''
        Predict using fitted model. Arguments:
        X: independent variables - array-like
        '''

        # Initialise dataframe with weak predictions for each observation
        weak_preds = pd.DataFrame(index = range(len(X)), columns = range(self.M)) 

        # Predict class label for each weak classifier, weighted by alpha_m
        for m in range(self.M):
            y_pred_m = self.G_M[m].predict(X) * self.alphas[m]
            weak_preds.iloc[:,m] = y_pred_m

        # Calculate final predictions
        y_pred = (1 * np.sign(weak_preds.T.sum())).astype(int)

        return y_pred

In [6]:
loans = pd.read_csv('lending-club-data.csv',
                    low_memory = False)

In [7]:
features = ['grade',              # grade of the loan
            'term',               # the term of the loan
            'home_ownership',     # home ownership status: own, mortgage or rent
            'emp_length',         # number of years of employment
           ]
loans['safe_loans'] = loans['bad_loans'].apply(lambda x : +1 if x==0 else -1)
loans.drop('bad_loans', axis = 1, inplace = True)
target = 'safe_loans'
loans = loans[features + [target]]

In [8]:
cat_cols = [col for col in loans.columns if loans[col].dtype == 'object']
onehot = pd.get_dummies(loans[cat_cols], prefix = cat_cols)
loans_data = pd.concat([loans, onehot], axis = 1)
loans_data.drop(cat_cols, axis = 1, inplace = True)
loans_data.columns

Index(['safe_loans', 'grade_A', 'grade_B', 'grade_C', 'grade_D', 'grade_E',
       'grade_F', 'grade_G', 'term_ 36 months', 'term_ 60 months',
       'home_ownership_MORTGAGE', 'home_ownership_OTHER', 'home_ownership_OWN',
       'home_ownership_RENT', 'emp_length_1 year', 'emp_length_10+ years',
       'emp_length_2 years', 'emp_length_3 years', 'emp_length_4 years',
       'emp_length_5 years', 'emp_length_6 years', 'emp_length_7 years',
       'emp_length_8 years', 'emp_length_9 years', 'emp_length_< 1 year'],
      dtype='object')

In [9]:
import json
train_lst, valid_lst = [], []
with open('module-8-assignment-2-test-idx.json', 'r') as js:
    js_reader = json.load(js)
    for idx in js_reader:
        valid_lst.append(idx)
with open('module-8-assignment-2-train-idx.json', 'r') as js:
    js_reader = json.load(js)
    for idx in js_reader:
        train_lst.append(idx)

In [10]:
train_data = loans_data.iloc[train_lst]
test_data = loans_data.iloc[valid_lst]

In [11]:
features = loans_data.columns.to_list()
features.remove('safe_loans')  # Remove the response variable
features

['grade_A',
 'grade_B',
 'grade_C',
 'grade_D',
 'grade_E',
 'grade_F',
 'grade_G',
 'term_ 36 months',
 'term_ 60 months',
 'home_ownership_MORTGAGE',
 'home_ownership_OTHER',
 'home_ownership_OWN',
 'home_ownership_RENT',
 'emp_length_1 year',
 'emp_length_10+ years',
 'emp_length_2 years',
 'emp_length_3 years',
 'emp_length_4 years',
 'emp_length_5 years',
 'emp_length_6 years',
 'emp_length_7 years',
 'emp_length_8 years',
 'emp_length_9 years',
 'emp_length_< 1 year']

In [66]:
adaboost = AdaBoost()

In [67]:
adaboost.fit(train_data[features], train_data[target])

In [68]:
y_pred = adaboost.predict(test_data[features])

In [69]:
accuracy_score(test_data[target], y_pred)

0.6233304610081861

In [81]:
adaboost2 = AdaBoost()
adaboost2.fit(train_data[features], train_data[target], M = 400)
y_pred = adaboost2.predict(test_data[features])

In [73]:
accuracy_score(test_data[target], y_pred)

0.6233304610081861

In [74]:
from sklearn.ensemble import AdaBoostClassifier
adaboost_sklearn = AdaBoostClassifier(n_estimators = 100)

In [75]:
adaboost_sklearn.fit(train_data[features], train_data[target])

AdaBoostClassifier(n_estimators=100)

In [76]:
y_pred_sklearn = adaboost_sklearn.predict(test_data[features])

In [77]:
accuracy_score(test_data[target], y_pred_sklearn)

0.6228996122361051

In [84]:
adaboost_sklearn_2 = AdaBoostClassifier(n_estimators = 400)
adaboost_sklearn_2.fit(train_data[features], train_data[target])

AdaBoostClassifier(n_estimators=400)

In [85]:
y_pred_sklearn = adaboost_sklearn_2.predict(test_data[features])
accuracy_score(test_data[target], y_pred_sklearn)

0.6241921585523481