In [177]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import time
import seaborn as sns
from pylab import rcParams
from sklearn.model_selection import KFold
from concurrent.futures import ThreadPoolExecutor

In [178]:
class adabooster:
    def __init__(self,debug = False):
        self.debug = debug
        self.marginDF = None
        return
    def saveClassifier(self, file="Adabooster_single.csv"):
        self.classifier_df = pd.DataFrame(np.concatenate((self.h,np.expand_dims(self.alpha, axis=1)), axis = 1) , columns = ["Threshold","Feature","Direction","Alpha"])
        self.classifier_df.to_csv("Adabooster_single.csv")
    def loadClassifier(self,file = "Adabooster_single.csv"):
        self.classifier_df = pd.read_csv(file,index_col = 0)
    def reset_params(self,Tt, datat):
        'Reset the parameters for the booster round'
        
        self.T = Tt
        self.h  = np.zeros([self.T, 3], dtype=np.float64)
        self.alpha  = np.zeros(self.T, dtype=np.float64)
        self.err  = np.ones(self.T, dtype=np.float64) * np.inf
        self.weight  = np.ones(datat.shape[0], dtype=np.float64) / datat.shape[0]
        self.dim = datat.shape[1]
    def calculate_decision_stump(self,data, feature, weight, label):
        'Calculate the desicion stump for the next booster round'
        Tp=np.float64(0); #T+ total sum of positive examples weights
        Tn=np.float64(0) #T- total sum of negative examples weights
        Sp=np.float64(0) #S+ sum of positive weights below the cuurent threshold
        Sn=np.float64(0) #S- sum of negative weights below the current threshold
        error1=np.float64(0)
        error2=np.float64(0)
        min_error=np.float64(2.0) 
        min_thresh=np.float64(0) 
        direction=1

        y = np.zeros(data.shape[0], dtype=np.int64)

        #get all positive weights    
        temp  = (label == 1)
        temp = np.int64(temp)
        Tp = np.sum(temp * weight)

        #get all negative weights  
        temp  = (label == -1)
        temp = np.int64(temp)
        Tn = np.sum(temp * weight)

        #sort feature values
        sorted_labels = data[:, feature].argsort()
        sorted_vector =  data[sorted_labels]

        length = len(sorted_vector)
        for i in range(length):

            #RIGHT DIRECTION THRESHOLD
            #error1 is the sum of positives up to that point + total negatives minus the sum of negatives so far
            error1 = Sp + (Tn - Sn) 
            if label[sorted_labels[i]] == -1 : 
                Sn = Sn +  weight[sorted_labels[i]]
            else :
                Sp = Sp + weight[sorted_labels[i]]

            #LEFT DIRECTION THRESHOLD
            error2 = Sn + (Tp - Sp) 

            if(min_error > error1) :
                min_error = error1
                min_thresh = sorted_vector[i, feature]
                direction = 1
            if(min_error > error2) :
                min_error = error2
                min_thresh = sorted_vector[i, feature]
                direction = -1           

        return min_thresh, direction, min_error
    def calculate_alpha(self,weighted_error):    
        #========================
        #YOUR CODE HERE
        #========================
        
        return  0.5 * np.log( (1.0 - weighted_error) / weighted_error )
    def classify_dataset_against_weak_classifier(self,x, thresh, direction):
        classification = np.zeros(len(x))

        #classifiy all samples based on the last feature
        #get actual classification
        for i in range(len(x)):
            #========================
            #YOUR CODE HERE
            #========================
            if direction == -1:
                if x[i] < thresh: classification[i] = 1
                else : classification[i] = -1
            else:
                if x[i] < thresh: classification[i] = -1
                else : classification[i] = 1    


        return classification 
    def update_weights(self,weight, alpha, classification, label):

        for i in range(len(weight)):
            #========================
            #YOUR CODE HERE
            #========================
            weight[i] =  weight[i] * np.exp( -1.0 * alpha * classification[i] * label[i] ) 

        return weight
    def normalise_weights(self,weight):

        #========================
        #YOUR CODE HERE
        #========================
        weight = weight / np.sum(weight)

        return weight 
    def fit(self, x , label ):
        # This is for future use
        T = self.T
        h = self.h
        alpha = self.alpha
        err = self.err
        weight = self.weight
    
        #0 - for each boosting round
        for t in range(T): 
            #1 - iterate through every feature  

                
            for feature in range(self.dim): 
                weighted_error = np.float64(0)

                #========================
                #2 - GENERATE A DECISION STUMP FOR A FEATURE
                #YOUR CODE HERE
                #========================
                threshold, sign, weighted_error = self.calculate_decision_stump(x , feature, weight, label)
                #========================
                #3 - KEEP TRACK OF THE FEATURE WITH THE LOWEST WEIGHTED ERROR
                #YOUR CODE HERE
                #========================        
                if weighted_error < err[t] :
                    err[t] = weighted_error
                    h[t][0] = threshold
                    h[t][1] = feature
                    h[t][2] = sign


            #========================
            #4 - CALCULATE ALPHA FOR BOOSTING ROUND t
            #YOUR CODE HERE
            #========================            
            alpha[t] = self.calculate_alpha(err[t])

            #========================
            #5 - CLASSIFY ALL SAMPLES BASED ON THE SELECTED FEATURE FOR BOOSTING ROUND t
            #YOUR CODE HERE
            #======================== 
            #print(x[:, int(h[t][1]) ])
            classification = self.classify_dataset_against_weak_classifier(x[:, int(h[t][1]) ], h[t][0], h[t][2] )

            #========================
            #6 - UPDATE WEIGHTS BASED ON THE CORRECTNESS OF THE CLASSIFICATION
            #YOUR CODE HERE
            #========================   
            weight = self.update_weights(weight, alpha[t], classification, label)

            #========================
            #7 - NORMALISE REASSIGNED WEIGHTS
            #YOUR CODE HERE
            #========================  
            weight = self.normalise_weights(weight )

            #--------------------------------------------
            #BOOSTING ALGORITHM DONE
            #--------------------------------------------
            if (self.debug):
               print("Round ",t, " Done!")
    def classify_sample(self,xi ):
        boost_classif = self.classifier_df
        boost_classif = boost_classif.values
        classification_sum = np.float64(0)

        for thresh, feat, sign, alpha in boost_classif:
            feat = np.int64(feat)
            temp = np.float64(0)
            if(sign == 1):
                temp = (xi[feat] >= thresh)
            else:
                temp =  (xi[feat]< thresh)

            temp = alpha*(-1 if temp == 0 else temp)


            classification_sum = classification_sum + temp


        if classification_sum >= 0:
            return 1
        else:
            return -1
    def score(self,test_x,test_y):
        results = []
        for i in range(len(test_x)):
            results.append(self.classify_sample(test_x[i]))
        results = np.array(results)
        return  len(results[results == test_y])/len(results*100)
    def sum_classifier_votes_for_each_sample(self, dataset, df):
        classifier_df = self.classifier_df
        for i in range(len(dataset)):
            classification_sum = np.float64(0)
            neg_votes = np.float64(0)
            pos_votes = np.float64(0)
            for idx, thresh, feat, sign, alpha in classifier_df.itertuples():
                #========================
                #YOUR CODE HERE
                #========================  
                feat = np.int64(feat)
                temp = np.float64(0)
                if(sign == 1):
                    temp = (dataset[i][feat] >= thresh)
                else:
                    temp =  (dataset[i][feat]< thresh)

                temp = alpha*(-1 if temp == 0 else temp)
                if temp < 0:
                    neg_votes = neg_votes+temp
                else : 
                    pos_votes = pos_votes +temp



                classification_sum = classification_sum + temp


            #========================
            #YOUR CODE HERE
            #========================  
            df['sum_alpha'].iloc[i] = classification_sum
            df['pos_votes'].iloc[i] = pos_votes
            df['neg_votes'].iloc[i] = neg_votes

        return df
    def margin_calculation(self,sign, pos, neg, tot_votes):
        #========================
        #YOUR CODE HERE
        #========================   
        margin = (pos/tot_votes if sign>0 else neg/tot_votes)

        return margin
    def margin_calculation_for_training_samples(self, sign, pos, neg, tot_votes ):  
        if np.sign(sign) < 0:
            return np.abs(neg) / tot_votes, -1
        else:
            return pos / tot_votes, 1
    def sign_of_margin(self, margin, classification, true_class_label):
        #========================
        #YOUR CODE HERE
        #========================      
        return (margin if (classification == true_class_label) else -margin)
    def calculate_margins(self,x,label):
        testing_set_df = pd.DataFrame(x)
        testing_set_df['sum_alpha'] = 0 
        testing_set_df['pos_votes'] = 0 
        testing_set_df['neg_votes'] = 0 

        testing_set_df = self.sum_classifier_votes_for_each_sample(x, testing_set_df)
        total_alpha_votes = np.sum(self.classifier_df.Alpha)
        testing_set_df['classification'] = 0
        testing_set_df['margin'] = 0
        testing_set_df['total_alpha_votes'] = total_alpha_votes
        
        result = testing_set_df[['sum_alpha','pos_votes','neg_votes','total_alpha_votes']].apply(lambda x: self.margin_calculation_for_training_samples(*x), axis=1)
        testing_set_df['margin'] = result.apply(lambda x: x[0])
        testing_set_df['classification'] = result.apply(lambda x: x[1])
        testing_set_df['true_class_label'] = label
        
        testing_set_df['sign_of_margin'] = testing_set_df[['margin', 'classification', 'true_class_label']].apply(lambda x: self.sign_of_margin(*x), axis=1)
        self.marginDF = testing_set_df[['sign_of_margin']]
        return self.marginDF
    def plotMargins(self):
        margin_30 = self.marginDF[["sign_of_margin"]]
        sns.kdeplot(margin_30.sign_of_margin, cumulative=True, label='classifier size 30')


In [179]:
# download the dataset and k-fold it
spamDF = pd.read_csv("spambase.data", header = None,index_col=False)
spamDF[:5]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,48,49,50,51,52,53,54,55,56,57
0,0.0,0.64,0.64,0.0,0.32,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.778,0.0,0.0,3.756,61,278,1
1,0.21,0.28,0.5,0.0,0.14,0.28,0.21,0.07,0.0,0.94,...,0.0,0.132,0.0,0.372,0.18,0.048,5.114,101,1028,1
2,0.06,0.0,0.71,0.0,1.23,0.19,0.19,0.12,0.64,0.25,...,0.01,0.143,0.0,0.276,0.184,0.01,9.821,485,2259,1
3,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.137,0.0,0.137,0.0,0.0,3.537,40,191,1
4,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.135,0.0,0.135,0.0,0.0,3.537,40,191,1


In [180]:
# Replace NAN values
for col in spamDF.columns:
    spamDF[col].fillna(spamDF[col].median(),inplace = True)


In [181]:
X = spamDF[list(range(len(spamDF.columns)-1))].values
X.shape

(4601, 57)

In [182]:
Y = spamDF[len(spamDF.columns)-1].apply(lambda a: 1 if a==1 else -1).values
print("0 Identifier: ",len(Y[Y==-1])/len(Y)*100)
print("1 Identifier: ",len(Y[Y==1])/len(Y)*100)
Y.shape
# Clearly the data is skewed, we will need to check that this is the same in the final

0 Identifier:  60.59552271245382
1 Identifier:  39.404477287546186


(4601,)

In [183]:
# Define the folder
kf = KFold(n_splits=3)
# Create the testing sets
Y_correct = Y[Y==1]
Y_incorrect = Y[Y==-1]
X_correct = X[Y==1]
X_incorrect = X[Y==-1]

print(Y_correct[:3])
print(Y_incorrect[:3])

[1 1 1]
[-1 -1 -1]


In [184]:

train_x = []
test_x = []
train_y = []
test_y = []

# Add the correct rows
for train_index, test_index in kf.split(Y_correct):
    X_train, X_test = X_correct[train_index], X_correct[test_index]
    y_train, y_test = Y_correct[train_index], Y_correct[test_index]
    
    train_x.append(X_train)
    test_x.append(X_test)
    train_y.append(y_train)
    test_y.append(y_test)
#     Add the incorrect rows
i = 0
for train_index, test_index in kf.split(Y_incorrect):
    X_train, X_test = X_incorrect[train_index], X_incorrect[test_index]
    y_train, y_test = Y_incorrect[train_index], Y_incorrect[test_index]
    
    train_x[i] = np.append(train_x[i], X_train, axis=0)   
    test_x[i] = np.append(test_x[i],X_test, axis=0)
    train_y[i] = np.append(train_y[i],y_train, axis=0)
    test_y[i]= np.append(test_y[i],y_test, axis=0)
    i = i + 1


In [185]:
min_rows = 1000000000
for i in range(len(train_x)):
    if train_x[i].shape[0] < min_rows:
        min_rows = train_x[i].shape[0]
    print(train_x[i].shape)
print(min_rows)
for i in range(len(train_x)):
    train_x[i] = train_x[i][:min_rows]
    train_y[i] = train_y[i][:min_rows]
    


(3066, 57)
(3068, 57)
(3068, 57)
3066


In [186]:
booster = adabooster(False)
# Create a booster and initialize it with 30 rounds
booster.reset_params(30,train_x[0])

In [187]:
# If a booster already exists, dont worry about classifying (for time constraint)
import os.path
from os import path
boosterFile = "Adabooster_"
for i in range(len(train_x)):
    booster.fit(train_x[i],train_y[i])
    booster.saveClassifier(boosterFile+str(i)+".csv")
    score = booster.score(test_x[i],test_y[i])
    print(score)


0.647557003257329
0.7045009784735812
0.6829745596868885
