In [1]:
# Librairies
import numpy as np
import pandas as pd
#import matplotlib.pyplot as plt
#import seaborn as sns
#%matplotlib inline
import random
from pprint import pprint
import time
from joblib import Parallel, delayed, parallel_backend, Memory
import math

In [2]:
#Differents functions
# spliting datas
def train_test_split(data, test_size):
    
    #check if test_size is a float
    if isinstance(test_size, float):
        test_size = round(test_size * len(data))
    
    #retrieve all indices
    indices = data.index.tolist()
    
    #select random indices according to test_size
    test_indices = random.sample(population=indices, k=test_size)
    
    #spliting datas in training & testing
    test_data = data.loc[test_indices]
    train_data = data.drop(test_indices)
    
    return train_data, test_data

# residuals calculation
def compute_residuals(datas):
    
    cols = datas.columns.tolist()
    l = len(cols) - 2 - INDEX_TARGET
    datas["residual_" + str(l)] = datas[cols[INDEX_TARGET]] - datas[cols[-1]]
    
    return datas

# compute similarity score
def similarity(data):
    
   
    res = np.sum(data[:,-1])
    prob = np.sum(data[:, -2] * (1-data[:,-2])) + 1
    sim = (res**2)/(prob) 
    
    #print("sim: {}".format((res, prob, sim)))
    
    return sim

# compute output for each leaf
def output(data):
    res = np.sum(data[:,-1])
    prob = np.sum(data[:, -2] * (1-data[:,-2])) + 1
    out = res/prob
    
    return out
    
#compute gain
def get_gain(data, data_inf, data_sup):
    
    gain = similarity(data_inf) + similarity(data_sup) - similarity(data)
    
    return gain
    
#Normal classification using majority
def classify(data):
    #get all classes & count occurences
    label = data[:,-1]
    uniques_classes, counts = np.unique(data[:,-1], return_counts = True)
    
    #get index of most common class
    index = counts.argmax()
    
    #return the most common class
    return uniques_classes[index]

#Calculate all potential splits
def get_splits(data):
    
    #initialize our dict of potential splits
    splits = {}
    #initialize all attribute potential splits list
    #_, n_cols = data.shape
    
    for col in range(INDEX_TARGET):
        if col == 13:
            continue
        splits[col] = list()
    
    # Compute all attributes potential splits
    for col in range(INDEX_TARGET):
        #get unique datas
        #print("Current col: {}".format(col))
        if col == 13:
            continue
        values = np.unique(data[:, col])
        #populate our dict
        feature_type = FEATURE_TYPES[col]
        if feature_type == "Continous":
            for index in range(1, len(values)):
                current_value = values[index]
                #print(current_value)
                previous_value = values[index - 1]
                potential_split = np.mean([current_value, previous_value])
                splits[col].append(potential_split)
        else:
            splits[col] = values
    return splits

#Spliting data
def split_data(data, feature_col, value):
    
    feature_type = FEATURE_TYPES[feature_col]
    
    #define all masks
    
    if feature_type == "Continous":
        mask_inf = data[:, feature_col] <= value
        mask_sup = data[:, feature_col] > value
    else:
        mask_inf = data[:, feature_col] == value
        mask_sup = data[:, feature_col] != value
    
    #data spliting
    data_inf = data[mask_inf]
    data_sup = data[mask_sup]
    
    return data_inf, data_sup


#determine best split attribute and value
def determine_best_split(data, potential_splits):
    
    overall_metric_value = -300000000
     
    for colum_index in potential_splits:
        #print("current column: {}".format(colum_index))
        for value in potential_splits[colum_index]:
            #print("current value: {}".format(value))
            data_inf, data_sup = split_data(data, colum_index, value)
            current_overall = get_gain(data, data_inf, data_sup)
            #print("{}".format((current_overall, overall_metric_value)))
            
            #print(current_overall)
          #print(current_overall)
          #check if lower
            if current_overall >= overall_metric_value:
                #print("cur,met, val, col, : ({}, {},{},{})".format(current_overall, overall_metric_value, colum_index, value))
                overall_metric_value = current_overall
                best_split_column = colum_index
                best_split_value = value
    
    #print("Final Done !! bests: ({}, {}, {})".format(best_split_column, best_split_value, overall_metric_value))
    return best_split_column, best_split_value, overall_metric_value

#building decision Tree
def decision_tree(df, counter=0, min_samples=5, max_depth=5, metric="entropy"):
    
    if counter == 0:
        data = df.values
        global COLUMNS_NAMES, FEATURE_TYPES, METRIC, GAIN
        GAIN = 0
        COLUMNS_NAMES = df.columns[:INDEX_TARGET]
        FEATURE_TYPES = determine_feature_types(df)
        METRIC = metric
    else:
        data = df
        
    #base case
    
    if (len(data) < min_samples) or (counter == max_depth) or (GAIN < 0):
        classification = output(data)
        return classification
    
    else:
        counter +=1
        #computations for right and left part
        potential_splits = get_splits(data)
        best_split_column, best_split_value, overall_metric_value = determine_best_split(data, potential_splits)
        GAIN = overall_metric_value
        #We must change data_inf & data_sup order later
        data_inf, data_sup = split_data(data, best_split_column, best_split_value)
        
        # Creating subTree
        feature_type = FEATURE_TYPES[best_split_column]    
        if feature_type == "Continous":
            question = "{} <= {}".format(COLUMNS_NAMES[best_split_column], best_split_value)
        else:
            question = "{} == {}".format(COLUMNS_NAMES[best_split_column], best_split_value)
        #Adding labels
        labels = " " + str(len(data_inf) + len(data_sup)) + " " + str(overall_metric_value)
        question += labels
        sub_tree = {question: []}
        #print(question)
            
        
        #left and right
        yes_answer = decision_tree(data_inf, counter, min_samples, max_depth, metric=METRIC)
        no_answer = decision_tree(data_sup, counter, min_samples, max_depth, metric=METRIC)
        
        if yes_answer == no_answer:
            print(yes_answer)
            sub_tree = yes_answer
        else:
            #Append left and right part
            sub_tree[question].append(yes_answer)
            sub_tree[question].append(no_answer)
        
        return sub_tree
        
#classify a sample
def classify_sample(sample, tree):
    
    #get node elements
    if not isinstance(tree, dict):
        return tree
    key = list(tree.keys())[0]
    #print(key.split())
    feature, comp_op, value, _, _ = key.split()
    
    if comp_op == "<=":
        if sample[feature] <= float(value):
            answer = tree[key][0]
        else:
            answer = tree[key][1]
    else:
        if str(sample[feature]) == value:
            answer = tree[key][0]
        else:
            answer = tree[key][1]
        
    #test base case
    if not isinstance(answer, dict):
        #print('yes')
        return answer
    else:
        return classify_sample(sample, answer)
    
# Pruning tree
def pruning(tree, datas):
    
    node = list(tree.keys())[0]
    left_leaf, right_leaf = tree[node]
    
    if not isinstance(left_leaf, dict) and not isinstance(right_leaf, dict):
        print('yes')
    

#compute accuracy
def compute_predictions(df, tree):
    
    learning_rate = 0.3
    
    # datas columns list
    cols =  df.columns.tolist()
    
    # last predictions column
    if TEST == False:
        last_pred = cols[-2]
    else:
        last_pred = cols[-1]
    
    # compute the next prediction index
    l = len(cols) + 1
    
    # Get all ouputs
    outputs = df.apply(classify_sample, axis=1, args=(tree,))
    
    # Compute log(odds)
    log_odds = np.log((df[last_pred] / (1-df[last_pred]))) + outputs*learning_rate
    
    final_pred = np.exp(log_odds)/(1 + np.exp(log_odds))
    
    if TEST == False:
        pred_name = "pred_" + str(l)
        df[pred_name] = final_pred
    else:
        pred_name = "classification"
        df[pred_name] = final_pred
        df[pred_name] = df[pred_name].apply(convert_predictions)
        df["classification_correct"] = df.classification == df.A15
    
    
    #accuracy = df.classification_correct.mean()
    return df
    
#convert predictions
def convert_predictions(val):
    
    return (1, 0)[val <= 0.5]

# In order to handle non-continous values
# We have to identify all features type in our dataset
def determine_feature_types(data):
    
    features_type  = []
    threshold = 931
    for col in data.columns:
        
        uniques_val = data[col].unique()
        sample = uniques_val[0]
        
        if (isinstance(sample, str)):
            features_type.append("Categorical")
        else:
            features_type.append("Continous")
    
    return features_type

#boosting algorithm
def boosting (train, test, n_estimators,sample_size, validation_size, early_stopping):
    
    # Set sampling size
    size = round(len(train)*sample_size)
    
    # Divide 
    train, val = train_test_split(train, validation_size)
    eraly_in = 0
    best_acc = 0
    best_tree = 0.5
    
    
    global TEST
    
    # Training phase , set test operation to false
    TEST = False 
    
    for k in range(n_estimators):
        
        print("step: {} left: {}".format(k+1, n_estimators-k-1))
        
        # residuals computation
        train = compute_residuals(train)
        train_sample = train.sample(size, replace=True)
        
        #get cover value
        #cover = round(np.sum(train.values[:, -2] * (1-train.values[:,-2])) - 1)
        
        #build tree
        my_tree = decision_tree(train_sample, min_samples=5, max_depth=5)
        
        #build new predictions
        train = compute_predictions(train, my_tree)
        
        # Validation set evaluation
        
        TEST = True
        val_test = compute_predictions(val.copy(), my_tree)
        
        res = compute_metrics(val_test)
        
        acc = res[0][0]
        
        if acc > best_acc:
            best_acc = acc
            best_tree = my_tree
            early_in = 0
        else:
            early_in += 1
            
        if early_in == early_stopping:
            break
        print(" acc: {} early: {}".format(best_acc, early_in))
            
        TEST = False
            
        #print(train)
        
    
    # Testing phase , set test operation to false
    TEST = True
    test = compute_predictions(test, best_tree)
    
    
    return best_tree, train, test

def compute_metrics(df):
    
    # 1- Accuracy
    accuracy = df.classification_correct.mean()
    
    #2- Recall, precision
    TP = len(df[(df.A15 == 1) & (df.classification == 1)])
    FP = len(df[(df.A15 == 0) & (df.classification == 1)])
    FN = len(df[(df.A15 == 1) & (df.classification == 0)])
    TN = len(df[(df.A15 == 0) & (df.classification == 0)])

    Precision = TP / (TP + FP)
    Recall = TP / (TP + FN)
    Precision, Recall
    
    #3- Lost, no_win
    P = df.A13[(df.A15 == 0) & (df.classification == 1)].sum()

    df_no_win = df[["A13", "A3", "A5"]][(df.A15 == 1) & (df.classification == 0)]
    df_no_win['A3'] /=100
    df_no_win["NoWin"] = df_no_win.A13 * df_no_win.A3 * df_no_win.A5/12

    M = df_no_win.NoWin.sum()
    P, M
    
    4# Error
    error = (FP / (TP + FP)) + (FN / FN + TN)
    
    #Friedman Test
    #f, _ = friedmanchisquare(df.kredit.values, df.classification.values)
    
    #resulting metric list
    result = [(accuracy, Precision, Recall, error), (P, M), (TP, FP, FN, TN)]
    
    return result

In [None]:
#Execution
random.seed(15)
datas = pd.read_csv('australian.dat',sep=' ')
global INDEX_TARGET
INDEX_TARGET = 14

datas['pred_0'] = 0.5
train, test = train_test_split(datas, 0.3)
my_tree, new_train, new_test = boosting(train, test, 100, 0.5, 0.3, 10)
compute_metrics(new_test)
new_test.to_excel('classic_boosting_2.xlsx', index=False)