In [1]:
# %matplotlib qt
import copy
import numpy as np
from collections import Counter
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [2]:
def load_data(set_type, test_size):
    if   set_type == 'BreastTissue':
        dataset = np.loadtxt('Datasets\BreastTissue.txt', delimiter = '\t', dtype='str') 
        n = 9
    elif set_type == 'Diabetes':
        dataset = np.loadtxt('Datasets\Diabetes.txt', delimiter = '\t', dtype='str')
        n = 8
    elif set_type == 'Glass':
        dataset = np.loadtxt('Datasets\Glass.txt', delimiter = '\t', dtype='str')
        n = 9
    elif set_type == 'Ionosphere':
        dataset = np.loadtxt('Datasets\Ionosphere.txt', delimiter = ',', dtype='str')
        n = 34
    elif set_type == 'Sonar':
        dataset = np.loadtxt('Datasets\Sonar.txt', delimiter = ',', dtype='str')
        n = 60
    elif set_type == 'Wine':
        dataset = np.loadtxt('Datasets\Wine.txt', delimiter = ', ', dtype='str')
        n = 13
        
    x = dataset[:,:n].astype(np.float)
    y = dataset[:,n]
    
    y_unique = np.unique(y)
    k = len(np.unique(y))
    X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=test_size, random_state=10)
    
    m = X_train.shape[0]
    m_test = X_test.shape[0]
    
    return X_train, y_train, m, X_test, y_test, m_test, k, n, y_unique

In [3]:
def add_noise(data, mu=0, sigma=1, percent=0):
    if percent == 0:
        return data
    else:
        number_of_features = np.round(data.shape[1]*percent/100).astype(int)
        features_idx = np.random.choice(data.shape[1], replace=False, size=number_of_features, p=None)
        noise = np.random.normal(mu, sigma, data[:, features_idx].shape)
        data[:, features_idx] = data[:, features_idx] + noise
        return data

In [4]:
def print_dataset_specification(dataset, X_train, y_train, m, X_test, y_test, m_test, c, n, y_unique):
    print('---------------------------DataSet : '+ dataset +'---------------------------')
    print('x_train.shape =',X_train.shape)
    print('y_train.shape =',y_train.shape)
    print('m =',m)
    print('n =',n)
    print('x_test.shape =',X_test.shape)
    print('y_test.shape =',y_test.shape)
    print('m_test =',m_test)
    print('c =',c)
    print('y_unique =',y_unique)   

In [5]:
def resample_imp(x, y, replace=True, n_samples=None, probabilies=None):
    idx = np.random.choice(x.shape[0], replace=replace, size=n_samples, p=probabilies)
    x_sample, y_sample = x[idx], y[idx]
    return x_sample, y_sample

In [6]:
def majority_vote(data_list):
    #return Counter(data_list[:, i]).most_common(1)[0][0]
    unique, counts = np.unique(data_list, return_counts=True)
    return unique[np.argmax(counts)]

# Train Data Without Noise:

In [7]:
datasets = [['BreastTissue',4],['Diabetes',1],['Glass',7],['Ionosphere',1],['Sonar',1],['Wine',1]]
# datasets = [['BreastTissue',4]]

In [10]:
for dataset, tree_max_dept in datasets:
    
    runs = 10
    accuracy_for_each_run = np.zeros((runs, 3))
    for run in range(0, runs):
        
        # Load Data
        X_train, Y_train, m, X_test, Y_test, m_test, c, n, y_unique = load_data(set_type=dataset, test_size=0.3)

        # ----------------Normal---------------
        normal_model = DecisionTreeClassifier(max_depth=tree_max_dept)
        normal_model.fit(X_train, Y_train)
        normal_y_pred = normal_model.predict(X_test)
        # -------------------------------------

        # ---------------BAGGING---------------
        
        # Initialize the parameters
        Models = []
        T_bagging = 21

        for i in range(0, T_bagging):

            # bootstrap sample
            x_train, y_train = resample_imp(X_train, Y_train, replace=True, n_samples=m, probabilies=None)
            
            # create model
            model_i = DecisionTreeClassifier(max_depth=None)
            model_i.fit(x_train, y_train)

            # ensemble Model
            Models.append( copy.deepcopy(model_i) )


        # Model Voting
        Votes = []
        for model in Models:
            y_pred = model.predict(X_test)
            Votes.append( copy.deepcopy(y_pred) )  
        Votes = np.array(Votes)

        # Majority Vote
        bagging_y_pred_final = []
        for i in range(0, m_test):
            bagging_y_pred_final.append( majority_vote(Votes[:, i]) )
        # -------------------------------------
        
        
        
        # -------------AdaBoost.M1-------------
        
        # Initialize the parameters
        Models = []
        T_boosting = 51      
        w = np.zeros((T_boosting, m), dtype='float64')
        w[0,:] = 1/m
        e = np.zeros((T_boosting))
        l = np.zeros((T_boosting, m))
        beta = np.zeros((T_boosting-1))
        k = 0
        
        while k <= T_boosting-2:
            
            # bootstrap sample
            x_train, y_train = resample_imp(X_train, Y_train, replace=True, n_samples=m, probabilies=w[k] )
            
            # create model
            model_k = DecisionTreeClassifier(max_depth=tree_max_dept)
            model_k.fit(x_train, y_train)
            Y_train_pred = model_k.predict(X_train)
            
            # Calculate weighted ensemble error
            l[k] = np.array([Y_train_pred[i]!=Y_train[i] for i in range(len(Y_train))]).astype(int)
            e[k] = w[k]@l[k].T
            
            if (e[k]==0) or (e[k]>=0.5):
                model_k = None
                w[k,:] = 1/m
                continue
            else:
                beta[k] = e[k]/(1-e[k])                
                w2 = np.ravel((np.asmatrix(w[k])@(beta[k]**(1-np.asmatrix(l[k]).T))))[0]
                for j in range(0, m):
                    w1 = w[k,j]*(beta[k]**(1-l[k,j]))
                    w[k+1, j] = w1/w2
                k = k + 1
            
            # ensemble Model
            Models.append( copy.deepcopy(model_k) )
                    
                
        # Classification phase
        Votes = []
        for model in Models:
            Y_test_pred = model.predict(X_test)
            Votes.append( copy.deepcopy(Y_test_pred) )  
        Votes = np.array(Votes)
        
        # Calculate the support for classes
        mu = np.zeros((m_test, c))
        for i in range(0, m_test):
            for t in range(0, c):
                mu[i,t] = np.sum(np.log(1/beta)[np.where(Votes[:,i]==y_unique[t])])

        # Maximum Support Vote
        boosting_y_pred_final = y_unique[np.argmax(mu, axis=1)]
        
        # -------------------------------------
        
        
        
        # Results
        normal_test_accuracy = accuracy_score(Y_test, normal_y_pred)
        bagging_test_accuracy = accuracy_score(Y_test, bagging_y_pred_final)
        boosting_test_accuracy = accuracy_score(Y_test, boosting_y_pred_final)
        
        accuracy_for_each_run[run, 0] = normal_test_accuracy
        accuracy_for_each_run[run, 1] = bagging_test_accuracy
        accuracy_for_each_run[run, 2] = boosting_test_accuracy
    
    print_dataset_specification(dataset, X_train, Y_train, m, X_test, Y_test, m_test, c, n, y_unique)    
    print('')
    print('T(Bagging) = ',T_bagging)
    print('T(Boosting) = ',T_boosting)
    print('tree_max_dept for (Normal, Boosting) = ',tree_max_dept)
    print('Runs = ',runs)
    print('')
    print('Normal Test Accuracy = ',np.round(np.mean(accuracy_for_each_run,axis=0)[0], 2) )
    print('Bagging Test Accuracy = ',np.round(np.mean(accuracy_for_each_run,axis=0)[1], 2) )   
    print('Boosting Test Accuracy = ',np.round(np.mean(accuracy_for_each_run,axis=0)[2], 2) )   
    print('-----------------------------------------------------------------------------')
    print('')

---------------------------DataSet : BreastTissue---------------------------
x_train.shape = (74, 9)
y_train.shape = (74,)
m = 74
n = 9
x_test.shape = (32, 9)
y_test.shape = (32,)
m_test = 32
c = 6
y_unique = ['1' '2' '3' '4' '5' '6']

T(Bagging) =  21
T(Boosting) =  51
tree_max_dept for (Normal, Boosting) =  4
Runs =  10

Normal Test Accuracy =  0.58
Bagging Test Accuracy =  0.68
Boosting Test Accuracy =  0.69
-----------------------------------------------------------------------------

---------------------------DataSet : Diabetes---------------------------
x_train.shape = (537, 8)
y_train.shape = (537,)
m = 537
n = 8
x_test.shape = (231, 8)
y_test.shape = (231,)
m_test = 231
c = 2
y_unique = ['-1' '1']

T(Bagging) =  21
T(Boosting) =  51
tree_max_dept for (Normal, Boosting) =  1
Runs =  10

Normal Test Accuracy =  0.75
Bagging Test Accuracy =  0.76
Boosting Test Accuracy =  0.78
-----------------------------------------------------------------------------

-------------------------

# Train Data With Noise:

In [8]:
datasets = [['BreastTissue',4],['Diabetes',1],['Glass',7],['Ionosphere',1],['Sonar',1],['Wine',1]]
# datasets = [['BreastTissue',4]]

In [9]:
for dataset, tree_max_dept in datasets:
    
    noise_percents = [10, 20, 30]
    for noise_percent in noise_percents:
        
        runs = 10
        accuracy_for_each_run = np.zeros((runs, 3))
        for run in range(0, runs):

            # Load Data
            X_train, Y_train, m, X_test, Y_test, m_test, c, n, y_unique = load_data(set_type=dataset, test_size=0.3)
            X_train = add_noise(data=X_train, mu=0, sigma=1, percent=noise_percent)

            # ----------------Normal---------------
            normal_model = DecisionTreeClassifier(max_depth=tree_max_dept)
            normal_model.fit(X_train, Y_train)
            normal_y_pred = normal_model.predict(X_test)
            # -------------------------------------

            # ---------------BAGGING---------------

            # Initialize the parameters
            Models = []
            T_bagging = 21

            for i in range(0, T_bagging):

                # bootstrap sample
                x_train, y_train = resample_imp(X_train, Y_train, replace=True, n_samples=m, probabilies=None)

                # create model
                model_i = DecisionTreeClassifier(max_depth=None)
                model_i.fit(x_train, y_train)

                # ensemble Model
                Models.append( copy.deepcopy(model_i) )


            # Model Voting
            Votes = []
            for model in Models:
                y_pred = model.predict(X_test)
                Votes.append( copy.deepcopy(y_pred) )  
            Votes = np.array(Votes)

            # Majority Vote
            bagging_y_pred_final = []
            for i in range(0, m_test):
                bagging_y_pred_final.append( majority_vote(Votes[:, i]) )
            # -------------------------------------



            # -------------AdaBoost.M1-------------

            # Initialize the parameters
            Models = []
            T_boosting = 51      
            w = np.zeros((T_boosting, m))
            w[0,:] = 1/m
            e = np.zeros((T_boosting))
            l = np.zeros((T_boosting, m))
            beta = np.zeros((T_boosting-1))
            k = 0
            
            while k <= T_boosting-2:

                # bootstrap sample
                x_train, y_train = resample_imp(X_train, Y_train, replace=True, n_samples=m, probabilies=w[k] )

                # create model
                model_k = DecisionTreeClassifier(max_depth=tree_max_dept)
                model_k.fit(x_train, y_train)
                Y_train_pred = model_k.predict(X_train)

                # Calculate weighted ensemble error
                l[k] = np.array([Y_train_pred[i]!=Y_train[i] for i in range(len(Y_train))]).astype(int)
                e[k] = w[k]@l[k].T

                if (e[k]==0) or (e[k]>=0.5):
                    model_k = None
                    w[k,:] = 1/m
                    continue
                else:
                    beta[k] = e[k]/(1-e[k])                
                    w2 = np.ravel((np.asmatrix(w[k])@(beta[k]**(1-np.asmatrix(l[k]).T))))[0]
                    for j in range(0, m):
                        w1 = w[k,j]*(beta[k]**(1-l[k,j]))
                        w[k+1, j] = w1/w2
                    k = k + 1

                # ensemble Model
                Models.append( copy.deepcopy(model_k) )


            # Classification phase
            Votes = []
            for model in Models:
                Y_test_pred = model.predict(X_test)
                Votes.append( copy.deepcopy(Y_test_pred) )  
            Votes = np.array(Votes)

            # Calculate the support for classes
            mu = np.zeros((m_test, c))
            for i in range(0, m_test):
                for t in range(0, c):
                    mu[i,t] = np.sum(np.log(1/beta)[np.where(Votes[:,i]==y_unique[t])])

            # Maximum Support Vote
            boosting_y_pred_final = y_unique[np.argmax(mu, axis=1)]

            # -------------------------------------



            # Results
            normal_test_accuracy = accuracy_score(Y_test, normal_y_pred)
            bagging_test_accuracy = accuracy_score(Y_test, bagging_y_pred_final)
            boosting_test_accuracy = accuracy_score(Y_test, boosting_y_pred_final)

            accuracy_for_each_run[run, 0] = normal_test_accuracy
            accuracy_for_each_run[run, 1] = bagging_test_accuracy
            accuracy_for_each_run[run, 2] = boosting_test_accuracy

        print_dataset_specification(dataset, X_train, Y_train, m, X_test, Y_test, m_test, c, n, y_unique)    
        print('')
        print('Noise(%) = ', noise_percent)
        print('')
        print('T(Bagging) = ',T_bagging)
        print('T(Boosting) = ',T_boosting)
        print('tree_max_dept for (Normal, Boosting) = ',tree_max_dept)
        print('Runs = ',runs)
        print('')
        print('Normal Test Accuracy = ',np.round(np.mean(accuracy_for_each_run,axis=0)[0], 2) )
        print('Bagging Test Accuracy = ',np.round(np.mean(accuracy_for_each_run,axis=0)[1], 2) )   
        print('Boosting Test Accuracy = ',np.round(np.mean(accuracy_for_each_run,axis=0)[2], 2) )   
        print('-----------------------------------------------------------------------------')
        print('')

---------------------------DataSet : BreastTissue---------------------------
x_train.shape = (74, 9)
y_train.shape = (74,)
m = 74
n = 9
x_test.shape = (32, 9)
y_test.shape = (32,)
m_test = 32
c = 6
y_unique = ['1' '2' '3' '4' '5' '6']

Noise(%) =  10

T(Bagging) =  21
T(Boosting) =  51
tree_max_dept for (Normal, Boosting) =  4
Runs =  10

Normal Test Accuracy =  0.58
Bagging Test Accuracy =  0.69
Boosting Test Accuracy =  0.72
-----------------------------------------------------------------------------

---------------------------DataSet : BreastTissue---------------------------
x_train.shape = (74, 9)
y_train.shape = (74,)
m = 74
n = 9
x_test.shape = (32, 9)
y_test.shape = (32,)
m_test = 32
c = 6
y_unique = ['1' '2' '3' '4' '5' '6']

Noise(%) =  20

T(Bagging) =  21
T(Boosting) =  51
tree_max_dept for (Normal, Boosting) =  4
Runs =  10

Normal Test Accuracy =  0.58
Bagging Test Accuracy =  0.69
Boosting Test Accuracy =  0.68
-----------------------------------------------------------

---------------------------DataSet : Wine---------------------------
x_train.shape = (124, 13)
y_train.shape = (124,)
m = 124
n = 13
x_test.shape = (54, 13)
y_test.shape = (54,)
m_test = 54
c = 3
y_unique = ['1' '2' '3']

Noise(%) =  30

T(Bagging) =  21
T(Boosting) =  51
tree_max_dept for (Normal, Boosting) =  1
Runs =  10

Normal Test Accuracy =  0.69
Bagging Test Accuracy =  0.91
Boosting Test Accuracy =  0.92
-----------------------------------------------------------------------------

