In [1]:
class Sample:
    "Represenation of data samples"
    def __init__(self, xclass, values, identity, attributes = None):
        self.xclass = xclass
        if attributes == None:
            self.attribute = values
        else:
            self.attribute = dict(zip(attributes, values))
        self.identity = identity
        
    def getClass(self):
        return self.xclass
    
    def getAttributes(self):
        return self.attribute
    
    def getNbrAttributes(self):
        return len(self.attribute)
    
    def getAttributeValue(self,attribute):
        return self.attribute[attribute]
    
    def getIdentity(self):
        return self.identity

In [2]:
import numpy as np
import pandas as pd

def chooseData(data):
    prefix = 'datasets/'
    if data == "glass":
        data =np.genfromtxt(prefix + 'glass.data',dtype=None,delimiter=",")
        return [Sample(x[-1],[x[i] for i in range(1,len(x)-1)],x[0]) for x in data]
    
    elif data == "ecoli":
        data = np.genfromtxt(prefix + 'ecoli.data',dtype=None)
        return [Sample(x[-1],[x[i] for i in range(1,len(x)-1)],x[0]) for x in data]
    
    elif data == "diabetes":
        data = np.genfromtxt(prefix + 'pima-indians-diabetes.data',dtype=None,delimiter=",")
        return [Sample(data[x][-1],[data[x][i] for i in range(len(data[x])-1)],x) for x in range(len(data))]
    
    elif data == "sonar":
        data = np.genfromtxt(prefix + 'sonar.all-data',dtype=None,delimiter=",")
        return [Sample(data[x][-1],[data[x][i] for i in range(len(data[x])-1)],x) for x in range(len(data))]
    
    elif data == "vowel":
        data = np.genfromtxt(prefix + 'vowel-context.data',dtype=None)
        return [Sample(data[x][-1],[data[x][i] for i in range(3,len(data[x])-1)],x) for x in range(len(data))]
    
    elif data == "ionosphere":
        data = np.genfromtxt(prefix + 'ionosphere.data',dtype=None, delimiter=',')
        return [Sample(data[x][-1],[data[x][i] for i in range(len(data[x])-1)],x) for x in range(len(data))]
    
    elif data == "vehicle":
        data = np.genfromtxt(prefix + 'vehicle.data',dtype=None, delimiter=',')
        return [Sample(data[x][-1],[data[x][i] for i in range(len(data[x])-1)],x) for x in range(len(data))]
    
    elif data == "german":
        data = np.genfromtxt(prefix + 'german.data-numeric',dtype=None)
        return [Sample(data[x][-1],[data[x][i] for i in range(len(data[x])-1)],x) for x in range(len(data))]
    
    elif data == "image":
        data = np.genfromtxt(prefix + 'segment.dat',dtype=None)
        return [Sample(data[x][-1],[data[x][i] for i in range(len(data[x])-1)],x) for x in range(len(data))]
    
    elif data == "cancer":
        data = np.genfromtxt(prefix + 'breast-cancer-wisconsin.data',delimiter=",",dtype=int)
        return [Sample(x[-1],[x[i] for i in range(1,len(x)-1)],x[0]) for x in data]
    
    elif data == "votes":
        data = np.genfromtxt(prefix + "house-votes-84.data",dtype=str,delimiter=',')
        #data = [list(x) for x in data if not any([y == '?' for y in x])]
        data = [[int(y == 'y' or y == 'democrat') - int(y == '?') for y in x] for x in data] # to numerical values
        dataIndex = np.where(np.matrix(data)[:,-1] > -1)[0]
        data = [data[idx] for idx in list(dataIndex)]
        return [Sample(data[x][-1],[data[x][i] for i in range(len(data[x])-1)],x) for x in range(len(data))]
    
    elif data == "liver":
        data = np.genfromtxt(prefix + "bupa.data",dtype=None,delimiter=',')
        return [Sample(data[x][-1],[data[x][i] for i in range(len(data[x])-1)],x) for x in range(len(data))]

In [13]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

def AdaBoosterror(data):
    S = chooseData(data)
    
    culError1 = 0
    culError2 = 0
    cumulErrorSelection = 0
    Nruns = 100
    
    #AdaBoost = AdaBoostClassifier(n_estimators = 50)
    for i in range(Nruns):
        AdaBoost1 = AdaBoostClassifier(DecisionTreeClassifier(criterion = 'entropy', splitter = 'best', max_features =1),n_estimators = 50, algorithm='SAMME.R',)
        AdaBoost2 = AdaBoostClassifier(DecisionTreeClassifier(criterion = 'entropy', splitter = 'best', max_features ='log2'),n_estimators = 50, algorithm='SAMME.R',)
        
        N = len(S)
        np.random.shuffle(S)
        train, test = S[:int(0.9*N)], S[int(0.9*N):]

        Xtrain = [o.attribute for o in train]
        Ytrain = [o.xclass for o in train]

        AdaBoost1.fit(Xtrain, Ytrain)
        AdaBoost2.fit(Xtrain, Ytrain)

        Xtest = [o.attribute for o in test]
        Ytest = [o.xclass for o in test]

        Error1 = 1 - AdaBoost1.score(Xtest,Ytest)
        culError1 += Error1
        Error2 = 1 - AdaBoost2.score(Xtest,Ytest)
        culError2 += Error2
        cumulErrorSelection += min(Error1,Error2)
        
        
    print("Data :", data)
    print("Number of input :", str(S[0].getNbrAttributes()))
    print("Number of data point :", str(len(S)))
    #print("Error percentage of Adaboost F = 1 :", str(round(culError1/Nruns *100,1)))
    #print("Error percentage of Adaboost F = log2:", str(round(culError2/Nruns *100,1)))
    print("Error percentage:", str(round(cumulErrorSelection/Nruns *100,1)))
    print()
    


In [14]:
AdaBoosterror("glass")
AdaBoosterror("cancer")
AdaBoosterror("diabetes")
AdaBoosterror("sonar")
AdaBoosterror("vowel")
AdaBoosterror("ionosphere")
AdaBoosterror("vehicle")
AdaBoosterror("german")
AdaBoosterror("image")
AdaBoosterror("ecoli")
AdaBoosterror("votes")
AdaBoosterror("liver")

Data : glass
Number of input : 9
Number of data point : 214
Error percentage: 27.5

Data : cancer
Number of input : 9
Number of data point : 699
Error percentage: 4.4

Data : diabetes
Number of input : 8
Number of data point : 768
Error percentage: 28.6

Data : sonar
Number of input : 60
Number of data point : 208
Error percentage: 24.9

Data : vowel
Number of input : 10
Number of data point : 990
Error percentage: 22.2

Data : ionosphere
Number of input : 34
Number of data point : 351
Error percentage: 10.2

Data : vehicle
Number of input : 18
Number of data point : 846
Error percentage: 30.0

Data : german
Number of input : 24
Number of data point : 1000
Error percentage: 31.1

Data : image
Number of input : 19
Number of data point : 2310
Error percentage: 4.8

Data : ecoli
Number of input : 7
Number of data point : 336
Error percentage: 21.4

Data : votes
Number of input : 16
Number of data point : 331
Error percentage: 20.5

Data : liver
Number of input : 6
Number of data point : 3

In [5]:
S = chooseData('glass')
print(S[0].attribute)

[1.52101, 13.640000000000001, 4.4900000000000002, 1.1000000000000001, 71.780000000000001, 0.059999999999999998, 8.75, 0.0, 0.0]


In [6]:
AdaBoosterror("glass")
AdaBoosterror("cancer")
AdaBoosterror("diabetes")
AdaBoosterror("sonar")
AdaBoosterror("vowel")
AdaBoosterror("ionosphere")
AdaBoosterror("vehicle")
AdaBoosterror("german")
AdaBoosterror("image")
AdaBoosterror("ecoli")
AdaBoosterror("votes")
AdaBoosterror("liver")

Data : glass
Number of input : 9
Number of data point : 214
Error percentage of Adaboost F = 1 : 36.4
Error percentage of Adaboost F = log2: 32.6

Data : cancer
Number of input : 9
Number of data point : 699
Error percentage of Adaboost F = 1 : 5.9
Error percentage of Adaboost F = log2: 5.0

Data : diabetes
Number of input : 8
Number of data point : 768
Error percentage of Adaboost F = 1 : 33.5
Error percentage of Adaboost F = log2: 30.9

Data : sonar
Number of input : 60
Number of data point : 208
Error percentage of Adaboost F = 1 : 33.3
Error percentage of Adaboost F = log2: 30.0

Data : vowel
Number of input : 10
Number of data point : 990
Error percentage of Adaboost F = 1 : 31.9
Error percentage of Adaboost F = log2: 22.0

Data : ionosphere
Number of input : 34
Number of data point : 351
Error percentage of Adaboost F = 1 : 13.2
Error percentage of Adaboost F = log2: 10.9

Data : vehicle
Number of input : 18
Number of data point : 846
Error percentage of Adaboost F = 1 : 36.3
Err