In [521]:
import scipy.io as sio  
import matplotlib.pyplot as plt  
import numpy as np 
from numpy.linalg import *
import pandas as pd

In [805]:
def decisionTreeTrain( x , y , mtype = 'C4.5' ,dtype = 'continuous' , quantile = 20 ):
    data = {'Name':'NULL' , 'num':1,
            'Left Child':0 , 'Left Name':'NULL',
            'Right Child':0 , 'Right Name': 'NULL',
            'bestattribute':-1 , 'bestvalue':-1 , 'Class' :0}
    node = [data]
    k = 1
    if pd.isnull(x).any():
        return node,k
    if len(np.unique(y))==1:
        node[0]['Name'] = 'The class is ' + str(np.unique(y)[0])
        node[0]['Class'] = np.unique(y)[0]
        return node,k
    if x.size==0:
        node[0]['Name'] = 'The class is ' + str(np.bincount( y ).argmax())
        node[0]['Class'] = np.bincount( y ).argmax()
        return node,k
    if dtype == 'continuous':
        [m,d] = x.shape
        value = np.zeros(d)
        if mtype == 'C4.5':
            gain_ratio = np.zeros(d)
            for i in range(d):
                [gain_ratio[i],value[i]] = Gain_ratio( y , x[:,i] , dtype ,quantile )
            bestattribute = gain_ratio.argmax()
            bestvalue = value[bestattribute]
            
        elif mtype == 'ID3':
            gain = np.zeros(d)
            for i in range(d):
                [gain[i],value[i] , no] = Gain( y , x[:,i] , dtype ,quantile )
            bestattribute = gain.argmax()
            bestvalue = value[bestattribute]
            
        elif mtype == 'CART':
            gini_index = np.zeros(d)
            for i in range(d):
                [gini_index[i],value[i]] = Gini_index( y , x[:,i] , dtype ,quantile )
            bestattribute = gini_index.argmin()
            bestvalue = value[bestattribute]
            
        node[0]['bestattribute'] = bestattribute
        node[0]['bestvalue'] = bestvalue
        node[0]['Name'] = 'The bestattribute is ' + str(bestattribute) + ' and the bestvalue is ' + str(bestvalue)
        print(node[0]['Name'])
        
        [node1,kl] = decisionTreeTrain( x[ (x[:,bestattribute]<=bestvalue) , : ] , y[ (x[:,bestattribute]<=bestvalue) ] , mtype , dtype , quantile)
        node[0]['Left Name'] = node1[0]['Name']
        node[0]['Left Child'] = 2
        for i in range(len(node1)):
            node1[i]['num'] = node1[i]['num'] + k
            node1[i]['Left Child'] = node1[i]['Left Child'] + k
            node1[i]['Right Child'] = node1[i]['Right Child'] + k
        k = k + kl
        node.extend(node1)
        
        [node2,kr] = decisionTreeTrain( x[ (x[:,bestattribute]>bestvalue) , : ] , y[ (x[:,bestattribute]>bestvalue) ] , mtype , dtype , quantile)
        node[0]['Right Name'] = node2[0]['Name']
        node[0]['Right Child'] = k+1
        for i in range(len(node2)):
            node2[i]['num'] = node2[i]['num'] + k
            node2[i]['Left Child'] = node2[i]['Left Child'] + k
            node2[i]['Right Child'] = node2[i]['Right Child'] + k
        k = k + kr
        node.extend(node2)
    return node,k

In [806]:
def decisionTreeTest(node , x , y):
    [m,d] = x.shape
    t = np.zeros_like(y)
    for i in range(m):
        test = x[i,:]
        j = 0
        while(node[j]['Class']==0):
            k = node[j]
            if test[k['bestattribute']] <=k['bestvalue']:
                j = k['Left Child']-1
            else:
                j = k['Right Child']-1
        t[i] = node[j]['Class']
    
    acc = np.equal(t,y).mean() * 100
    text = "The Linear Regression's accuracy is %.2f%%." %(acc)
    print(text)
    return t

In [807]:
def divideset( D , a , subclass ):
    if subclass.size==1:
        subdataset = []
        subdataset.append( D[a<=subclass] )
        subdataset.append( D[a>subclass] )
    else:
        subdataset = []
        subdataset.append(D[a<=subclass[0]])
        for i in range(1,len(subclass)):
            subdataset.append( D[ (a>subclass[i-1]) & (a<=subclass[i]) ])
    
    return subdataset

In [808]:
def Ent(D):
    if len(D)==0:
        return 0
    num = np.bincount(D)
    num = num[num>0]
    num = num/num.sum()
    ent = -(num * np.log2(num)).sum()
    return ent

In [809]:
def Gain(D,a,dtype='continuous',quantile = 20):
    ent = Ent(D);
    m = len(D)
    if dtype=='continuous':
        section = np.arange(0.00,100.00,100.00/quantile)
        subclass = np.unique(np.percentile(a,section))
        gain = np.zeros(len(subclass))
        for i in range(len(subclass)):
            Dv = divideset(D,a,subclass[i])
            gain[i] = ent - len(Dv[0])*Ent(Dv[0])/m - len(Dv[1])*Ent(Dv[1])/m
        return gain.max(),subclass[gain.argmax()] , divideset(D,a,subclass[gain.argmax()])
    
    elif dtype=='discrete':
        subclass = np.unique(a)
        Dv = divdeset(D,a,subclass)
        gain = np.zeros(len(subclass))
        for i in range(len(subclass)):
            gain[i] = len(Dv[i])*Ent(Dv[i])/m
        return ent-gain.sum(),subclass , subD
    
    else:
        print('The type you set does not exist.')
        return null,null
    

In [810]:
def Gain_ratio(D,a,dtype='continuous',quantile = 20):
    [gain,bestvalue,Dv] = Gain(D,a,dtype,quantile)
    m = len(D)
    iv = np.zeros(len(Dv))
    for i in range(len(Dv)):
        iv[i] = (len(Dv[i])/m) * np.log2(len(Dv[i])/m)
    gain_ration = gain/(-iv.sum())
    return gain_ration , bestvalue


In [811]:
def Gini(D):
    num = np.bincount(D)
    num = num[num>0]
    num = num/num.sum()
    gini = 1 - (num**2).sum()
    return gini


In [812]:
def Gini_index(D,a,dtype='continuous',quantile = 20):
    m = len(D)
    if dtype=='continuous':
        section = np.arange(0.00,100.00,100.00/quantile)
        subclass = np.unique(np.percentile(a,section))
        gini_index = np.zeros(len(subclass))
        for i in range(len(subclass)):
            Dv = divideset(D,a,subclass[i])
            gini_index[i] = len(Dv[0])*Gini(Dv[0])/m + len(Dv[1])*Gini(Dv[1])/m
        return gini_index.min() , subclass[gini_index.argmin()]
    
    elif dtype=='discrete':
        subclass = np.unique(a)
        Dv = divideset(D,a,subclass)
        gini_index = np.zeros(len(subclass))
        for i in range(len(subclass)):
            gini_index[i] = len(Dv[i])*Gini(Dv[i])/m
        return gini_index.sum() , subclass
    
    else:
        print('The type you set does not exist.')
        return null , null


In [813]:
data_path = "E://yuwoliang/three set/SpectralClassification/No removal of continuous spectrum/"

data=sio.loadmat( data_path + "SpectralClassificationTrain.mat" ) 
train_x = data['train_x']
train_y = data['train_y'][:,0]

data=sio.loadmat( data_path + "SpectralClassificationTest.mat" ) 
test_x = data['test_x']
test_y = data['test_y'][:,0]

del data

In [800]:
[node,k] = decisionTreeTrain( train_x , train_y, mtype='C4.5' )
t = decisionTreeTest(node , test_x , test_y  )

The bestattribute is 278 and the bestvalue is 0.0398812
The bestattribute is 362 and the bestvalue is 0.021801
The bestattribute is 10 and the bestvalue is 0.081867
The bestattribute is 0 and the bestvalue is 0.01544
The bestattribute is 328 and the bestvalue is 0.047291
The bestattribute is 68 and the bestvalue is 0.052907
The bestattribute is 162 and the bestvalue is 0.0570645
The bestattribute is 236 and the bestvalue is 0.040832
The bestattribute is 67 and the bestvalue is 0.0018921
The bestattribute is 27 and the bestvalue is 0.0086117
The bestattribute is 22 and the bestvalue is 0.02877745
The bestattribute is 67 and the bestvalue is 0.0137858
The bestattribute is 0 and the bestvalue is 0.02379
The Linear Regression's accuracy is 81.00%.


In [801]:
[node,k] = decisionTreeTrain( train_x , train_y, mtype='ID3' )
t = decisionTreeTest(node , test_x , test_y  )

The bestattribute is 65 and the bestvalue is 0.0607039
The bestattribute is 28 and the bestvalue is 0.0268082
The bestattribute is 27 and the bestvalue is 0.00897044
The bestattribute is 234 and the bestvalue is 0.039715
The bestattribute is 67 and the bestvalue is 0.0018921
The bestattribute is 67 and the bestvalue is 0.0138986
The bestattribute is 0 and the bestvalue is 0.02379
The bestattribute is 150 and the bestvalue is 0.0446376
The bestattribute is 39 and the bestvalue is 0.0361248
The bestattribute is 24 and the bestvalue is 0.028624
The bestattribute is 1 and the bestvalue is 0.0353671
The bestattribute is 52 and the bestvalue is 0.0762532
The bestattribute is 10 and the bestvalue is 0.0829748
The bestattribute is 2 and the bestvalue is 0.081887
The Linear Regression's accuracy is 79.50%.


In [814]:
[node,k] = decisionTreeTrain( train_x , train_y, mtype='CART' )
t = decisionTreeTest(node , test_x , test_y  )

The bestattribute is 278 and the bestvalue is 0.0398812
The bestattribute is 10 and the bestvalue is 0.083105
The bestattribute is 0 and the bestvalue is 0.01544
The bestattribute is 2 and the bestvalue is 0.081887
The bestattribute is 328 and the bestvalue is 0.047291
The bestattribute is 68 and the bestvalue is 0.052907
The bestattribute is 162 and the bestvalue is 0.0570645
The bestattribute is 45 and the bestvalue is 0.0071129
The bestattribute is 238 and the bestvalue is 0.0379532
The bestattribute is 22 and the bestvalue is 0.0287734
The bestattribute is 0 and the bestvalue is 0.0088277
The bestattribute is 67 and the bestvalue is 0.0137858
The bestattribute is 0 and the bestvalue is 0.02379
The Linear Regression's accuracy is 76.50%.
