In [1]:
import gzip
import numpy as np

In [3]:
from sklearn.ensemble import RandomForestClassifier

#tree_depth_size = [10,15,25]
#metoda koja trenira Random Forest i vraća šumu (listu stabla)
def createRandomForest(X,y,depth_size = 25):
    clf = RandomForestClassifier(
            n_estimators=100, max_features="sqrt", max_depth=depth_size,
            min_samples_split=5)

    clf.fit(X,y)
    forest = clf.estimators_
    return forest

In [8]:
def mapToIndexesLeaves(tree):
    mapa = {}
    counter = 0
    mapa,counter = rekurzivno(tree,mapa,counter,0)
    return mapa

def rekurzivno(tree, mapa,counter,node):
        if tree.children_left[node] == -1:
            mapa[node] = counter
            counter +=1
            return mapa, counter
        mapa, counter = rekurzivno(tree, mapa, counter, tree.children_left[node])
        mapa, counter = rekurzivno(tree, mapa, counter, tree.children_right[node])
        return mapa, counter


In [6]:
from scipy.sparse import csr_matrix

def leafIndexesOfLeaves(treeDecision,mapa, X, offset = 0):
    return np.array([mapa[x] + offset for x in treeDecision.apply(X)])

def createFI(X, forest, forestMap):
    col = np.array([])
    row = np.array([])
    offset = 0
    for i,(f,mapa) in enumerate(zip(forest,forestMap)):
        indexes = leafIndexesOfLeaves(f,mapa, X,offset=offset)
        col = np.concatenate((col,indexes))
        row = np.concatenate((row,np.arange(0,X.shape[0])))
        offset += len(mapa)
    FI = csr_matrix(
        (np.ones(col.shape,dtype=np.int8), (row, col)), shape=(X.shape[0],offset))
    return FI

In [90]:
from liblinear.liblinearutil import *

def leafVectorOptimizator(X,y,s,c):
    prob  = problem(y, X)
    param = parameter('-s '+str(s)+' -c '+str(c)+' -q')
    m = train(prob, param)
    [W, _] = m.get_decfun()
    return W,m
def getScore(X,y,m):
    _,score,_ = predict(y, X, m)
    return score

In [9]:
def getNeighbors(forest, forestMap,W):
    neighbors = []
    offset = 0
    for i,(mapa,f) in enumerate(zip(forestMap,forest)):
        prev_node_index = None
        prev_leaf_index = None
        for (node_index, leaf_index) in mapa.items():
            if prev_node_index is None:
                prev_node_index = node_index
                prev_leaf_index = leaf_index
                continue
            
            if f.tree_.children_right[prev_node_index - 1] == node_index:
                singf = abs(W[offset + prev_leaf_index]) + abs (W[offset + leaf_index])
                neighbors.append([singf,i,prev_node_index-1])
                prev_node_index = None
                prev_leaf_index = None
                continue
            prev_node_index = node_index
            prev_leaf_index = leaf_index
        offset += len(mapa)
    return neighbors

In [10]:
from sklearn.tree._tree import TREE_LEAF

def pruneTrees(neighbors, forest,  pruneSize):
    neighbors.sort(key = lambda x : x[0])
    for _,f_index,parent in neighbors[:pruneSize]:
        forest[f_index].tree_.children_left[parent] = TREE_LEAF
        forest[f_index].tree_.children_right[parent] = TREE_LEAF

In [97]:
problemTypes = {
    "classifation":2,
    "regression":11 
}

def refinedA(X_train,y_train, X_test, y_test,problemType,
    iteration = 10, c = 1,depth_size_index = 2):
    s = problemTypes[problemType]
    bestScore = 0
    bestM = None
    tree_depth_size = [10,15,25]
    forest = createRandomForest(X_train,y_train,tree_depth_size[depth_size_index])
    counter = 0
    while True:
        forestMap = [mapToIndexesLeaves(f.tree_) for f in forest]
        FI_train = createFI(X_train,forest,forestMap)
        FI_test = createFI(X_test,forest,forestMap)
        W, m = leafVectorOptimizator(FI_train,y_train, s, c)
        score = getScore(FI_test,y_test, m)
        if s == 2:
            score = score[0]
        else:
            score = score[1]
        print("iteracija: " + str(counter) + ", score: " + 
            str(score) +", len(W): "+ str(len(W)))

        counter += 1
        if score > bestScore:
            bestScore = score
            bestM = m

        if counter > iteration:
            break
        neighbors = getNeighbors(forest, forestMap, W)
        pruneTrees(neighbors, forest, int(len(W) * 0.1))
    return bestM, bestScore

#Treniramo model dok se broj listova u stablu 
# ne smanji na "leafes_size" početnog broja listova
def refinedE(X_train,y_train, X_test, y_test,problemType,
    leafes_size = 0.5, c = 1,depth_size_index = 2):
    s = problemTypes[problemType]
    bestScore = 0
    bestM = None
    tree_depth_size = [10,15,25]
    forest = createRandomForest(X_train,y_train,tree_depth_size[depth_size_index])
    size = 1
    counter = 0
    while True:
        forestMap = [mapToIndexesLeaves(f.tree_) for f in forest]
        FI_train = createFI(X_train,forest,forestMap)
        FI_test = createFI(X_test,forest,forestMap)
        W, m = leafVectorOptimizator(FI_train,y_train, s, c)
        score = getScore(FI_test,y_test, m)
        if s == 2:
            score = score[0]
        else:
            score = score[1]
        print("iteracija: " + str(counter) + ", score: " + 
            str(score) +", len(W): "+ str(len(W)))
        size *= 0.9
        if score > bestScore:
            bestScore = score
            bestM = m

        if leafes_size > size:
            break
        neighbors = getNeighbors(forest, forestMap, W)
        pruneTrees(neighbors, forest, int(len(W) * 0.1))
    return bestM, bestScore

## Klasifikacija MINIST seta 

In [2]:
#https://archive.ics.uci.edu/ml/datasets/covertype
#http://yann.lecun.com/exdb/mnist/

image_size = 28
train_data_count = 60_000
test_data_count = 10_000

def readGzImages(path,number_of_images):
    input_X = gzip.open(path,'r')
    input_X.read(16)
    buf_X = input_X.read(image_size * image_size * number_of_images)
    X = np.frombuffer(buf_X, dtype=np.uint8).astype(np.float32)
    X = X.reshape(number_of_images ,image_size * image_size)
    return X

def readGzLabels(path, number_of_labels):
    input_y = gzip.open(path,'r')
    input_y.read(8)
    buf_y = input_y.read(1 * number_of_labels)
    y = np.frombuffer(buf_y, dtype=np.uint8).astype(np.int32)
    y = y.reshape(number_of_labels)
    return y

X = readGzImages('./MNIST_set/train-images-idx3-ubyte.gz', train_data_count)
y = readGzLabels('./MNIST_set/train-labels-idx1-ubyte.gz', train_data_count)

X_test = readGzImages('./MNIST_set/t10k-images-idx3-ubyte.gz', test_data_count)
y_test = readGzLabels('./MNIST_set/t10k-labels-idx1-ubyte.gz', test_data_count)

In [1]:
_, score = refinedA(X,y,X_test, y_test,"classifation",iteration=10,)
print("Najbolja preciznost: " + str(score))


NameError: name 'refinedA' is not defined

In [65]:
_, score = refinedE(X,y,X_test, y_test,"classifation",leafes_size=0.3,)
print("Najbolja preciznost: " + str(score))

Accuracy = 96.4% (4820/5000) (classification)
iteracija: 0, score: 96.39999999999999, len(W): 143601
Accuracy = 96.42% (4821/5000) (classification)
iteracija: 0, score: 96.41999999999999, len(W): 129241
Accuracy = 96.34% (4817/5000) (classification)
iteracija: 0, score: 96.34, len(W): 116317
Accuracy = 96.3% (4815/5000) (classification)
iteracija: 0, score: 96.3, len(W): 104686
Accuracy = 96.42% (4821/5000) (classification)
iteracija: 0, score: 96.41999999999999, len(W): 94218
Accuracy = 96.48% (4824/5000) (classification)
iteracija: 0, score: 96.48, len(W): 84797
Accuracy = 96.4% (4820/5000) (classification)
iteracija: 0, score: 96.39999999999999, len(W): 76318
Accuracy = 96.36% (4818/5000) (classification)
iteracija: 0, score: 96.36, len(W): 68687
Accuracy = 96.44% (4822/5000) (classification)
iteracija: 0, score: 96.44, len(W): 61819
Accuracy = 96.44% (4822/5000) (classification)
iteracija: 0, score: 96.44, len(W): 55638
Accuracy = 96.34% (4817/5000) (classification)
iteracija: 0, s

## Klasifikacija Letter seta

In [69]:
from sklearn import preprocessing
from sklearn.model_selection import train_test_split

#https://archive.ics.uci.edu/ml/machine-learning-databases/letter-recognition/

data = np.genfromtxt('./letter_set/letter-recognition.data', delimiter=',', dtype=np.str,encoding="utf8")
X = data[:,1:].astype('int32')
y = data[:,0]

le = preprocessing.LabelEncoder()
le.fit(y)
y = le.transform(y)
X, X_test, y, y_test = train_test_split(X, y, test_size=0.25)

In [70]:
_, score = refinedA(X,y,X_test, y_test,"classifation",iteration=10,)
print("Najbolja preciznost: " + str(score))


Accuracy = 96.04% (4802/5000) (classification)
iteracija: 0, score: 96.04, len(W): 142894
Accuracy = 96.06% (4803/5000) (classification)
iteracija: 1, score: 96.06, len(W): 128605
Accuracy = 95.98% (4799/5000) (classification)
iteracija: 2, score: 95.98, len(W): 115745
Accuracy = 95.94% (4797/5000) (classification)
iteracija: 3, score: 95.94, len(W): 104171
Accuracy = 96.1% (4805/5000) (classification)
iteracija: 4, score: 96.1, len(W): 93754
Accuracy = 96.12% (4806/5000) (classification)
iteracija: 5, score: 96.12, len(W): 84379
Accuracy = 96.02% (4801/5000) (classification)
iteracija: 6, score: 96.02000000000001, len(W): 75942
Accuracy = 95.94% (4797/5000) (classification)
iteracija: 7, score: 95.94, len(W): 68348
Accuracy = 95.98% (4799/5000) (classification)
iteracija: 8, score: 95.98, len(W): 61514
Accuracy = 95.96% (4798/5000) (classification)
iteracija: 9, score: 95.96000000000001, len(W): 55363
Accuracy = 95.74% (4787/5000) (classification)
iteracija: 10, score: 95.740000000000

In [71]:
_, score = refinedE(X,y,X_test, y_test,"classifation",leafes_size=0.3,)
print("Najbolja preciznost: " + str(score))

Accuracy = 96.2% (4810/5000) (classification)
velicina: 1, score: 96.2, len(W): 142407
Accuracy = 96.2% (4810/5000) (classification)
velicina: 0.9, score: 96.2, len(W): 128167
Accuracy = 96.22% (4811/5000) (classification)
velicina: 0.81, score: 96.22, len(W): 115351
Accuracy = 96.3% (4815/5000) (classification)
velicina: 0.7290000000000001, score: 96.3, len(W): 103816
Accuracy = 96.32% (4816/5000) (classification)
velicina: 0.6561000000000001, score: 96.32, len(W): 93435
Accuracy = 96.28% (4814/5000) (classification)
velicina: 0.5904900000000002, score: 96.28, len(W): 84092
Accuracy = 96.3% (4815/5000) (classification)
velicina: 0.5314410000000002, score: 96.3, len(W): 75683
Accuracy = 96.24% (4812/5000) (classification)
velicina: 0.47829690000000014, score: 96.24000000000001, len(W): 68115
Accuracy = 96.24% (4812/5000) (classification)
velicina: 0.43046721000000016, score: 96.24000000000001, len(W): 61304
Accuracy = 96.14% (4807/5000) (classification)
velicina: 0.38742048900000015, s

##  regresija CPUSMALL seta

In [76]:
# https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/regression.html
data = np.genfromtxt('./cpusmall/cpusmall_scale.txt', delimiter=',',encoding="utf8")
X = data[:,1:]
y = data[:,0]
X, X_test, y, y_test = train_test_split(X, y, test_size=0.4)

In [91]:
_, score = refinedA(X,y,X_test, y_test,"regression",iteration=20,c = 0.01,depth_size_index=2)
print("Najbolja preciznost: " + str(score))

Mean squared error = 39.2906 (regression)
Squared correlation coefficient = 0.895588 (regression)
iteracija: 0, score: 0.8955875041878025, len(W): 25932
Mean squared error = 37.5573 (regression)
Squared correlation coefficient = 0.89988 (regression)
iteracija: 1, score: 0.8998802858206845, len(W): 23339
Mean squared error = 35.0086 (regression)
Squared correlation coefficient = 0.90631 (regression)
iteracija: 2, score: 0.9063101177288384, len(W): 21006
Mean squared error = 31.9924 (regression)
Squared correlation coefficient = 0.913794 (regression)
iteracija: 3, score: 0.9137935954967531, len(W): 18906
Mean squared error = 28.8491 (regression)
Squared correlation coefficient = 0.921812 (regression)
iteracija: 4, score: 0.9218118234750702, len(W): 17016
Mean squared error = 26.0391 (regression)
Squared correlation coefficient = 0.928986 (regression)
iteracija: 5, score: 0.9289862679530775, len(W): 15315
Mean squared error = 23.0142 (regression)
Squared correlation coefficient = 0.93671 

In [92]:
_, score = refinedE(X,y,X_test, y_test,"regression",leafes_size=0.3,)
print("Najbolja preciznost: " + str(score))

Mean squared error = 27.3473 (regression)
Squared correlation coefficient = 0.930739 (regression)
iteracija: 0, score: 0.9307391159964287, len(W): 115240
Mean squared error = 23.0954 (regression)
Squared correlation coefficient = 0.94095 (regression)
iteracija: 0, score: 0.9409503676690726, len(W): 103716
Mean squared error = 21.4512 (regression)
Squared correlation coefficient = 0.943903 (regression)
iteracija: 0, score: 0.943902590128311, len(W): 93345
Mean squared error = 20.3237 (regression)
Squared correlation coefficient = 0.945922 (regression)
iteracija: 0, score: 0.9459221747713878, len(W): 84011
Mean squared error = 18.2199 (regression)
Squared correlation coefficient = 0.951041 (regression)
iteracija: 0, score: 0.9510412422926204, len(W): 75610
Mean squared error = 17.6039 (regression)
Squared correlation coefficient = 0.952243 (regression)
iteracija: 0, score: 0.9522425289092203, len(W): 68049
Mean squared error = 17.622 (regression)
Squared correlation coefficient = 0.95176

## Abalone reggresion set

In [93]:
# https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/regression.html
data = np.genfromtxt('./abalone_set/abalone.txt', delimiter=',',encoding="utf8")
X = data[:,1:]
y = data[:,0]
X, X_test, y, y_test = train_test_split(X, y, test_size=0.4)

In [104]:
_, score = refinedA(X,y,X_test, y_test,"regression",iteration=10,c = 0.01,depth_size_index=2)
print("Najbolja preciznost: " + str(score))

Mean squared error = 6.85881 (regression)
Squared correlation coefficient = 0.497767 (regression)
iteracija: 0, score: 6.858812982817409, len(W): 60728
Mean squared error = 6.67283 (regression)
Squared correlation coefficient = 0.50116 (regression)
iteracija: 1, score: 6.672829328891643, len(W): 54656
Mean squared error = 6.52107 (regression)
Squared correlation coefficient = 0.502743 (regression)
iteracija: 2, score: 6.521069663240621, len(W): 49191
Mean squared error = 6.37414 (regression)
Squared correlation coefficient = 0.50217 (regression)
iteracija: 3, score: 6.374139135329044, len(W): 44272
Mean squared error = 6.20235 (regression)
Squared correlation coefficient = 0.506919 (regression)
iteracija: 4, score: 6.202351797417601, len(W): 39845
Mean squared error = 6.12033 (regression)
Squared correlation coefficient = 0.50194 (regression)
iteracija: 5, score: 6.120326327126437, len(W): 35861
Mean squared error = 6.02148 (regression)
Squared correlation coefficient = 0.499893 (regre

In [99]:
_, score = refinedE(X,y,X_test, y_test,"regression",leafes_size=0.3,)
print("Najbolja preciznost: " + str(score))

Mean squared error = 5.40909 (regression)
Squared correlation coefficient = 0.509967 (regression)
iteracija: 0, score: 5.409089426576987, len(W): 60891
Mean squared error = 5.38249 (regression)
Squared correlation coefficient = 0.510902 (regression)
iteracija: 0, score: 5.382492664932072, len(W): 54802
Mean squared error = 5.37784 (regression)
Squared correlation coefficient = 0.509785 (regression)
iteracija: 0, score: 5.37783546793041, len(W): 49322
Mean squared error = 5.3359 (regression)
Squared correlation coefficient = 0.513058 (regression)
iteracija: 0, score: 5.335896764334138, len(W): 44390
Mean squared error = 5.34769 (regression)
Squared correlation coefficient = 0.510643 (regression)
iteracija: 0, score: 5.347693778901994, len(W): 39951
Mean squared error = 5.35039 (regression)
Squared correlation coefficient = 0.509306 (regression)
iteracija: 0, score: 5.350391035189264, len(W): 35956
Mean squared error = 5.41993 (regression)
Squared correlation coefficient = 0.503351 (regr