In [1]:
import gzip
import numpy as np

In [2]:
#https://archive.ics.uci.edu/ml/datasets/covertype
#http://yann.lecun.com/exdb/mnist/

image_size = 28
train_data_count = 60_000
test_data_count = 10_000

def readGzImages(path,number_of_images):
    input_X = gzip.open(path,'r')
    input_X.read(16)
    buf_X = input_X.read(image_size * image_size * number_of_images)
    X = np.frombuffer(buf_X, dtype=np.uint8).astype(np.float32)
    X = X.reshape(number_of_images ,image_size * image_size)
    return X

def readGzLabels(path, number_of_labels):
    input_y = gzip.open(path,'r')
    input_y.read(8)
    buf_y = input_y.read(1 * number_of_labels)
    y = np.frombuffer(buf_y, dtype=np.uint8).astype(np.int32)
    y = y.reshape(number_of_labels)
    return y

X = readGzImages('./MNIST_set/train-images-idx3-ubyte.gz', train_data_count)
y = readGzLabels('./MNIST_set/train-labels-idx1-ubyte.gz', train_data_count)

X_test = readGzImages('./MNIST_set/t10k-images-idx3-ubyte.gz', test_data_count)
y_test = readGzLabels('./MNIST_set/t10k-labels-idx1-ubyte.gz', test_data_count)

In [3]:
tree_depth_size = [10,15,25]
from sklearn.ensemble import RandomForestClassifier

In [259]:
clf = RandomForestClassifier(
         n_estimators=100, max_features="sqrt", max_depth=tree_depth_size[2],min_samples_split=5
         )
def indexing(index, size):
    arr = np.zeros(size, dtype = np.int8)
    arr[index] = 1
    return arr

clf.fit(X,y)
forest = clf.estimators_
FI_X = np.array([f.predict(X) for f in forest]).T
FI_X_test = np.array([f.predict(X_test) for f in forest]).T

In [253]:
def mapToIndexesLeaves1(tree):
    mapa = {}
    counter = 0
    mapa,counter = rekurzivno(tree,mapa,counter,0)
    return mapa

def rekurzivno(tree, mapa,counter,node):
    if tree.children_left[node] == -1:
        mapa[node] = counter
        counter +=1
        return mapa, counter
    mapa, counter = rekurzivno(tree, mapa, counter, tree.children_left[node])
    mapa, counter = rekurzivno(tree, mapa, counter, tree.children_right[node])
    return mapa, counter




In [256]:
def mapToIndexesLeaves(tree):
    mapa = {}
    counter = 0
    for i in range(tree.node_count):
        if tree.children_left[i] == -1:
            mapa[i] = counter
            counter +=1
            i = tree.children_right[i-1]
    return mapa

In [281]:
forestMap = [mapToIndexesLeaves1(f.tree_) for f in forest]

In [258]:
len(mapToIndexesLeaves1(forest[0].tree_))

3476

In [85]:
print(forest[0].tree_.node_count)
forest[65].get_n_leaves()
print(list(zip(forest[8].tree_.children_left[1270:1280],forest[8].tree_.children_right[1270:1280])))


7181
[(1271, 1274), (1272, 1273), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1)]


In [136]:
forest[0].decision_path([X[0]]).toarray().shape

(1, 7195)

In [87]:
forestMap[8][1274]

630

In [9]:
from scipy.sparse import csr_matrix, hstack
n_non_leaves = forest[0].tree_.node_count - forest[0].get_n_leaves()
indexes = np.subtract(forest[0].apply(X),n_non_leaves)
fi = csr_matrix([indexing(i,forest[0].get_n_leaves()) for i in indexes])
n_non_leaves1 = forest[1].tree_.node_count - forest[1].get_n_leaves()
indexes1 = np.subtract(forest[1].apply(X),n_non_leaves1)
fi1 = csr_matrix([indexing(i,forest[1].get_n_leaves()) for i in indexes1])
fi2 = hstack([fi,fi1])

In [236]:
print(len(forest[0].tree_.children_left))
forest[0].tree_.children_left = np.delete(forest[0].tree_.children_left,10)
print(len(forest[0].tree_.children_left))

7429


AttributeError: attribute 'children_left' of 'sklearn.tree._tree.Tree' objects is not writable

In [32]:
forest[0].tree_.node_count

7555

In [50]:

print(forest[0].decision_path(X))

  (0, 0)	1
  (0, 1)	1
  (0, 2)	1
  (0, 3)	1
  (0, 531)	1
  (0, 532)	1
  (0, 533)	1
  (0, 597)	1
  (0, 627)	1
  (0, 628)	1
  (0, 629)	1
  (0, 630)	1
  (0, 640)	1
  (0, 641)	1
  (0, 642)	1
  (0, 643)	1
  (0, 644)	1
  (0, 645)	1
  (1, 0)	1
  (1, 4172)	1
  (1, 6938)	1
  (1, 7140)	1
  (1, 7302)	1
  (1, 7303)	1
  (1, 7304)	1
  :	:
  (59998, 7142)	1
  (59998, 7178)	1
  (59998, 7179)	1
  (59998, 7180)	1
  (59998, 7181)	1
  (59998, 7182)	1
  (59998, 7183)	1
  (59998, 7184)	1
  (59998, 7186)	1
  (59998, 7187)	1
  (59998, 7188)	1
  (59999, 0)	1
  (59999, 4172)	1
  (59999, 4173)	1
  (59999, 5385)	1
  (59999, 5865)	1
  (59999, 5866)	1
  (59999, 6214)	1
  (59999, 6215)	1
  (59999, 6216)	1
  (59999, 6260)	1
  (59999, 6286)	1
  (59999, 6287)	1
  (59999, 6288)	1
  (59999, 6289)	1


In [5]:
def leafIndexesOfLeaves(treeDecision,X, offset = 0):
    indexes = np.array([])
    indices = treeDecision.decision_path(X).indices
    tmp = 0
    for i in range(len(indices[:-1])):
        if indices[i+1] - indices[i] == 1: continue
        if indices[i+1] == 0:
            indexes = np.append(indexes,tmp + offset)
            tmp = 0
            continue
        tmp = tmp  + (indices[i+1] - indices[i]) // 2
    indexes = np.append(indexes,tmp + offset)
    return indexes

In [11]:
def leafIndexesOfLeaves1(treeDecision,mapa, X, offset = 0):
    return np.array([mapa[x] + offset for x in treeDecision.apply(X)])

In [57]:
leafIndexesOfLeaves(forest[0])

array([ 316., 3675.,  924., ...,  328., 3589., 3141.])

In [56]:
for i,f in enumerate(forest):
    arr = leafIndexesOfLeaves(f)
    if(min(arr) < 0): 
        print("Greška u "+ i + "min je: " + min(arr))
    if(max(arr) >= f.get_n_leaves()): 
        print("Greška u "+ i + "max je: " + max(arr) + ", broj listova: " + f.get_n_leaves())


In [10]:
from scipy.sparse import csr_matrix
def getFI(X, forest):
    indptr = np.array([0])
    indices = np.array([])
    offset = 0
    for f in forest:
        indexes = leafIndexesOfLeaves(f,X,offset=offset)
        indices = np.concatenate((indices,indexes))
        offset += f.get_n_leaves()
    indptr = np.concatenate((indptr,np.full(60_000,offset)))
    FI = csr_matrix((np.ones(indices.shape), indices, indptr), shape=(60_000, np.sum(indptr)))
    return FI

In [175]:
from scipy.sparse import csr_matrix
def getFI1(X, forest):
    col = np.array([])
    row = np.array([])
    offset = 0
    for i,(f,mapa) in enumerate(zip(forest,forestMap)):
        indexes = leafIndexesOfLeaves1(f,mapa, X,offset=offset)
        col = np.concatenate((col,indexes))
        row = np.concatenate((row,np.arange(0,X.shape[0])))
        offset += len(mapa)
    FI = csr_matrix((np.ones(col.shape,dtype=np.int8), (row, col)), shape=(X.shape[0],offset))
    return FI

In [91]:
np.arange(0,(5+1) * 3 ,3)
offset
indptr = np.arange(0,(X.shape[0] + 1) * offset,offset,dtype=np.float64)
indptr[-1]
offset

  indptr = np.arange(0,(X.shape[0] + 1) * offset,offset,dtype=np.float64)


368503

In [282]:
FI1 = getFI1(X,forest)

In [189]:
FI1

<60000x330464 sparse matrix of type '<class 'numpy.int8'>'
	with 6000000 stored elements in Compressed Sparse Row format>

In [114]:
a = forest[1].apply([X[1]])[0]
forestMap[1][a] + forest[0].tree_.n_leaves

5176

In [8]:
FI = csr_matrix((np.ones(indices.shape), indices, indptr), shape=(60_000, np.sum(indptr)))

In [283]:
FI_test = getFI1(X_test,forest)

In [46]:
FI_test

<10000x368503 sparse matrix of type '<class 'numpy.float64'>'
	with 368503 stored elements in Compressed Sparse Row format>

In [102]:
np.arange(0,5)

array([0, 1, 2, 3, 4])

In [20]:
indexes

array([ 3278,   197, -3093, ...,  2110,  2786,  1349], dtype=int64)

In [153]:
fi.shape

(60000, 3598)

In [34]:
sum([x.get_n_leaves() for x in forest])

368503

In [148]:
forest[0].tree_.node_count
forest[0].get_n_leaves()
n_non_leaves = forest[0].tree_.node_count - forest[0].get_n_leaves()
n_non_leaves

3597

In [124]:
a = forest[0].predict_proba(X)
for f in forest[1:]:
    a = np.concatenate((a,f.predict_proba(X)),axis =1)

a_test = forest[0].predict_proba(X_test)
for f in forest[1:]:
    a_test = np.concatenate((a_test,f.predict_proba(X_test)),axis =1)

In [128]:
forest[0].predict_proba(X).shape

(60000, 10)

In [7]:
FI_X.shape

(60000, 100)

In [5]:
FI_X1 = np.array([[indexing(a.astype(int),10) for a in b] for b in FI_X]).reshape(60000,1000)
FI_X1_test = np.array([[indexing(a.astype(int),10) for a in b] for b in FI_X_test]).reshape(10000,1000)

In [9]:
y1 = np.array([indexing(a.astype(int),10) for a in y])

In [10]:
y1.shape

(60000, 10)

In [284]:
from liblinear.liblinearutil import *
y_indexing = np.array([indexing(i,10) for i in y])
prob  = problem(y, FI1)
param = parameter('-s 2 -c 1 ')
m = train(prob, param)
[W, b] = m.get_decfun()

In [285]:
y_pred,_,_ = predict(y_test, FI_test, m)

Accuracy = 97.85% (9785/10000) (classification)


In [232]:
import copy
forest_tmp = copy.deepcopy(forest)
forestMap_tmp = copy.deepcopy(forestMap)


In [279]:
from sklearn.tree._tree import TREE_LEAF
neighbors = []
offset = 0
for i,(mapa,f) in enumerate(zip(forestMap,forest)):
    prev_node_index = None
    prev_leaf_index = None
    for (node_index, leaf_index) in mapa.items():
        if prev_node_index is None:
            prev_node_index = node_index
            prev_leaf_index = leaf_index
            continue
        
        if f.tree_.children_right[prev_node_index - 1] == node_index:
            #print(i,prev_node_index,node_index)
            singf = abs(W[offset + prev_leaf_index]) + abs (W[offset + leaf_index])
            neighbors.append([singf,i,prev_node_index-1])
            prev_node_index = None
            prev_leaf_index = None
            continue
        prev_node_index = node_index
        prev_leaf_index = leaf_index
    offset += len(mapa)

In [209]:
neighbors.sort(key = lambda x : x[0])
neighbors


[[3.5787434916477935e-05, 95, 3801],
 [5.4814448631998414e-05, 98, 2713],
 [0.00010112330063747299, 40, 7040],
 [0.00015286387015271712, 54, 1724],
 [0.00015356758569464907, 81, 1191],
 [0.0001984228155173963, 32, 4713],
 [0.0002034996759221203, 52, 479],
 [0.0002121913856303953, 56, 5258],
 [0.00021592820282613883, 91, 6109],
 [0.0002411791312161632, 61, 6973],
 [0.000261870819018929, 59, 729],
 [0.00026434684053572787, 49, 6543],
 [0.0002764699514984959, 45, 2189],
 [0.00028886129082041236, 98, 524],
 [0.00029093549429038383, 50, 1297],
 [0.0003050386694608301, 61, 5935],
 [0.0003118802465352752, 53, 2455],
 [0.0003281782587131175, 96, 7165],
 [0.0003472477362411722, 40, 1288],
 [0.00036561291263741206, 67, 4283],
 [0.0003684495694339031, 67, 1264],
 [0.0003700452324990657, 54, 6714],
 [0.0003819694326399446, 68, 1848],
 [0.0003897844627443133, 32, 4439],
 [0.0003960551261719433, 73, 6161],
 [0.0004002734347649724, 34, 979],
 [0.0004084245644472415, 69, 5992],
 [0.0004193680319272849

In [187]:
sum([len(x) for x in forestMap])

330464

In [238]:
import copy
tmp_FI = copy.deepcopy(FI1)

In [234]:
for singf,f_index,parent in neighbors:
    a = forest[f_index].tree_.children_left[parent]
    b = forest[f_index].tree_.children_right[parent]
    if a ==-1 or b == -1:
        print(singf,f_index,parent)
    c = forest[f_index].tree_.children_left[a]
    d = forest[f_index].tree_.children_left[b]
    if c != -1 or d != -1:
        print(singf,f_index,parent,c,d)

In [280]:
from sklearn.tree._tree import TREE_LEAF
#neighbors.sort(key = lambda x : x[0])
delete_mapa = {}
for singf,f_index,parent in neighbors[:int(len(W) * 0.1)]:
#for singf,f_index,parent in neighbors:
    forest[f_index].tree_.children_left[parent] = TREE_LEAF
    forest[f_index].tree_.children_right[parent] = TREE_LEAF
#     if f_index not in delete_mapa.keys():
#         delete_mapa[f_index] = [parent]
#     else: 
#         delete_mapa[f_index] = delete_mapa[f_index] + [parent]

#     #print(singf,f_index,parent)
#     # a = forest[f_index].tree_.children_left[parent]
#     # b = forest[f_index].tree_.children_right[parent]
#     # if a ==-1 or b == -1:
#     #     print(singf,f_index,parent)
#     # c = forest[f_index].tree_.children_left[a]
#     # d = forest[f_index].tree_.children_left[b]
#     # if c != -1 or d != -1:
#     #     print(singf,f_index,parent,c,d)
#     # print(forest[f_index].tree_.children_left[parent])
#     # print(forest[f_index].tree_.children_right[parent])
#     # print(forest[f_index].tree_.children_left[a])
#     # print(forest[f_index].tree_.children_left[b])
# for key,values in delete_mapa.items():
#     print(key,len(values))
#     # node_indexes = list(forestMap[key].keys())
    
#     # for v in values:
#     #     node_indexes.append(v)
#     #     node_indexes.remove(v+1)
#     #     node_indexes.remove(v+2)
#     # node_indexes.sort()
#     # forestMap[key] = dict(zip(node_indexes,list(range(len(node_indexes)))))






In [145]:
sum(np.array(list(delete_mapa.values()))[:,1])/ 2


  sum(np.array(list(delete_mapa.values()))[:,1])/ 2


36981.5

In [237]:
a =  [1,3,4,6]
a.append(23)
a

[1, 3, 4, 6, 23]

In [98]:
type(forest[0].tree_.children_left)

numpy.ndarray

In [173]:
FI[:,0] = FI.getcol(0) + FI.getcol(1)

In [190]:
row = np.array([0, 0, 1, 2, 2, 2])
col = np.array([0, 2, 2, 0, 1, 2])
data = np.array([1, 2, 3, 4, 5, 6])
a = csr_matrix((data, (row, col)), shape=(3, 3))
print(a.toarray())
a[:,1] = a.getcol(1) + a.getcol(2)
print(a.toarray())
print(a.tolil()[:,[1,2]].toarray())

[[1 0 2]
 [0 0 3]
 [4 5 6]]
[[ 1  2  2]
 [ 0  3  3]
 [ 4 11  6]]
[[ 2  2]
 [ 3  3]
 [11  6]]


In [116]:
m.get_decfun_coef(feat_idx=7, label_idx=0)

-0.020985986039304214

In [246]:
W1 = np.array(W).reshape(100,10)
W1[1]
norms = [np.linalg.norm(w) for w in W1]
sum_norms = []

for i in range(len(norms) - 1):
    sum_norms.append(norms[i] + norms[i+1])
sum_enumerate = list(enumerate(sum_norms))
sum_enumerate.sort(key= lambda x : x[1])
only10 = sum_enumerate[:int(len(sum_enumerate) * 0.1)] 
for index,_ in only10:
    a = W1[index]
    b = W1[index+1]
    W1[index] = a + b
W1 = np.delete(W1,[x[0] for x in only10],axis=0)

print(W1.shape)


ValueError: cannot reshape array of size 369792 into shape (100,10)

In [90]:
np.linalg.norm(W1[99])
#np.linalg.norm(W1[98])


0.05131417664551044

In [8]:
def predict_y(W,X):
    a = np.array([f.predict(X) for f in forest]).T
    b = np.array([[indexing(a.astype(int),10) for a in b] for b in a])
    predic = np.array([W @ x for x in b])
    return predic

In [16]:
a = predict_y(W,X_test)

In [17]:
a.shape

(10000, 10)

In [11]:
a1 = np.array([[np.argmax(np.absolute(x)) for x in b] for b in a])

In [19]:
a3 = np.array([np.argmax(np.absolute(b)) for b in a])

In [12]:
from statistics import mode
a2 = np.array([mode(r) for r in a1])

In [20]:
a3[0]


7

In [42]:
y_test[0]

7

In [33]:
a1[576]

3

In [48]:
from sklearn.metrics import zero_one_loss
error_rate = zero_one_loss(y_test, y_pred)
error_rate

ValueError: Classification metrics can't handle a mix of multiclass and continuous targets