In [111]:
import numpy as np
from rtree import index
from sklearn.neighbors import KDTree
from sklearn.preprocessing import normalize

class Motley():
    def __init__(self, threshold=0.2, alpha=0.1, idx_method="rtree"):
        self.threshold = threshold
        self.alpha = alpha
        self.idx_method = idx_method if idx_method == "kdtree" else "rtree"
    
    # X means data point in the spatial space
    # Z means corresponding attribute representation
    def datafeed(self, X, Z):
        # rownum: number of points, colnum: spatial dimension
        rownum, colnum = X.shape
        
        if(rownum != Z.shape[0]):
            print("Number of input data doesn't match")
            return
        
        self.attributeset = Z
        num_attrs = self.attributeset.shape[1]
        a = self.alpha
        
        # Build up index (kd-tree / R-tree)
        if self.idx_method == "kdtree":
            # Note: kd-tree doesn't support additional object linkage
            self.index = KDTree(X)
        else:
            p = index.Property()
            p.dimension = colnum
            self.index = index.Index(properties=p)
            for idx, row in enumerate(X):
                # (1) Index based on row number
                # (2) Store point for the bounding box
                # (3) Store attribute representation as inner object
                self.index.insert(idx, np.append(row, row), Z[idx])
                
        # Weights used for computing MinDiv
        # Number of weights depends on dimension of attribute space.
        self.weight = np.fromfunction(
            lambda self, x: ((a**(x))*(1-a)/(1-a**num_attrs))
            , (1, num_attrs))
    
    # Query a point and find its diversed neighbors
    # aggress is set for next round's search, if k neighbors are not found
    # max_iter is set to avoid whole-document sacnning
    def search(self, qs_space, k=10, aggress=5, approach="greedy", max_iter=5):
        # Initial search: nearest (k * aggress) points
        s_amount = k*aggress
        filtered, num_iter = 0, 0 # Neighbors found / Iteration already run
        
        # Initial result contains zero row, so the nearest neighbor is guaranteed
        # to be in the result set.
        res = np.empty((0, self.attributeset.shape[1]))
        ret = []
        
        if self.idx_method == "kdtree":
            # TODO: should stop if s_amount > size of dataset
            while (len(res) != k) or (num_iter < max_iter):
                # [filtered:] - Exclude those already exaimed
                q_ans = self.index.query(qs_space, k=s_amount, return_distance=False)[filtered:]
                # the query returns a list of indices, get point attributes from self.attrs
                for cand in q_ans:
                    # Add if pass the diversity test
                    if self.diversity_check_greedy(res, self.attributeset[cand]):
                        ret.append(cand)
                        res = np.vstack([res, cand])
                    
                    filtered += 1
                    
                    if len(res) == k:
                        break
                # Start the next round
                if len(res) != k:
                    num_iter += 1
                    s_amount *= aggress
        else:
            while len(res) != k and (num_iter < max_iter):
                print("round %d..." % num_iter)
                print("search size = %d" % s_amount)
                q_ans = self.index.nearest(np.append(qs_space, qs_space), s_amount, objects=True)
                print("Returned. Handling...")
                for cand in q_ans:
                    tmp_attr = cand.object
                    if self.diversity_check_greedy(res, tmp_attr):
                        ret.append(cand.id)
                        res = np.vstack([res, tmp_attr])
                        print("%d-th neighbor found!" % len(res))
                    filtered += 1
                    if len(res) == k:
                        break
                        
                if len(res) != k:
                    num_iter += 1
                    s_amount *= aggress
        return ret
    
    def diversity_check_greedy(self, X, q):
        size_data, _ = X.shape

        for i in range(size_data):
            # Sort 1-D difference (In ascending order)
            diff_sorted = np.sort(np.absolute(X[i] - q))
            # Weighting
            divdist_tmp = diff_sorted * self.weight
            # If difference is too small, dispose it
            if divdist_tmp.sum() <= self.threshold:
                return False
        return True

                
                

In [48]:
fd.close()
fd_w.close()
max_line = 4000
dataset_path = "./../dataset/Forest_Cover/covtype.data.txt"
dataset_w_path = "./../dataset/Forest_Cover/covtype_s.data.txt"
fd = open(dataset_path)
fd_w = open(dataset_w_path, "w")
ct = 0
for line in fd.readlines():
    fd_w.write(line + '\n')
    ct += 1
    if ct >= max_line:
        break
        
fd.close()
fd_w.close()


r = np.genfromtxt(dataset_w_path, delimiter=',', dtype=None, names=None)
r.shape

(4000, 55)

In [126]:
data_space = minmax_scale(r[::, (0, 1)], copy=False)
data_attributes = minmax_scale(r[::, (2, 3, 4)], copy=False)

print(data_space.shape)
print(data_attrbutes.shape)

(4000, 2)
(4000, 3)




In [129]:

finder = Motley()
finder.datafeed(data_space, data_attributes)

space1_max, space1_min, space2_max, space2_min = data_space[::, 0].max(), data_space[::, 0].min(), data_space[::, 1].max(), data_space[::, 1].min()

rand1 = np.random.uniform(space1_min, space1_max, size=(10, 1))
rand2 = np.random.uniform(space2_min, space2_max, size=(10, 1))

test_input_space = np.hstack([rand1, rand2])

res = []

finder.threshold = 0.05

for i, q in enumerate(test_input_space):
    print("Query no. %d" % (i+1))
    res.append(finder.search(q))

    

Query no. 1
round 0...
search size = 50
Returned. Handling...
1-th neighbor found!
2-th neighbor found!
3-th neighbor found!
4-th neighbor found!
round 1...
search size = 250
Returned. Handling...
5-th neighbor found!
6-th neighbor found!
7-th neighbor found!
round 2...
search size = 1250
Returned. Handling...
8-th neighbor found!
9-th neighbor found!
10-th neighbor found!
Query no. 2
round 0...
search size = 50
Returned. Handling...
1-th neighbor found!
2-th neighbor found!
3-th neighbor found!
4-th neighbor found!
5-th neighbor found!
round 1...
search size = 250
Returned. Handling...
6-th neighbor found!
7-th neighbor found!
8-th neighbor found!
9-th neighbor found!
round 2...
search size = 1250
Returned. Handling...
10-th neighbor found!
Query no. 3
round 0...
search size = 50
Returned. Handling...
1-th neighbor found!
2-th neighbor found!
3-th neighbor found!
4-th neighbor found!
round 1...
search size = 250
Returned. Handling...
5-th neighbor found!
6-th neighbor found!
round 2..

In [130]:
res

[[2612, 83, 87, 2311, 1816, 1232, 1414, 1341, 1646, 2507],
 [964, 250, 800, 1893, 200, 2088, 1954, 1803, 292, 2049],
 [2669, 2291, 2456, 2791, 2484, 1871, 2939, 3653, 3320, 1954],
 [1738, 416, 212, 962, 758, 1803, 1954, 2121, 3871, 3320],
 [3274, 497, 268, 1972, 229, 2537, 3589, 2172, 1803, 2035],
 [80, 1528, 1312, 2045, 1573, 1423, 2435, 1283, 279, 2537],
 [3157, 26, 3556, 3320, 3563, 3654, 3766, 3998, 1892, 1954],
 [2184, 2559, 2092, 3812, 3513, 3082, 3085, 3550, 1954, 1892],
 [547, 862, 2311, 2581, 2312, 1357, 1290, 1705, 243, 1803],
 [26, 3251, 3556, 3562, 3666, 3654, 3544, 3055, 520, 1892]]