In [1]:
import random
import numpy as np
import matplotlib.pyplot as plt
from KDTree import Datapoint
from KDTree import KDTree
import pandas as pd

In [2]:
df = pd.read_feather('data.feather')
df

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,39,Private,215419,Bachelors,13,Divorced,Prof-specialty,Not-in-family,White,Female,0,0,36,United-States,<=50K.
48838,64,,321403,HS-grad,9,Widowed,,Other-relative,Black,Male,0,0,40,United-States,<=50K.
48839,38,Private,374983,Bachelors,13,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,50,United-States,<=50K.
48840,44,Private,83891,Bachelors,13,Divorced,Adm-clerical,Own-child,Asian-Pac-Islander,Male,5455,0,40,United-States,<=50K.


In [3]:
raceSexCounter = dict()
def create_color(row):
    raceSex = row.race + row.sex
    if raceSex not in raceSexCounter:
        raceSexCounter[raceSex] = 0
    raceSexCounter[raceSex] += 1

In [4]:
df.apply(lambda row: create_color(row), axis=1)

0        None
1        None
2        None
3        None
4        None
         ... 
48837    None
48838    None
48839    None
48840    None
48841    None
Length: 48842, dtype: object

In [5]:
colors1 = dict()
colors2 = dict()
dp_colors1 = dict()
dp_colors2 = dict()
counter = 0
for key, val in raceSexCounter.items():
    colors1[key] = {'color': counter, 'weight': val/48842}
    colors2[key] = {'color': counter, 'weight': 1 - val/48842}
    dp_colors1[str(counter)] = val/48842
    dp_colors2[str(counter)] = 1 - val/48842
    counter += 1

In [6]:
colors1

{'WhiteMale': {'color': 0, 'weight': 0.5883256213914254},
 'BlackMale': {'color': 1, 'weight': 0.04866713074812661},
 'BlackFemale': {'color': 2, 'weight': 0.04725441218623316},
 'WhiteFemale': {'color': 3, 'weight': 0.2667171696490725},
 'Asian-Pac-IslanderMale': {'color': 4, 'weight': 0.0205151304205397},
 'Amer-Indian-EskimoMale': {'color': 5, 'weight': 0.005835141886081651},
 'OtherFemale': {'color': 6, 'weight': 0.0031734982187461612},
 'Asian-Pac-IslanderFemale': {'color': 7, 'weight': 0.010585152123172679},
 'Amer-Indian-EskimoFemale': {'color': 8, 'weight': 0.0037877236804389667},
 'OtherMale': {'color': 9, 'weight': 0.005139019696163138}}

In [7]:
colors2

{'WhiteMale': {'color': 0, 'weight': 0.4116743786085746},
 'BlackMale': {'color': 1, 'weight': 0.9513328692518734},
 'BlackFemale': {'color': 2, 'weight': 0.9527455878137668},
 'WhiteFemale': {'color': 3, 'weight': 0.7332828303509276},
 'Asian-Pac-IslanderMale': {'color': 4, 'weight': 0.9794848695794603},
 'Amer-Indian-EskimoMale': {'color': 5, 'weight': 0.9941648581139183},
 'OtherFemale': {'color': 6, 'weight': 0.9968265017812539},
 'Asian-Pac-IslanderFemale': {'color': 7, 'weight': 0.9894148478768273},
 'Amer-Indian-EskimoFemale': {'color': 8, 'weight': 0.996212276319561},
 'OtherMale': {'color': 9, 'weight': 0.9948609803038369}}

In [19]:
datapoints1 = list()
datapoints2 = list()
for row in df.itertuples():
    raceSex = row.race + row.sex
    dp1 = Datapoint(coordinate=[row.age, row.fnlwgt, row._5, row._11, row._12, row._13 ], color=str(colors1[raceSex]['color']))
    dp2 = Datapoint(coordinate=[row.age, row.fnlwgt, row._5, row._11, row._12, row._13 ], color=str(colors2[raceSex]['color']))
    datapoints1.append(dp1)
    datapoints2.append(dp2)

min_age = min(df['age'])
min_fnlwgt = min(df['fnlwgt'])
min_education = min(df['education-num'])
min_cgain = min(df['capital-gain'])
min_closs = min(df['capital-loss'])
min_hrs = min(df['hours-per-week'])

max_age = max(df['age'])
max_fnlwgt = max(df['fnlwgt'])
max_education = max(df['education-num'])
max_cgain = max(df['capital-gain'])
max_closs = max(df['capital-loss'])
max_hrs = max(df['hours-per-week'])

In [9]:
tree1 = KDTree()
tree2 = KDTree()

In [10]:
tree1.build_tree(datapoints1, dp_colors1)

In [11]:
tree2.build_tree(datapoints2, dp_colors2)

In [15]:
for i in range(1000):
    # Get a random age range
    random_age1 = random.randint(min_age, max_age)
    random_age2 = random.randint(random_age1, max_age)
    # Get a random final weight range
    random_fnlwgt1 = random.randint(min_fnlwgt, max_fnlwgt)
    random_fnlwgt2 = random.randint(random_fnlwgt1, max_fnlwgt)
    # Get a random education level range
    random_education1 = random.randint(min_education, max_education)
    random_education2 = random.randint(random_education1, max_education)
    # Get a random capital gain range
    random_cgain1 = random.randint(min_cgain, max_cgain)
    random_cgain2 = random.randint(random_cgain1, max_cgain)
    # Get a random capital loss range
    random_closs1 = random.randint(min_closs, max_closs)
    random_closs2 = random.randint(random_closs1, max_closs)
    # Get a random hours per week work
    random_hrs1 = random.randint(min_hrs, max_hrs)
    random_hrs2 = random.randint(random_hrs1, max_hrs)
    # Get the result of the level1 random selection
    result = tree1.level1_random_selection( ([random_age1,random_fnlwgt1,random_education1,random_cgain1,random_closs1,random_hrs1], 
                                    [random_age2, random_fnlwgt2, random_education2, random_cgain2, random_closs2, random_hrs2]) )
    if result is None:
        i -= 1
    else:
        print(result)

The query rectangle returned 0 nodes.  ([82, 317906, 14, 50539, 860, 7], [90, 894084, 16, 57818, 3829, 71])
The query rectangle returned 0 nodes.  ([35, 1106732, 7, 35921, 2579, 18], [85, 1119695, 15, 56500, 3125, 89])
The query rectangle returned 0 nodes.  ([81, 549081, 2, 36398, 3382, 87], [87, 1082059, 13, 72238, 4345, 91])
The query rectangle returned 0 nodes.  ([86, 985170, 8, 97797, 3855, 76], [89, 1299418, 11, 99807, 3997, 90])
The query rectangle returned 0 nodes.  ([63, 1007811, 11, 1348, 1998, 4], [81, 1469046, 11, 13603, 3775, 79])
The query rectangle returned 0 nodes.  ([47, 954945, 8, 70334, 1909, 16], [80, 1483432, 16, 98933, 3146, 79])
The query rectangle returned 0 nodes.  ([72, 730237, 12, 34203, 3434, 41], [80, 1035498, 15, 91881, 4144, 82])
The query rectangle returned 0 nodes.  ([85, 46022, 4, 63059, 3783, 12], [87, 483941, 13, 73803, 3946, 41])
The query rectangle returned 0 nodes.  ([65, 1192957, 6, 81031, 3037, 72], [80, 1266016, 13, 82791, 3667, 86])
The query r

In [22]:
df_test = df[df.age >= 82]
df_test

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
222,90,Private,51744,HS-grad,9,Never-married,Other-service,Not-in-family,Black,Male,0,2206,40,United-States,<=50K
1040,90,Private,137018,HS-grad,9,Never-married,Other-service,Not-in-family,White,Female,0,0,40,United-States,<=50K
1168,88,Self-emp-not-inc,206291,Prof-school,15,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,40,United-States,<=50K
1935,90,Private,221832,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,45,United-States,<=50K
2303,90,Private,52386,Some-college,10,Never-married,Other-service,Not-in-family,Asian-Pac-Islander,Male,0,0,35,United-States,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47649,90,Private,197613,HS-grad,9,Never-married,Adm-clerical,Not-in-family,White,Female,0,0,40,United-States,>50K.
47659,88,Private,30102,Bachelors,13,Never-married,Exec-managerial,Not-in-family,White,Male,0,1816,50,,<=50K.
47969,89,,29106,10th,6,Married-civ-spouse,,Husband,White,Male,0,0,20,United-States,<=50K.
48495,84,Private,65478,HS-grad,9,Widowed,Priv-house-serv,Not-in-family,White,Female,0,0,40,England,<=50K.


In [25]:
df_test2 = df_test[df_test.fnlwgt >= 317906]
df_test2

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
13025,84,?,368925,5th-6th,3,Widowed,?,Not-in-family,White,Male,0,0,15,United-States,<=50K
27795,84,Private,388384,7th-8th,4,Married-civ-spouse,Prof-specialty,Husband,Black,Male,0,0,10,United-States,<=50K
32494,82,?,403910,HS-grad,9,Never-married,?,Not-in-family,White,Male,0,0,3,United-States,<=50K
45007,90,Private,347074,Some-college,10,Never-married,Adm-clerical,Own-child,White,Female,0,1944,12,United-States,<=50K.


In [27]:
df_test3 = df_test2[df_test2.fnlwgt <= 894084]
df_test3

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
13025,84,?,368925,5th-6th,3,Widowed,?,Not-in-family,White,Male,0,0,15,United-States,<=50K
27795,84,Private,388384,7th-8th,4,Married-civ-spouse,Prof-specialty,Husband,Black,Male,0,0,10,United-States,<=50K
32494,82,?,403910,HS-grad,9,Never-married,?,Not-in-family,White,Male,0,0,3,United-States,<=50K
45007,90,Private,347074,Some-college,10,Never-married,Adm-clerical,Own-child,White,Female,0,1944,12,United-States,<=50K.
