In [1]:
import random
import numpy as np
import matplotlib.pyplot as plt
from KDTree import Datapoint
from KDTree import KDTree
import pandas as pd

In [2]:
df = pd.read_feather('data.feather')
df

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,39,Private,215419,Bachelors,13,Divorced,Prof-specialty,Not-in-family,White,Female,0,0,36,United-States,<=50K.
48838,64,,321403,HS-grad,9,Widowed,,Other-relative,Black,Male,0,0,40,United-States,<=50K.
48839,38,Private,374983,Bachelors,13,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,50,United-States,<=50K.
48840,44,Private,83891,Bachelors,13,Divorced,Adm-clerical,Own-child,Asian-Pac-Islander,Male,5455,0,40,United-States,<=50K.


In [3]:
raceSexCounter = dict()
def create_color(row):
    raceSex = row.race + row.sex
    if raceSex not in raceSexCounter:
        raceSexCounter[raceSex] = 0
    raceSexCounter[raceSex] += 1

In [4]:
df.apply(lambda row: create_color(row), axis=1)

0        None
1        None
2        None
3        None
4        None
         ... 
48837    None
48838    None
48839    None
48840    None
48841    None
Length: 48842, dtype: object

In [5]:
colors1 = dict()
colors2 = dict()
dp_colors1 = dict()
dp_colors2 = dict()
counter = 0
for key, val in raceSexCounter.items():
    colors1[key] = {'color': counter, 'weight': val/48842}
    colors2[key] = {'color': counter, 'weight': 1 - val/48842}
    dp_colors1[str(counter)] = val/48842
    dp_colors2[str(counter)] = 1 - val/48842
    counter += 1

In [6]:
colors1

{'WhiteMale': {'color': 0, 'weight': 0.5883256213914254},
 'BlackMale': {'color': 1, 'weight': 0.04866713074812661},
 'BlackFemale': {'color': 2, 'weight': 0.04725441218623316},
 'WhiteFemale': {'color': 3, 'weight': 0.2667171696490725},
 'Asian-Pac-IslanderMale': {'color': 4, 'weight': 0.0205151304205397},
 'Amer-Indian-EskimoMale': {'color': 5, 'weight': 0.005835141886081651},
 'OtherFemale': {'color': 6, 'weight': 0.0031734982187461612},
 'Asian-Pac-IslanderFemale': {'color': 7, 'weight': 0.010585152123172679},
 'Amer-Indian-EskimoFemale': {'color': 8, 'weight': 0.0037877236804389667},
 'OtherMale': {'color': 9, 'weight': 0.005139019696163138}}

In [7]:
colors2

{'WhiteMale': {'color': 0, 'weight': 0.4116743786085746},
 'BlackMale': {'color': 1, 'weight': 0.9513328692518734},
 'BlackFemale': {'color': 2, 'weight': 0.9527455878137668},
 'WhiteFemale': {'color': 3, 'weight': 0.7332828303509276},
 'Asian-Pac-IslanderMale': {'color': 4, 'weight': 0.9794848695794603},
 'Amer-Indian-EskimoMale': {'color': 5, 'weight': 0.9941648581139183},
 'OtherFemale': {'color': 6, 'weight': 0.9968265017812539},
 'Asian-Pac-IslanderFemale': {'color': 7, 'weight': 0.9894148478768273},
 'Amer-Indian-EskimoFemale': {'color': 8, 'weight': 0.996212276319561},
 'OtherMale': {'color': 9, 'weight': 0.9948609803038369}}

In [8]:
datapoints1 = list()
datapoints2 = list()
for row in df.itertuples():
    raceSex = row.race + row.sex
    dp1 = Datapoint(coordinate=[row.age, row.fnlwgt, row._5, row._11, row._12, row._13 ], color=str(colors1[raceSex]['color']))
    dp2 = Datapoint(coordinate=[row.age, row.fnlwgt, row._5, row._11, row._12, row._13 ], color=str(colors2[raceSex]['color']))
    datapoints1.append(dp1)
    datapoints2.append(dp2)

min_age = min(df['age'])
min_fnlwgt = min(df['fnlwgt'])
min_education = min(df['education-num'])
min_cgain = min(df['capital-gain'])
min_closs = min(df['capital-loss'])
min_hrs = min(df['hours-per-week'])

max_age = max(df['age'])
max_fnlwgt = max(df['fnlwgt'])
max_education = max(df['education-num'])
max_cgain = max(df['capital-gain'])
max_closs = max(df['capital-loss'])
max_hrs = max(df['hours-per-week'])

In [9]:
tree1 = KDTree()
tree2 = KDTree()

In [10]:
tree1.build_tree(datapoints1, dp_colors1)

In [11]:
tree2.build_tree(datapoints2, dp_colors2)

In [19]:
for i in range(1000):
    random_age1 = random.randint(min_age, max_age)
    random_fnlwgt1 = random.randint(min_fnlwgt, max_fnlwgt)
    random_education1 = random.randint(min_education, max_education)
    random_cgain1 = random.randint(min_cgain, max_cgain)
    random_closs1 = random.randint(min_closs, max_closs)
    random_hrs = random.randint(min_hrs, max_hrs)

In [12]:
tree1.level1_random_selection(([random_age1,0,0,0,0,0], [100, 100000, 10, 100, 100, 50]))

'0'