In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib

import matplotlib.pyplot as plt
from scipy import stats
from scipy.stats import skew
from scipy.stats import norm
from scipy.stats.stats import pearsonr
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import preprocessing
%matplotlib inline
import sys
sys.path.append("/content/drive/MyDrive/MIDA2/IntelligentMobilityProject/Code")
np.set_printoptions(precision=5, suppress=True)

In [None]:
!pip install distython



In [None]:
# Example code of how the HEOM metric can be used together with Scikit-Learn
import numpy as np
from sklearn.neighbors import NearestNeighbors
from sklearn.datasets import load_boston
# Importing a custom metric class
from distython import HEOM

# Load the dataset from sklearn
boston = load_boston()
boston_data = boston["data"]
# Categorical variables in the data
categorical_ix = [3, 8]
# The problem here is that NearestNeighbors can't handle np.nan
# So we have to set up the NaN equivalent
nan_eqv = 12345

# Introduce some missingness to the data for the purpose of the example
row_cnt, col_cnt = boston_data.shape
for i in range(row_cnt):
    for j in range(col_cnt):
        rand_val = np.random.randint(20, size=1)
        if rand_val == 10:
            boston_data[i, j] = nan_eqv

# Declare the HEOM with a correct NaN equivalent value
heom_metric = HEOM(boston_data, categorical_ix, nan_equivalents = [nan_eqv])

# Declare NearestNeighbor and link the metric
neighbor = NearestNeighbors(metric = heom_metric.heom)

# Fit the model which uses the custom distance metric 
neighbor.fit(boston_data)

# Return 5-Nearest Neighbors to the 1st instance (row 1)
result = neighbor.kneighbors(boston_data[0].reshape(1, -1), n_neighbors = 5)
print(result)


(array([[0.     , 0.     , 0.00001, 0.00001, 0.00001]]), array([[  0, 341, 501, 502, 505]]))


In [None]:
boston_data[0]

array([  0.00632,  18.     ,   2.31   ,   0.     ,   0.538  ,   6.575  ,
        65.2    ,   4.09   ,   1.     , 296.     ,  15.3    , 396.9    ,
         4.98   ])

In [None]:
boston_data[0].reshape(1, -1)

array([[  0.00632,  18.     ,   2.31   ,   0.     ,   0.538  ,   6.575  ,
         65.2    ,   4.09   ,   1.     , 296.     ,  15.3    , 396.9    ,
          4.98   ]])

In [None]:
boston_data[0]

array([  0.00632,  18.     ,   2.31   ,   0.     ,   0.538  ,   6.575  ,
        65.2    ,   4.09   ,   1.     , 296.     ,  15.3    , 396.9    ,
         4.98   ])

In [None]:
% cd /content/drive/MyDrive/MIDA2/IntelligentMobilityProject/Code/DistanceMetrics

/content/drive/.shortcut-targets-by-id/1W4CaD115k2Q9O4E9IXqedlMV2TZCy_9J/MIDA2/IntelligentMobilityProject/Code/DistanceMetrics


In [None]:
! python setup.py build_ext --inplace

Compiling heom_c.pyx because it depends on heom_c.pxd.
[1/1] Cythonizing heom_c.pyx
  tree = Parsing.p_module(s, pxd, full_module_name)
running build_ext
building 'heom_c1' extension
x86_64-linux-gnu-gcc -pthread -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O2 -Wall -g -fdebug-prefix-map=/build/python3.7-LSlbJj/python3.7-3.7.11=. -fstack-protector-strong -Wformat -Werror=format-security -g -fdebug-prefix-map=/build/python3.7-LSlbJj/python3.7-3.7.11=. -fstack-protector-strong -Wformat -Werror=format-security -Wdate-time -D_FORTIFY_SOURCE=2 -fPIC -I/usr/local/lib/python3.7/dist-packages/numpy/core/include -I/usr/include/python3.7m -c heom_c.c -o build/temp.linux-x86_64-3.7/heom_c.o
x86_64-linux-gnu-gcc -pthread -shared -Wl,-O1 -Wl,-Bsymbolic-functions -Wl,-Bsymbolic-functions -Wl,-z,relro -Wl,-Bsymbolic-functions -Wl,-z,relro -g -fdebug-prefix-map=/build/python3.7-LSlbJj/python3.7-3.7.11=. -fstack-protector-strong -Wformat -Werror=format-security -Wdate-time -D_FORTIFY_SOURCE=2

In [None]:
!cp build/lib.linux-x86_64-3.7/Code/DistanceMetrics/heom_c.cpython-37m-x86_64-linux-gnu.so heom_c.so

cp: cannot stat 'build/lib.linux-x86_64-3.7/Code/DistanceMetrics/heom_c.cpython-37m-x86_64-linux-gnu.so': No such file or directory


In [None]:
from DistanceMetrics.heom_c import HEOM_C

# Test of weighted HEOM

In [None]:
import numpy as np
from sklearn import preprocessing

# initialize variables
r = 5
iterations = 10000
seed = 10

# initialize generators
rng = np.random.default_rng(seed)
min_max_scaler = preprocessing.MinMaxScaler()

def generate_random_answers(r, iterations=10000):
  range_r = np.array(range(r)).reshape(-1, 1)
  r_scaled = min_max_scaler.fit_transform(range_r)
  r_scaled = r_scaled.reshape(1, r)[0]
  # print(r_scaled)
  return rng.choice(r_scaled, size=iterations)

# err = 0
# for i in range(iterations):
#   if np.
random_answers = generate_random_answers(r, iterations=iterations)

[0.   0.25 0.5  0.75 1.  ]


In [None]:
# random_answers

In [None]:
# if normality is respected, err should be 0.5

mean_heom = 0
for random_answer in random_answers:
  if random_answer != 0:
    mean_heom += 1
mean_heom /= iterations
mean_heom

0.8038

In [None]:
mean_euclidean_distance = 0
for random_answer in random_answers:
  mean_euclidean_distance += random_answer
mean_euclidean_distance /= iterations
mean_euclidean_distance

0.500275

We can see a strong bias towards categorical variables, using standard heom. Let's try to normalize this bias..

In [None]:
# if normality is respected, err should be 0.5
for r in range(10, 100, 10):
  random_answers = generate_random_answers(r, iterations=iterations)

  mean_weighted_heom = 0
  for random_answer in random_answers:
    if random_answer != 0:
      # original formula was 0.5*iterations/(mean_different_answers/r)
      # simplified iterations/mean_different_answers to 1/(r-1)
      mean_weighted_heom += 0.5*r/(r-1)
  mean_weighted_heom /= iterations
  print(r, mean_weighted_heom)

[0.         0.11111111 0.22222222 0.33333333 0.44444444 0.55555556
 0.66666667 0.77777778 0.88888889 1.        ]
10 0.4996666666667369
[0.         0.05263158 0.10526316 0.15789474 0.21052632 0.26315789
 0.31578947 0.36842105 0.42105263 0.47368421 0.52631579 0.57894737
 0.63157895 0.68421053 0.73684211 0.78947368 0.84210526 0.89473684
 0.94736842 1.        ]
20 0.49705263157892193
[0.         0.03448276 0.06896552 0.10344828 0.13793103 0.17241379
 0.20689655 0.24137931 0.27586207 0.31034483 0.34482759 0.37931034
 0.4137931  0.44827586 0.48275862 0.51724138 0.55172414 0.5862069
 0.62068966 0.65517241 0.68965517 0.72413793 0.75862069 0.79310345
 0.82758621 0.86206897 0.89655172 0.93103448 0.96551724 1.        ]
30 0.5007931034482722
[0.         0.02564103 0.05128205 0.07692308 0.1025641  0.12820513
 0.15384615 0.17948718 0.20512821 0.23076923 0.25641026 0.28205128
 0.30769231 0.33333333 0.35897436 0.38461538 0.41025641 0.43589744
 0.46153846 0.48717949 0.51282051 0.53846154 0.56410256 0.5