<a href="https://colab.research.google.com/github/DepartmentOfStatisticsPUE/ann-for-survey-sampling/blob/main/notebooks/ann_paper_sim_study_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Install required packages for the simulation study

In [None]:
!pip install annoy
!pip install pynndescent
!pip install hnswlib

Load standard modules

In [44]:
import pandas as pd
import numpy as np
import plotnine as ggplot

Load ann modules

In [16]:
from sklearn.neighbors import KDTree ## exact search with ['euclidean'] Other see: KDTree.valid_metrics
#from annoy import AnnoyIndex

Functions for the study

In [75]:
## exact nn
def exact_nn_numpy(sample, data, y, x ):
  nns = np.array([data[np.argmin(np.sqrt(np.sum((row-data[:,x])**2,axis=1)))] for row in sample[:,x]])[:,y]
  res = np.mean(nns, axis=0)
  return res

## kdtree
def kdtree_impute(sample, data, y, x):
  tree = KDTree(data[:,x], leaf_size = 40, metric = 'euclidean')
  nns = tree.query(sample[:,x], return_distance = False)
  res = np.mean(data[nns], axis=0)[:, y]
  return res

Generate data for the simulation study

In [19]:
np.random.seed(123)
N = 1000000
x1 = np.random.normal(loc=1.0,scale=1.0,size=N)
x2 = np.random.exponential(scale=1.0, size = N)
epsilon = np.random.normal(size=N)

### target variables
y1 = 1 + x1 + x2 + epsilon
y2 = 0.5*(x1 - 1.5)**2 + x2 + epsilon
## propensity scores
p1 = np.exp(x2) / (1 + np.exp(x2))
p2 = np.exp(-0.5 + 0.5*(x2-2)**2) / (1 + np.exp(-0.5 + 0.5*(x2-2)**2))

data = np.column_stack((x1,x2,y1,y2,p1,p2))
data[:5]

array([[-0.0856306 ,  0.55466463,  0.90054107,  1.24328388,  0.63521714,
         0.63285795],
       [ 1.99734545,  1.19758235,  4.45522212,  1.58155292,  0.76809442,
         0.45560151],
       [ 1.2829785 ,  0.84342942,  3.025612  ,  0.76618266,  0.699187  ,
         0.54210697],
       [-0.50629471,  4.28335756,  4.09344198,  5.61234593,  0.98639148,
         0.89156823],
       [ 0.42139975,  1.56705153,  2.21403432,  1.37432383,  0.82736287,
         0.39980497]])

Simulation study

In [51]:
R = 2000
xs = [0,1]
ys = [2,3]
ps = [4,5]

s500 = np.random.choice(a = data.shape[0], size = 500, replace = False)
s1000 = np.random.choice(a = data.shape[0], size = 500, replace = False)

In [73]:
%%time
result = exact_nn_numpy(data[s500], data, ys, xs)

CPU times: user 10 s, sys: 106 ms, total: 10.1 s
Wall time: 10.1 s


In [56]:
print(np.mean(data[:,ys], axis=0))
print(np.mean(result, axis=0))

[3.00157899 1.62589353]
[2.98569788 1.6752459 ]


In [74]:
%%time
kkkk = KDTree(data[:,xs], leaf_size = 40, metric = 'euclidean')
kdtree_res = kkkk.query(data[s500][:,xs], return_distance = False)
np.mean(data[kdtree_res], axis=0)[:, ys]

CPU times: user 1.13 s, sys: 0 ns, total: 1.13 s
Wall time: 1.13 s
