<a href="https://colab.research.google.com/github/DepartmentOfStatisticsPUE/ann-for-survey-sampling/blob/main/notebooks/ann_paper_sim_study_2_pynb.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Installation required packages

In [None]:
!pip install nearpy ## do we need it?

In [None]:
!pip install annoy

In [None]:
!pip install pynndescent

In [None]:
!pip install hnswlib

In [2]:
from sklearn.neighbors import KDTree

In [3]:
import pandas as pd
import numpy as np
from numba import jit

In [2]:
from annoy import AnnoyIndex

In [None]:
from nearpy import Engine
from nearpy.hashes import RandomBinaryProjections ## przez to jest bez sensu
from nearpy.hashes import PCABinaryProjections
from nearpy.distances import EuclideanDistance
from nearpy.distances import CosineDistance
from nearpy.distances import ManhattanDistance
from nearpy.filters import NearestFilter

In [None]:
import plotnine as ggplot

Generate pseudo-random data based on Yang, S., & Kim, J. K. (2020). Asymptotic theory and inference of predictive mean matching imputation using a superpopulation model framework. Scandinavian Journal of Statistics, 47(3), 839–861. https://doi.org/10.1111/sjos.12429

In [None]:
## settings
rbp = RandomBinaryProjections('rbp', 2)
engine = Engine(6, lshashes=[rbp], 
                distance = CosineDistance(), 
                vector_filters = [NearestFilter(1)])

In [75]:
np.random.seed(123)
N = 50000
x1 = np.random.uniform(size=N)
x2 = np.random.uniform(size=N)
x3 = np.random.uniform(size=N)
x4 = np.random.normal(size=N)
x5 = np.random.normal(size=N)
x6 = np.random.normal(size=N)
epsilon = np.random.normal(size=N)
### target variables
y1 = -1 + x1 + x2 + epsilon
y2 = -1.167 + x1 + x2 + (x1-0.5)**2 + (x2-0.5)**2 + epsilon
y3 = -1.5 + x1 + x2 + x3 + x4 + x5 + x6 + epsilon
## non-response propensity score
pr = np.exp(0.2 + x1 + x2) / (1 + np.exp(0.2 + x1 + x2))

data = np.column_stack((x1,x2,x3,x4,x5,x6,y1,y2,y3))

In [None]:
B = 20
boots = np.zeros(shape = (B,3))

for b in range(B):
  np.random.seed(b)
  flag = np.random.binomial(n = 1, p = pr)  
  data_observed = data[flag==1]
  data_missing = data[flag!=1]

  engine = Engine(6, lshashes=[rbp], distance = CosineDistance(), vector_filters = [NearestFilter(1)])

  for index in range(data_observed.shape[0]):
    engine.store_vector(data_observed[index, [0,1,2,3,4,5]], 'data_%d' % index)
  
  print("bootstrap: ", b)

  result_indx = np.zeros(shape = data_missing.shape[0])
  for i in range(data_missing.shape[0]):
    if (np.remainder([i], [1000]) == 0):
      print("quering: ", i)
    res = engine.neighbours(data_missing[i, [0,1,2,3,4,5]])
    result_indx[i] = [int(s) for s in res[0][1].split("_") if s.isdigit()][0]

  boots[b,:] = (np.sum(data_observed[result_indx.astype(int),:][:,[6,7,8]], axis=0) + np.sum(data_observed[:, [6,7,8]], axis =0))/N


bootstrap:  0
quering:  0
quering:  1000
bootstrap:  1
quering:  0
quering:  1000
bootstrap:  2
quering:  0
quering:  1000
bootstrap:  3
quering:  0
quering:  1000


IndexError: ignored

In [None]:
print(np.mean(boots, axis=0))
print(np.mean(data[:, [6,7,8]], axis=0))

[0.03096811 0.02814859 0.02643809]
[0.01195955 0.01117289 0.00263205]


In [None]:
data_missing[i, [0,1,2,3,4,5]]

array([ 0.03030632,  0.22484043,  0.02137786,  1.36365468, -1.56913731,
        0.34398149])

# Annoy

In [12]:
data.shape[0]

5000

In [80]:
B = 100
vars = 6
boots_annoy = np.zeros(shape = (B,3))

for b in range(B):
  print("iteration: ", b)
  np.random.seed(b)
  flag = np.random.binomial(n = 1, p = pr)  
  data_observed = data[flag==1]
  data_missing = data[flag!=1]
  
  t = AnnoyIndex(vars, 'euclidean')  # Length of item vector that will be indexed
  for i in range(data_observed.shape[0]):
    t.add_item(i, data_observed[i, [0,1,2,3,4,5]])
    
  t.build(50) # 50 trees
  
  #t.save('test.ann')
  #u = AnnoyIndex(vars, 'euclidean')
  #u.load('test.ann')

  missing_inds = [int(t.get_nns_by_vector(data_missing[i,[0,1,2,3,4,5]], 1)[0]) for i in range(data_missing.shape[0])]
  #u.unload()

  boots_annoy[b,:] = (np.sum(data_observed[np.array(missing_inds).astype(int),:][:,[6,7,8]], axis=0) + np.sum(data_observed[:, [6,7,8]], axis =0))/N

iteration:  0
iteration:  1
iteration:  2
iteration:  3
iteration:  4
iteration:  5
iteration:  6
iteration:  7
iteration:  8
iteration:  9
iteration:  10
iteration:  11
iteration:  12
iteration:  13
iteration:  14
iteration:  15
iteration:  16
iteration:  17
iteration:  18
iteration:  19
iteration:  20
iteration:  21
iteration:  22
iteration:  23
iteration:  24
iteration:  25
iteration:  26
iteration:  27
iteration:  28
iteration:  29
iteration:  30
iteration:  31
iteration:  32
iteration:  33
iteration:  34
iteration:  35
iteration:  36
iteration:  37
iteration:  38
iteration:  39
iteration:  40
iteration:  41
iteration:  42
iteration:  43
iteration:  44
iteration:  45
iteration:  46
iteration:  47
iteration:  48
iteration:  49
iteration:  50
iteration:  51
iteration:  52
iteration:  53
iteration:  54
iteration:  55
iteration:  56
iteration:  57
iteration:  58
iteration:  59
iteration:  60
iteration:  61
iteration:  62
iteration:  63
iteration:  64
iteration:  65
iteration:  66
itera

In [81]:
print(
    (np.mean(boots_annoy, axis=0) - np.mean(data[:, [6,7,8]], axis = 0)) / np.mean(data[:, [6,7,8]], axis = 0)*100
    ) 
print(np.std(boots_annoy, axis=0))

[ 915.80295208 -972.11602987   43.63169436]
[0.00296375 0.00296397 0.00308664]


In [86]:
np.mean(boots_annoy, axis=0) - np.mean(data[:, [6,7,8]], axis = 0)*100

array([-0.04480352,  0.020281  , -1.00556323])