<a href="https://colab.research.google.com/github/DepartmentOfStatisticsPUE/ann-for-survey-sampling/blob/main/ann_paper_simulation_1_properties.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## About

This notebook covers simulation study to assess the performance of predictive mean imputation based on exact and approximate nearest neigbours. 

**Warning**: before runing this scripts go to `Runtime` -> `Change runtime type` and set it to `GPU`.

## Install requested modules

Please note that this may take some time

In [None]:
!apt install libomp-dev
!pip install faiss-gpu
!pip install n2
!pip install scann
!pip install annoy ## takes minutes to add data and index
!pip install pyflann-py3
## !pip install pynndescent -- not suitable for PMM 1d dimension: gets error "no suitable hyperplains were found"

## this line cleanes information about installing
## comment these lines if you want to see the progress 
from IPython.display import clear_output 
clear_output()

## Import requested modules

In [None]:
## standard modules
import pandas as pd
import numpy as np
import time

## linear regression
from sklearn.linear_model import LinearRegression

## ann modules
import scann
import faiss
from pyflann import *
from annoy import AnnoyIndex
from n2 import HnswIndex
from scipy.spatial import cKDTree
from pynndescent import NNDescent

## Helper functions

In [None]:
def kdtree_impute(y_pred, y_pred_miss, y):
  tree = cKDTree(y_pred, leafsize = 100, balanced_tree=True)
  dists, indx = tree.query(y_pred_miss, k = 1, eps = 0)
  res = (np.sum(y) + np.sum(y[indx])) / (len(y_pred) + len(y_pred_miss))
  return res

def faiss_impute(y_pred, y_pred_miss, y, gpu = True, voronoi = False):
  index_flat = faiss.IndexFlatL2(1)

  if voronoi:
    index_flat = faiss.IndexIVFFlat(index_flat, 1, 1000)

  if gpu:
    gpu_faiss = faiss.StandardGpuResources() 
    index_flat = faiss.index_cpu_to_gpu(gpu_faiss, 0, index_flat)
  
  if voronoi:
    index_flat.train(y_pred)

  index_flat.add(y_pred)
  dists, indx = index_flat.search(y_pred_miss, k = 1) 
  res = (np.sum(y) + np.sum(y[indx])) / (len(y_pred) + len(y_pred_miss))
  return res

def annoy_impute(y_pred, y_pred_miss, y, trees = 50):
  t = AnnoyIndex(1, "euclidean") 
  for i in range(len(y_pred)):
    t.add_item(i, y_pred[i]) 

  t.build(trees)
  indx = np.array([t.get_nns_by_vector(i, 1) for i in y_pred_miss])
  res = (np.sum(y) + np.sum(y[indx])) / (len(y_pred) + len(y_pred_miss))
  return res

def n2_impute(y_pred, y_pred_miss, y, trees = 50):
  t = HnswIndex(1, "euclidean") 
  for i in y_pred:
    t.add_data(i)

  t.build(m=5, n_threads=-1)
  indx = t.batch_search_by_vectors(y_pred_miss, 1)
  res = (np.sum(y) + np.sum(y[indx])) / (len(y_pred) + len(y_pred_miss))
  return res

def flann_impute(y_pred, y_pred_miss, y):
  flann = FLANN()
  indx, dists = flann.nn(y_pred, y_pred_miss, 1, 
                       algorithm="kdtree", branching=32, iterations=7, checks=16, random_seed = 1)
  res = (np.sum(y) + np.sum(y[indx])) / (len(y_pred) + len(y_pred_miss))
  return res

## to be corrected
def scann_impute(y_pred, y_pred_miss, y):
  searcher = scann.scann_ops_pybind.builder(y_pred, 1, "squared_l2").tree(
      num_leaves=1000, num_leaves_to_search=50, training_sample_size=5000).score_ah(
          2, anisotropic_quantization_threshold=0.2).reorder(10).build()
  nns, indx = searcher.search_batched(y_pred_miss)
  res = (np.sum(y) + np.sum(y[indx])) / (len(y_pred) + len(y_pred_miss))
  return res


## Simulation studies outline

Here, we conduct simulation study based on predictive mean matching. We replicate study from *Yang, S., & Kim, J. K. (2020). Asymptotic theory and inference of predictive mean matching imputation using a superpopulation model framework. Scandinavian Journal of Statistics, 47(3), 839-861.* paper, however we only assume missing data mechanism



In [None]:
np.random.seed(123)
N = 50000
x1 = np.random.uniform(size = N)
x2 = np.random.uniform(size = N)
x3 = np.random.uniform(size = N)
x4 = np.random.normal(size = N)
x5 = np.random.normal(size = N)
x6 = np.random.normal(size = N)
epsilon = np.random.normal(size=N)

### target variables
y1 = -1 + x1 + x2 + epsilon
y2 = -1.167 + x1 + x2 + (x1 - 0.5)**2 + + (x2 - 0.5)**2 + epsilon
y3 = -1.5 + x1 + x2 + x3 + x4 + x5 + x6 + epsilon

## response indicator
p1 = np.exp(0.2 + x1 + x2) / (1 + np.exp(0.2 + x1 + x2))

data = np.column_stack((x1,x2,x3,x4,x5,x6,y1,y2,y3, p1)).astype('float32')

## first three rows
data[:3]

array([[ 0.6964692 ,  0.36086547,  0.20932843,  0.47665665,  0.80429375,
        -0.9894137 ,  0.42543107,  0.31638962,  0.42629617,  0.77856696],
       [ 0.28613934,  0.22535679,  0.2937351 ,  0.6701647 ,  1.2931948 ,
         1.033963  ,  0.25331086,  0.20747614,  3.0443685 ,  0.67073166],
       [ 0.22685145,  0.50813043,  0.05571789,  0.5033514 ,  1.5591047 ,
         0.13504769,  0.36729714,  0.27497336,  2.1205187 ,  0.71808493]],
      dtype=float32)

### 

In [None]:
R = 500
sim1_results_ckdtree = np.zeros(shape = (R, 3))
sim1_results_faiss = np.zeros(shape = (R, 3))
sim1_results_annoy = np.zeros(shape = (R, 3))
sim1_results_n2 = np.zeros(shape = (R, 3))
sim1_results_flann = np.zeros(shape = (R, 3))
#sim1_results_ckdtree_time

for r in range(R):
  
  if (r % 10 == 0):
    print(r)

  np.random.seed(r)
  response_flag = np.random.binomial(n=1, p = p1, size = N)
  data_resp = data[response_flag == 1]
  data_noresp = data[response_flag != 1]
  
  ## predictive mean matching
  ## y1
  m1_reg_y1 = LinearRegression().fit(data_resp[:,:2], data_resp[:, 6])
  m1_resp_y1_predict = m1_reg_y1.predict(data_resp[:,:2]).reshape(-1,1)
  m1_noresp_y1_predict = m1_reg_y1.predict(data_noresp[:,:2]).reshape(-1,1)
  ## y2
  m1_reg_y2 = LinearRegression().fit(data_resp[:,:2], data_resp[:, 7])
  m1_resp_y2_predict = m1_reg_y2.predict(data_resp[:,:2]).reshape(-1,1)
  m1_noresp_y2_predict = m1_reg_y2.predict(data_noresp[:,:2]).reshape(-1,1)
  ## y3
  m1_reg_y3 = LinearRegression().fit(data_resp[:,:6], data_resp[:, 8])
  m1_resp_y3_predict = m1_reg_y3.predict(data_resp[:,:6]).reshape(-1,1)
  m1_noresp_y3_predict = m1_reg_y3.predict(data_noresp[:,:6]).reshape(-1,1)

  ## cktree imputation
  sim1_results_ckdtree[r, 0] = kdtree_impute(m1_resp_y1_predict, m1_noresp_y1_predict, data_resp[:, 6]) 
  sim1_results_ckdtree[r, 1] = kdtree_impute(m1_resp_y2_predict, m1_noresp_y2_predict, data_resp[:, 7])
  sim1_results_ckdtree[r, 2] = kdtree_impute(m1_resp_y3_predict, m1_noresp_y3_predict, data_resp[:, 8])
  
  ## faiss imputation
  sim1_results_faiss[r, 0] = faiss_impute(m1_resp_y1_predict, m1_noresp_y1_predict, data_resp[:, 6]) 
  sim1_results_faiss[r, 1] = faiss_impute(m1_resp_y2_predict, m1_noresp_y2_predict, data_resp[:, 7])
  sim1_results_faiss[r, 2] = faiss_impute(m1_resp_y3_predict, m1_noresp_y3_predict, data_resp[:, 8])

  ## annoy imputation
  sim1_results_annoy[r, 0] = annoy_impute(m1_resp_y1_predict, m1_noresp_y1_predict, data_resp[:, 6]) 
  sim1_results_annoy[r, 1] = annoy_impute(m1_resp_y2_predict, m1_noresp_y2_predict, data_resp[:, 7])
  sim1_results_annoy[r, 2] = annoy_impute(m1_resp_y3_predict, m1_noresp_y3_predict, data_resp[:, 8])

  ## n2 imputation
  sim1_results_n2[r, 0] = n2_impute(m1_resp_y1_predict, m1_noresp_y1_predict, data_resp[:, 6]) 
  sim1_results_n2[r, 1] = n2_impute(m1_resp_y2_predict, m1_noresp_y2_predict, data_resp[:, 7])
  sim1_results_n2[r, 2] = n2_impute(m1_resp_y3_predict, m1_noresp_y3_predict, data_resp[:, 8])

  ## flann imputation
  sim1_results_flann[r, 0] = flann_impute(m1_resp_y1_predict, m1_noresp_y1_predict, data_resp[:, 6]) 
  sim1_results_flann[r, 1] = flann_impute(m1_resp_y2_predict, m1_noresp_y2_predict, data_resp[:, 7])
  sim1_results_flann[r, 2] = flann_impute(m1_resp_y3_predict, m1_noresp_y3_predict, data_resp[:, 8])

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32


KeyboardInterrupt: ignored

In [None]:
[abs(np.mean(sim1_results_ckdtree, axis = 0) - np.mean(data[:,[6,7,8]], axis = 0)) / np.mean(data[:,[6,7,8]], axis = 0),
 abs(np.mean(sim1_results_faiss, axis = 0) - np.mean(data[:,[6,7,8]], axis = 0)) / np.mean(data[:,[6,7,8]], axis = 0),
 abs(np.mean(sim1_results_annoy, axis = 0) - np.mean(data[:,[6,7,8]], axis = 0)) / np.mean(data[:,[6,7,8]], axis = 0),
 abs(np.mean(sim1_results_n2, axis = 0) - np.mean(data[:,[6,7,8]], axis = 0)) / np.mean(data[:,[6,7,8]], axis = 0),
 abs(np.mean(sim1_results_n2, axis = 0) - np.mean(data[:,[6,7,8]], axis = 0)) / np.mean(data[:,[6,7,8]], axis = 0)]

[array([ 0.59649765, -1.12227378,  0.64477209]),
 array([ 0.81641633, -0.30906736,  0.65052588]),
 array([ 0.60311325, -1.20067965,  0.64466372]),
 array([ 0.60872123, -1.5602482 ,  0.65131163])]

In [None]:
scann_impute(m1_resp_y1_predict, m1_noresp_y1_predict, data_resp[:, 6])

IndexError: ignored

In [None]:
sim1_results_ckdtree

array([[ 0.00099632, -0.00174125,  0.01660192],
       [-0.00591695, -0.0081028 ,  0.00880808],
       [ 0.00258703,  0.00332214,  0.01562247],
       [ 0.00624923,  0.00122973,  0.00844259],
       [ 0.00298142, -0.00291746,  0.00692833],
       [ 0.0042884 ,  0.0020578 ,  0.00675807],
       [-0.00125058, -0.00289614,  0.01092934],
       [-0.00092402, -0.00142819,  0.0108194 ],
       [-0.00598042,  0.00504807,  0.00985407],
       [ 0.0032031 ,  0.00332188,  0.01405788],
       [ 0.00832067, -0.00021854,  0.01523517],
       [ 0.00312725,  0.00128403,  0.0111154 ],
       [-0.00378741, -0.00251644,  0.00161225],
       [ 0.00123471, -0.00348929,  0.00471626],
       [-0.00774299, -0.00380645,  0.00472216],
       [-0.00162043,  0.002236  ,  0.01136493],
       [ 0.00115472, -0.0031789 ,  0.00947237],
       [-0.0002643 , -0.00099903,  0.0078425 ],
       [ 0.00386419,  0.00683264,  0.01641528],
       [ 0.00237491,  0.00064832,  0.01288427],
       [-0.00275811, -0.00010406,  0.008