<a href="https://colab.research.google.com/github/DepartmentOfStatisticsPUE/ann-for-survey-sampling/blob/main/ann_paper_simulation_2_data_integration.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!apt install libomp-dev
!pip install faiss-gpu
!pip install n2
!pip install scann
!pip install annoy ## takes minutes to add data and index
!pip install pyflann-py3
## !pip install pynndescent -- not suitable for PMM 1d dimension: gets error "no suitable hyperplains were found"

## this line cleanes information about installing
## comment these lines if you want to see the progress 
from IPython.display import clear_output 
clear_output()

In [None]:
## standard modules
import pandas as pd
import numpy as np
import time

## linear regression
from sklearn.linear_model import LinearRegression

## ann modules
import scann
import faiss
from pyflann import *
from annoy import AnnoyIndex
from n2 import HnswIndex
from scipy.spatial import cKDTree
from pynndescent import NNDescent

## serialization
import pickle

In [None]:
def kdtree_impute(y_pred, y_pred_miss, y):
  tree = cKDTree(y_pred, leafsize = 100, balanced_tree=True)
  dists, indx = tree.query(y_pred_miss, k = 1, eps = 0)
  res = (np.sum(y) + np.sum(y[indx])) / (len(y_pred) + len(y_pred_miss))
  return res

def faiss_impute(y_pred, y_pred_miss, y, gpu = True, voronoi = False):
  index_flat = faiss.IndexFlatL2(1)

  if voronoi:
    index_flat = faiss.IndexIVFFlat(index_flat, 1, 1000)

  if gpu:
    gpu_faiss = faiss.StandardGpuResources() 
    index_flat = faiss.index_cpu_to_gpu(gpu_faiss, 0, index_flat)
  
  if voronoi:
    index_flat.train(y_pred)

  index_flat.add(y_pred)
  dists, indx = index_flat.search(y_pred_miss, k = 1) 
  res = (np.sum(y) + np.sum(y[indx])) / (len(y_pred) + len(y_pred_miss))
  return res

def annoy_impute(y_pred, y_pred_miss, y, trees = 50):
  t = AnnoyIndex(1, "euclidean") 
  for i in range(len(y_pred)):
    t.add_item(i, y_pred[i]) 

  t.build(trees)
  indx = np.array([t.get_nns_by_vector(i, 1) for i in y_pred_miss])
  res = (np.sum(y) + np.sum(y[indx])) / (len(y_pred) + len(y_pred_miss))
  return res

def n2_impute(y_pred, y_pred_miss, y, trees = 50):
  t = HnswIndex(1, "euclidean") 
  for i in y_pred:
    t.add_data(i)

  t.build(m=5, n_threads=-1)
  indx = t.batch_search_by_vectors(y_pred_miss, 1)
  res = (np.sum(y) + np.sum(y[indx])) / (len(y_pred) + len(y_pred_miss))
  return res

def flann_impute(y_pred, y_pred_miss, y):
  flann = FLANN()
  indx, dists = flann.nn(y_pred, y_pred_miss, 1, 
                       algorithm="kmeans", branching=32, iterations=7, checks=16)
  res = (np.sum(y) + np.sum(y[indx])) / (len(y_pred) + len(y_pred_miss))
  return res

## to be corrected
def scann_impute(y_pred, y_pred_miss, y):
  searcher = scann.scann_ops_pybind.builder(y_pred, 1, "squared_l2").tree(
      num_leaves=1000, num_leaves_to_search=50, training_sample_size=5000).score_ah(
          2, anisotropic_quantization_threshold=0.2).reorder(10).build()
  indx, nns = searcher.search_batched(y_pred_miss)
  res = (np.sum(y) + np.sum(y[indx])) / (len(y_pred) + len(y_pred_miss))
  return res

Simulation study taken from: Kim, J. K., & Wang, Z. (2018). Sampling Techniques for Big Data Analysis. International Statistical Review, 1, 1–15. https://doi.org/10.1111/insr.12290

In [None]:
np.random.seed(123) 
N = 1000000
x1 = np.random.normal(loc=1.0,scale=1.0,size=N)
x2 = np.random.exponential(scale=1.0, size = N)
epsilon = np.random.normal(size=N)

### target variables
y1 = 1 + x1 + x2 + epsilon
y2 = 0.5*(x1 - 1.5)**2 + x2 + epsilon
## propensity scores
p1 = np.exp(x2) / (1 + np.exp(x2))
p2 = np.exp(-0.5 + 0.5*(x2-2)**2) / (1 + np.exp(-0.5 + 0.5*(x2-2)**2))

data = np.column_stack((x1,x2,y1,y2,p1,p2)).astype('float32')
data[:3]

In [None]:
R = 10
xs = [0,1]
ys = [2,3]
ps = [4,5]

results_faiss_500 = np.zeros(shape = (R, 2))
results_faiss_1000 = np.zeros(shape = (R, 2))
results_kdtree_500 = np.zeros(shape = (R, 2))
results_kdtree_1000 = np.zeros(shape = (R, 2))

for r in range(R):
  print(r)
  np.random.seed(r)
  ## big data sample
  big_p1 = np.random.binomial(n=1, p = p1, size = N)
  big_p2 = np.random.binomial(n=1, p = p2, size = N)    
  ## random samples
  s500 = np.random.choice(a = data.shape[0], size = 500, replace = False)
  s1000 = np.random.choice(a = data.shape[0], size = 1000, replace = False)