<a href="https://colab.research.google.com/github/DepartmentOfStatisticsPUE/ann-for-survey-sampling/blob/main/ann_paper_simulation_2_data_integration.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
!apt install libomp-dev
!pip install faiss-gpu
!pip install n2
!pip install scann
!pip install annoy ## takes minutes to add data and index
!pip install pyflann-py3
## !pip install pynndescent -- not suitable for PMM 1d dimension: gets error "no suitable hyperplains were found"

## this line cleanes information about installing
## comment these lines if you want to see the progress 
from IPython.display import clear_output 
clear_output()

In [94]:
## standard modules
import pandas as pd
import numpy as np
import time

## vis
import matplotlib.pyplot as plt

## linear regression
from sklearn.linear_model import LinearRegression

## ann modules
import scann
import faiss
from pyflann import *
from annoy import AnnoyIndex
from n2 import HnswIndex
from scipy.spatial import cKDTree
from pynndescent import NNDescent

## serialization
import pickle

In [72]:
def kdtree_impute_di(y_pred, y_pred_miss, y):
  tree = cKDTree(y_pred, leafsize = 100, balanced_tree=True)
  dists, indx = tree.query(y_pred_miss, k = 1, eps = 0)
  res = np.mean(y[indx])
  return res

def faiss_impute_di(y_pred, y_pred_miss, y, gpu = True, voronoi = False):
  index_flat = faiss.IndexFlatL2(2)

  if voronoi:
    index_flat = faiss.IndexIVFFlat(index_flat, 2, 1000)

  if gpu:
    gpu_faiss = faiss.StandardGpuResources() 
    index_flat = faiss.index_cpu_to_gpu(gpu_faiss, 0, index_flat)
  
  if voronoi:
    index_flat.train(y_pred)

  index_flat.add(y_pred)
  dists, indx = index_flat.search(y_pred_miss, k = 1) 
  res = np.mean(y[indx])
  return res

def annoy_impute_di(y_pred, y_pred_miss, y, trees = 50):
  t = AnnoyIndex(2, "euclidean") 
  for i in range(len(y_pred)):
    t.add_item(i, y_pred[i,:]) 

  t.build(trees)
  indx = np.array([t.get_nns_by_vector(i, 1) for i in y_pred_miss])
  res = np.mean(y[indx])
  return res

def n2_impute_di(y_pred, y_pred_miss, y, trees = 50):
  t = HnswIndex(2, "euclidean") 
  for i in y_pred:
    t.add_data(i)

  t.build(m=5, n_threads=-1)
  indx = t.batch_search_by_vectors(y_pred_miss, 1)
  res = np.mean(y[indx])
  return res

def flann_impute_di(y_pred, y_pred_miss, y):
  flann = FLANN()
  indx, dists = flann.nn(y_pred, y_pred_miss, 1, 
                       algorithm="kdtree", 
                       #branching=32, iterations=7, checks=16, 
                       random_seed = 1)
  res = np.mean(y[indx])
  return res

def scann_impute_di(y_pred, y_pred_miss, y):
  searcher = scann.scann_ops_pybind.builder(y_pred, 1, "squared_l2").tree(
      num_leaves=1000, num_leaves_to_search=50, training_sample_size=5000).score_ah(
          2, anisotropic_quantization_threshold=0.2).reorder(10).build()
  indx, nns = searcher.search_batched(y_pred_miss)
  res = np.mean(y[indx])
  return res

Simulation study taken from: Kim, J. K., & Wang, Z. (2018). Sampling Techniques for Big Data Analysis. International Statistical Review, 1, 1–15. https://doi.org/10.1111/insr.12290

In [110]:
np.random.seed(123) 
N = 1000000
x1 = np.random.normal(loc=1.0,scale=1.0,size=N)
x2 = np.random.exponential(scale=1.0, size = N)
epsilon = np.random.normal(size=N)

### target variables
y1 = 1 + x1 + x2 + epsilon
y2 = 0.5*(x1 - 1.5)**2 + x2 + epsilon
## propensity scores
p1 = np.exp(x2) / (1 + np.exp(x2))
p2 = np.exp(-0.5 + 0.5*(x2-2)**2) / (1 + np.exp(-0.5 + 0.5*(x2-2)**2))

data = np.column_stack((x1,x2,y1,y2,p1,p2)).astype('float32')
data[:3]

array([[-0.0856306 ,  0.5546646 ,  0.90054107,  1.2432839 ,  0.63521713,
         0.632858  ],
       [ 1.9973454 ,  1.1975824 ,  4.455222  ,  1.5815529 ,  0.7680944 ,
         0.4556015 ],
       [ 1.2829785 ,  0.84342945,  3.0256119 ,  0.76618266,  0.699187  ,
         0.542107  ]], dtype=float32)

In [111]:
R = 500
xs = [0,1]
ys = [2,3]
ps = [4,5]

results_naive_500 = np.zeros(shape = (R, 3))
results_naive_1000 = np.zeros(shape = (R, 3))
results_faiss_500 = np.zeros(shape = (R, 3))
results_faiss_1000 = np.zeros(shape = (R, 3))
results_faiss_v_500 = np.zeros(shape = (R, 3))
results_faiss_v_1000 = np.zeros(shape = (R, 3))
results_kdtree_500 = np.zeros(shape = (R, 3))
results_kdtree_1000 = np.zeros(shape = (R, 3))
results_scann_500 = np.zeros(shape = (R, 3))
results_scann_1000 = np.zeros(shape = (R, 3))
results_flann_500 = np.zeros(shape = (R, 3))
results_flann_1000 = np.zeros(shape = (R, 3))

for r in range(R):
  print(r)
  np.random.seed(r)
  ## big data sample
  big_p1 = np.random.binomial(n=1, p = p1, size = N)
  big_p2 = np.random.binomial(n=1, p = p2, size = N)    
  ## random samples
  s500 = np.random.choice(a = data.shape[0], size = 500, replace = False)
  s1000 = np.random.choice(a = data.shape[0], size = 1000, replace = False)
  
  ## naive
  results_naive_500[r, 0] = np.mean(data[big_p1==1][:, ys[0]])
  results_naive_500[r, 1] = np.mean(data[big_p2==1][:, ys[0]])
  results_naive_500[r, 2] = np.mean(data[big_p1==1][:, ys[1]])
  results_naive_1000[r, 0] = np.mean(data[big_p1==1][:, ys[0]])
  results_naive_1000[r, 1] = np.mean(data[big_p2==1][:, ys[0]])
  results_naive_1000[r, 2] = np.mean(data[big_p1==1][:, ys[1]])

  ## imputation by kdtree
  results_kdtree_500[r,0]=kdtree_impute_di(data[big_p1==1][:, xs], data[s500][:, xs], data[big_p1==1][:, ys[0]])
  results_kdtree_500[r,1]=kdtree_impute_di(data[big_p2==1][:, xs], data[s500][:, xs], data[big_p2==1][:, ys[0]])
  results_kdtree_500[r,2]=kdtree_impute_di(data[big_p1==1][:, xs], data[s500][:, xs], data[big_p1==1][:, ys[1]])
  results_kdtree_1000[r,0]=kdtree_impute_di(data[big_p1==1][:, xs], data[s1000][:, xs], data[big_p1==1][:, ys[0]])
  results_kdtree_1000[r,1]=kdtree_impute_di(data[big_p2==1][:, xs], data[s1000][:, xs], data[big_p2==1][:, ys[0]])
  results_kdtree_1000[r,2]=kdtree_impute_di(data[big_p1==1][:, xs], data[s1000][:, xs], data[big_p1==1][:, ys[1]])
  
  ## imputation by faiss
  results_faiss_500[r,0]=faiss_impute_di(data[big_p1==1][:, xs].copy(), data[s500][:, xs].copy(), data[big_p1==1][:, ys[0]].copy())
  results_faiss_500[r,1]=faiss_impute_di(data[big_p2==1][:, xs].copy(), data[s500][:, xs].copy(), data[big_p2==1][:, ys[0]].copy())
  results_faiss_500[r,2]=faiss_impute_di(data[big_p1==1][:, xs].copy(), data[s500][:, xs].copy(), data[big_p1==1][:, ys[1]].copy())
  results_faiss_1000[r,0]=faiss_impute_di(data[big_p1==1][:, xs].copy(), data[s1000][:, xs].copy(), data[big_p1==1][:, ys[0]].copy())
  results_faiss_1000[r,1]=faiss_impute_di(data[big_p2==1][:, xs].copy(), data[s1000][:, xs].copy(), data[big_p2==1][:, ys[0]].copy())
  results_faiss_1000[r,2]=faiss_impute_di(data[big_p1==1][:, xs].copy(), data[s1000][:, xs].copy(), data[big_p1==1][:, ys[1]].copy())
  
  ## imputation by faiss with voronoi
  results_faiss_v_500[r,0]=faiss_impute_di(data[big_p1==1][:, xs].copy(), data[s500][:, xs].copy(), data[big_p1==1][:, ys[0]].copy(), voronoi=True)
  results_faiss_v_500[r,1]=faiss_impute_di(data[big_p2==1][:, xs].copy(), data[s500][:, xs].copy(), data[big_p2==1][:, ys[0]].copy(), voronoi=True)
  results_faiss_v_500[r,2]=faiss_impute_di(data[big_p1==1][:, xs].copy(), data[s500][:, xs].copy(), data[big_p1==1][:, ys[1]].copy(), voronoi=True)
  results_faiss_v_1000[r,0]=faiss_impute_di(data[big_p1==1][:, xs].copy(), data[s1000][:, xs].copy(), data[big_p1==1][:, ys[0]].copy(), voronoi=True)
  results_faiss_v_1000[r,1]=faiss_impute_di(data[big_p2==1][:, xs].copy(), data[s1000][:, xs].copy(), data[big_p2==1][:, ys[0]].copy(), voronoi=True)
  results_faiss_v_1000[r,2]=faiss_impute_di(data[big_p1==1][:, xs].copy(), data[s1000][:, xs].copy(), data[big_p1==1][:, ys[1]].copy(), voronoi=True)

  ## imputation by scann
  results_scann_500[r,0]=scann_impute_di(data[big_p1==1][:, xs], data[s500][:, xs], data[big_p1==1][:, ys[0]])
  results_scann_500[r,1]=scann_impute_di(data[big_p2==1][:, xs], data[s500][:, xs], data[big_p2==1][:, ys[0]])
  results_scann_500[r,2]=scann_impute_di(data[big_p1==1][:, xs], data[s500][:, xs], data[big_p1==1][:, ys[1]])
  results_scann_1000[r,0]=scann_impute_di(data[big_p1==1][:, xs], data[s1000][:, xs], data[big_p1==1][:, ys[0]])
  results_scann_1000[r,1]=scann_impute_di(data[big_p2==1][:, xs], data[s1000][:, xs], data[big_p2==1][:, ys[0]])
  results_scann_1000[r,2]=scann_impute_di(data[big_p1==1][:, xs], data[s1000][:, xs], data[big_p1==1][:, ys[1]])

  ## imputation by flann
  results_flann_500[r,0]=flann_impute_di(data[big_p1==1][:, xs], data[s500][:, xs], data[big_p1==1][:, ys[0]])
  results_flann_500[r,1]=flann_impute_di(data[big_p2==1][:, xs], data[s500][:, xs], data[big_p2==1][:, ys[0]])
  results_flann_500[r,2]=flann_impute_di(data[big_p1==1][:, xs], data[s500][:, xs], data[big_p1==1][:, ys[1]])
  results_flann_1000[r,0]=flann_impute_di(data[big_p1==1][:, xs], data[s1000][:, xs], data[big_p1==1][:, ys[0]])
  results_flann_1000[r,1]=flann_impute_di(data[big_p2==1][:, xs], data[s1000][:, xs], data[big_p2==1][:, ys[0]])
  results_flann_1000[r,2]=flann_impute_di(data[big_p1==1][:, xs], data[s1000][:, xs], data[big_p1==1][:, ys[1]])

0
1
2
3


KeyboardInterrupt: ignored

In [106]:
y_trues = [np.mean(y1), np.mean(y1), np.mean(y2)]

mse_500 = [
 np.std(results_naive_500, axis = 0)**2 + (np.mean(results_naive_500, axis = 0) - y_trues)**2,
 np.std(results_kdtree_500, axis = 0)**2 + (np.mean(results_kdtree_500, axis = 0) - y_trues)**2,
 np.std(results_faiss_500, axis = 0)**2 + (np.mean(results_faiss_500, axis = 0) - y_trues)**2,
 np.std(results_flann_500, axis = 0)**2 + (np.mean(results_flann_500, axis = 0) - y_trues)**2,
 np.std(results_scann_500, axis = 0)**2 + (np.mean(results_scann_500, axis = 0) - y_trues)**2
 ]

np.sqrt(mse_500)


array([[0.18627488, 0.09831804, 0.186431  ],
       [0.08211612, 0.07099447, 0.0813127 ],
       [0.08163982, 0.07074112, 0.0812208 ],
       [0.08133967, 0.07058238, 0.08079635],
       [0.10292541, 0.10698985, 0.11651488]])

In [108]:
mse_1000 = [
np.std(results_naive_1000, axis = 0)**2 + (np.mean(results_naive_1000, axis = 0) - y_trues)**2,
 np.std(results_kdtree_1000, axis = 0)**2 + (np.mean(results_kdtree_1000, axis = 0) - y_trues)**2,
 np.std(results_faiss_1000, axis = 0)**2 + (np.mean(results_faiss_1000, axis = 0) - y_trues)**2,
 np.std(results_flann_1000, axis = 0)**2 + (np.mean(results_flann_1000, axis = 0) - y_trues)**2,
 np.std(results_scann_1000, axis = 0)**2 + (np.mean(results_scann_1000, axis = 0) - y_trues)**2
 ]
np.sqrt(mse_1000)

array([[0.18627488, 0.09831804, 0.186431  ],
       [0.0444269 , 0.04747939, 0.0470238 ],
       [0.044689  , 0.04701806, 0.04703901],
       [0.04480808, 0.04678452, 0.04711393],
       [0.07262391, 0.08552078, 0.09067404]])

In [109]:
results = {
    "data": pd.DataFrame(data),
    "results_naive_500" : pd.DataFrame(results_naive_500),
    "results_naive_1000" : pd.DataFrame(results_naive_1000),
    "results_faiss_500" : pd.DataFrame(results_faiss_500),
    "results_faiss_1000" : pd.DataFrame(results_faiss_1000),
    "results_faiss_v_500" : pd.DataFrame(results_faiss_v_500),
    "results_faiss_v_1000" : pd.DataFrame(results_faiss_v_1000),
    "results_kdtree_500" : pd.DataFrame(results_kdtree_500),
    "results_kdtree_1000" : pd.DataFrame(results_kdtree_1000),
    "results_scann_500" : pd.DataFrame(results_scann_500),
    "results_scann_1000" : pd.DataFrame(results_scann_1000),
    "results_flann_500" : pd.DataFrame(results_flann_500),
    "results_flann_1000" : pd.DataFrame(results_flann_1000),
          }

f = open("sim2-results.pkl","wb")
pickle.dump(results,f)
f.close()