<a href="https://colab.research.google.com/github/DepartmentOfStatisticsPUE/ann-for-survey-sampling/blob/main/notebooks/ann_paper_sim_study_3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!apt install libomp-dev
!pip install faiss-gpu

In [61]:
import pandas as pd
import numpy as np
import pickle
import faiss
from scipy.spatial import KDTree
res = faiss.StandardGpuResources() 

In [None]:
def kdtree_impute(tree, sample, data, y, x, eps = 0):
  nns = tree.query(sample[:,x], k = 1, eps = eps)
  res = np.mean(data[nns[1]][:, ys], axis = 0)
  return res

Simulation study taken from: Kim, J. K., & Wang, Z. (2018). Sampling Techniques for Big Data Analysis. International Statistical Review, 1, 1–15. https://doi.org/10.1111/insr.12290

In [55]:
np.random.seed(123)
N = 1000000
x1 = np.random.normal(loc=1.0,scale=1.0,size=N)
x2 = np.random.exponential(scale=1.0, size = N)
epsilon = np.random.normal(size=N)

### target variables
y1 = 1 + x1 + x2 + epsilon
y2 = 0.5*(x1 - 1.5)**2 + x2 + epsilon
## propensity scores
p1 = np.exp(x2) / (1 + np.exp(x2))
p2 = np.exp(-0.5 + 0.5*(x2-2)**2) / (1 + np.exp(-0.5 + 0.5*(x2-2)**2))

data = np.column_stack((x1,x2,y1,y2,p1,p2)).astype('float32')
data[:3]

array([[-0.0856306 ,  0.5546646 ,  0.90054107,  1.2432839 ,  0.63521713,
         0.632858  ],
       [ 1.9973454 ,  1.1975824 ,  4.455222  ,  1.5815529 ,  0.7680944 ,
         0.4556015 ],
       [ 1.2829785 ,  0.84342945,  3.0256119 ,  0.76618266,  0.699187  ,
         0.542107  ]], dtype=float32)

In [None]:
R = 500
xs = [0,1]
ys = [2,3]
ps = [4,5]

results_faiss_500 = np.zeros(shape = (R, 2))
results_faiss_1000 = np.zeros(shape = (R, 2))
results_kdtree_500 = np.zeros(shape = (R, 2))
results_kdtree_1000 = np.zeros(shape = (R, 2))

for r in range(R):
  print(r)
  np.random.seed(r)
  ## big data sample
  big_p1 = np.random.binomial(n=1, p = p1, size = N)
  big_p2 = np.random.binomial(n=1, p = p2, size = N)    
  ## random samples
  s500 = np.random.choice(a = data.shape[0], size = 500, replace = False)
  s1000 = np.random.choice(a = data.shape[0], size = 1000, replace = False)
  ## kdtree (exact)
  kdtree = KDTree(data[big_p1==1][:, xs], leafsize = 10)
  results_kdtree_500[r, :] = kdtree_impute(kdtree, data[s500], data[big_p1==1], ys, xs)
  results_kdtree_1000[r, :] = kdtree_impute(kdtree, data[s1000], data[big_p1==1], ys, xs)
  ## faiss
  big_data = data[big_p1==1][:, xs].copy()
  sam_data_500 = data[s500][:, xs].copy()
  sam_data_1000 = data[s1000][:, xs].copy()
  index_flat = faiss.IndexFlatL2(len(xs))
  gpu_index_flat = faiss.index_cpu_to_gpu(res, 0, index_flat)
  gpu_index_flat.add(big_data)
  D_500, I_500 = gpu_index_flat.search(sam_data_500, k = 1) 
  D_1000, I_1000 = gpu_index_flat.search(sam_data_1000, k = 1) 
  ind_500 = [i[0] for i in I_500]
  ind_1000 = [i[0] for i in I_1000]
  results_faiss_500[r,:]=np.mean(data[big_p1==1][ind_500][:, ys], axis=0)
  results_faiss_1000[r,:]=np.mean(data[big_p1==1][ind_1000][:, ys], axis=0)

In [59]:
expected = np.stack(
    [np.mean(results_kdtree_500, axis=0),
     np.mean(results_faiss_500, axis=0),
     np.mean(results_kdtree_1000, axis=0),
     np.mean(results_faiss_1000, axis=0)
     ]
) 

stderrs =  np.stack(
    [np.std(results_kdtree_500, axis=0),
     np.std(results_faiss_500, axis=0),
     np.std(results_kdtree_1000, axis=0),
     np.std(results_faiss_1000, axis=0)
     ]
)

bias = expected - np.mean(data[:,ys], axis=0)
mse = bias**2 + stderrs**2

print("===== bias =====")
print(bias)
print("===== se =====")
print(stderrs)
print("===== mse =====")
print(mse)

===== bias =====
[[ 0.00424795 -0.00285448]
 [ 0.00378274 -0.00331977]
 [ 0.00329126  0.00082333]
 [ 0.00325391  0.00078617]]
===== se =====
[[0.07272113 0.07345612]
 [0.07295542 0.07315786]
 [0.05202406 0.04801981]
 [0.05191877 0.04749557]]
===== mse =====
[[0.00530641 0.00540395]
 [0.0053368  0.00536309]
 [0.00271734 0.00230658]
 [0.00270615 0.00225645]]


In [63]:
## save results
results = {
    "results_kdtree_500" : pd.DataFrame(results_kdtree_500),
    "results_faiss_500" : pd.DataFrame(results_faiss_500),
    "results_kdtree_1000": pd.DataFrame(results_kdtree_1000),
    "results_faiss_1000": pd.DataFrame(results_faiss_1000),
           }

f = open("kdtree_faiss_500_1000k.pkl","wb")
pickle.dump(results,f)
f.close()

Modified case from: Yang, S., & Kim, J. K. (2019). Nearest neighbour imputation for general parameter estimation in survey sampling. In The Econometrics of Complex Survey Data: Theory and Applications (Vol. 39, pp. 209–234)

Study from Yang, S., & Kim, J. K. (2020). Doubly robust inference when combining probability and non-probability samples with high dimensional. Journal of the Royal Statistical Society. Series B: Statistical Methodology, 82(2), 445–465. https://doi.org/10.1111/rssb.12354

R code (source: https://github.com/shuyang1987/IntegrativeFPM)

```r
set.seed(1234)
## population size
N <- 10000
## x is a p-dimensional covariate
p <- 50
x <- matrix( rnorm(N*p,0,1),N,p)
## y is a continuous outcome 
beta0 <- c(1,1,1,1,1,rep(0,p-4))
y <- cbind(1,x)%*%beta0 + rnorm(N,0,1)
true <- mean(y)
## y2 is a binary outcome
ly2 <- (cbind(1,x)%*%beta0)
ply <- exp(ly2)/(1+exp(ly2))
y2 <- rbinom(N,1,ply)
true2 <- mean(y2)
## A.set is a prob sample: SRS
## sampling probability into A is known when estimation
nAexp <- 1000
probA <- rep(nAexp/N,N)
A.index <- rbinom(N,size = 1,prob = probA)
A.loc <- which(A.index == 1)
nA <- sum(A.index == 1)
sw.A <- 1/probA[A.loc]
x.A <- x[A.loc,]
y.A <- rep(NA,nA) # y is not observed in Sample A
y2.A <- rep(NA,nA)
## B.set is a nonprob sample
## sampling probability into B is unknown when estimation
nBexp <- 2000
alpha0 <- c(-2,1,1,1,1,rep(0,p-4))
probB <- (1+exp(-cbind(1,x)%*%alpha0))^(-1) 
B.index <- rbinom(N,size = 1,prob = probB)
B.loc <- which(B.index == 1)
nB <- sum(B.index)
x.B <- x[B.loc,]
y.B <- y[B.loc]
y2.B <- y2[B.loc]
## combined dataset
y.AB <- c(y.A,y.B)
y2.AB <- c(y2.A,y2.B)
x.AB <- rbind(x.A,x.B)
deltaB <- c(rep(0,nA),rep(1,nB))
sw <- c(sw.A,rep(1,nB))
```