### Rpy2 package

In [None]:
%load_ext rpy2.ipython

### Importing Python libraries

In [None]:
import pandas as pd
import numpy as np
import math
from timeit import default_timer as timer

import matplotlib.pyplot as plt
%matplotlib inline

#knockpy
import knockpy
from knockpy.knockoff_filter import KnockoffFilter
from knockpy.knockoff_stats import data_dependent_threshhold

### Importing R packages

In [None]:
import rpy2
import rpy2.robjects as robjects
from rpy2.robjects.packages import importr
from rpy2.robjects import r, pandas2ri
from rpy2.robjects.conversion import localconverter

# import R's packages
base = importr('base')
glmnet = importr('glmnet')
dplyr = importr('dplyr')
rvinecopulib = importr('rvinecopulib')
knockoff = importr('knockoff')
sn = importr('sn')
doMC = importr('doMC')
foreach = importr('foreach')
doParallel = importr('doParallel')
TSP = importr('TSP')
VineCopula = importr('VineCopula')



### Auxiliary used-defined  python functions

In [None]:
def dvine_knockoff_filter(X, y, M, M_lasso, alpha, lasso_family, vinecop_family, n_cores):
  """" 
  This function runs the knockoff filter with e-values for dvine knockoffs.
  
  Returns a numpy array
  
  Parameters 
  ---------- 
  X : numpy array --> Matrix of predictors
  y : numpy array --> Response variable
  M : int -> number of derandomize knockoffs runs
  alpha : float -> FDR target level for the derandomize knockoffs using e-values
  lasso_family : str-> related to linear regression or logistic regresion ("gaussian" or "binomial")
  vinecop_family: String related to the family of pair copulas used in the dvine fitting. 
                 Common options are "parametric", "nonparametric", "onepar". More details can be found
                 in the documentation of R package rvinecopulib
                  https://cran.r-project.org/web/packages/rvinecopulib/rvinecopulib.pdf
  n_cores: int -> number of cores for parallel processing               
  """ 
  #Transformation to a pandas data.frame
  X = pd.DataFrame(X)  
  y = pd.DataFrame(y)
    
  #Convertion of the pandas dataframe to a R dataframe  
  with localconverter(robjects.default_converter + pandas2ri.converter):
    r_X = robjects.conversion.py2rpy(X)
    r_y = robjects.conversion.py2rpy(y)
  
  #Object conversion
  robjects.globalenv["X"] = r_X
  robjects.globalenv["y"] = r_y

  #Object conversion
  robjects.globalenv["M"] = M
  robjects.globalenv["M_lasso"] = M_lasso  
  robjects.globalenv["alpha"] = alpha
  robjects.globalenv["vinecop_family"] = vinecop_family
  robjects.globalenv["lasso_family"] = lasso_family
  robjects.globalenv["n_cores"] = n_cores

    
  #Fitting 
  robjects.r('''
  
           #Dvine fitting 
           dvine_distributions <- X_Xk_dvine_distributions(X, vinecop_family, n_cores) 
            
           y <- unlist(y) #This transformation is required because the object Python conversion
           
           #Aplication of the derandomized procedure using e-values
           res <- ekn_dvines(X, y, dvine_distributions, M, M_lasso, alpha, gamma=alpha/2, 
           lasso_family, n_cores)
           
           #Vector of integers that indicates the selected non-nulls position 
           rej <- res$rej
           #To account for Python indexing, it is necessary to subtract 1 from the vector elements.
           rej <- rej-1 
       
        
        ''')


  r_rej = robjects.globalenv['rej']  
  #Transformation to a numpy array
  np_rej = np.array(r_rej, dtype=np.int32)
    
  return np_rej

In [None]:
def get_dvine_order(X):
  """" 
  This function runs an heuristic procedure to determine the 
  order for the first tree in a D-vine structure using the TSP R package
  
  Returns a numpy array
  
  Parameters 
  ---------- 
  X : numpy array --> Matrix of predictors
           
  """ 
  #Transformation to a pandas data.frame
  X = pd.DataFrame(X)  
     
  #Convertion of the pandas dataframe to a R dataframe  
  with localconverter(robjects.default_converter + pandas2ri.converter):
    r_X = robjects.conversion.py2rpy(X)
  
  #Object conversion
  robjects.globalenv["X"] = r_X
    
  #Fitting 
  robjects.r('''
  
           #Heuristic procedure to determine the order for the first tree in a D-vine structure
           dvine_order <- get_dvine_order(X)
      
        ''')

  r_dvine_order = robjects.globalenv['dvine_order']  
  #Transformation to a numpy array
  np_dvine_order = np.array(r_dvine_order, dtype=np.int32)
    
  return np_dvine_order

In [None]:
def gaussian_knockoff_filter(X, y, M, M_lasso, alpha, lasso_family, n_cores):
  """" 
  This function runs the knockoff filter with e-values for Gaussian knockoffs.
  The knockoffs method is implemented via the Python-package knockpy
  
  Returns a numpy array
  
  Parameters 
  ---------- 
  X : numpy array --> Matrix of predictors
  y : numpy array --> Response variable
  M : int -> number of derandomize knockoffs runs
  alpha : float -> FDR target level for the derandomize knockoffs using e-values
  lasso_family : str-> related to linear regression or logistic regresion ("gaussian" or "binomial")
  n_cores: int -> number of cores for parallel processing               
  """     
  #Transformation to a pandas data.frame
  X = pd.DataFrame(X)  
  y = pd.DataFrame(y)
    
  #Convertion of the pandas dataframe to a R dataframe  
  with localconverter(robjects.default_converter + pandas2ri.converter):
    r_X = robjects.conversion.py2rpy(X)
    r_y = robjects.conversion.py2rpy(y)
  
  robjects.globalenv["X"] = r_X
  robjects.globalenv["y"] = r_y

  #Object conversion
  robjects.globalenv["M"] = M
  robjects.globalenv["M_lasso"] = M_lasso  
  robjects.globalenv["alpha"] = alpha
  robjects.globalenv["lasso_family"] = lasso_family
  robjects.globalenv["n_cores"] = n_cores

    
  #Fitting 
  robjects.r(''' 
  
           y <- unlist(y) #This transformation is required because the object Python conversion
 
           #Aplication of the derandomized procedure using e-values
           res <- ekn_gaussian(X, y, ls_Xk_norm, M, M_lasso, alpha, gamma=alpha/2, 
           lasso_family, n_cores)
           
           #Vector of integers that indicates the selected non-nulls position 
           rej <- res$rej
           rej <- rej-1 #Subtraction is done for python index
              
        ''')


  r_rej = robjects.globalenv['rej']  
  #Transformation to a numpy array
  np_rej = np.array(r_rej, dtype=np.int32)
    
  return np_rej

In [None]:
def second_order_knockoff_filter(X, y, M, M_lasso, alpha, lasso_family, n_cores):
  """" 
  This function runs the knockoff filter with e-values for second order knockoffs.
  The second order knockoffs method is implemented via the R-package knockoffs
  
  Returns a numpy array
  
  Parameters 
  ---------- 
  X : numpy array --> Matrix of predictors
  y : numpy array --> Response variable
  M : int -> number of derandomize knockoffs runs
  alpha : float -> FDR target level for the derandomize knockoffs using e-values
  lasso_family : str-> related to linear regression or logistic regresion ("gaussian" or "binomial")
  n_cores: int -> number of cores for parallel processing               
  """    
  #Transformation to a pandas data.frame
  X = pd.DataFrame(X)  
  y = pd.DataFrame(y)
    
  #Convertion of the pandas dataframe to a R dataframe  
  with localconverter(robjects.default_converter + pandas2ri.converter):
    r_X = robjects.conversion.py2rpy(X)
    r_y = robjects.conversion.py2rpy(y)

  #Object conversion
  robjects.globalenv["X"] = r_X
  robjects.globalenv["y"] = r_y

  #Object conversion
  robjects.globalenv["M"] = M
  robjects.globalenv["M_lasso"] = M_lasso  
  robjects.globalenv["alpha"] = alpha
  robjects.globalenv["lasso_family"] = lasso_family
  robjects.globalenv["n_cores"] = n_cores

    
  #Fitting 
  robjects.r('''           
           
           y <- unlist(y) #This transformation is required because the object Python conversion
           X <- as.matrix(X) #Transformation needed to applied the second order knockoffs
           
           #Aplication of the derandomized procedure using e-values
           res <- ekn_second_order(X, y, M, M_lasso, alpha, gamma=alpha/2, 
           lasso_family, n_cores)
           
           #Vector of integers that indicates the selected non-nulls position 
           rej <- res$rej
           rej <- rej-1 #Subtraction is done for python index
              
        ''')

  r_rej = robjects.globalenv['rej']  
  #Transformation to a numpy array
  np_rej = np.array(r_rej, dtype=np.int32)
    
  return np_rej

### R functions related to dvines

In [None]:
%%R

#get_dvine_order() -->   This function runs an heuristic procedure to determine the 
#order for the first tree in a D-vine structure using the TSP R package to solve 
#the traveling salesman problem. To solve it, we need to identify the shortest 
#Hamiltonian path by assigning weights based on the pairwise Kendall’s τ

#Arguments:
#X: matrix of predictors

#Value: an integer vector with the new indices

get_dvine_order <- function(X){

    if (is.null(X)) {
        stop("Argument X is null")  
    }
    
    #Matrix transformation
    X <- as.matrix(X)
       
    #Matrix of 1 - tau_ij
    M_tau <- 1 - abs(TauMatrix(X))

    #Hamiltonian path and solution (functions of package TSP)
    hamilton <- insert_dummy(TSP(M_tau), label="cut")
    sol <- solve_TSP(hamilton,method="repetitive_nn")

    #Reordering
    TSP_order <- cut_tour(sol,"cut")
    names(TSP_order) <- NULL
    
    #To represent indices in Python, the vector requires a slight adjustment. 
    #Consequently, we subtract 1 from each element.
    TSP_order <- TSP_order - 1
    return(TSP_order)
    
}

#X_Xk_dvine_distributions() --> Function to fit the dvine distribution for X and X_X matrices
#Arguments:
#X: matrix of predictors
#vinecop_family : String related to the family of pair copulas used in the dvine fitting. 
#                 Common options are "parametric", "nonparametric", "onepar". More details can be found
#                 in the documentation of R package rvinecopulib
#                  https://cran.r-project.org/web/packages/rvinecopulib/rvinecopulib.pdf
# n_cores: int -> number of cores for parallel processing

#Value: This function returns a list that contains objects of class vinecop_dist for X and X_X
#Note: more information about objects of class vinecop_dist can be found in 
#https://cran.r-project.org/web/packages/rvinecopulib/rvinecopulib.pdf

X_Xk_dvine_distributions <- function(X, vinecop_family="parametric", n_cores=1){

    if (is.null(X)) {
        stop("Argument X is null")  
    }
       
    #Number of variables p and sample size n
    n <- dim(X)[1]
    p <- dim(X)[2]


    #dstructures for dvines
    X_X_dstructure <- dvine_structure((2*p):1)
    X_dstructure <- dvine_structure(p:1)

    #Dataset column binding
    X_X <- cbind(X,X)

    #Seudo-Observations
    u_X_X <- pseudo_obs(X_X)

    #Fitting dvine distribution for X_X
    dvine_fitting_time <- system.time(
    fit_dvine_trunc <- vinecop(u_X_X, family_set=c(vinecop_family), structure= X_X_dstructure, presel=TRUE,
                         selcrit='mbicv', par_method='mle', psi0=0.95, show_trace=FALSE, cores=n_cores, trunc_lvl=p-1)
    )

    #Printing dvine X_X fitting time
    print("dvine fitting time in seconds:")
    print(dvine_fitting_time)

    #Pair-copula list for X_X
    X_X_dvine_pclist <- fit_dvine_trunc$pair_copulas

    #dvine distribution for X_Xk 
    X_X_dvine_dist <- vinecop_dist(X_X_dvine_pclist, X_X_dstructure)

    #Pair-copula list for X
    X_dvine_pclist <- list(rep(list(""),p-1))

    #Iniziating with Independent copula
    for (i in 1:(p-1)){
    bicop <- bicop_dist("indep",)
    X_dvine_pclist[i] <- list(rep(list(bicop),p-i))
    }

    #Pair copula list just for X dependencies
    for (i in 1:(p-1)){
    J <- p-i

    for (j in 1:J){
      X_dvine_pclist[[i]][j] <- X_X_dvine_pclist[[i]][j] 

    } 
    }

    # dvine distribution for X
    X_dvine_dist <- vinecop_dist(X_dvine_pclist, X_dstructure)

    #List with dvine distributions
    dvine_distributions <- list(X_dvine_dist=X_dvine_dist, X_X_dvine_dist=X_X_dvine_dist)

    return(dvine_distributions)
}

#create_dvine_Knockoffs() --> Function to sample dvine knockoffs
#Arguments:
#X: matrix of predictors
#X_dvine_dist: Object of class vinecop_dist for X, contaning a list specifying the pair-copulas,
#              structure, and variable types.
#X_X_dvine_dist: Object of class vinecop_dist for X_X, 
#               contaning a list specifying the pair-copulas, structure, and variable types.
# n_cores: int -> number of cores for parallel processing
#Note: more information about objects of class vinecop_dist can be found in 
#https://cran.r-project.org/web/packages/rvinecopulib/rvinecopulib.pdf

#Value: This function returns a matrix Xk of knockoffs

create_dvine_Knockoffs <- function(X, X_dvine_dist , X_X_dvine_dist, n_cores=1){

    if (is.null(X)) {
        stop("Argument X is null")  
    }
    if (is.null( X_dvine_dist)) {
        stop("Argument X_dvine_dist is null")  
    }
    if (is.null( X_X_dvine_dist)) {
        stop("Argument X_X_dvine_dist is null")  
    }
    
        
    #Number of variables p and sample size n
    n <- dim(X)[1]
    p <- dim(X)[2]
    
    
    #Pseudo observations
    u_X <- pseudo_obs(X)

    #Independent uniforms w
    w_X <- rosenblatt(x=u_X, model=X_dvine_dist, cores = n_cores)
    w_Xk <- matrix(runif(n=p*n,min=0,max=1),nrow=n,ncol=p)
    w_X_Xk <- cbind(w_X,w_Xk)

    #Knockoff sampling Xk
    u_X_Xk <- inverse_rosenblatt(u=w_X_Xk, model= X_X_dvine_dist, cores = n_cores)
    u_Xk <- u_X_Xk[,(p+1):(2*p)]

    #Marginal transformation
    Xk <- X
    for(i in 1:p) {   
        Xk[,i] <- as.vector(quantile(X[,i], probs=punif(u_Xk[,i],min=0, max=1), type=8))
    }

    return(Xk)
}

# stable_lasso_glmnet()--> Function to fit a regularized lasso regresion model using 
#some functions of the R package glmnet.
#It implements the stabilizing procedure of Roberts and Nowak (2014) to diminish sensitivity
#to the fold assignment used in cross-validation to select the hyperparameter lambda

#Arguments:
#X: matrix of predictors
#y: vector or matrix of response
#lasso_family: a string to select linear regression "gaussian" or logistic regression "binomial"
#M_lasso: integer related to the number of runs for the stabilzation against CV
#n_folds: integer indicating the number of cross validations
#Note: more information about the R package glmnet can be found in 
#https://cran.r-project.org/web/packages/glmnet/glmnet.pdf
#Note 2: this function runs in parallel for the stabilzation against CV

#Value: This function returns a vector of the estimated coeficientes (without the intercept)


stable_lasso_glmnet <- function(X, y, lasso_family, M_lasso = 10, n_folds = 5){

    if (is.null(X)) {
        stop("Argument X is missing")  
    }
    if (is.null(y)) {
        stop("Argument y is missing")  
    }
    if (is.null( lasso_family )) {
        stop("Argument lasso_family is missing")  
    }

    
    y_vec <- as.vector(y)
    X_matrix <- as.matrix(X)


    #Stabilizing the lasso against CV (Roberts and Nowak, 2014)
    lambdas <- rep(0,M_lasso)

    time_cv <- system.time(  
        lambdas <- foreach(i = 1:M_lasso, .combine=c,.packages=c("glmnet")) %dopar% {
        set.seed(i)
        cvfit <- cv.glmnet(X_matrix, y_vec, alpha=1, family = lasso_family, nfolds = n_folds, standardize = TRUE)
        cvfit$lambda.min
        }
    )

    #Selecting the median of the lambdas distribution
    lambda50 <- as.numeric(quantile(lambdas,probs=0.5))
    fit_coef <- coef(glmnet(X_matrix, y_vec, alpha = 1, lambda = lambda50, family = lasso_family, standardize = TRUE))

    fit_coef_vec <- as.vector(fit_coef)
    fit_coef_vec <- fit_coef_vec[-1] 

    return(fit_coef_vec)
}

# ekn_dvines()--> Function to derandomized knockoffs using e-values for FDR control. This function
# considers the dvine knockoff procedure.
#The code to implement this function is adapted from 
#https://github.com/zhimeir/derandomized_knockoffs_fdr

#Arguments:
#X: matrix of predictors
#y: vector or matrix of response
#M: integer denoting the number of generated copies of the knockff matrix Xk.
#dvine_distributions: list that contains objects of class vinecop_dist for X and X_X
#M_lasso: integer related to the number of runs for the stabilzation against CV
#alpha: integer indicating FDR target level
#gamma: integer denoting target level for the knockoff threshold. According to Ren & Barber (2023),
#       experimentally, gamma=alpha/2 works well.           
#lasso_family: a string to select linear regression "gaussian" or logistic regression "binomial" 
#n_cores: int -> number of cores for parallel processing

#Note: the knockoff.threshold() function from the R knockoff package is used for 
#setting the Knockoff rejection threshold (https://cran.r-project.org/web/packages/knockoff/knockoff.pdf)

#Value: This function returns a list with the selected variables of the procedure

ekn_dvines <- function(X, y, dvine_distributions, M=50, M_lasso=10, alpha=0.2, gamma=0.1, lasso_family, n_cores=1){

    if (is.null(X)) {
        stop("Argument X is missing")  
    }
    if (is.null(y)) {
        stop("Argument y is missing")  
    }
    if (is.null( dvine_distributions )) {
        stop("Argument dvine_distributions is missing")  
    }
    if (is.null( lasso_family )) {
        stop("Argument lasso_family is missing")  
    }
       
      
    #Number of variables p and sample size n  
    n <- dim(X)[1]
    p <- dim(X)[2]

    #Initial matrix of E-values 
    E <- matrix(0, M, p)
    
    for(m in 1:M){
        
        set.seed(m) #The seed is adjusted for reproducibility issues in the simulations.
        
        #dvine Knockoffs sampling
        Xk <- create_dvine_Knockoffs(X, X_dvine_dist = dvine_distributions$X_dvine_dist ,
                                     X_X_dvine_dist =dvine_distributions$X_X_dvine_dist, n_cores)
        
        #X and Xk column binding
        X_Xk <- cbind(X, Xk)
        
        #Estimated lasso coefficients for X_Xk
        Z <- stable_lasso_glmnet(X_Xk, y, lasso_family, M_lasso)
        
        #Importance statistic
        W <- abs(Z[1:p]) - abs(Z[(p+1):length(Z)])
        
        #Knockoff rejection threshold - conservative procedure ("knockoffs+" offset = 1)
        tau <- stop_early(W, gamma, offset=1) 
        
        #E-vales for all the variables (columns) for m run
        E[m,] <- (W >= tau) / (1 + sum(W <= -tau))
        
   }
    
    #Averaging the e-values to select set of discoveries
    E <- p*colMeans(E)
    rej <- ebh(E, alpha)$rej
    
    return(list(rej = rej, E = E)) 

}

### R functions  to derandomized knockoffs using e-values for FDR control (Gaussian and second order knockoffs)

In [None]:
%%R

ekn_gaussian <- function(X, y, ls_Xk_norm ,M, M_lasso, alpha, gamma, lasso_family, n_cores){
  
    if (is.null(X)) {
        stop("Argument X is missing")  
    }
    if (is.null(y)) {
        stop("Argument y is missing")  
    }
    if (is.null( lasso_family )) {
        stop("Argument lasso_family is missing")  
    }
    
    
    #Number of variables p and sample size n  
    n <- dim(X)[1]
    p <- dim(X)[2]

    #Matrix of E-values 
    E <- matrix(0, M, p)

    for(m in 1:M){
        
        set.seed(m) #The seed is adjusted for reproducibility issues in the simulations. 
        
        #Gaussian Knockoffs copy selection from the list object
        Xk <- ls_Xk_norm[[m]]
        
        #X and Xk column binding
        X_Xk <- cbind(X, Xk)
        
        #Estimated lasso coefficients for X_Xk
        Z <- stable_lasso_glmnet(X_Xk, y, lasso_family, M_lasso)
        
        #Importance statistics
        W <- abs(Z[1:p]) - abs(Z[(p+1):length(Z)])
        
        #Knockoff rejection threshold - conservative procedure ("knockoffs+" offset = 1)
        tau <- stop_early(W, gamma, offset=1) 
        
        #E-vales for all the variables (columns) for m run
        E[m,] <- (W >= tau) / (1 + sum(W <= -tau))
        
   }
    
    #Averaging the e-values to select set of discoveries
    E <- p*colMeans(E)
    rej <- ebh(E, alpha)$rej
    
    return(list(rej = rej, E = E)) 

}


ekn_second_order <- function(X, y, M, M_lasso, alpha, gamma, lasso_family, n_cores){
   
    if (is.null(X)) {
        stop("Argument X is missing")  
    }
    if (is.null(y)) {
        stop("Argument y is missing")  
    }
    if (is.null( lasso_family )) {
        stop("Argument lasso_family is missing")  
    }

     
    #Number of variables p and sample size n  
    n <- dim(X)[1]
    p <- dim(X)[2]

    #Matrix of E-values 
    E <- matrix(0, M, p)

    for(m in 1:M){
        
        set.seed(m) #The seed is adjusted for reproducibility issues in the simulations.
        
        #Gaussian Knockoffs copy selection from the list object
        Xk <- create.second_order(X)
        
        #X and Xk column binding
        X_Xk <- cbind(X, Xk)
        
        #Estimated lasso coefficients for X_Xk
        Z <- stable_lasso_glmnet(X_Xk, y, lasso_family, M_lasso)
        
        #Importance statistics
        W <- abs(Z[1:p]) - abs(Z[(p+1):length(Z)])
        
        #Knockoff rejection threshold - conservative procedure ("knockoffs+" offset = 1)
        tau <- stop_early(W, gamma, offset=1) 
        
        #E-vales for all the variables (columns) for m run
        E[m,] <- (W >= tau) / (1 + sum(W <= -tau))
        
   }
    
    #Averaging the e-values to select set of discoveries
    E <- p*colMeans(E)
    rej <- ebh(E, alpha)$rej
    
    return(list(rej = rej, E = E)) 

}


### Utility functions for the e-values procedure

In [None]:
%%R

#These functions are obtained from
#https://github.com/zhimeir/derandomized_knockoffs_fdr


#####################################
## The eBH procedure
#####################################
### Input: 
###   E: e-values
###   alpha: target FDR level
### Output:
###   Variables selected by the e-BH procedure

ebh <- function(E, alpha){
  
  p <- length(E)
  E_ord <- order(E, decreasing = TRUE)
  E <- sort(E, decreasing = TRUE)
  comp <- E >= (p / alpha / (1:p))
  id <- suppressWarnings(max(which(comp>0)))
  if(id > 0){
    rej <- E_ord[1:id]
  }else{
    rej <- NULL
  }
  return(list(rej = rej))
}

#######################################
## Computing the early stopping time ##
#######################################
### Input:
###   W: vector of knockoff feature importance statistics 
###   gamma: alpha_kn 
###   offset: value between 0 and 1
### Output: 
###   The modified knockoff stopping time defined in (14)

stop_early <- function(W, gamma, offset){
  
  tau <- alphakn_threshold(W, fdr =  gamma, offset = offset) 
  ord_W <- order(abs(W), decreasing = TRUE)
  sorted_W <- W[ord_W]
  
  if(sum(W>0) >= 1 / gamma){
    pos_ind <- which(sorted_W > 0)
    tau1 <- sorted_W[pos_ind[ceiling(1/gamma)-1]]
  }else{
    tau1 <- 0
  }
  tau <- min(tau,tau1) 

  return(tau)
}


#######################################################
## Compute stopping time w/ diff alpha_kn and offset ##
#######################################################
### Input:
###   W: a length p vector of knockoff feature importance statistics
###   fdr: the target FDR level
###   offset: 0 or 1 
### Output: 
###   the knockoff selection threshold

alphakn_threshold <- function(W, fdr, offset) {
  ts = sort(c(0, abs(W)))
  ratio = sapply(ts, function(t)
    (offset + sum(W <= -t)) / max(1, sum(W >= t)))
  ok = which(ratio <= fdr)
  ifelse(length(ok) > 0, ts[ok[1]], Inf)
}
                 
                 

## NonGaussian  variables according to a Dvine distribution

### Simulation setup

In [None]:
t_initial = timer()

nsim = 100 #Number of simulations
n = 300  #Number of data points
p = 100  #Number of features
ri = 1 #Reproducibility indicator


M = 50 #Number of runs for e-values procedure
M_lasso = 10 #Number of runs for Lasso Stability against CV
lasso_family= 'gaussian'
vinecop_family = 'parametric'
nonparametric_family = 'nonparametric'
alpha = 0.2 #The rarget value for FDR control of derandomized knockoffs using e-values
n_cores = 23


#Correlation (rho)
rho = 0.8
np_rhos = np.repeat(rho,p-1) 
sp = 0.2 #Sparsity of the non-null vector (%)

#Amplitude and coefficients
amplitude = 8
beta_factor = amplitude / math.sqrt(n)

#From Python to R
%R -i n
%R -i ri 
%R -i n_cores

#Parallel processing
%R registerDoParallel(makeCluster(n_cores))


# Iteration 1 

In [None]:
#Degrees of fredom (t-distribution)
df_t = 3

### Data simulation

In [None]:
#Arrays and lists to save information
ls_simulations = list(range(nsim))
ls_beta = list(range(nsim))
ls_X = list(range(nsim))
ls_y = list(range(nsim))

ti = timer() #Initial time


for k in range(nsim):
  
  #Set seed for replication 
  np.random.seed(k + ri + 1000) #Python
  %R -i k
  %R set.seed(k + ri + 1000)  #R

  #X simulation according to a t-tailed Markov Chain
  X = knockpy.dgp.sample_ar1t(rhos=np_rhos, n=n, df_t= df_t)
  ls_X[k] = X  
    
  # Creating random sparse coefficients
  beta = knockpy.dgp.create_sparse_coefficients(p=p, sparsity=sp, sign_prob=0.5, coeff_size=beta_factor,coeff_dist='uniform')
  ls_beta[k] = beta

  # Response variable
  y = np.dot(X, beta) + np.random.randn(n)
  ls_y[k] = y
    
  #Data frame with simulated data (Y,X)
  #df_X = pd.DataFrame(X)
  #df_y = pd.DataFrame(y)
  #ls_simulations[k] = pd.concat([df_y,df_X], axis=1)

time_simulations_1 = timer() - ti      

### Knockoff filters 

In [None]:
%%R

# A list needed for the Gaussian Knockoff sampling procedure
ls_Xk_norm <- list()

In [None]:
np_Gaussian_Power = np.zeros(nsim)
np_Gaussian_FDP = np.zeros(nsim)

np_second_order_Power = np.zeros(nsim)
np_second_order_FDP = np.zeros(nsim)

np_dvine_Power = np.zeros(nsim)
np_dvine_FDP = np.zeros(nsim)

np_nonpar_dvine_Power = np.zeros(nsim)
np_nonpar_dvine_FDP = np.zeros(nsim)

ti = timer() #Initial time

for k in range(nsim):
  
  %R set.seed(NULL) #The R seed is set inside each knockoff filter function
  
  #Loading simulations  
  X = ls_X[k]  
  y = ls_y[k]
  beta = ls_beta[k]
    
  #1)  
  #Gaussian Knockoffs
  
  #Defining the Gaussian sampler object
  Gaussian_sampler_hat = knockpy.knockoffs.GaussianSampler(X, mu=None,
                                                           Sigma=None,
                                                           method='mvr', verbose=False)

  #Sampling the Gaussian Knockoffs
  for m in range(M):
      np.random.seed(m)
      Xk_norm = Gaussian_sampler_hat.sample_knockoffs()
      %R -i Xk_norm
      %R -i m
      %R ls_Xk_norm[[m+1]] <- Xk_norm
      %R rm(Xk_norm)  
    
  #Array of integers that indicates the non-nulls position  
  just_rejections_gaussian = gaussian_knockoff_filter(X, y, M, M_lasso, alpha, lasso_family, n_cores)
  
  print("Gaussian selection:")  
  print(np.sort(just_rejections_gaussian))
 
  #Array that indicates the rejections considering all the variables (0 null, 1 non-null)
  rejections_gaussian = np.zeros(p)
  rejections_gaussian[just_rejections_gaussian]=1
   

  #Power and FDP
  np_Gaussian_Power[k] = np.dot(rejections_gaussian, beta != 0) / (beta != 0).sum()
  np_Gaussian_FDP[k] = np.around(100*np.dot(rejections_gaussian, beta == 0) / max(1,rejections_gaussian.sum() ) )
  print(f"The knockoff GAUSSIAN filter POWER {100*np_Gaussian_Power[k]}% with a FDP of {np_Gaussian_FDP[k]}%")
  

  #2)
  #Second order knockoff filter    
  just_rejections_second_order = second_order_knockoff_filter(X, y, M, M_lasso, alpha, lasso_family, n_cores)
  
  #Array of integers that indicates the non-nulls position
  print("Second Order selection:")  
  print(np.sort(just_rejections_second_order))
  
  #Array that indicates the rejections considering all the variables (0 null, 1 non-null)
  rejections_second_order = np.zeros(p)
  rejections_second_order[just_rejections_second_order]=1
    
  #Power and FDP
  np_second_order_Power[k] = np.dot(rejections_second_order, beta != 0) / (beta != 0).sum()
  np_second_order_FDP[k] = np.around(100*np.dot(rejections_second_order, beta == 0) / max(1,rejections_second_order.sum()))
  print(f"The knockoff SECOND ORDER filter POWER {100*np_second_order_Power[k]}% with a FDP of {np_second_order_FDP[k]}%")
    
  #3) dvine_order
  #Heuristic procedure to determine the order for the first tree in a D-vine structure
  dvine_order = get_dvine_order(X)

  #New columns orders for X and beta
  X_dvine_order = X[:,dvine_order]
  beta_dvine_order  = beta[dvine_order]

  #4) Parametric dvine
    
  just_rejections_dvines = dvine_knockoff_filter(X_dvine_order, y, M, M_lasso, alpha, lasso_family, vinecop_family, n_cores)
    
  #Array of integers that indicates the non-nulls position
  print("DVINE selection (in dvine_order):")
  print(np.sort(just_rejections_dvines))
 
  #Array that indicates the rejections considering all the variables (0 null, 1 non-null)
  rejections_dvines = np.zeros(p)
  rejections_dvines[just_rejections_dvines]=1
    
  #Power and FDP
  np_dvine_Power[k] = np.dot(rejections_dvines, beta_dvine_order != 0) / (beta_dvine_order != 0).sum()
  np_dvine_FDP[k] = np.around(100*np.dot(rejections_dvines, beta_dvine_order == 0) / max(1,rejections_dvines.sum()))
  print(f"The DVINE knockoff filter POWER {100*np_dvine_Power[k]}% with a FDP of {np_dvine_FDP[k]}% (DVINES)")

  #Deleting some objects
  %R rm(dvine_distributions)  

  #5) Nonparametric dvine
  #dvine knockoff filter  
  just_rejections_nonpar_dvines = dvine_knockoff_filter(X_dvine_order, y, M, M_lasso, alpha, lasso_family, nonparametric_family, n_cores)
    
  #Array of integers that indicates the non-nulls position
  print("NONPARAMETRIC DVINE selection (in dvine_order):")
  print(np.sort(just_rejections_nonpar_dvines))
 
  #Array that indicates the rejections considering all the variables (0 null, 1 non-null)
  rejections_nonpar_dvines = np.zeros(p)
  rejections_nonpar_dvines[just_rejections_nonpar_dvines]=1
    
  #Power and FDP
  np_nonpar_dvine_Power[k] = np.dot(rejections_nonpar_dvines, beta_dvine_order != 0) / (beta_dvine_order != 0).sum()
  np_nonpar_dvine_FDP[k] = np.around(100*np.dot(rejections_nonpar_dvines, beta_dvine_order == 0) / max(1,rejections_nonpar_dvines.sum()))
  print(f"The NONPARAMETRIC DVINE knockoff filter POWER {100*np_nonpar_dvine_Power[k]}% with a FDP of {np_nonpar_dvine_FDP[k]}% (NONPAR DVINES)")

  #Deleting some objects
  %R rm(dvine_distributions)  
  
  del X, y, beta, dvine_order
  del rejections_gaussian,rejections_second_order,rejections_dvines, rejections_nonpar_dvines
        
  print(f"LOOP ITERATION: {k}")
   
time_knockoffs_1 = timer() - ti  

In [None]:
# Array of varying feature
np_varying_feature = np.repeat([df_t],repeats=nsim,axis=0)


#Dataframe with simulated data
df_simulations_results_1 = pd.DataFrame({'Varying feature':np_varying_feature,                                   
                     'Gaussian Power(%)':np_Gaussian_Power*100, 
                     '2do Order Power(%)':np_second_order_Power*100,
                     'Dvine Power(%)':np_dvine_Power*100,
                     'Nonpar DvinePower(%)':np_nonpar_dvine_Power*100,
                     'Gaussian FDP(%)':np_Gaussian_FDP,
                     '2do Order FDP(%)':np_second_order_FDP, 
                     'Dvine FDP(%)':np_dvine_FDP,
                     'Nonpar Dvine FDP(%)':np_nonpar_dvine_FDP})

In [None]:
df_simulations_results_1.to_csv('t-Markov_results_1.csv')
df_simulations_results_1

In [None]:
df_simulations_results_1.mean()

# Iteration 2 

In [None]:
#Degrees of fredom (t-distribution)
df_t = 6

### Data simulation

In [None]:
#Arrays and lists to save information
ls_simulations = list(range(nsim))
ls_beta = list(range(nsim))
ls_X = list(range(nsim))
ls_y = list(range(nsim))

ti = timer() #Initial time

for k in range(nsim):
  
  #Set seed for replication 
  np.random.seed(k + ri + 2000) #Python
  %R -i k
  %R set.seed(k + ri + 2000)  #R

  #X simulation according to a t-tailed Markov Chain
  X = knockpy.dgp.sample_ar1t(rhos=np_rhos, n=n, df_t= df_t)
  ls_X[k] = X  
    
  # Creating random sparse coefficients
  beta = knockpy.dgp.create_sparse_coefficients(p=p, sparsity=sp, sign_prob=0.5, coeff_size=beta_factor,coeff_dist='uniform')
  ls_beta[k] = beta

  # Response variable
  y = np.dot(X, beta) + np.random.randn(n)
  ls_y[k] = y
   
  #Data frame with simulated data (Y,X)
  #df_X = pd.DataFrame(X)
  #df_y = pd.DataFrame(y)
  #ls_simulations[k] = pd.concat([df_y,df_X], axis=1)

time_simulations_2 = timer() - ti      

### Knockoff filters 

In [None]:
%%R

# A list needed for the Gaussian Knockoff sampling procedure
ls_Xk_norm <- list()

In [None]:
np_Gaussian_Power = np.zeros(nsim)
np_Gaussian_FDP = np.zeros(nsim)

np_second_order_Power = np.zeros(nsim)
np_second_order_FDP = np.zeros(nsim)

np_dvine_Power = np.zeros(nsim)
np_dvine_FDP = np.zeros(nsim)

np_nonpar_dvine_Power = np.zeros(nsim)
np_nonpar_dvine_FDP = np.zeros(nsim)

ti = timer() #Initial time

for k in range(nsim):
  
  %R set.seed(NULL) #The R seed is set inside each knockoff filter function
  
  #Loading simulations  
  X = ls_X[k]  
  y = ls_y[k]
  beta = ls_beta[k]
    
  #1)  
  #Gaussian Knockoffs
  
  #Defining the Gaussian sampler object
  Gaussian_sampler_hat = knockpy.knockoffs.GaussianSampler(X, mu=None,
                                                           Sigma=None,
                                                           method='mvr', verbose=False)

  #Sampling the Gaussian Knockoffs
  for m in range(M):
      np.random.seed(m)
      Xk_norm = Gaussian_sampler_hat.sample_knockoffs()
      %R -i Xk_norm
      %R -i m
      %R ls_Xk_norm[[m+1]] <- Xk_norm
      %R rm(Xk_norm)  
    
  #Array of integers that indicates the non-nulls position  
  just_rejections_gaussian = gaussian_knockoff_filter(X, y, M, M_lasso, alpha, lasso_family, n_cores)
  
  print("Gaussian selection:")  
  print(np.sort(just_rejections_gaussian))
 
  #Array that indicates the rejections considering all the variables (0 null, 1 non-null)
  rejections_gaussian = np.zeros(p)
  rejections_gaussian[just_rejections_gaussian]=1
   

  #Power and FDP
  np_Gaussian_Power[k] = np.dot(rejections_gaussian, beta != 0) / (beta != 0).sum()
  np_Gaussian_FDP[k] = np.around(100*np.dot(rejections_gaussian, beta == 0) / max(1,rejections_gaussian.sum() ) )
  print(f"The knockoff GAUSSIAN filter POWER {100*np_Gaussian_Power[k]}% with a FDP of {np_Gaussian_FDP[k]}%")
  

  #2)
  #Second order knockoff filter    
  just_rejections_second_order = second_order_knockoff_filter(X, y, M, M_lasso, alpha, lasso_family, n_cores)
  
  #Array of integers that indicates the non-nulls position
  print("Second Order selection:")  
  print(np.sort(just_rejections_second_order))
  
  #Array that indicates the rejections considering all the variables (0 null, 1 non-null)
  rejections_second_order = np.zeros(p)
  rejections_second_order[just_rejections_second_order]=1
    
  #Power and FDP
  np_second_order_Power[k] = np.dot(rejections_second_order, beta != 0) / (beta != 0).sum()
  np_second_order_FDP[k] = np.around(100*np.dot(rejections_second_order, beta == 0) / max(1,rejections_second_order.sum()))
  print(f"The knockoff SECOND ORDER filter POWER {100*np_second_order_Power[k]}% with a FDP of {np_second_order_FDP[k]}%")
    
  #3) dvine_order
  #Heuristic procedure to determine the order for the first tree in a D-vine structure
  dvine_order = get_dvine_order(X)

  #New columns orders for X and beta
  X_dvine_order = X[:,dvine_order]
  beta_dvine_order  = beta[dvine_order]

  #4) Parametric dvine
    
  just_rejections_dvines = dvine_knockoff_filter(X_dvine_order, y, M, M_lasso, alpha, lasso_family, vinecop_family, n_cores)
    
  #Array of integers that indicates the non-nulls position
  print("DVINE selection (in dvine_order):")
  print(np.sort(just_rejections_dvines))
 
  #Array that indicates the rejections considering all the variables (0 null, 1 non-null)
  rejections_dvines = np.zeros(p)
  rejections_dvines[just_rejections_dvines]=1
    
  #Power and FDP
  np_dvine_Power[k] = np.dot(rejections_dvines, beta_dvine_order != 0) / (beta_dvine_order != 0).sum()
  np_dvine_FDP[k] = np.around(100*np.dot(rejections_dvines, beta_dvine_order == 0) / max(1,rejections_dvines.sum()))
  print(f"The DVINE knockoff filter POWER {100*np_dvine_Power[k]}% with a FDP of {np_dvine_FDP[k]}% (DVINES)")

  #Deleting some objects
  %R rm(dvine_distributions)  

  #5) Nonparametric dvine
  #dvine knockoff filter  
  just_rejections_nonpar_dvines = dvine_knockoff_filter(X_dvine_order, y, M, M_lasso, alpha, lasso_family, nonparametric_family, n_cores)
    
  #Array of integers that indicates the non-nulls position
  print("NONPARAMETRIC DVINE selection (in dvine_order):")
  print(np.sort(just_rejections_nonpar_dvines))
 
  #Array that indicates the rejections considering all the variables (0 null, 1 non-null)
  rejections_nonpar_dvines = np.zeros(p)
  rejections_nonpar_dvines[just_rejections_nonpar_dvines]=1
    
  #Power and FDP
  np_nonpar_dvine_Power[k] = np.dot(rejections_nonpar_dvines, beta_dvine_order != 0) / (beta_dvine_order != 0).sum()
  np_nonpar_dvine_FDP[k] = np.around(100*np.dot(rejections_nonpar_dvines, beta_dvine_order == 0) / max(1,rejections_nonpar_dvines.sum()))
  print(f"The NONPARAMETRIC DVINE knockoff filter POWER {100*np_nonpar_dvine_Power[k]}% with a FDP of {np_nonpar_dvine_FDP[k]}% (NONPAR DVINES)")

  #Deleting some objects
  %R rm(dvine_distributions)  
  
  del X, y, beta, dvine_order
  del rejections_gaussian,rejections_second_order,rejections_dvines, rejections_nonpar_dvines
        
  print(f"LOOP ITERATION: {k}")    

time_knockoffs_2 = timer() - ti  

In [None]:
# Array of varying feature
np_varying_feature = np.repeat([df_t],repeats=nsim,axis=0)


#Dataframe with simulated data
df_simulations_results_2 = pd.DataFrame({'Varying feature':np_varying_feature,                                    
                     'Gaussian Power(%)':np_Gaussian_Power*100, 
                     '2do Order Power(%)':np_second_order_Power*100,
                     'Dvine Power(%)':np_dvine_Power*100,
                     'Nonpar DvinePower(%)':np_nonpar_dvine_Power*100,
                     'Gaussian FDP(%)':np_Gaussian_FDP,
                     '2do Order FDP(%)':np_second_order_FDP, 
                     'Dvine FDP(%)':np_dvine_FDP,
                     'Nonpar Dvine FDP(%)':np_nonpar_dvine_FDP})

In [None]:
df_simulations_results_2.to_csv('t-Markov_results_2.csv')
df_simulations_results_2

In [None]:
df_simulations_results_2.mean()

# Iteration 3 

In [None]:
#Degrees of fredom (t-distribution)
df_t = 10

### Data simulation

In [None]:
#Arrays and lists to save information
ls_simulations = list(range(nsim))
ls_beta = list(range(nsim))
ls_X = list(range(nsim))
ls_y = list(range(nsim))

ti = timer() #Initial time


for k in range(nsim):
  
  #Set seed for replication 
  np.random.seed(k + ri + 3000) #Python
  %R -i k
  %R set.seed(k + ri + 3000)  #R

  #X simulation according to a t-tailed Markov Chain
  X = knockpy.dgp.sample_ar1t(rhos=np_rhos, n=n, df_t= df_t)
  ls_X[k] = X  
    
  # Creating random sparse coefficients
  beta = knockpy.dgp.create_sparse_coefficients(p=p, sparsity=sp, sign_prob=0.5, coeff_size=beta_factor,coeff_dist='uniform')
  ls_beta[k] = beta

  # Response variable
  y = np.dot(X, beta) + np.random.randn(n)
  ls_y[k] = y
    
  #Data frame with simulated data (Y,X)
  #df_X = pd.DataFrame(X)
  #df_y = pd.DataFrame(y)
  #ls_simulations[k] = pd.concat([df_y,df_X], axis=1)

time_simulations_3 = timer() - ti      

### Knockoff filters 

In [None]:
%%R
# A list needed for the Gaussian Knockoff sampling procedure
ls_Xk_norm <- list()

In [None]:
np_Gaussian_Power = np.zeros(nsim)
np_Gaussian_FDP = np.zeros(nsim)

np_second_order_Power = np.zeros(nsim)
np_second_order_FDP = np.zeros(nsim)

np_dvine_Power = np.zeros(nsim)
np_dvine_FDP = np.zeros(nsim)

np_nonpar_dvine_Power = np.zeros(nsim)
np_nonpar_dvine_FDP = np.zeros(nsim)

ti = timer() #Initial time

for k in range(nsim):
  
  %R set.seed(NULL) #The R seed is set inside each knockoff filter function
  
  #Loading simulations  
  X = ls_X[k]  
  y = ls_y[k]
  beta = ls_beta[k]
    
  #1)  
  #Gaussian Knockoffs
  
  #Defining the Gaussian sampler object
  Gaussian_sampler_hat = knockpy.knockoffs.GaussianSampler(X, mu=None,
                                                           Sigma=None,
                                                           method='mvr', verbose=False)

  #Sampling the Gaussian Knockoffs
  for m in range(M):
      np.random.seed(m)
      Xk_norm = Gaussian_sampler_hat.sample_knockoffs()
      %R -i Xk_norm
      %R -i m
      %R ls_Xk_norm[[m+1]] <- Xk_norm
      %R rm(Xk_norm)  
    
  #Array of integers that indicates the non-nulls position  
  just_rejections_gaussian = gaussian_knockoff_filter(X, y, M, M_lasso, alpha, lasso_family, n_cores)
  
  print("Gaussian selection:")  
  print(np.sort(just_rejections_gaussian))
 
  #Array that indicates the rejections considering all the variables (0 null, 1 non-null)
  rejections_gaussian = np.zeros(p)
  rejections_gaussian[just_rejections_gaussian]=1
   

  #Power and FDP
  np_Gaussian_Power[k] = np.dot(rejections_gaussian, beta != 0) / (beta != 0).sum()
  np_Gaussian_FDP[k] = np.around(100*np.dot(rejections_gaussian, beta == 0) / max(1,rejections_gaussian.sum() ) )
  print(f"The knockoff GAUSSIAN filter POWER {100*np_Gaussian_Power[k]}% with a FDP of {np_Gaussian_FDP[k]}%")
  

  #2)
  #Second order knockoff filter    
  just_rejections_second_order = second_order_knockoff_filter(X, y, M, M_lasso, alpha, lasso_family, n_cores)
  
  #Array of integers that indicates the non-nulls position
  print("Second Order selection:")  
  print(np.sort(just_rejections_second_order))
  
  #Array that indicates the rejections considering all the variables (0 null, 1 non-null)
  rejections_second_order = np.zeros(p)
  rejections_second_order[just_rejections_second_order]=1
    
  #Power and FDP
  np_second_order_Power[k] = np.dot(rejections_second_order, beta != 0) / (beta != 0).sum()
  np_second_order_FDP[k] = np.around(100*np.dot(rejections_second_order, beta == 0) / max(1,rejections_second_order.sum()))
  print(f"The knockoff SECOND ORDER filter POWER {100*np_second_order_Power[k]}% with a FDP of {np_second_order_FDP[k]}%")
    
  #3) dvine_order
  #Heuristic procedure to determine the order for the first tree in a D-vine structure
  dvine_order = get_dvine_order(X)

  #New columns orders for X and beta
  X_dvine_order = X[:,dvine_order]
  beta_dvine_order  = beta[dvine_order]

  #4) Parametric dvine
    
  just_rejections_dvines = dvine_knockoff_filter(X_dvine_order, y, M, M_lasso, alpha, lasso_family, vinecop_family, n_cores)
    
  #Array of integers that indicates the non-nulls position
  print("DVINE selection (in dvine_order):")
  print(np.sort(just_rejections_dvines))
 
  #Array that indicates the rejections considering all the variables (0 null, 1 non-null)
  rejections_dvines = np.zeros(p)
  rejections_dvines[just_rejections_dvines]=1
    
  #Power and FDP
  np_dvine_Power[k] = np.dot(rejections_dvines, beta_dvine_order != 0) / (beta_dvine_order != 0).sum()
  np_dvine_FDP[k] = np.around(100*np.dot(rejections_dvines, beta_dvine_order == 0) / max(1,rejections_dvines.sum()))
  print(f"The DVINE knockoff filter POWER {100*np_dvine_Power[k]}% with a FDP of {np_dvine_FDP[k]}% (DVINES)")

  #Deleting some objects
  %R rm(dvine_distributions)  

  #5) Nonparametric dvine
  #dvine knockoff filter  
  just_rejections_nonpar_dvines = dvine_knockoff_filter(X_dvine_order, y, M, M_lasso, alpha, lasso_family, nonparametric_family, n_cores)
    
  #Array of integers that indicates the non-nulls position
  print("NONPARAMETRIC DVINE selection (in dvine_order):")
  print(np.sort(just_rejections_nonpar_dvines))
 
  #Array that indicates the rejections considering all the variables (0 null, 1 non-null)
  rejections_nonpar_dvines = np.zeros(p)
  rejections_nonpar_dvines[just_rejections_nonpar_dvines]=1
    
  #Power and FDP
  np_nonpar_dvine_Power[k] = np.dot(rejections_nonpar_dvines, beta_dvine_order != 0) / (beta_dvine_order != 0).sum()
  np_nonpar_dvine_FDP[k] = np.around(100*np.dot(rejections_nonpar_dvines, beta_dvine_order == 0) / max(1,rejections_nonpar_dvines.sum()))
  print(f"The NONPARAMETRIC DVINE knockoff filter POWER {100*np_nonpar_dvine_Power[k]}% with a FDP of {np_nonpar_dvine_FDP[k]}% (NONPAR DVINES)")

  #Deleting some objects
  %R rm(dvine_distributions)  
  
  del X, y, beta, dvine_order
  del rejections_gaussian,rejections_second_order,rejections_dvines, rejections_nonpar_dvines
        
  print(f"LOOP ITERATION: {k}")    
  
time_knockoffs_3 = timer() - ti  

In [None]:
# Array of varying feature
np_varying_feature = np.repeat([df_t],repeats=nsim,axis=0)


#Dataframe with simulated data
df_simulations_results_3 = pd.DataFrame({'Varying feature':np_varying_feature,                                    
                     'Gaussian Power(%)':np_Gaussian_Power*100, 
                     '2do Order Power(%)':np_second_order_Power*100,
                     'Dvine Power(%)':np_dvine_Power*100,
                     'Nonpar DvinePower(%)':np_nonpar_dvine_Power*100,
                     'Gaussian FDP(%)':np_Gaussian_FDP,
                     '2do Order FDP(%)':np_second_order_FDP, 
                     'Dvine FDP(%)':np_dvine_FDP,
                     'Nonpar Dvine FDP(%)':np_nonpar_dvine_FDP})

In [None]:
df_simulations_results_3.to_csv('t-Markov_results_3.csv')
df_simulations_results_3

In [None]:
df_simulations_results_3.mean()

# Iteration 4 

In [None]:
#Degrees of fredom (t-distribution)
df_t = 15

### Data simulation

In [None]:
#Arrays and lists to save information
ls_simulations = list(range(nsim))
ls_beta = list(range(nsim))
ls_X = list(range(nsim))
ls_y = list(range(nsim))

ti = timer() #Initial time


for k in range(nsim):
  
  #Set seed for replication 
  np.random.seed(k + ri + 4000) #Python
  %R -i k
  %R set.seed(k + ri + 4000)  #R

  #X simulation according to a t-tailed Markov Chain
  X = knockpy.dgp.sample_ar1t(rhos=np_rhos, n=n, df_t= df_t)
  ls_X[k] = X  
    
  # Creating random sparse coefficients
  beta = knockpy.dgp.create_sparse_coefficients(p=p, sparsity=sp, sign_prob=0.5, coeff_size=beta_factor,coeff_dist='uniform')
  ls_beta[k] = beta

  # Response variable
  y = np.dot(X, beta) + np.random.randn(n)
  ls_y[k] = y
    
  #Data frame with simulated data (Y,X)
  #df_X = pd.DataFrame(X)
  #df_y = pd.DataFrame(y)
  #ls_simulations[k] = pd.concat([df_y,df_X], axis=1)

time_simulations_4 = timer() - ti      

### Knockoff filters 

In [None]:
%%R
# A list needed for the Gaussian Knockoff sampling procedure
ls_Xk_norm <- list()

In [None]:
np_Gaussian_Power = np.zeros(nsim)
np_Gaussian_FDP = np.zeros(nsim)

np_second_order_Power = np.zeros(nsim)
np_second_order_FDP = np.zeros(nsim)

np_dvine_Power = np.zeros(nsim)
np_dvine_FDP = np.zeros(nsim)

np_nonpar_dvine_Power = np.zeros(nsim)
np_nonpar_dvine_FDP = np.zeros(nsim)

ti = timer() #Initial time

for k in range(nsim):
  
  %R set.seed(NULL) #The R seed is set inside each knockoff filter function
  
  #Loading simulations  
  X = ls_X[k]  
  y = ls_y[k]
  beta = ls_beta[k]
    
  #1)  
  #Gaussian Knockoffs
  
  #Defining the Gaussian sampler object
  Gaussian_sampler_hat = knockpy.knockoffs.GaussianSampler(X, mu=None,
                                                           Sigma=None,
                                                           method='mvr', verbose=False)

  #Sampling the Gaussian Knockoffs
  for m in range(M):
      np.random.seed(m)
      Xk_norm = Gaussian_sampler_hat.sample_knockoffs()
      %R -i Xk_norm
      %R -i m
      %R ls_Xk_norm[[m+1]] <- Xk_norm
      %R rm(Xk_norm)  
    
  #Array of integers that indicates the non-nulls position  
  just_rejections_gaussian = gaussian_knockoff_filter(X, y, M, M_lasso, alpha, lasso_family, n_cores)
  
  print("Gaussian selection:")  
  print(np.sort(just_rejections_gaussian))
 
  #Array that indicates the rejections considering all the variables (0 null, 1 non-null)
  rejections_gaussian = np.zeros(p)
  rejections_gaussian[just_rejections_gaussian]=1
   

  #Power and FDP
  np_Gaussian_Power[k] = np.dot(rejections_gaussian, beta != 0) / (beta != 0).sum()
  np_Gaussian_FDP[k] = np.around(100*np.dot(rejections_gaussian, beta == 0) / max(1,rejections_gaussian.sum() ) )
  print(f"The knockoff GAUSSIAN filter POWER {100*np_Gaussian_Power[k]}% with a FDP of {np_Gaussian_FDP[k]}%")
  

  #2)
  #Second order knockoff filter    
  just_rejections_second_order = second_order_knockoff_filter(X, y, M, M_lasso, alpha, lasso_family, n_cores)
  
  #Array of integers that indicates the non-nulls position
  print("Second Order selection:")  
  print(np.sort(just_rejections_second_order))
  
  #Array that indicates the rejections considering all the variables (0 null, 1 non-null)
  rejections_second_order = np.zeros(p)
  rejections_second_order[just_rejections_second_order]=1
    
  #Power and FDP
  np_second_order_Power[k] = np.dot(rejections_second_order, beta != 0) / (beta != 0).sum()
  np_second_order_FDP[k] = np.around(100*np.dot(rejections_second_order, beta == 0) / max(1,rejections_second_order.sum()))
  print(f"The knockoff SECOND ORDER filter POWER {100*np_second_order_Power[k]}% with a FDP of {np_second_order_FDP[k]}%")
    
  #3) dvine_order
  #Heuristic procedure to determine the order for the first tree in a D-vine structure
  dvine_order = get_dvine_order(X)

  #New columns orders for X and beta
  X_dvine_order = X[:,dvine_order]
  beta_dvine_order  = beta[dvine_order]

  #4) Parametric dvine
    
  just_rejections_dvines = dvine_knockoff_filter(X_dvine_order, y, M, M_lasso, alpha, lasso_family, vinecop_family, n_cores)
    
  #Array of integers that indicates the non-nulls position
  print("DVINE selection (in dvine_order):")
  print(np.sort(just_rejections_dvines))
 
  #Array that indicates the rejections considering all the variables (0 null, 1 non-null)
  rejections_dvines = np.zeros(p)
  rejections_dvines[just_rejections_dvines]=1
    
  #Power and FDP
  np_dvine_Power[k] = np.dot(rejections_dvines, beta_dvine_order != 0) / (beta_dvine_order != 0).sum()
  np_dvine_FDP[k] = np.around(100*np.dot(rejections_dvines, beta_dvine_order == 0) / max(1,rejections_dvines.sum()))
  print(f"The DVINE knockoff filter POWER {100*np_dvine_Power[k]}% with a FDP of {np_dvine_FDP[k]}% (DVINES)")

  #Deleting some objects
  %R rm(dvine_distributions)  

  #5) Nonparametric dvine
  #dvine knockoff filter  
  just_rejections_nonpar_dvines = dvine_knockoff_filter(X_dvine_order, y, M, M_lasso, alpha, lasso_family, nonparametric_family, n_cores)
    
  #Array of integers that indicates the non-nulls position
  print("NONPARAMETRIC DVINE selection (in dvine_order):")
  print(np.sort(just_rejections_nonpar_dvines))
 
  #Array that indicates the rejections considering all the variables (0 null, 1 non-null)
  rejections_nonpar_dvines = np.zeros(p)
  rejections_nonpar_dvines[just_rejections_nonpar_dvines]=1
    
  #Power and FDP
  np_nonpar_dvine_Power[k] = np.dot(rejections_nonpar_dvines, beta_dvine_order != 0) / (beta_dvine_order != 0).sum()
  np_nonpar_dvine_FDP[k] = np.around(100*np.dot(rejections_nonpar_dvines, beta_dvine_order == 0) / max(1,rejections_nonpar_dvines.sum()))
  print(f"The NONPARAMETRIC DVINE knockoff filter POWER {100*np_nonpar_dvine_Power[k]}% with a FDP of {np_nonpar_dvine_FDP[k]}% (NONPAR DVINES)")

  #Deleting some objects
  %R rm(dvine_distributions)  
  
  del X, y, beta, dvine_order
  del rejections_gaussian,rejections_second_order,rejections_dvines, rejections_nonpar_dvines
        
  print(f"LOOP ITERATION: {k}")
 
time_knockoffs_4 = timer() - ti  

In [None]:
# Array of varying feature
np_varying_feature = np.repeat([df_t],repeats=nsim,axis=0)


#Dataframe with simulated data
df_simulations_results_4 = pd.DataFrame({'Varying feature':np_varying_feature,                                      
                     'Gaussian Power(%)':np_Gaussian_Power*100, 
                     '2do Order Power(%)':np_second_order_Power*100,
                     'Dvine Power(%)':np_dvine_Power*100,
                     'Nonpar DvinePower(%)':np_nonpar_dvine_Power*100,
                     'Gaussian FDP(%)':np_Gaussian_FDP,
                     '2do Order FDP(%)':np_second_order_FDP, 
                     'Dvine FDP(%)':np_dvine_FDP,
                     'Nonpar Dvine FDP(%)':np_nonpar_dvine_FDP})

In [None]:
df_simulations_results_4.to_csv('t-Markov_results_4.csv')
df_simulations_results_4

In [None]:
df_simulations_results_4.mean()

# Iteration 5

In [None]:
#Degrees of fredom (t-distribution)
df_t = 20

### Data simulation

In [None]:
#Arrays and lists to save information
ls_simulations = list(range(nsim))
ls_beta = list(range(nsim))
ls_X = list(range(nsim))
ls_y = list(range(nsim))

ti = timer() #Initial time


for k in range(nsim):
  
  #Set seed for replication 
  np.random.seed(k + ri + 5000) #Python
  %R -i k
  %R set.seed(k + ri + 5000)  #R

  #X simulation according to a t-tailed Markov Chain
  X = knockpy.dgp.sample_ar1t(rhos=np_rhos, n=n, df_t= df_t)
  ls_X[k] = X  
    
  # Creating random sparse coefficients
  beta = knockpy.dgp.create_sparse_coefficients(p=p, sparsity=sp, sign_prob=0.5, coeff_size=beta_factor,coeff_dist='uniform')
  ls_beta[k] = beta

  # Response variable
  y = np.dot(X, beta) + np.random.randn(n)
  ls_y[k] = y
    
  #Data frame with simulated data (Y,X)
  #df_X = pd.DataFrame(X)
  #df_y = pd.DataFrame(y)
  #ls_simulations[k] = pd.concat([df_y,df_X], axis=1)

time_simulations_5 = timer() - ti      

### Knockoff filters 

In [None]:
%%R
# A list needed for the Gaussian Knockoff sampling procedure
ls_Xk_norm <- list()

In [None]:
np_Gaussian_Power = np.zeros(nsim)
np_Gaussian_FDP = np.zeros(nsim)

np_second_order_Power = np.zeros(nsim)
np_second_order_FDP = np.zeros(nsim)

np_dvine_Power = np.zeros(nsim)
np_dvine_FDP = np.zeros(nsim)

np_nonpar_dvine_Power = np.zeros(nsim)
np_nonpar_dvine_FDP = np.zeros(nsim)

ti = timer() #Initial time

for k in range(nsim):
  
  %R set.seed(NULL) #The R seed is set inside each knockoff filter function
  
  #Loading simulations  
  X = ls_X[k]  
  y = ls_y[k]
  beta = ls_beta[k]
    
  #1)  
  #Gaussian Knockoffs
  
  #Defining the Gaussian sampler object
  Gaussian_sampler_hat = knockpy.knockoffs.GaussianSampler(X, mu=None,
                                                           Sigma=None,
                                                           method='mvr', verbose=False)

  #Sampling the Gaussian Knockoffs
  for m in range(M):
      np.random.seed(m)
      Xk_norm = Gaussian_sampler_hat.sample_knockoffs()
      %R -i Xk_norm
      %R -i m
      %R ls_Xk_norm[[m+1]] <- Xk_norm
      %R rm(Xk_norm)  
    
  #Array of integers that indicates the non-nulls position  
  just_rejections_gaussian = gaussian_knockoff_filter(X, y, M, M_lasso, alpha, lasso_family, n_cores)
  
  print("Gaussian selection:")  
  print(np.sort(just_rejections_gaussian))
 
  #Array that indicates the rejections considering all the variables (0 null, 1 non-null)
  rejections_gaussian = np.zeros(p)
  rejections_gaussian[just_rejections_gaussian]=1
   

  #Power and FDP
  np_Gaussian_Power[k] = np.dot(rejections_gaussian, beta != 0) / (beta != 0).sum()
  np_Gaussian_FDP[k] = np.around(100*np.dot(rejections_gaussian, beta == 0) / max(1,rejections_gaussian.sum() ) )
  print(f"The knockoff GAUSSIAN filter POWER {100*np_Gaussian_Power[k]}% with a FDP of {np_Gaussian_FDP[k]}%")
  

  #2)
  #Second order knockoff filter    
  just_rejections_second_order = second_order_knockoff_filter(X, y, M, M_lasso, alpha, lasso_family, n_cores)
  
  #Array of integers that indicates the non-nulls position
  print("Second Order selection:")  
  print(np.sort(just_rejections_second_order))
  
  #Array that indicates the rejections considering all the variables (0 null, 1 non-null)
  rejections_second_order = np.zeros(p)
  rejections_second_order[just_rejections_second_order]=1
    
  #Power and FDP
  np_second_order_Power[k] = np.dot(rejections_second_order, beta != 0) / (beta != 0).sum()
  np_second_order_FDP[k] = np.around(100*np.dot(rejections_second_order, beta == 0) / max(1,rejections_second_order.sum()))
  print(f"The knockoff SECOND ORDER filter POWER {100*np_second_order_Power[k]}% with a FDP of {np_second_order_FDP[k]}%")
    
  #3) dvine_order
  #Heuristic procedure to determine the order for the first tree in a D-vine structure
  dvine_order = get_dvine_order(X)

  #New columns orders for X and beta
  X_dvine_order = X[:,dvine_order]
  beta_dvine_order  = beta[dvine_order]

  #4) Parametric dvine
    
  just_rejections_dvines = dvine_knockoff_filter(X_dvine_order, y, M, M_lasso, alpha, lasso_family, vinecop_family, n_cores)
    
  #Array of integers that indicates the non-nulls position
  print("DVINE selection (in dvine_order):")
  print(np.sort(just_rejections_dvines))
 
  #Array that indicates the rejections considering all the variables (0 null, 1 non-null)
  rejections_dvines = np.zeros(p)
  rejections_dvines[just_rejections_dvines]=1
    
  #Power and FDP
  np_dvine_Power[k] = np.dot(rejections_dvines, beta_dvine_order != 0) / (beta_dvine_order != 0).sum()
  np_dvine_FDP[k] = np.around(100*np.dot(rejections_dvines, beta_dvine_order == 0) / max(1,rejections_dvines.sum()))
  print(f"The DVINE knockoff filter POWER {100*np_dvine_Power[k]}% with a FDP of {np_dvine_FDP[k]}% (DVINES)")

  #Deleting some objects
  %R rm(dvine_distributions)  

  #5) Nonparametric dvine
  #dvine knockoff filter  
  just_rejections_nonpar_dvines = dvine_knockoff_filter(X_dvine_order, y, M, M_lasso, alpha, lasso_family, nonparametric_family, n_cores)
    
  #Array of integers that indicates the non-nulls position
  print("NONPARAMETRIC DVINE selection (in dvine_order):")
  print(np.sort(just_rejections_nonpar_dvines))
 
  #Array that indicates the rejections considering all the variables (0 null, 1 non-null)
  rejections_nonpar_dvines = np.zeros(p)
  rejections_nonpar_dvines[just_rejections_nonpar_dvines]=1
    
  #Power and FDP
  np_nonpar_dvine_Power[k] = np.dot(rejections_nonpar_dvines, beta_dvine_order != 0) / (beta_dvine_order != 0).sum()
  np_nonpar_dvine_FDP[k] = np.around(100*np.dot(rejections_nonpar_dvines, beta_dvine_order == 0) / max(1,rejections_nonpar_dvines.sum()))
  print(f"The NONPARAMETRIC DVINE knockoff filter POWER {100*np_nonpar_dvine_Power[k]}% with a FDP of {np_nonpar_dvine_FDP[k]}% (NONPAR DVINES)")

  #Deleting some objects
  %R rm(dvine_distributions)  
  
  del X, y, beta, dvine_order
  del rejections_gaussian,rejections_second_order,rejections_dvines, rejections_nonpar_dvines
        
  print(f"LOOP ITERATION: {k}")    

time_knockoffs_5 = timer() - ti  

In [None]:
# Array of varying feature
np_varying_feature = np.repeat([df_t],repeats=nsim,axis=0)


#Dataframe with simulated data
df_simulations_results_5 = pd.DataFrame({'Varying feature':np_varying_feature,                                     
                     'Gaussian Power(%)':np_Gaussian_Power*100, 
                     '2do Order Power(%)':np_second_order_Power*100,
                     'Dvine Power(%)':np_dvine_Power*100,
                     'Nonpar DvinePower(%)':np_nonpar_dvine_Power*100,
                     'Gaussian FDP(%)':np_Gaussian_FDP,
                     '2do Order FDP(%)':np_second_order_FDP, 
                     'Dvine FDP(%)':np_dvine_FDP,
                     'Nonpar Dvine FDP(%)':np_nonpar_dvine_FDP})

In [None]:
df_simulations_results_5.to_csv('t-Markov_results_5.csv')
df_simulations_results_5

In [None]:
df_simulations_results_5.mean()

### Time to run all the scenarios

In [None]:
t_final = timer()

print('Time (hrs) taken to run all is:',round((t_final-t_initial)/3600,4))

In [None]:
#Run time of the different loops

time_simulations = time_simulations_1 +time_simulations_2 + time_simulations_3 + time_simulations_4 + time_simulations_5   
print('Time (hrs) taken to create simulations of X and y:',round(time_simulations/3600,4))

time_knockoffs = time_knockoffs_1 +time_knockoffs_2 + time_knockoffs_3 + time_knockoffs_4 + time_knockoffs_5  
print('Time (hrs) taken to apply knockoffs filters:',round(time_knockoffs/3600,4))


# Bringing it all together

In [None]:
#Data frame with 
df_simulations_results = pd.concat([df_simulations_results_1,df_simulations_results_2,df_simulations_results_3,df_simulations_results_4,df_simulations_results_5], axis=0)
df_simulations_results.reset_index(drop=True, inplace=True)
df_simulations_results

In [None]:
results = df_simulations_results[["Varying feature","Gaussian Power(%)","2do Order Power(%)", "Dvine Power(%)", "Nonpar DvinePower(%)", "Gaussian FDP(%)", "2do Order FDP(%)","Dvine FDP(%)","Nonpar Dvine FDP(%)"]].groupby("Varying feature").mean()

In [None]:
results