In [84]:
import numpy as np
import pandas as pd
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import train_test_split
import time
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
# import torch
# import torch.nn as nn
# import torch.optim as optim
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings("ignore")

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def relu(x):
    return np.maximum(0, x)

def initialize_parameters(input_size, hidden_size, output_size):
    np.random.seed(42)
    W1 = np.random.normal(loc=0, scale=1, size=(input_size, hidden_size))
    b1 = np.zeros((1, hidden_size))
    W2 = np.random.normal(loc=0, scale=1, size=(hidden_size, output_size))
    b2 = np.zeros((1, output_size))
    return {'W1': W1, 'b1': b1, 'W2': W2, 'b2': b2}

def forward_propagation(X, parameters):
    Z1 = np.dot(X, parameters['W1']) + parameters['b1']
    A1 = relu(Z1)
    Z2 = np.dot(A1, parameters['W2']) + parameters['b2']
    A2 = sigmoid(Z2)
    return {'Z1': Z1, 'A1': A1, 'Z2': Z2, 'A2': A2}

def predict(X, parameters):
    forward = forward_propagation(X, parameters)
    return (forward['A2'] > 0.5).astype(int)

# Set up neural network parameters
input_size = 100
hidden_size = 264
output_size = 1

# Generate random input data
np.random.seed(42)
n_samples = 1000

# Generate the first 33 variables using the neural network
X_first_33 = np.random.normal(loc=0, scale=1, size=(n_samples, 33))
parameters = initialize_parameters(33, hidden_size, output_size)
forward = forward_propagation(X_first_33, parameters)
y_neural_network = forward['A2']

# Generate random values for the remaining variables
X_rest = np.random.normal(loc=0, scale=1, size=(n_samples, input_size - 33))

# Combine the generated values
X = np.concatenate((X_first_33, X_rest), axis=1)

# Threshold for binary classification
threshold = 0.5
y_binary_neural_network = (y_neural_network > threshold).astype(int)

# Add the generated y values to the existing DataFrame
data = pd.DataFrame(X, columns=[f'var{i}' for i in range(input_size)])
data['y'] = y_binary_neural_network

# Print the DataFrame
print(data.head())

       var0      var1      var2      var3      var4      var5      var6  \
0  0.496714 -0.138264  0.647689  1.523030 -0.234153 -0.234137  1.579213   
1 -1.057711  0.822545 -1.220844  0.208864 -1.959670 -1.328186  0.196861   
2 -0.072010  1.003533  0.361636 -0.645120  0.361396  1.538037 -0.035826   
3 -0.234587 -1.415371 -0.420645 -0.342715 -0.802277 -0.161286  0.404051   
4 -1.062304  0.473592 -0.919424  1.549934 -0.783253 -0.322062  0.813517   

       var7      var8      var9  ...     var91     var92     var93     var94  \
0  0.767435 -0.469474  0.542560  ... -0.029352  0.395307  0.033023  1.346941   
1  0.738467  0.171368 -0.115648  ...  1.021963  0.733179  1.378143 -0.990623   
2  1.564644 -2.619745  0.821903  ...  0.272634  0.342226 -1.098679  0.044570   
3  1.886186  0.174578  0.257550  ... -1.312467  0.536389 -1.671147 -0.838362   
4 -1.230864  0.227460  1.307143  ...  0.534347 -1.768415  0.995168  0.937367   

      var95     var96     var97     var98     var99  y  
0  0.774023

In [85]:
def normal_prediction(X_train,y_train,X_test,y_test,input_s):
    input_dim = input_s
    running_times = []
    acc= []
    for i in range(0,10,1):
        model = keras.Sequential()
        model.add(layers.Dense(264, input_shape=(input_dim,)))

        model.add(layers.Dense(1, activation='sigmoid'))
        model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
        start_time = time.time()
        model.fit(X_train,y_train, epochs=250, batch_size=32, verbose=0)
        predictions = model.predict(X_test)
        predicted = (predictions > 0.5)  
        accuracy = accuracy_score(y_test, predicted)  
     
        end_time = time.time()
        run_time = end_time - start_time
        running_times.append(run_time)
        acc.append(accuracy)
    avg_running_time = np.mean(running_times)
    avg_accuracy = np.mean(acc)

    print("Average running time:", avg_running_time, "s")
    print("Average accuracy:", avg_accuracy * 100, "%")

In [86]:
def drop_out_prediction(X_train,y_train,X_test,y_test,input_s):
    input_dim = input_s
    running_times = []
    acc= []
    for i in range(0,10,1):
        model = keras.Sequential()
        model.add(layers.Dense(264, activation='relu', input_shape=(input_dim,)))
        model.add(layers.Dropout(0.33))  # Adding dropout with rate 0.5

        model.add(layers.Dense(1, activation='sigmoid'))

        model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

        start_time = time.time()
        model.fit(X_train, y_train, epochs=250, batch_size=32, verbose=0)
        predictions = model.predict(X_test)
        predicted = (predictions > 0.5)
        accuracy = accuracy_score(y_test, predicted)  
        # 输出准确率
        end_time = time.time()
        run_time = end_time - start_time
        running_times.append(run_time)
        acc.append(accuracy)
    avg_running_time = np.mean(running_times)
    avg_accuracy = np.mean(acc)

    print("Average running time:", avg_running_time, "s")
    print("Average accuracy:", avg_accuracy * 100, "%")  

In [87]:
def All_varaibles_prediction(X_train,y_train,X_test,y_test,input_s):
    input_dim = input_s
    running_times = []
    acc= []
    for i in range(0,10,1):
        model = keras.Sequential()
        model.add(layers.Dense(264, input_shape=(input_dim,)))

        model.add(layers.Dense(1, activation='sigmoid'))
        model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
        start_time = time.time()
        model.fit(X_train,y_train, epochs=250, batch_size=32, verbose=0)
        predictions = model.predict(X_test)
        predicted = (predictions > 0.5)  
        accuracy = accuracy_score(y_test, predicted)  
        end_time = time.time()
        run_time = end_time - start_time
        running_times.append(run_time)
        acc.append(accuracy)
    avg_running_time = np.mean(running_times)
    avg_accuracy = np.mean(acc)

    print("Average running time:", avg_running_time, "s")
    print("Average accuracy:", avg_accuracy * 100, "%")

# Single layer

In [5]:
from sklearn.model_selection import train_test_split
var_t = data.loc[:, ['var1','var6','var7','var10','var16','var20','var23','var24','var26','var27','var30','var31','y']]

X_train, X_test, y_train, y_test = train_test_split(var_t[['var1','var6','var7','var10','var16','var20','var23','var24','var26','var27','var30','var31']], var_t['y'], test_size=0.2, random_state=42)
input_data = []
output_data = []
input_test =  []
output_test = []
for i in X_train.values:
    input_data.append(i)
for i in y_train.values:  
    output_data.append(i)
for i in X_test.values:
    input_test.append(i)
for i in y_test.values:
    output_test.append(i)
normal_prediction(X_train,y_train,X_test,y_test,np.shape(var_t)[1]-1)
drop_out_prediction(X_train,y_train,X_test,y_test,np.shape(var_t)[1]-1)

Average running time: 10.429088878631593 s
Average accuracy: 78.25 %
Average running time: 11.922840666770934 s
Average accuracy: 77.60000000000001 %


# Multiple layers

In [21]:
var_t = data.loc[:, ['var0','var3','var6','var10','var19','var20','var23','var24','var25','var26','var27','var31','y']]

X_train, X_test, y_train, y_test = train_test_split(var_t[['var0','var3','var6','var10','var19','var20','var23','var24','var25','var26','var27','var31']], var_t['y'], test_size=0.2, random_state=42)
input_data = []
output_data = []
input_test =  []
output_test = []
for i in X_train.values:
    input_data.append(i)
for i in y_train.values:  
    output_data.append(i)
for i in X_test.values:
    input_test.append(i)
for i in y_test.values:
    output_test.append(i)

In [22]:
normal_prediction(X_train,y_train,X_test,y_test,np.shape(var_t)[1]-1)
drop_out_prediction(X_train,y_train,X_test,y_test,np.shape(var_t)[1]-1)

Average running time: 6.518034982681274 s
Average accuracy: 77.2 %
Average running time: 9.463358306884766 s
Average accuracy: 71.89999999999999 %


# VWA_OL

In [110]:
from sklearn.model_selection import train_test_split
var_t = data.loc[:, ['var4', 'var5', 'var7', 'var8', 'var11', 'var12', 'var13', 'var16', 'var22', 'var25', 'var26', 'var28', 'var18', 'var31', 'var0', 'var21', 'var14',
                      'var30', 'var1', 'var6', 'var32', 'var19', 'var20', 'var2', 'var3', 'var29', 'y']]
X_train, X_test, y_train, y_test = train_test_split(var_t[['var4', 'var5', 'var7', 'var8', 'var11', 'var12', 'var13', 'var16', 'var22', 'var25', 'var26', 'var28', 'var18', 'var31', 'var0', 'var21', 'var14',
                      'var30', 'var1', 'var6', 'var32', 'var19', 'var20', 'var2', 'var3', 'var29']], var_t['y'], test_size=0.2, random_state=42)
input_data = []
output_data = []
input_test =  []
output_test = []
for i in X_train.values:
    input_data.append(i)
for i in y_train.values:  
    output_data.append(i)
for i in X_test.values:
    input_test.append(i)
for i in y_test.values:
    output_test.append(i)

In [111]:
normal_prediction(X_train,y_train,X_test,y_test,np.shape(var_t)[1]-1)
drop_out_prediction(X_train,y_train,X_test,y_test,np.shape(var_t)[1]-1)

Average running time: 6.825684976577759 s
Average accuracy: 70.64999999999999 %
Average running time: 8.113622045516967 s
Average accuracy: 67.4 %


# VWA_ML

In [112]:
var_t = data[['var13', 'var26', 'var28', 'var11', 'var16', 'var22','var5', 'var6', 'var24', 'var27', 'var31', 'var9', 'var10', 'var18', 'var30', 'var15','y']]
X_train, X_test, y_train, y_test = train_test_split(var_t[['var13', 'var26', 'var28', 'var11', 'var16', 'var22','var5', 'var6', 'var24', 'var27', 'var31', 'var9', 'var10', 'var18', 'var30', 'var15']], var_t['y'], test_size=0.2, random_state=42)
input_data = []
output_data = []
input_test =  []
output_test = []
for i in X_train.values:
    input_data.append(i)
for i in y_train.values:  
    output_data.append(i)
for i in X_test.values:
    input_test.append(i)
for i in y_test.values:
    output_test.append(i)

In [113]:
normal_prediction(X_train,y_train,X_test,y_test,np.shape(var_t)[1]-1)
drop_out_prediction(X_train,y_train,X_test,y_test,np.shape(var_t)[1]-1)

Average running time: 6.25473906993866 s
Average accuracy: 75.2 %
Average running time: 8.157733392715453 s
Average accuracy: 70.35 %


# VWA_OML

In [114]:
var_t = data[['var4', 'var5', 'var7', 'var8', 'var11', 'var12', 'var13', 'var16', 'var22', 'var25', 'var26', 'var28', 'var18', 'var31', 'var0', 'var21', 'var14',
                      'var30', 'var1', 'var6', 'var32', 'var19', 'var20', 'var2', 'var3', 'var29','var9','var17','var24','var15','var10','var27','y']]
X_train, X_test, y_train, y_test = train_test_split(var_t[['var4', 'var5', 'var7', 'var8', 'var11', 'var12', 'var13', 'var16', 'var22', 'var25', 'var26', 'var28', 'var18', 'var31', 'var0', 'var21', 'var14',
                      'var30', 'var1', 'var6', 'var32', 'var19', 'var20', 'var2', 'var3', 'var29','var9','var17','var24','var15','var10','var27']], var_t['y'], test_size=0.2, random_state=42)
input_data = []
output_data = []
input_test =  []
output_test = []
for i in X_train.values:
    input_data.append(i)
for i in y_train.values:  
    output_data.append(i)
for i in X_test.values:
    input_test.append(i)
for i in y_test.values:
    output_test.append(i)

In [115]:
normal_prediction(X_train,y_train,X_test,y_test,np.shape(var_t)[1]-1)
drop_out_prediction(X_train,y_train,X_test,y_test,np.shape(var_t)[1]-1)

Average running time: 6.737831783294678 s
Average accuracy: 80.35 %
Average running time: 8.89448983669281 s
Average accuracy: 80.05000000000001 %


# All varaibles

In [14]:
X_train, X_test, y_train, y_test = train_test_split(data.iloc[:,:-1], data['y'], test_size=0.2, random_state=42)
input_data = []
output_data = []
input_test =  []
output_test = []
for i in X_train.values:
    input_data.append(i)
for i in y_train.values:  
    output_data.append(i)
for i in X_test.values:
    input_test.append(i)
for i in y_test.values:
    output_test.append(i)

normal_prediction(X_train,y_train,X_test,y_test,100)
drop_out_prediction(X_train,y_train,X_test,y_test,100)

Average running time: 11.214705634117127 s
Average accuracy: 75.85 %
Average running time: 16.673228645324706 s
Average accuracy: 76.7 %


## 

In [75]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
import time
from sklearn.metrics import accuracy_score

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def relu(x):
    return np.maximum(0, x)

def initialize_parameters(input_size, hidden_size, output_size):
    np.random.seed(42)
    W1 = np.random.normal(loc=0, scale=1, size=(input_size, hidden_size))
    b1 = np.zeros((1, hidden_size))
    W2 = np.random.normal(loc=0, scale=1, size=(hidden_size, output_size))
    b2 = np.zeros((1, output_size))
    return {'W1': W1, 'b1': b1, 'W2': W2, 'b2': b2}

def forward_propagation(X, parameters):
    Z1 = np.dot(X, parameters['W1']) + parameters['b1']
    A1 = relu(Z1)
    Z2 = np.dot(A1, parameters['W2']) + parameters['b2']
    A2 = sigmoid(Z2)
    return {'Z1': Z1, 'A1': A1, 'Z2': Z2, 'A2': A2}

def predict(X, parameters):
    forward = forward_propagation(X, parameters)
    return (forward['A2'] > 0.5).astype(int)

# Set up neural network parameters
input_size = 100
hidden_size = 264
output_size = 1
n_samples = 1000

# Generate random input data
np.random.seed(42)

# Generate the first 33 variables using the neural network
X_first_33 = np.random.normal(loc=0, scale=1, size=(n_samples, 33))
parameters = initialize_parameters(33, hidden_size, output_size)
forward = forward_propagation(X_first_33, parameters)
y_neural_network = forward['A2']

# Generate random values for the remaining variables
X_rest = np.random.normal(loc=0, scale=1,size=(n_samples, input_size - 33))

# Combine the generated values
X = np.concatenate((X_first_33, X_rest), axis=1)

# Threshold for binary classification
threshold =0.5
y_binary_neural_network = (y_neural_network > threshold).astype(int)

# Add the generated y values to the existing DataFrame
data = pd.DataFrame(X, columns=[f'var{i}' for i in range(input_size)])
data['y'] = y_binary_neural_network

# Print the DataFrame
print(data.head())

def reduce_weight(lambda_array1,lambda_array2,value,variables):
    lambda_array2 = np.where(lambda_array2 == 0, 8, lambda_array2)
#     print(lambda_array2)
    layer1_weight = lambda_array1
    layer2_weight = lambda_array2
    layer1_weight = np.where(lambda_array1[:100, :] < value, 0, 1)
    layer1_weight = layer1_weight[variables, :]
    layer2_weight = np.where(lambda_array2[:264, :] < value, 0, 1)
    
    zero_count = 264*len(selected_vars1)+ 264 - np.count_nonzero(layer1_weight)-np.count_nonzero(layer2_weight)
    rate = zero_count/(264*len(selected_vars1)+ 264)
    return rate,layer1_weight,layer2_weight

def reduce_weight_prediction(X_train,y_train,X_test,y_test,input_s, weight1, weight2):
    # Assuming you have your custom weights for the first layer
    input_dim = input_s
    running_times = []
    acc = []
    
    custom_first_layer_weights = weight1  
    custom_second_layer_weights = weight2  

    for i in range(0, 10, 1):
        model = keras.Sequential()
        model.add(Dense(264, activation='relu', input_shape=(input_dim,), weights=[custom_first_layer_weights, np.zeros(264)]))
        model.add(Dense(1, activation='sigmoid', weights=[custom_second_layer_weights, np.zeros(1)]))
        model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
        def prune(model, threshold=100):
            for layer in model.layers:
                if isinstance(layer, tf.keras.layers.Dense):
                    weights = layer.get_weights()
                    weights[0][np.abs(weights[0]) < threshold] = 0
                    layer.set_weights(weights)
        start_time = time.time()+1
        model.fit(X_train, y_train, epochs=250, batch_size=32, verbose=0)
        predictions = model.predict(X_test)
        predicted = (predictions > 0.5)
        accuracy = accuracy_score(y_test, predicted)
        end_time = time.time()
        run_time = end_time - start_time
        running_times.append(run_time)
        acc.append(accuracy)

    avg_running_time = np.mean(running_times)
    avg_accuracy = np.mean(acc)

    print("Average running time:", avg_running_time, "s")
    print("Average accuracy:", avg_accuracy * 100, "%")

import numpy as np
from scipy.sparse.linalg import eigsh

## some useful functions
def get_arccos(X):
    # X is a 2-d array
    n, p = X.shape
    cos_a = np.zeros([n, n, n])
    
    for r in range(n):
        
        xr = X[r]
        X_r = X - xr
        cross = np.dot(X_r, X_r.T)
        row_norm = np.sqrt(np.sum(X_r**2, axis = 1))
        outer_norm = np.outer(row_norm, row_norm)
        
        zero_idx = (outer_norm == 0.)
        outer_norm[zero_idx] = 1.
        cos_a_kl = cross / outer_norm
        cos_a_kl[zero_idx] = 0.

        cos_a[:,:,r] = cos_a_kl
        
    cos_a[cos_a > 1] = 1.
    cos_a[cos_a < -1] = -1.
    a = np.arccos(cos_a)

    a_bar_12 = np.mean(a, axis = 0, keepdims = True)
    a_bar_02 = np.mean(a, axis = 1, keepdims = True)
    a_bar_2  = np.mean(a, axis = (0,1), keepdims = True)
    A = a - a_bar_12 - a_bar_02 + a_bar_2
        
    return a, A

def get_arccos_1d(X):
    # X is a 1-d array
    
    X = np.squeeze(X)
    Y = X[:,None] - X
    Z = Y.T[:,:,None]*Y.T[:,None]
    n = len(X)
    
    a = np.zeros([n, n, n])
    a[Z == 0.] = np.pi/2.
    a[Z < 0.] = np.pi
    
    a = np.transpose(a, (1,2,0))
    
    #a = Z[Z>0.]*0. + Z[Z==0.]*np.pi/2. + Z[Z<0.]*np.pi

    a_bar_12 = np.mean(a, axis = 0, keepdims = True)
    a_bar_02 = np.mean(a, axis = 1, keepdims = True)
    a_bar_2  = np.mean(a, axis = (0,1), keepdims = True)
    A = a - a_bar_12 - a_bar_02 + a_bar_2
    
    return a, A

def orthonormalize(X):
    # X is a 2-d array
    # output: Gram-Schmidt orthogonalization of X
    
    n, p = X.shape
    Y = np.zeros([n,p])
    Y[:,0] = X[:,0]/np.sqrt(np.sum(X[:,0]**2))
    
    for j in range(1,p):
        
        Yj = Y[:,range(j)]
        xj = X[:,j]
        w = np.dot(xj, Yj)
        xj_p = np.sum(w*Yj, axis = 1)
        yj = xj - xj_p
        yj = yj/np.sqrt(np.sum(yj**2))
        
        Y[:,j] = yj
        
    return Y

# Main functions
def projection_corr(X, Y):
    # X, Y are 2-d array
    
    nx, p = X.shape
    ny, q = Y.shape
    
    if nx == ny:
        n = nx
    else:
        raise ValueError("sample sizes do not match.")
        
    a_x, A_x = get_arccos(X)
    a_y, A_y = get_arccos(Y)
    
    S_xy = np.sum(A_x * A_y) / (n**3)
    S_xx = np.sum(A_x**2) / (n**3)
    S_yy = np.sum(A_y**2) / (n**3)
    
    if S_xx * S_yy == 0.:
        corr = 0.
    else:
        corr = np.sqrt( S_xy / np.sqrt(S_xx * S_yy) )
    
    return corr

def projection_corr_1d(X, Y):
    
    nx, p = X.shape
    ny, q = Y.shape
    
    if nx == ny:
        n = nx
    else:
        raise ValueError("sample sizes do not match.")
        
    a_x, A_x = get_arccos_1d(X)
    a_y, A_y = get_arccos_1d(Y)
    
    S_xy = np.sum(A_x * A_y) / (n**3)
    S_xx = np.sum(A_x**2) / (n**3)
    S_yy = np.sum(A_y**2) / (n**3)
    
    if S_xx * S_yy == 0.:
        corr = 0.
    else:
        corr = np.sqrt( S_xy / np.sqrt(S_xx * S_yy) )
    
    return corr

def projection_corr_1dy(X, Y):
    
    nx, p = X.shape
    ny, q = Y.shape
    
    if nx == ny:
        n = nx
    else:
        raise ValueError("sample sizes do not match.")
        
    a_x, A_x = get_arccos(X)
    a_y, A_y = get_arccos_1d(Y)
    
    S_xy = np.sum(A_x * A_y) / (n**3)
    S_xx = np.sum(A_x**2) / (n**3)
    S_yy = np.sum(A_y**2) / (n**3)
    
    if S_xx * S_yy == 0.:
        corr = 0.
    else:
        corr = np.sqrt( S_xy / np.sqrt(S_xx * S_yy) )
    
    return corr

def get_equi_features(X):
    # X is 2-d array
    
    n, p = X.shape
    scale = np.sqrt(np.sum(X**2, axis=0))
    Xstd = X / scale
    sigma = np.dot(Xstd.T, Xstd)
    sigma_inv = np.linalg.inv(sigma)
    lambd_min = eigsh(sigma, k=1, which='SA')[0].squeeze()
    sj = np.min([1., 2.*lambd_min])
    sj = sj - 0.00001
    
    mat_s = np.diag([sj]*p)
    A = 2*mat_s - sj*sj*sigma_inv
    C = np.linalg.cholesky(A).T
    
    Xn = np.random.randn(n, p)
    XX = np.hstack([Xstd, Xn])
    XXo = orthonormalize(XX)
    U = XXo[:,range(p,2*p)]
    
    Xnew = np.dot(Xstd,  np.eye(p) - sigma_inv*sj) + np.dot(U,C)
    return Xnew

X = data.iloc[:,:-1]
X = (X - X.mean()) / X.std()
Y = data.iloc[:,-1]
X_knockoff = get_equi_features(X)
X_knockoff = (X_knockoff - X_knockoff.mean()) / X_knockoff.std()

X_knockoff_df = pd.DataFrame(X_knockoff)
column_names = ['feature0_k', 'feature1_k', 'feature2_k', 'feature3_k', 'feature4_k','feature5_k', 'feature6_k', 'feature7_k', 'feature8_k', 'feature9_k',
                'feature10_k', 'feature11_k', 'feature12_k', 'feature13_k', 'feature14_k','feature15_k', 'feature16_k', 'feature17_k', 'feature18_k', 'feature19_k',
                'feature20_k', 'feature21_k', 'feature22_k', 'feature23_k', 'feature24_k','feature25_k', 'feature26_k', 'feature27_k', 'feature28_k', 'feature29_k',
                'feature30_k', 'feature31_k', 'feature32_k', 'feature33_k', 'feature34_k','feature35_k', 'feature36_k', 'feature37_k', 'feature38_k', 'feature39_k',
                'feature40_k', 'feature41_k', 'feature42_k', 'feature43_k', 'feature44_k','feature45_k', 'feature46_k', 'feature47_k', 'feature48_k', 'feature49_k',
                'feature50_k', 'feature51_k', 'feature52_k', 'feature53_k', 'feature54_k','feature55_k', 'feature56_k', 'feature57_k', 'feature58_k', 'feature59_k',
                'feature60_k', 'feature61_k', 'feature62_k', 'feature63_k', 'feature64_k','feature65_k', 'feature66_k', 'feature67_k', 'feature68_k', 'feature69_k',
                'feature70_k', 'feature71_k', 'feature72_k', 'feature73_k', 'feature74_k','feature75_k', 'feature76_k', 'feature77_k', 'feature78_k', 'feature79_k',
                'feature80_k', 'feature81_k', 'feature82_k', 'feature83_k', 'feature84_k','feature85_k', 'feature86_k', 'feature87_k', 'feature88_k', 'feature89_k',
                'feature90_k', 'feature91_k', 'feature92_k', 'feature93_k', 'feature94_k','feature95_k', 'feature96_k', 'feature97_k', 'feature98_k', 'feature99_k']
X_knockoff_df.columns = column_names
feature = pd.concat([X,X_knockoff_df],axis = 1)
dataset1 =  pd.concat([feature,data['y']],axis = 1)
dataset1
# dataset1.to_csv('dataset2_for_DeepPINK.csv', index=False)

       var0      var1      var2      var3      var4      var5      var6  \
0  0.496714 -0.138264  0.647689  1.523030 -0.234153 -0.234137  1.579213   
1 -1.057711  0.822545 -1.220844  0.208864 -1.959670 -1.328186  0.196861   
2 -0.072010  1.003533  0.361636 -0.645120  0.361396  1.538037 -0.035826   
3 -0.234587 -1.415371 -0.420645 -0.342715 -0.802277 -0.161286  0.404051   
4 -1.062304  0.473592 -0.919424  1.549934 -0.783253 -0.322062  0.813517   

       var7      var8      var9  ...     var91     var92     var93     var94  \
0  0.767435 -0.469474  0.542560  ... -0.029352  0.395307  0.033023  1.346941   
1  0.738467  0.171368 -0.115648  ...  1.021963  0.733179  1.378143 -0.990623   
2  1.564644 -2.619745  0.821903  ...  0.272634  0.342226 -1.098679  0.044570   
3  1.886186  0.174578  0.257550  ... -1.312467  0.536389 -1.671147 -0.838362   
4 -1.230864  0.227460  1.307143  ...  0.534347 -1.768415  0.995168  0.937367   

      var95     var96     var97     var98     var99  y  
0  0.774023

Unnamed: 0,var0,var1,var2,var3,var4,var5,var6,var7,var8,var9,...,feature91_k,feature92_k,feature93_k,feature94_k,feature95_k,feature96_k,feature97_k,feature98_k,feature99_k,y
0,0.453366,-0.157348,0.663988,1.511999,-0.240969,-0.185356,1.560560,0.743297,-0.458107,0.563718,...,0.999163,0.392953,1.328734,1.097952,-1.648306,-0.165183,-0.295485,1.555953,0.450022,1
1,-1.109920,0.784452,-1.211126,0.209216,-2.000063,-1.263838,0.205948,0.713553,0.175235,-0.106240,...,-1.257662,0.584329,-0.096528,0.088015,0.811433,-0.093714,-0.411926,-1.560464,-1.211231,0
2,-0.118600,0.961859,0.376928,-0.637370,0.366168,1.561602,-0.022070,1.561837,-2.583211,0.848047,...,0.468241,-0.052013,0.457419,-0.172570,0.144992,-0.755030,1.420825,-1.423810,-0.385982,1
3,-0.282104,-1.409187,-0.408109,-0.337584,-0.820148,-0.113541,0.408980,1.891982,0.178407,0.273621,...,-0.331501,0.656186,2.651308,-0.119277,0.542618,-0.168797,-1.109217,-0.220027,0.592296,0
4,-1.114539,0.442404,-0.908645,1.538670,-0.800754,-0.272030,0.810230,-1.308471,0.230670,1.341950,...,0.019434,-0.954020,-0.558327,1.151677,-0.407388,1.156443,0.255484,1.752459,-0.637515,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,0.857611,0.220515,-1.039238,-0.021502,0.443266,1.020646,-0.727184,-0.618147,-0.681152,-0.645571,...,0.880521,0.561123,-0.187615,0.590897,0.717976,2.216450,0.774594,1.206445,-1.988576,1
996,-0.403236,-0.725955,-1.494617,0.050196,-0.814320,1.347463,0.160171,0.532471,-0.240005,0.216346,...,-0.350321,0.211602,1.331181,1.359493,1.556090,-0.303557,-0.798531,2.443395,0.733447,0
997,-0.083530,-1.601194,0.491777,2.216535,-0.654236,-0.120893,-0.104044,0.968191,0.406698,0.948248,...,-1.277865,-0.544419,-1.085406,-0.664075,0.716933,0.124159,0.569351,0.567875,2.840357,1
998,-1.901593,1.151172,0.209901,0.281905,0.779757,1.291113,-0.009521,1.163360,0.551546,-0.655813,...,0.264692,0.593664,-0.352553,-0.286219,-0.385229,1.095248,-0.840813,-2.369916,-0.260249,1


In [81]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tqdm import tqdm

lambda_array = np.zeros((200, 264))
lambda_array2 = np.zeros((264, 1))
input_dim = 200

# Use tqdm for a one-line progress bar
for i in tqdm(np.arange(0, 7, 0.05)):
    model = keras.Sequential()
    model.add(layers.Dense(264, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(i)))
    #model.add(layers.Dense(128, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(i)))
    model.add(layers.Dense(1, activation='sigmoid'))
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    model.fit(feature, Y, epochs=50, batch_size=32, verbose=0)

    first_layer = model.layers[0]
    weights, biases = first_layer.get_weights()
    
    for j in range(min(len(weights), len(lambda_array))):
        for n in range(min(len(weights[j]), len(lambda_array[j]))):
            if abs(weights[j][n]) < 5e-4 and lambda_array[j][n] == 0:
                lambda_array[j][n] = i
    
    # Calculate the Zi in second layer
    second_layer = model.layers[1]
    weights, biases = second_layer.get_weights()
    #print(f"Layer: {layer.name}, Weights: {weights}")
#     print(weights)
    for j in range(len(weights)):
        for n in range(len(weights[j])):
            if abs(weights[j][n]) < 5e-4 and lambda_array2[j][n] == 0 :
#                 print(j,n)
                lambda_array2[j][n] = i
print(lambda_array)

100%|██████████| 140/140 [10:03<00:00,  4.31s/it]

[[0.05 0.1  0.1  ... 0.05 0.2  0.05]
 [0.05 0.1  0.1  ... 0.05 0.2  0.05]
 [0.05 0.1  0.1  ... 0.05 0.2  0.05]
 ...
 [0.05 0.1  0.1  ... 0.05 0.2  0.05]
 [0.05 0.1  0.1  ... 0.05 0.05 0.05]
 [0.05 0.1  0.05 ... 0.05 0.2  0.05]]





### OL

In [82]:
selected_vars1 = [1,6,7,10,16,20,23,24,26,27,30,31]
r,l1,l2 = reduce_weight(lambda_array,lambda_array2,0.1,selected_vars1)
r

0.32925407925407923

In [78]:
from sklearn.model_selection import train_test_split
var_t = data.loc[:, ['var1','var6','var7','var10','var16','var23','var24','var25','var26','var27','var30','var31','y']]

X_train, X_test, y_train, y_test = train_test_split(var_t[['var1','var6','var7','var10','var16','var23','var24','var25','var26','var27','var30','var31']], var_t['y'], test_size=0.2, random_state=42)
input_data = []
output_data = []
input_test =  []
output_test = []
for i in X_train.values:
    input_data.append(i)
for i in y_train.values:  
    output_data.append(i)
for i in X_test.values:
    input_test.append(i)
for i in y_test.values:
    output_test.append(i)
    

In [18]:
reduce_weight_prediction(X_train,y_train,X_test,y_test,12, l1, l2)

Average running time: 9.491174840927124 s
Average accuracy: 79.35000000000001 %


### ML

In [19]:
var_t = data.loc[:, ['var1','var6','var7','var10','var13','var16','var20','var23','var24','var26','var27','var30','var31','y']]

X_train, X_test, y_train, y_test = train_test_split(var_t[['var1','var6','var7','var10','var13','var16','var20','var23','var24','var26','var27','var30','var31']], var_t['y'], test_size=0.2, random_state=42)
input_data = []
output_data = []
input_test =  []
output_test = []
for i in X_train.values:
    input_data.append(i)
for i in y_train.values:  
    output_data.append(i)
for i in X_test.values:
    input_test.append(i)
for i in y_test.values:
    output_test.append(i)

In [20]:
selected_vars1 = [0, 6, 7, 10, 13, 16, 20, 23, 24, 26, 27, 30,31]
r,l1,l2 = reduce_weight(lambda_array,lambda_array2,0.1,selected_vars1)
r

0.35064935064935066

In [21]:
reduce_weight_prediction(X_train,y_train,X_test,y_test,13, l1, l2)

Average running time: 9.49518554210663 s
Average accuracy: 79.60000000000001 %


### VWA_OL

In [107]:
from sklearn.model_selection import train_test_split
var_t = data.loc[:, ['var4', 'var5', 'var7', 'var8', 'var11', 'var12', 'var13', 'var16', 'var22', 'var25', 'var26', 'var28', 'var18', 'var31', 'var0', 'var21', 'var14',
                      'var30', 'var1', 'var6', 'var32', 'var19', 'var20', 'var2', 'var3', 'var29', 'y']]
X_train, X_test, y_train, y_test = train_test_split(var_t[['var4', 'var5', 'var7', 'var8', 'var11', 'var12', 'var13', 'var16', 'var22', 'var25', 'var26', 'var28', 'var18', 'var31', 'var0', 'var21', 'var14',
                      'var30', 'var1', 'var6', 'var32', 'var19', 'var20', 'var2', 'var3', 'var29']], var_t['y'], test_size=0.2, random_state=42)
input_data = []
output_data = []
input_test =  []
output_test = []
for i in X_train.values:
    input_data.append(i)
for i in y_train.values:  
    output_data.append(i)
for i in X_test.values:
    input_test.append(i)
for i in y_test.values:
    output_test.append(i)

In [108]:
selected_vars1 = [4, 5, 7, 8, 11, 12, 13, 16, 22, 25, 26, 28, 18, 31, 0, 21, 14, 30, 1, 6, 32, 19, 20, 2, 3, 29]
r,l1,l2 = reduce_weight(lambda_array,lambda_array2,0.1,selected_vars1)
r

0.3547979797979798

In [109]:
reduce_weight_prediction(X_train,y_train,X_test,y_test,26, l1, l2)

Average running time: 6.9524836301803585 s
Average accuracy: 71.2 %


### VWA_ML

In [93]:
var_t = data[['var13', 'var26', 'var28', 'var11', 'var16', 'var22','var5', 'var6', 'var24', 'var27', 'var31', 'var9', 'var10', 'var18', 'var30', 'var15','y']]
X_train, X_test, y_train, y_test = train_test_split(var_t[['var13', 'var26', 'var28', 'var11', 'var16', 'var22','var5', 'var6', 'var24', 'var27', 'var31', 'var9', 'var10', 'var18', 'var30', 'var15']], var_t['y'], test_size=0.2, random_state=42)
input_data = []
output_data = []
input_test =  []
output_test = []
for i in X_train.values:
    input_data.append(i)
for i in y_train.values:  
    output_data.append(i)
for i in X_test.values:
    input_test.append(i)
for i in y_test.values:
    output_test.append(i)

In [94]:
selected_vars1 = [13, 26, 28, 11, 16, 22, 5, 6, 24, 27, 31, 9, 10, 18, 30, 15]
r,l1,l2 = reduce_weight(lambda_array,lambda_array2,0.10,selected_vars1)
r

0.34157754010695185

In [96]:
reduce_weight_prediction(X_train,y_train,X_test,y_test,16, l1, l2)

Average running time: 5.657754492759705 s
Average accuracy: 76.89999999999999 %


### VWA_OML

In [100]:
selected_vars1 = [4, 5, 7, 8, 11, 12, 13, 16, 22, 25, 26, 28, 18, 31, 0, 21, 14, 30, 1, 6, 32, 19, 20, 2, 3, 29, 9, 17, 24, 15, 10, 27]
r,l1,l2 = reduce_weight(lambda_array,lambda_array2,0.1,selected_vars1)
r

0.35651974288337923

In [101]:
var_t = data[['var4', 'var5', 'var7', 'var8', 'var11', 'var12', 'var13', 'var16', 'var22', 'var25', 'var26', 'var28', 'var18', 'var31', 'var0', 'var21', 'var14',
                      'var30', 'var1', 'var6', 'var32', 'var19', 'var20', 'var2', 'var3', 'var29','var9','var17','var24','var15','var10','var27','y']]
X_train, X_test, y_train, y_test = train_test_split(var_t[['var4', 'var5', 'var7', 'var8', 'var11', 'var12', 'var13', 'var16', 'var22', 'var25', 'var26', 'var28', 'var18', 'var31', 'var0', 'var21', 'var14',
                      'var30', 'var1', 'var6', 'var32', 'var19', 'var20', 'var2', 'var3', 'var29','var9','var17','var24','var15','var10','var27']], var_t['y'], test_size=0.2, random_state=42)
input_data = []
output_data = []
input_test =  []
output_test = []
for i in X_train.values:
    input_data.append(i)
for i in y_train.values:  
    output_data.append(i)
for i in X_test.values:
    input_test.append(i)
for i in y_test.values:
    output_test.append(i)

In [102]:
reduce_weight_prediction(X_train,y_train,X_test,y_test,32, l1, l2)

Average running time: 5.5468226909637455 s
Average accuracy: 79.89999999999999 %


## All variable

In [41]:
selected_vars1 = []
for i in range(100):
    selected_vars1.append(i)
r,l1,l2 = reduce_weight(lambda_array,lambda_array2,0.1,selected_vars1)
r

0.39723972397239726

In [44]:
X_train, X_test, y_train, y_test = train_test_split(data.iloc[:,:-1], data['y'], test_size=0.2, random_state=42)
input_data = []
output_data = []
input_test =  []
output_test = []
for i in X_train.values:
    input_data.append(i)
for i in y_train.values:  
    output_data.append(i)
for i in X_test.values:
    input_test.append(i)
for i in y_test.values:
    output_test.append(i)

reduce_weight_prediction(X_train,y_train,X_test,y_test,100, l1, l2)

Average running time: 11.00966272354126 s
Average accuracy: 75.35 %


## DeepLINK

In [31]:
var_t = data[['var0', 'var1', 'var5','var6', 'var7', 'var16','var22', 'var23','y']]

X_train, X_test, y_train, y_test = train_test_split(var_t[['var0', 'var1', 'var5','var6', 'var7', 'var16','var22', 'var23'
]], var_t['y'], test_size=0.2, random_state=42)
input_data = []
output_data = []
input_test =  []
output_test = []
for i in X_train.values:
    input_data.append(i)
for i in y_train.values:  
    output_data.append(i)
for i in X_test.values:
    input_test.append(i)
for i in y_test.values:
    output_test.append(i)

In [32]:
normal_prediction(X_train,y_train,X_test,y_test,np.shape(var_t)[1]-1)

Average running time: 10.497719120979308 s
Average accuracy: 67.10000000000001 %


In [33]:
var_t = data[['var9', 'var11', 'var13','var96','y']]

X_train, X_test, y_train, y_test = train_test_split(var_t[['var9', 'var11', 'var13','var96'
]], var_t['y'], test_size=0.2, random_state=42)
input_data = []
output_data = []
input_test =  []
output_test = []
for i in X_train.values:
    input_data.append(i)
for i in y_train.values:  
    output_data.append(i)
for i in X_test.values:
    input_test.append(i)
for i in y_test.values:
    output_test.append(i)
normal_prediction(X_train,y_train,X_test,y_test,np.shape(var_t)[1]-1)

Average running time: 10.160671019554139 s
Average accuracy: 63.05 %


### DeepPINK

In [34]:
var_t = data[['var27', 'var43', 'var55','y']]

X_train, X_test, y_train, y_test = train_test_split(var_t[['var27', 'var43', 'var55']], var_t['y'], test_size=0.2, random_state=42)
input_data = []
output_data = []
input_test =  []
output_test = []
for i in X_train.values:
    input_data.append(i)
for i in y_train.values:  
    output_data.append(i)
for i in X_test.values:
    input_test.append(i)
for i in y_test.values:
    output_test.append(i)
    
normal_prediction(X_train,y_train,X_test,y_test,np.shape(var_t)[1]-1)

Average running time: 10.305346179008485 s
Average accuracy: 67.5 %


### DeepLINK

In [38]:
var_t = data[['var10', 'var45', 'var20', 'var23','y']]

X_train, X_test, y_train, y_test = train_test_split(var_t[['var10', 'var45', 'var20', 'var23']], var_t['y'], test_size=0.2, random_state=42)
input_data = []
output_data = []
input_test =  []
output_test = []
for i in X_train.values:
    input_data.append(i)
for i in y_train.values:  
    output_data.append(i)
for i in X_test.values:
    input_test.append(i)
for i in y_test.values:
    output_test.append(i)
    
normal_prediction(X_train,y_train,X_test,y_test,np.shape(var_t)[1]-1)

Average running time: 10.4948894739151 s
Average accuracy: 65.25000000000001 %
