In [1]:
import torch
import numpy as np
import pandas as pd
from model import NeuralNet, train_model, predict,ProximalSGD,accuracy
from knockoff import create_knockoff_variable
import torch.nn as nn
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


In [3]:
def set_seed(seed):
    #Set the parameters are consistent when the model is initialized
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
    np.random.seed(seed)

In [4]:
input_size = 100
relevant_features = 33
hidden_size_data = 264
hidden_size = 264
output_size = 1

### Create simulated variables

In [5]:
def create_data(input_size=input_size, relevant_features=relevant_features, hidden_size=hidden_size_data, 
                output_size=output_size, n_samples=1000, threshold=0.5):  
    np.random.seed(42)

    X_relevant = np.random.normal(loc=2, scale=3, size=(n_samples, relevant_features))  # mean=2, std=3

    X_non_relevant = np.random.normal(loc=0, scale=1, size=(n_samples, input_size - relevant_features))  # mean=0, std=1

    X_all = np.hstack([X_relevant, X_non_relevant])

    layer_sizes = [relevant_features, hidden_size, output_size]  # Use only the relevant features
    X_relevant_tensor = torch.tensor(X_relevant, dtype=torch.float32)
    
    set_seed(42)
    model = NeuralNet(layer_sizes)
    y_neural_network = model(X_relevant_tensor)

    threshold = 0.5
    y_binary_neural_network = (y_neural_network > threshold).float()

    data = pd.DataFrame(X_all, columns=[f'var{i}' for i in range(input_size)])
    data['y'] = y_binary_neural_network

    return data


In [6]:
data = create_data()

### Create knockoff variables

In [7]:
X = data.iloc[:, :-1]
# X= (X - X.min(axis=0)) / (X.max(axis=0) - X.min(axis=0))
 
Y = data.iloc[:, -1]

knockoff_creator = create_knockoff_variable()
X_knockoff = knockoff_creator.get_equi_features(X)
# X_knockoff = (X_knockoff - X_knockoff.min(axis=0)) / (X_knockoff.max(axis=0) - X_knockoff.min(axis=0))

In [8]:
X_knockoff_df = pd.DataFrame(X_knockoff,columns= [f'var_k{i}' for i in range(input_size)])
feature = pd.concat([X,X_knockoff_df],axis = 1)
dataset1 =  pd.concat([feature,data['y']],axis = 1)
dataset1
# dataset1.to_csv("Simulationdata.csv")

Unnamed: 0,var0,var1,var2,var3,var4,var5,var6,var7,var8,var9,...,var_k91,var_k92,var_k93,var_k94,var_k95,var_k96,var_k97,var_k98,var_k99,y
0,3.490142,1.585207,3.943066,6.569090,1.297540,1.297589,6.737638,4.302304,0.591577,3.627680,...,0.034453,-0.033122,0.018492,-0.000316,-0.025200,0.023365,0.009972,-0.021628,-0.011788,0.0
1,-1.173133,4.467635,-1.662531,2.626591,-3.879010,-1.984558,2.590584,4.215400,2.514105,1.653055,...,0.022673,-0.017316,-0.037129,-0.008989,0.017278,-0.044893,-0.012170,-0.014179,0.006181,0.0
2,1.783970,5.010599,3.084908,0.064641,3.084187,6.614110,1.892522,6.693931,-5.859235,4.465708,...,-0.041876,-0.001015,0.016521,0.041242,-0.031441,0.024462,0.024236,-0.008180,0.007144,1.0
3,1.296239,-2.246112,0.738064,0.971856,-0.406832,1.516143,3.212153,7.658558,2.523733,2.772651,...,0.002618,-0.006697,0.046907,0.012209,-0.069256,-0.060775,0.052254,-0.050740,0.000170,0.0
4,-1.186911,3.420777,-0.758273,6.649803,-0.349760,1.033815,4.440552,-1.692593,2.682380,5.921428,...,0.024444,0.002415,-0.019230,0.042395,0.017645,-0.007764,-0.000105,-0.068749,0.015619,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,4.696002,2.741676,-1.148678,1.928387,3.311065,4.967817,-0.266136,0.324411,-0.085480,0.063439,...,-0.012493,0.008497,-0.015299,0.001296,-0.013542,0.010613,-0.001350,0.007365,0.079991,1.0
996,0.934903,-0.155047,-2.510018,2.145360,-0.389681,5.962420,2.450440,3.686309,1.253634,2.603841,...,0.029264,0.012612,-0.030472,-0.018075,0.026242,-0.006903,0.000686,0.012208,0.012369,1.0
997,1.888585,-2.833761,3.428247,8.701166,0.081406,1.493770,1.641565,4.959406,3.216717,4.761038,...,0.010602,-0.023652,0.040607,0.008429,-0.052465,-0.006947,0.028833,-0.063052,-0.089730,1.0
998,-3.534689,5.589999,2.585586,2.846564,4.301271,5.790931,1.930940,5.529655,3.656409,0.033253,...,0.014154,-0.027570,0.006346,-0.000620,-0.049369,0.021905,0.038264,-0.031197,-0.019243,1.0


In [9]:
def compute_W1(lambda_array,num_x):
    row_sums1 = np.sum(lambda_array, axis=1)
    W1 = []
    for i in range(0,int(0.5*len(row_sums1)),1):
        if row_sums1[i]>=row_sums1[i+num_x]:
            W1.append(row_sums1[i])
        else:
            W1.append(-row_sums1[i+num_x])
    return W1,row_sums1

def compute_W2(model, row_sums1, num_x):
    para = []
    for name, param in model.named_parameters():
        if param.requires_grad and 'weight' in name:
            para.append(param.data)


    result = para[0]
    for i in range(1, len(para)):
        if i == (len(para) - 1):
            result = torch.matmul(result.t(), para[i].reshape(-1, 1))
        else:
            result = torch.matmul(result.t(), para[i])
    
    g =[]
    for i in range(len(result)):
        g.append(result[i][0].cpu().numpy() * row_sums1[i]*10)
  
    W2= []
 
    for i in range(0,int(0.5*len(g)),1):
        W2.append(abs(g[i])-abs(g[i+num_x]))
    return W2,g

def select_variables(W,target_q):
    T = []
    for i in W:
        count1 = 0
        count2 = 0
        t = abs(i)
        for j in W:
            if j < -t:
                count1 = count1+1
    
            if j >= t:
                count2 = count2 +1
        q = (count1)/(max(count2,1))

        if q <= target_q:
            T.append(t)
    threshold = min(T)
    selected_vars = [i+1 for i in range(len(W)) if W[i] >= threshold]
    return selected_vars, threshold

### OL 10

In [10]:
Y = Y.apply(pd.to_numeric, errors='coerce')
y_float = (Y.values).astype(float) 
Y_tensor = torch.tensor(y_float, dtype=torch.float32).view(-1, 1).to(device)
feature_tensor = torch.tensor(feature.values, dtype=torch.float32).to(device)
c = torch.tensor(feature.values, dtype=torch.float32).to(device)

# Y_tensor = torch.tensor(Y.values, dtype=torch.float32).to(device)
# Y_tensor = Y_tensor.view(-1, 1) 
lambda_all = []
all_selected_vars = []

criterion = nn.BCELoss()  # Binary Cross Entropy Loss
net1 = NeuralNet([input_size*2, hidden_size, output_size]).to(device)
lambda_array1 = np.zeros((input_size*2,hidden_size))

for n in range(10):
    lambda_array1 = np.zeros((input_size*2,hidden_size))
    for i in tqdm(np.arange(0,0.02,0.00009)):
        net1 = NeuralNet([input_size*2, hidden_size, output_size]).to(device)
        optimizer = ProximalSGD(net1.parameters(), lr=0.01, l1_lambda=i)
        train_model(net1, feature_tensor, Y_tensor, criterion, optimizer, epochs=50)
        weight = next(net1.named_parameters())[1].data.t()
        for j in range(len(lambda_array1)):
            for k in range(len(lambda_array1[j])):
                if abs(weight[j][k])==0 and lambda_array1[j][k] == 0:
                    lambda_array1[j][k] = i
    lambda_all.append(lambda_array1)
    W1, _ = compute_W1(lambda_array1, input_size)
    selected_vars, _ = select_variables(W1, 0.1)
    print(f"Model {n + 1} - Selected Variables:", selected_vars)
    all_selected_vars.append(selected_vars)

100%|██████████| 223/223 [19:06<00:00,  5.14s/it]


Model 1 - Selected Variables: [1, 2, 5, 6, 8, 9, 10, 12, 13, 14, 17, 19, 20, 22, 23, 26, 27, 29, 32, 42, 43]


100%|██████████| 223/223 [18:39<00:00,  5.02s/it]


Model 2 - Selected Variables: [2, 3, 4, 5, 6, 7, 8, 9, 12, 13, 14, 15, 17, 19, 20, 21, 22, 23, 25, 26, 27, 29, 31, 32, 56, 88]


100%|██████████| 223/223 [3:11:21<00:00, 51.48s/it]    


Model 3 - Selected Variables: [1, 2, 3, 4, 5, 6, 7, 8, 9, 12, 13, 14, 15, 16, 17, 19, 21, 22, 23, 24, 26, 27, 29, 31, 32, 33, 42, 56, 62, 86, 95]


100%|██████████| 223/223 [17:39<00:00,  4.75s/it]


Model 4 - Selected Variables: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 26, 27, 29, 30, 31, 32, 33, 42, 81, 85, 90, 98]


100%|██████████| 223/223 [17:21<00:00,  4.67s/it]


Model 5 - Selected Variables: [1, 5, 6, 7, 8, 9, 12, 13, 14, 16, 17, 19, 20, 22, 23, 26, 27, 28, 29, 30, 31, 32, 33]


100%|██████████| 223/223 [17:31<00:00,  4.71s/it]


Model 6 - Selected Variables: [1, 2, 5, 6, 8, 9, 10, 12, 13, 14, 15, 17, 18, 22, 23, 25, 26, 27, 28, 29, 30, 33, 40]


100%|██████████| 223/223 [17:26<00:00,  4.69s/it]


Model 7 - Selected Variables: [2, 5, 6, 8, 9, 10, 11, 12, 13, 14, 15, 17, 18, 19, 21, 22, 23, 26, 27, 29, 31, 32, 33, 35, 64, 74]


100%|██████████| 223/223 [17:22<00:00,  4.68s/it]


Model 8 - Selected Variables: [1, 3, 5, 6, 7, 8, 9, 12, 13, 14, 15, 17, 19, 20, 23, 25, 26, 27, 28, 29, 30, 31, 32, 33, 39, 69, 80, 90]


100%|██████████| 223/223 [17:32<00:00,  4.72s/it]


Model 9 - Selected Variables: [1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12, 13, 14, 15, 17, 19, 20, 21, 22, 23, 26, 27, 29, 31, 32, 37, 39, 79, 81, 91, 94]


100%|██████████| 223/223 [17:42<00:00,  4.76s/it]

Model 10 - Selected Variables: [1, 4, 5, 6, 7, 8, 9, 12, 13, 14, 15, 17, 18, 19, 21, 23, 26, 27, 29, 30, 31, 32, 33, 47, 80, 91]





In [12]:
from collections import Counter

def aggregate_selected_vars(all_vars):
    flat_vars = [item for sublist in all_vars for item in sublist]
    var_counts = Counter(flat_vars)
    sorted_vars = sorted(var_counts.items(), key=lambda x: x[1], reverse=True)
    return sorted_vars
final_selected_vars = aggregate_selected_vars(all_selected_vars)


print("Final selected variables and their counts:")
for var, count in final_selected_vars:
    print(f"Variable: {var}, Count: {count}")

Final selected variables and their counts:
Variable: 5, Count: 10
Variable: 6, Count: 10
Variable: 8, Count: 10
Variable: 9, Count: 10
Variable: 12, Count: 10
Variable: 13, Count: 10
Variable: 14, Count: 10
Variable: 17, Count: 10
Variable: 23, Count: 10
Variable: 26, Count: 10
Variable: 27, Count: 10
Variable: 29, Count: 10
Variable: 19, Count: 9
Variable: 32, Count: 9
Variable: 1, Count: 8
Variable: 22, Count: 8
Variable: 15, Count: 8
Variable: 31, Count: 8
Variable: 2, Count: 7
Variable: 7, Count: 7
Variable: 33, Count: 7
Variable: 20, Count: 6
Variable: 21, Count: 6
Variable: 3, Count: 5
Variable: 4, Count: 5
Variable: 30, Count: 5
Variable: 10, Count: 4
Variable: 18, Count: 4
Variable: 42, Count: 3
Variable: 25, Count: 3
Variable: 16, Count: 3
Variable: 11, Count: 3
Variable: 28, Count: 3
Variable: 56, Count: 2
Variable: 24, Count: 2
Variable: 81, Count: 2
Variable: 90, Count: 2
Variable: 39, Count: 2
Variable: 80, Count: 2
Variable: 91, Count: 2
Variable: 43, Count: 1
Variable: 8

### ML 10

In [51]:
Y = Y.apply(pd.to_numeric, errors='coerce')
y_float = (Y.values).astype(float) 
Y_tensor = torch.tensor(y_float, dtype=torch.float32).view(-1, 1).to(device)
feature_tensor = torch.tensor(feature.values, dtype=torch.float32).to(device)
c = torch.tensor(feature.values, dtype=torch.float32).to(device)

# Y_tensor = torch.tensor(Y.values, dtype=torch.float32).to(device)
# Y_tensor = Y_tensor.view(-1, 1) 
lambda_all = []
all_selected_vars = []

criterion = nn.BCELoss()  # Binary Cross Entropy Loss
net1 = NeuralNet([input_size*2, hidden_size, output_size]).to(device)
lambda_array1 = np.zeros((input_size*2,hidden_size))

for n in range(10):
    lambda_array1 = np.zeros((input_size*2,hidden_size))
    for i in tqdm(np.arange(0, 0.1, 0.01)):
        #(0,0.09,0.0009
        net1 = NeuralNet([input_size*2, hidden_size, output_size]).to(device)
        optimizer = ProximalSGD(net1.parameters(), lr=0.01, l1_lambda=i)
        train_model(net1, feature_tensor, Y_tensor, criterion, optimizer, epochs=50)
        weight = next(net1.named_parameters())[1].data.t()
        for j in range(len(lambda_array1)):
            for k in range(len(lambda_array1[j])):
                if abs(weight[j][k])==0 and lambda_array1[j][k] == 0:
                    lambda_array1[j][k] = i
    lambda_all.append(lambda_array1)
    W1, row_sums1 = compute_W1(lambda_all[n],input_size)
    W2, _ = compute_W2(net1,row_sums1,input_size)
    selected_vars, _ = select_variables(W2, 0.1)
    print(f"Model {n + 1} - Selected Variables:", selected_vars)
    all_selected_vars.append(selected_vars)

100%|██████████| 10/10 [01:18<00:00,  7.88s/it]


Model 1 - Selected Variables: [4, 7, 12, 14, 17, 20, 23, 24, 25, 27, 28, 29, 31, 32]


100%|██████████| 10/10 [01:18<00:00,  7.82s/it]


Model 2 - Selected Variables: [6, 7, 10, 11, 12, 14, 15, 16, 18, 19, 23, 25, 27, 28, 29, 30, 31, 32, 33, 68, 72, 77, 82, 93]


100%|██████████| 10/10 [01:24<00:00,  8.43s/it]


Model 3 - Selected Variables: [3, 4, 6, 7, 11, 12, 14, 17, 18, 19, 23, 25, 26, 27, 28, 29, 45, 59, 61, 85, 99]


100%|██████████| 10/10 [01:24<00:00,  8.49s/it]


Model 4 - Selected Variables: [6, 7, 10, 11, 14, 17, 22, 23, 27, 29, 30, 31, 33]


100%|██████████| 10/10 [01:21<00:00,  8.12s/it]


Model 5 - Selected Variables: [6, 10, 12, 14, 16, 17, 19, 20, 24, 28, 29, 32]


100%|██████████| 10/10 [00:57<00:00,  5.80s/it]


Model 6 - Selected Variables: [6, 11, 14, 17, 23, 27, 29, 66, 88]


100%|██████████| 10/10 [01:23<00:00,  8.34s/it]


Model 7 - Selected Variables: [2, 10, 11, 12, 14, 16, 17, 19, 27, 31]


100%|██████████| 10/10 [01:17<00:00,  7.73s/it]


Model 8 - Selected Variables: [3, 6, 7, 10, 11, 12, 14, 16, 17, 18, 20, 22, 23, 25, 27, 28, 29, 30, 31, 32, 33, 52, 65, 66, 69, 80, 98]


100%|██████████| 10/10 [01:18<00:00,  7.84s/it]


Model 9 - Selected Variables: [2, 4, 6, 7, 10, 12, 14, 16, 18, 19, 21, 23, 25, 27, 28, 29, 30, 32, 69, 77]


100%|██████████| 10/10 [01:09<00:00,  6.92s/it]

Model 10 - Selected Variables: [2, 3, 4, 6, 12, 14, 17, 19, 23, 25, 27, 29, 32, 67, 73]





In [52]:
from collections import Counter

def aggregate_selected_vars(all_vars):
    flat_vars = [item for sublist in all_vars for item in sublist]
    var_counts = Counter(flat_vars)
    sorted_vars = sorted(var_counts.items(), key=lambda x: x[1], reverse=True)
    return sorted_vars
final_selected_vars = aggregate_selected_vars(all_selected_vars)


print("Final selected variables and their counts:")
for var, count in final_selected_vars:
    print(f"Variable: {var}, Count: {count}")

Final selected variables and their counts:
Variable: 14, Count: 10
Variable: 27, Count: 9
Variable: 29, Count: 9
Variable: 12, Count: 8
Variable: 17, Count: 8
Variable: 23, Count: 8
Variable: 6, Count: 8
Variable: 7, Count: 6
Variable: 25, Count: 6
Variable: 28, Count: 6
Variable: 32, Count: 6
Variable: 10, Count: 6
Variable: 11, Count: 6
Variable: 19, Count: 6
Variable: 31, Count: 5
Variable: 16, Count: 5
Variable: 4, Count: 4
Variable: 18, Count: 4
Variable: 30, Count: 4
Variable: 20, Count: 3
Variable: 33, Count: 3
Variable: 3, Count: 3
Variable: 2, Count: 3
Variable: 24, Count: 2
Variable: 77, Count: 2
Variable: 22, Count: 2
Variable: 66, Count: 2
Variable: 69, Count: 2
Variable: 15, Count: 1
Variable: 68, Count: 1
Variable: 72, Count: 1
Variable: 82, Count: 1
Variable: 93, Count: 1
Variable: 26, Count: 1
Variable: 45, Count: 1
Variable: 59, Count: 1
Variable: 61, Count: 1
Variable: 85, Count: 1
Variable: 99, Count: 1
Variable: 88, Count: 1
Variable: 52, Count: 1
Variable: 65, Coun

### OML

Variable: 5, Count: 10
Variable: 6, Count: 18
Variable: 8, Count: 10
Variable: 9, Count: 10
Variable: 12, Count: 18
Variable: 13, Count: 10
Variable: 14, Count: 20
Variable: 17, Count: 18
Variable: 23, Count: 18
Variable: 26, Count: 11
Variable: 27, Count: 19
Variable: 29, Count: 19
Variable: 19, Count: 15
Variable: 32, Count: 15
Variable: 1, Count: 8
Variable: 22, Count: 10
Variable: 15, Count: 9
Variable: 31, Count: 13
Variable: 2, Count: 10
Variable: 7, Count: 13
Variable: 33, Count: 10
Variable: 20, Count: 9
Variable: 21, Count: 7
Variable: 3, Count: 8
Variable: 4, Count: 9
Variable: 30, Count: 9
Variable: 10, Count: 10
Variable: 18, Count: 8
Variable: 25, Count: 9
Variable: 16, Count: 8
Variable: 11, Count: 9
Variable: 28, Count: 9
Variable: 42, Count: 3
Variable: 56, Count: 2
Variable: 24, Count: 4
Variable: 81, Count: 2
Variable: 90, Count: 2
Variable: 39, Count: 2
Variable: 80, Count: 2
Variable: 91, Count: 2
Variable: 43, Count: 1
Variable: 88, Count: 1
Variable: 62, Count: 1
Variable: 86, Count: 1
Variable: 95, Count: 1
Variable: 85, Count: 1
Variable: 98, Count: 1
Variable: 40, Count: 1
Variable: 35, Count: 1
Variable: 64, Count: 1
Variable: 74, Count: 1
Variable: 69, Count: 1
Variable: 37, Count: 1
Variable: 79, Count: 1
Variable: 94, Count: 1
Variable: 47, Count: 1
Variable: 77, Count: 2
Variable: 66, Count: 2
Variable: 69, Count: 2
Variable: 68, Count: 1
Variable: 72, Count: 1
Variable: 82, Count: 1
Variable: 93, Count: 1
Variable: 45, Count: 1
Variable: 59, Count: 1
Variable: 61, Count: 1
Variable: 85, Count: 1
Variable: 99, Count: 1
Variable: 88, Count: 1
Variable: 52, Count: 1
Variable: 65, Count: 1
Variable: 80, Count: 1
Variable: 98, Count: 1
Variable: 67, Count: 1
Variable: 73, Count: 1

OL: 0 0.79
ML: 0 0.48
OML: 0 0.97