In [1]:
import torch
import numpy as np
import pandas as pd
from model import NeuralNet, train_model, predict,ProximalSGD,accuracy
from knockoff import create_knockoff_variable
import torch.nn as nn
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


In [3]:
def set_seed(seed):
    #Set the parameters are consistent when the model is initialized
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
    np.random.seed(seed)

In [55]:
input_size = 30
hidden_size_data = 60
hidden_size = 60
output_size = 1

### Create simulated variables

In [56]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

df = pd.read_csv('cancerdata_feature.csv')
data = df.rename(columns={
    'Column1': 'radius1', 'Column2': 'texture1', 'Column3': 'perimeter1', 
    'Column4': 'area1', 'Column5': 'smoothness1', 'Column6': 'compactness', 
    'Column7': 'concavity1', 'Column8': 'concave_points1', 'Column9': 'symmetry1', 
    'Column10': 'fractal_dimension1', 'Column11': 'radius2', 'Column12': 'texture2', 
    'Column13': 'perimeter2', 'Column14': 'area2', 'Column15': 'smoothness2', 
    'Column16': 'compactness2', 'Column17': 'concavity2', 'Column18': 'concave_points2',
    'Column19': 'symmetry2', 'Column20': 'fractal_dimension2', 'Column21': 'radius3', 
    'Column22': 'texture3', 'Column23': 'perimeter3', 'Column24': 'area3', 
    'Column25': 'smoothness3', 'Column26': 'compactness3', 'Column27': 'concavity3', 
    'Column28': 'concave_points3', 'Column29': 'symmetry3', 'Column30': 'fractal_dimension3'
})

scaler = MinMaxScaler()
data_normalized = scaler.fit_transform(data)
data_normalized = pd.DataFrame(data_normalized, columns=data.columns)
y_df = pd.read_csv('cancerdata_Y.csv')

y_df['Y'] = y_df['Y'].replace({'B': 0, 'M': 1})

data_normalized['Y'] = y_df['Y']
data = data_normalized

### Create knockoff variables

In [57]:
X = data.iloc[:, :-1]
# X= (X - X.min(axis=0)) / (X.max(axis=0) - X.min(axis=0))
 
Y = data.iloc[:, -1]

knockoff_creator = create_knockoff_variable()
X_knockoff = knockoff_creator.get_equi_features(X)
# X_knockoff = (X_knockoff - X_knockoff.min(axis=0)) / (X_knockoff.max(axis=0) - X_knockoff.min(axis=0))

In [58]:
X_knockoff_df = pd.DataFrame(X_knockoff,columns= [f'var_k{i}' for i in range(input_size)])
feature = pd.concat([X,X_knockoff_df],axis = 1)
dataset1 =  pd.concat([feature,data['Y']],axis = 1)
dataset1
# dataset1.to_csv("Simulationdata.csv")

Unnamed: 0,radius1,texture1,perimeter1,area1,smoothness1,compactness1,concavity1,concave_points1,symmetry1,fractal_dimension1,...,var_k21,var_k22,var_k23,var_k24,var_k25,var_k26,var_k27,var_k28,var_k29,Y
0,0.521037,0.022658,0.545989,0.363733,0.593753,0.792037,0.703140,0.731113,0.686364,0.605518,...,0.015118,0.085178,0.085438,0.058326,0.096793,0.086753,0.084118,0.086150,0.078790,1
1,0.643144,0.272574,0.615783,0.501591,0.289880,0.181768,0.203608,0.348757,0.379798,0.141323,...,0.031644,0.069187,0.082327,0.033843,0.024583,0.029432,0.058920,0.034146,0.041303,1
2,0.601496,0.390260,0.595743,0.449417,0.514309,0.431017,0.462512,0.635686,0.509596,0.211247,...,0.037910,0.065067,0.070829,0.046311,0.060284,0.055310,0.076889,0.058908,0.039343,1
3,0.210090,0.360839,0.233501,0.102906,0.811321,0.811361,0.565604,0.522863,0.776263,1.000000,...,0.040393,0.031330,0.017648,0.088762,0.127436,0.084528,0.081533,0.145202,0.145017,1
4,0.629893,0.156578,0.630986,0.489290,0.430351,0.347893,0.463918,0.518390,0.378283,0.186816,...,0.012823,0.064862,0.065068,0.043022,0.026394,0.048932,0.052169,0.023266,0.026131,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,0.690000,0.428813,0.678668,0.566490,0.526948,0.296055,0.571462,0.690358,0.336364,0.132056,...,0.040149,0.074205,0.085466,0.044444,0.028495,0.049894,0.070077,0.013654,0.020282,1
565,0.622320,0.626987,0.604036,0.474019,0.407782,0.257714,0.337395,0.486630,0.349495,0.113100,...,0.073162,0.066011,0.072558,0.029649,0.025348,0.039445,0.051687,0.029235,0.014601,1
566,0.455251,0.621238,0.445788,0.303118,0.288165,0.254340,0.216753,0.263519,0.267677,0.137321,...,0.062124,0.049200,0.043427,0.027179,0.042703,0.042206,0.045354,0.019231,0.028252,1
567,0.644564,0.663510,0.665538,0.475716,0.588336,0.790197,0.823336,0.755467,0.675253,0.425442,...,0.076045,0.085183,0.076356,0.060413,0.127395,0.114868,0.084465,0.071434,0.084922,1


In [59]:
def compute_W1(lambda_array,num_x):
    row_sums1 = np.sum(lambda_array, axis=1)
    W1 = []
    for i in range(0,int(0.5*len(row_sums1)),1):
        if row_sums1[i]>=row_sums1[i+num_x]:
            W1.append(row_sums1[i])
        else:
            W1.append(-row_sums1[i+num_x])
    return W1,row_sums1

def compute_W2(model, row_sums1, num_x):
    para = []
    for name, param in model.named_parameters():
        if param.requires_grad and 'weight' in name:
            para.append(param.data)


    result = para[0]
    for i in range(1, len(para)):
        if i == (len(para) - 1):
            result = torch.matmul(result.t(), para[i].reshape(-1, 1))
        else:
            result = torch.matmul(result.t(), para[i])
    
    g =[]
    for i in range(len(result)):
        g.append(result[i][0].cpu().numpy() * row_sums1[i]*10)
  
    W2= []
 
    for i in range(0,int(0.5*len(g)),1):
        W2.append(abs(g[i])-abs(g[i+num_x]))
    return W2,g

def select_variables(W,target_q):
    T = []
    for i in W:
        count1 = 0
        count2 = 0
        t = abs(i)
        for j in W:
            if j < -t:
                count1 = count1+1
    
            if j >= t:
                count2 = count2 +1
        q = (count1)/(max(count2,1))

        if q <= target_q:
            T.append(t)
    threshold = min(T)
    selected_vars = [i+1 for i in range(len(W)) if W[i] >= threshold]
    return selected_vars, threshold

### OL 10

In [66]:
Y = Y.apply(pd.to_numeric, errors='coerce')
y_float = (Y.values).astype(float) 
Y_tensor = torch.tensor(y_float, dtype=torch.float32).view(-1, 1).to(device)
feature_tensor = torch.tensor(feature.values, dtype=torch.float32).to(device)
c = torch.tensor(feature.values, dtype=torch.float32).to(device)

# Y_tensor = torch.tensor(Y.values, dtype=torch.float32).to(device)
# Y_tensor = Y_tensor.view(-1, 1) 
lambda_all = []
all_selected_vars = []

criterion = nn.BCELoss()  # Binary Cross Entropy Loss
net1 = NeuralNet([input_size*2, hidden_size, output_size]).to(device)
lambda_array1 = np.zeros((input_size*2,hidden_size))

for n in range(10):
    lambda_array1 = np.zeros((input_size*2,hidden_size))
    for i in tqdm(np.arange(0,0.01,0.0001)):
        net1 = NeuralNet([input_size*2, hidden_size, output_size]).to(device)
        optimizer = ProximalSGD(net1.parameters(), lr=0.01, l1_lambda=i)
        train_model(net1, feature_tensor, Y_tensor, criterion, optimizer, epochs=50)
        weight = next(net1.named_parameters())[1].data.t()
        for j in range(len(lambda_array1)):
            for k in range(len(lambda_array1[j])):
                if abs(weight[j][k])==0 and lambda_array1[j][k] == 0:
                    lambda_array1[j][k] = i
    lambda_all.append(lambda_array1)
    W1, _ = compute_W1(lambda_array1, input_size)
    selected_vars, _ = select_variables(W1, 0.1)
    print(f"Model {n + 1} - Selected Variables:", selected_vars)
    all_selected_vars.append(selected_vars)

100%|██████████| 100/100 [00:37<00:00,  2.65it/s]


Model 1 - Selected Variables: [1, 2, 3, 7, 8, 9, 10, 15, 18, 19, 20, 21, 23, 24, 25, 28, 29, 30]


100%|██████████| 100/100 [00:38<00:00,  2.62it/s]


Model 2 - Selected Variables: [3, 5, 7, 9, 10, 13, 14, 16, 18, 19, 21, 22, 23, 24, 27, 28]


100%|██████████| 100/100 [00:38<00:00,  2.58it/s]


Model 3 - Selected Variables: [6, 15]


100%|██████████| 100/100 [00:39<00:00,  2.50it/s]


Model 4 - Selected Variables: [7, 11]


100%|██████████| 100/100 [00:37<00:00,  2.63it/s]


Model 5 - Selected Variables: [21, 29]


100%|██████████| 100/100 [00:38<00:00,  2.61it/s]


Model 6 - Selected Variables: [3, 15]


100%|██████████| 100/100 [00:39<00:00,  2.52it/s]


Model 7 - Selected Variables: [1, 2, 4, 5, 6, 8, 9, 10, 11, 12, 15, 16, 17, 18, 19, 20, 23, 25, 26, 27, 28, 29, 30]


100%|██████████| 100/100 [00:39<00:00,  2.53it/s]


Model 8 - Selected Variables: [2, 4, 12, 14, 15, 19, 23, 25, 26]


100%|██████████| 100/100 [00:38<00:00,  2.62it/s]


Model 9 - Selected Variables: [2, 3, 7, 18, 19, 20, 21, 23, 26, 27, 28, 29, 30]


100%|██████████| 100/100 [00:38<00:00,  2.59it/s]

Model 10 - Selected Variables: [2, 8, 10, 11, 18, 21, 22, 24, 25, 27, 28, 30]





In [67]:
from collections import Counter

def aggregate_selected_vars(all_vars):
    flat_vars = [item for sublist in all_vars for item in sublist]
    var_counts = Counter(flat_vars)
    sorted_vars = sorted(var_counts.items(), key=lambda x: x[1], reverse=True)
    return sorted_vars
final_selected_vars = aggregate_selected_vars(all_selected_vars)


print("Final selected variables and their counts:")
for var, count in final_selected_vars:
    print(f"Variable: {var}, Count: {count}")

Final selected variables and their counts:
Variable: 2, Count: 5
Variable: 15, Count: 5
Variable: 18, Count: 5
Variable: 19, Count: 5
Variable: 21, Count: 5
Variable: 23, Count: 5
Variable: 28, Count: 5
Variable: 3, Count: 4
Variable: 7, Count: 4
Variable: 10, Count: 4
Variable: 25, Count: 4
Variable: 29, Count: 4
Variable: 30, Count: 4
Variable: 27, Count: 4
Variable: 8, Count: 3
Variable: 9, Count: 3
Variable: 20, Count: 3
Variable: 24, Count: 3
Variable: 11, Count: 3
Variable: 26, Count: 3
Variable: 1, Count: 2
Variable: 5, Count: 2
Variable: 14, Count: 2
Variable: 16, Count: 2
Variable: 22, Count: 2
Variable: 6, Count: 2
Variable: 4, Count: 2
Variable: 12, Count: 2
Variable: 13, Count: 1
Variable: 17, Count: 1


### ML 10

In [76]:
Y = Y.apply(pd.to_numeric, errors='coerce')
y_float = (Y.values).astype(float) 
Y_tensor = torch.tensor(y_float, dtype=torch.float32).view(-1, 1).to(device)
feature_tensor = torch.tensor(feature.values, dtype=torch.float32).to(device)
c = torch.tensor(feature.values, dtype=torch.float32).to(device)

# Y_tensor = torch.tensor(Y.values, dtype=torch.float32).to(device)
# Y_tensor = Y_tensor.view(-1, 1) 
lambda_all = []
all_selected_vars = []

criterion = nn.BCELoss()  # Binary Cross Entropy Loss
net1 = NeuralNet([input_size*2, hidden_size, output_size]).to(device)
lambda_array1 = np.zeros((input_size*2,hidden_size))

for n in range(10):
    lambda_array1 = np.zeros((input_size*2,hidden_size))
    for i in tqdm(np.arange(0,0.1,0.001)):
        #(0,0.09,0.0009
        net1 = NeuralNet([input_size*2, hidden_size, output_size]).to(device)
        optimizer = ProximalSGD(net1.parameters(), lr=0.01, l1_lambda=i)
        train_model(net1, feature_tensor, Y_tensor, criterion, optimizer, epochs=50)
        weight = next(net1.named_parameters())[1].data.t()
        for j in range(len(lambda_array1)):
            for k in range(len(lambda_array1[j])):
                if abs(weight[j][k])==0 and lambda_array1[j][k] == 0:
                    lambda_array1[j][k] = i
    lambda_all.append(lambda_array1)
    W1, row_sums1 = compute_W1(lambda_all[n],input_size)
    W2, _ = compute_W2(net1,row_sums1,input_size)
    selected_vars, _ = select_variables(W2, 0.1)
    print(f"Model {n + 1} - Selected Variables:", selected_vars)
    all_selected_vars.append(selected_vars)

100%|██████████| 100/100 [00:39<00:00,  2.50it/s]


Model 1 - Selected Variables: [2, 3, 4, 5, 8, 9, 15, 19, 20, 22, 24, 25, 27, 28, 30]


100%|██████████| 100/100 [00:38<00:00,  2.58it/s]


Model 2 - Selected Variables: []


100%|██████████| 100/100 [00:39<00:00,  2.56it/s]


Model 3 - Selected Variables: [7, 14, 15, 23]


100%|██████████| 100/100 [00:39<00:00,  2.52it/s]


Model 4 - Selected Variables: [3, 5, 6, 10, 20, 26]


100%|██████████| 100/100 [00:38<00:00,  2.60it/s]


Model 5 - Selected Variables: [9, 23]


100%|██████████| 100/100 [00:39<00:00,  2.55it/s]


Model 6 - Selected Variables: []


100%|██████████| 100/100 [00:42<00:00,  2.34it/s]


Model 7 - Selected Variables: [7, 12, 15, 18, 27]


100%|██████████| 100/100 [00:39<00:00,  2.55it/s]


Model 8 - Selected Variables: [3, 4, 5, 8, 13, 14, 15, 16, 17, 18, 21, 22, 23, 24, 25, 26, 27, 28, 29]


100%|██████████| 100/100 [00:38<00:00,  2.59it/s]


Model 9 - Selected Variables: [1, 2, 3, 4, 5, 6, 12, 15, 17, 18, 19, 20, 21, 26, 30]


100%|██████████| 100/100 [00:39<00:00,  2.54it/s]

Model 10 - Selected Variables: [3, 15, 19, 21, 24]





In [77]:
from collections import Counter

def aggregate_selected_vars(all_vars):
    flat_vars = [item for sublist in all_vars for item in sublist]
    var_counts = Counter(flat_vars)
    sorted_vars = sorted(var_counts.items(), key=lambda x: x[1], reverse=True)
    return sorted_vars
final_selected_vars = aggregate_selected_vars(all_selected_vars)


print("Final selected variables and their counts:")
for var, count in final_selected_vars:
    print(f"Variable: {var}, Count: {count}")

Final selected variables and their counts:
Variable: 15, Count: 6
Variable: 3, Count: 5
Variable: 5, Count: 4
Variable: 4, Count: 3
Variable: 19, Count: 3
Variable: 20, Count: 3
Variable: 24, Count: 3
Variable: 27, Count: 3
Variable: 23, Count: 3
Variable: 26, Count: 3
Variable: 18, Count: 3
Variable: 21, Count: 3
Variable: 2, Count: 2
Variable: 8, Count: 2
Variable: 9, Count: 2
Variable: 22, Count: 2
Variable: 25, Count: 2
Variable: 28, Count: 2
Variable: 30, Count: 2
Variable: 7, Count: 2
Variable: 14, Count: 2
Variable: 6, Count: 2
Variable: 12, Count: 2
Variable: 17, Count: 2
Variable: 10, Count: 1
Variable: 13, Count: 1
Variable: 16, Count: 1
Variable: 29, Count: 1
Variable: 1, Count: 1


### OML

Variable: 2, Count: 7
Variable: 15, Count: 11
Variable: 18, Count: 11
Variable: 19, Count: 8
Variable: 21, Count: 11
Variable: 23, Count: 6
Variable: 28, Count: 7
Variable: 3, Count: 9
Variable: 7, Count: 6
Variable: 10, Count: 5
Variable: 25, Count: 6
Variable: 29, Count: 5
Variable: 30, Count: 6
Variable: 27, Count: 7
Variable: 8, Count: 5
Variable: 9, Count: 5
Variable: 20, Count: 6
Variable: 24, Count: 6
Variable: 26, Count: 6
Variable: 5, Count: 6
Variable: 4, Count: 5

