In [None]:
!pip install concrete_autoencoder

In [None]:
import numpy as np
import pandas as pd
import os
import keras
from concrete_autoencoder import ConcreteAutoencoderFeatureSelector
from keras.layers import Dense, LeakyReLU
from sklearn.metrics import accuracy_score, cohen_kappa_score,roc_auc_score,average_precision_score
from sklearn.model_selection import KFold

In [None]:
def set_data(path,data_name,k_feat):
    
    data = np.genfromtxt(path+ data_name + str(k_feat) + 'feat.csv',delimiter=',')

    X = data[:,:-1]
    y = data[:,-1]
    
    return X,y

def train_val_test_split(X,y,tr_idx,te_idx):
    
    # divide the data in train data (4/6), val data (1/6), test data (1/6)
    
    X_train, X_te = X[tr_idx], X[te_idx]
    y_train, y_te = y[tr_idx], y[te_idx]
    
    st = te_idx[-1] + 1 - len(te_idx)
    end = te_idx[-1]
    if te_idx[-1] == 999:
       
        val_id = tr_idx[:len(te_idx)]
        
    else:
       
        val_id = tr_idx[st:end]
    
    X_val = X[val_id]
    y_val = y[val_id]
    no_train = np.concatenate((val_id,te_idx))
    
    X_tr = np.delete(X, no_train, 0)
    y_tr = np.delete(y,no_train,0)
    
    return X_tr, X_val, X_te, y_tr, y_val, y_te

def cv_cae(X,y,num_epochs,n,k,g,model_name='cae'):

    # n: number of folds for KFolds cross val
    # k: number of features to select
    # g: architecture of NN
    
    selector = ConcreteAutoencoderFeatureSelector(K = k, output_function = g,start_temp=10,min_temp=0.01,num_epochs = num_epochs ,learning_rate=0.001,tryout_limit=1)      
    
    kf = KFold(n_splits=n)
    
    i = 0

    aucc = []
    auprc = []
    
    dataDir = '/Users/utente/Documents/università/tesi - confronto FS/analisi 2-512 features'
        
    for tr_idx, te_idx in kf.split(X):
             
        X_tr,X_val,X_te,y_tr,y_val,y_te = train_val_test_split(X,y,tr_idx,te_idx)
        
        history = selector.fit(X_tr,y_tr,val_X=X_val,val_Y=y_val)
        ypred = selector.get_params().predict(X_te)
    
        aucc.append(roc_auc_score(y_te,ypred))
        auprc.append(average_precision_score(y_te, ypred))
        
        
        outputDir = os.path.join( dataDir, 'cv_' + model_name + '_fold' + str(i))

        df = pd.DataFrame(selector.get_support(indices=True).flatten())
        df.to_csv(outputDir) 

        
        i += 1
        
    return np.mean(aucc), np.var(aucc),np.mean(auprc),np.var(auprc)

def matches(best_feat,non_zero):
    #non_zero = non_zeros.copy()
    match = []
    for i in range(len(best_feat)):
        for j in range(len(non_zero)):
            if best_feat[i] == non_zero[j]:
                match.append(best_feat[i])
    umatch = []
    sorted_match = np.sort(match)
    for i in range(len(sorted_match)):
        if i == len(sorted_match)-1:
        
            umatch.append(sorted_match[i])
        
        if  i < len(sorted_match)-1 and sorted_match[i] < sorted_match[i+1] :
        
            umatch.append(sorted_match[i])
        
    return umatch#,match

def get_features(model_names,n_split,k):                     
    
    best_feat = np.zeros((n_split,k))
        
    for i in range(n_split):
        file = './'+model_names[0]+'/cv_' + model_names[1] + '_fold' + str(i) 
        imp = np.genfromtxt(file,delimiter=',')
        best_feat[i] = imp[1:,1]
        
    return best_feat

In [None]:
# architecture of NN after concrete selector layer

def G(x):
    x = Dense(64)(x)
    x = LeakyReLU(0.2)(x)
    x = Dense(64)(x)
    x = LeakyReLU(0.2)(x)
    x = Dense(64)(x)
    x = LeakyReLU(0.2)(x)

    x = Dense(1,activation='sigmoid')(x)
        
    return x

In [None]:
dataDir = '/Users/utente/Documents/università/tesi - confronto FS/analisi 2-512 features'

path='./data/'
data_name='ring-xor-sum_1000samples-'
num_epochs = 1000
splits = 6
K_feat = 6
dataset = 'ring-xor-sum'

cv_auc = []
var_auc = []
cv_auprc = []
var_auprc = []
tot_feats = [6,8,16,32,64,128,256,512]


for i in tot_feats:
    
    X,y = set_data(path,data_name,i)

    auccc,auprc,vauccc,vauprc = cv_cae(X,y,num_epochs,splits,K_feat,G,dataset+'_'+str(i)+'_K_cae')
    var_auc.append(vauccc)
    cv_auc.append(auccc)
    var_auprc.append(vauprc)
    cv_auprc.append(auprc)

In [None]:
splits = 6
K_feat = 6
name0 = 'CAE'

feat_res = []
feat_res_2k = []

for i in tot_feats:
    feat_res.append(get_features([name0,dataset+"_"+str(i)+'_K_cae'],splits,K_feat))
    
m = np.zeros((len(tot_feats),splits)) #rows=ring2,ring4,ring8... columns=fold0,fold1,fold2...
for j in range(len(tot_feats)):
    for k in range(splits):
        m[j,k] =len(matches(feat_res[j][k],np.arange(K_feat)))

# best K feat averaged on the 6 folds
cv_feat = np.mean(m,axis=1) 
var_feat = np.var(m,axis=1)

In [None]:
# Save the results

df = pd.DataFrame([cv_auc,var_auc,cv_auprc,var_auprc,cv_feat,var_feat])
df1 = df.T
df1
df1.to_excel("Acc-Cohen-Auc-Auprc_CAE_K_RING-XOR-SUM.xlsx")

In [None]:
# Run the model selecting 2k features

K_feat = 12
tot_feats = [16,32,64,128,256,512]
dataset = 'ring-xor-sum_2K_'

cv_auc = []
var_auc = []
cv_auprc = []
var_auprc = []

for i in tot_feats:
    
    X,y = set_data(path,data_name,i)

    auccc,auprc,vauccc,vauprc = cv_cae(X,y,num_epochs,splits,K_feat,G,dataset+'_'+str(i)+'_2K_cae')

    var_auc.append(vauccc)
    cv_auc.append(auccc)
    var_auprc.append(vauprc)
    cv_auprc.append(auprc)

In [None]:
# Best 2K features

for i in tot_feats:
    feat_res_2k.append(get_features([name0,dataset+"_"+str(i)+'_K_cae'],splits,K_feat))

m = np.zeros((len(tot_feats),splits)) #rows=ring2,ring4,ring8... columns=fold0,fold1,fold2...
for j in range(len(tot_feats)):
    for k in range(splits):
        
        m[j,k] =len(matches(feat_res_2k[j][k],np.arange(int(K_feat/2))))
        
# best 2K feat averaged on the 6 folds
cv_feat_2k = np.mean(m,axis=1)
var_feat_2k = np.var(m,axis=1)

In [None]:
# Save the results

df = pd.DataFrame([cv_auc,var_auc,cv_auprc,var_auprc,cv_feat_2k,var_feat_2k])
df1 = df.T
df1
df1.to_excel("Acc-Cohen-Auc-Auprc_CAE_2K_RING-XOR-SUM.xlsx")