In [None]:
import numpy as np
import pandas as pd
import sklearn
import os
from scipy.spatial import distance_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score,roc_auc_score,cohen_kappa_score,average_precision_score

In [None]:
def set_data(path,data_name,k_feat):
    
    data = np.genfromtxt(path+ data_name + str(k_feat) + 'feat.csv',delimiter=',')

    X = data[:,:-1]
    y = data[:,-1]
    
    return X,y

def matches(best_feat,non_zero):
    
    # find how many features are important between the features selected by the model
    
    match = []
    for i in range(len(best_feat)):
        for j in range(len(non_zero)):
            if best_feat[i] == non_zero[j]:
                match.append(best_feat[i])
    return match

def get_features(model_names,n_split,k):                     
    
    # get the best k features from the model "model_names"
    
    best_feat = np.zeros((n_split,k))
     
    
    for i in range(n_split):
        file = '/Users/utente/Documents/università/tesi - confronto FS/analisi 2-512 features/'+ model_names[0] + '/' + model_names[1] + '_fold_'+ str(i) +'_importance.csv'    
    
        imp = np.genfromtxt(file,delimiter=',')
                       
        best_feat[i] = imp.argsort()[-k:][::-1]
        
    return best_feat



def relief(X, y, m=None):
    
    # m: number of iterations of the algorithm
    
    if hasattr(X, 'values'):
        X = X.values
    if hasattr(y, 'values'):
        y = y.values
    D = distance_matrix(X, X)
    scores = np.zeros(X.shape[1])
    ranges = np.max(X, axis=0) - np.min(X, axis=0)
    if m is None:
        m = X.shape[0]
    mm = np.random.choice(X.shape[0], m, replace=False)
    for i in mm:
        nhit = None
        nmiss = None
        for j in range(X.shape[0]):
            if i == j:
                continue
            if y[j] == y[i]:
                if nhit is None:
                    nhit = j
                elif D[i, j] < D[i, nhit]:
                    nhit = j
            else:
                if nmiss is None:
                    nmiss = j
                elif D[i, j] < D[i, nmiss]:
                    nmiss = j
        for f in range(X.shape[1]):
            scores[f] = scores[f] + (-abs(X[i, f] - X[nhit, f])/ranges[f] 
                               + abs(X[i, f] - X[nmiss, f])/ranges[f])/m
    return scores


def cv_relief(X,y,n,dataDir,m,model_name = 'relief'):
    
    # n: number of folds for KFold cross-val
    # m: iterations of relief algorithm
    # model_name: identify the model when we save it. Last word must be "sigmoid" or "softmax", in this way we set cancelout layer
    
    kf = KFold(n_splits=n)
    
    rel_importances = np.zeros((n,X.shape[1]))
    
    i = 0
    
    for tr_idx, te_idx in kf.split(X):
        
        X_tr, X_te = X[tr_idx], X[te_idx]
        y_tr, y_te = y[tr_idx], y[te_idx]
        
        rel_importances[i] = relief(X_tr, y_tr,m)
        
        outputDir = os.path.join( dataDir, 'Relief')
        try:
            os.stat(outputDir)
        except:
            os.mkdir(outputDir)

        with open(os.path.join(outputDir, model_name + '_fold_'+ str(i) +'_importance.csv'), "a+") as myfile:
            myfile.write(','.join([str(x) for x in rel_importances[i].flatten()]) + '\n')
        
        i += 1
        
    return None

def cv_RF_training(X,y,n,dataDir,model_name = 'rf'):
    
    # n: number of folds for KFold cross-val
    # model_name: identify the model when we save it. Last word must be "sigmoid" or "softmax", in this way we set cancelout layer
    
    kf = KFold(n_splits=n)
    
    rf_importances = np.zeros((n,X.shape[1]))

    aucc = []
    auprc = []
    
    
    i = 0
    
    for tr_idx, te_idx in kf.split(X):
        
        X_tr, X_te = X[tr_idx], X[te_idx]
        y_tr, y_te = y[tr_idx], y[te_idx]
        
        clf = RandomForestClassifier(n_estimators=500, n_jobs=-1, random_state=42)
        rnd_clf = clf.fit(X_tr,y_tr)
        
        rf_importances[i] = rnd_clf.feature_importances_
        
        y_pred = clf.predict(X_te)
        y_predprob = clf.predict_proba(X_te)[:,1]
        aucc.append(roc_auc_score(y_te,y_predprob))
        auprc.append(average_precision_score(y_te, y_predprob))            
        
        outputDir = os.path.join( dataDir, 'RandomForest')
        try:
            os.stat(outputDir)
        except:
            os.mkdir(outputDir)

        with open(os.path.join(outputDir, model_name + '_fold_'+ str(i) +'_importance.csv'), "a+") as myfile:
            myfile.write(','.join([str(x) for x in rf_importances[i].flatten()]) + '\n')
    
        
        i += 1
        
    with open(os.path.join(outputDir, 'cv_RandomForest_' + model_name + '_accuracy.csv'), "a+") as myfile:
        myfile.write(str(scores))
    with open(os.path.join(outputDir, 'cv_RandomForest_' + model_name + '_cohen.csv'), "a+") as myfile:
        myfile.write(str(cohen))
    with open(os.path.join(outputDir, 'cv_RandomForest_' + model_name + '_auc.csv'), "a+") as myfile:
        myfile.write(str(aucc))
    with open(os.path.join(outputDir, 'cv_RandomForest_' + model_name + '_auprc.csv'), "a+") as myfile:
        myfile.write(str(auprc))
        
        
    return np.mean(aucc), np.mean(auprc), np.var(aucc), np.var(auprc)

In [None]:
# RING RELIEF

dataDir = '/Users/utente/Documents/università/tesi - confronto FS/analisi 2-512 features'
path='./data/'
data_name='ring_1000samples-'
m = 5 # iterations of relief algorithm
tot_feats = [2,4,8,16,32,64,128,256,512]
splits = 6
K_feat = 2
name0 = 'Relief'
name1 = '_relief'
dataset = 'ring_'


for i in tot_feats:
    
    X,y = set_data(path,data_name,i)

    cv_relief(X,y,6,dataDir,m,dataset+str(i)+name1)


feat_res = []
feat_res_2k = []

# Best K features

for i in tot_feats:
    feat_res.append(get_features([name0,dataset+str(i)+name1],splits,K_feat))
    
m = np.zeros((len(tot_feats),splits)) #rows=ring2,ring4,ring8... columns=fold0,fold1,fold2...
for j in range(len(tot_feats)):
    for k in range(splits):
        m[j,k] =len(matches(feat_res[j][k],np.arange(K_feat)))

# best K feat averaged on the 6 folds
cv_feat = np.mean(m,axis=1) 
var_feat = np.var(m,axis=1)

    
# Best 2K features

for i in tot_feats:
    if i == 2:
        feat_res_2k.append(get_features([name0,dataset+str(i)+name1],splits,int(K_feat)))
    else:
        feat_res_2k.append(get_features([name0,dataset+str(i)+name1],splits,int(2*K_feat)))

m = np.zeros((len(tot_feats),splits)) #rows=ring2,ring4,ring8... columns=fold0,fold1,fold2...
for j in range(len(tot_feats)):
    for k in range(splits):
        
        m[j,k] =len(matches(feat_res_2k[j][k],np.arange(int(K_feat))))

cv_feat_2k = np.mean(m,axis=1)
var_feat_2k = np.var(m,axis=1)

In [None]:
# Save results

df = pd.DataFrame([cv_feat,var_feat,cv_feat_2k,var_feat_2k])
df1 = df.T
df1
df1.to_excel("BESTK_RELIEF_RING.xlsx")

In [None]:
# XOR RELIEF

dataDir = '/Users/utente/Documents/università/tesi - confronto FS/analisi 2-512 features'
path='./data/'
data_name='xor_1000samples-'
m = 5 # iterations of relief algorithm
tot_feats = [2,4,8,16,32,64,128,256,512]
splits = 6
K_feat = 2
name0 = 'Relief'
name1 = '_relief'
dataset = 'xor_'


for i in tot_feats:
    
    X,y = set_data(path,data_name,i)

    cv_relief(X,y,6,dataDir,m,dataset+str(i)+name1)


feat_res = []
feat_res_2k = []

# Best K features

for i in tot_feats:
    feat_res.append(get_features([name0,dataset+str(i)+name1],splits,K_feat))
    
m = np.zeros((len(tot_feats),splits)) #rows=ring2,ring4,ring8... columns=fold0,fold1,fold2...
for j in range(len(tot_feats)):
    for k in range(splits):
        m[j,k] =len(matches(feat_res[j][k],np.arange(K_feat)))

# best K feat averaged on the 6 folds
cv_feat = np.mean(m,axis=1) 
var_feat = np.var(m,axis=1)

    
# Best 2K features

for i in tot_feats:
    if i == 2:
        feat_res_2k.append(get_features([name0,dataset+str(i)+name1],splits,int(K_feat)))
    else:
        feat_res_2k.append(get_features([name0,dataset+str(i)+name1],splits,int(2*K_feat)))

m = np.zeros((len(tot_feats),splits)) #rows=ring2,ring4,ring8... columns=fold0,fold1,fold2...
for j in range(len(tot_feats)):
    for k in range(splits):
        
        m[j,k] =len(matches(feat_res_2k[j][k],np.arange(int(K_feat))))

cv_feat_2k = np.mean(m,axis=1)
var_feat_2k = np.var(m,axis=1)

In [None]:
# Save results

df = pd.DataFrame([cv_feat,var_feat,cv_feat_2k,var_feat_2k])
df1 = df.T
df1
df1.to_excel("BESTK_RELIEF_XOR.xlsx")

In [None]:
# RING+XOR RELIEF

dataDir = '/Users/utente/Documents/università/tesi - confronto FS/analisi 2-512 features'
path='./data/'
data_name='ring+xor_1000samples-'
m = 5 # iterations of relief algorithm
tot_feats = [4,8,16,32,64,128,256,512]
splits = 6
K_feat = 4
name0 = 'Relief'
name1 = '_relief'
dataset = 'ring+xor_'


for i in tot_feats:
    
    X,y = set_data(path,data_name,i)

    cv_relief(X,y,6,dataDir,m,dataset+str(i)+name1)


feat_res = []
feat_res_2k = []

# Best K features

for i in tot_feats:
    feat_res.append(get_features([name0,dataset+str(i)+name1],splits,K_feat))
    
m = np.zeros((len(tot_feats),splits)) #rows=ring2,ring4,ring8... columns=fold0,fold1,fold2...
for j in range(len(tot_feats)):
    for k in range(splits):
        m[j,k] =len(matches(feat_res[j][k],np.arange(K_feat)))

# best K feat averaged on the 6 folds
cv_feat = np.mean(m,axis=1) 
var_feat = np.var(m,axis=1)

    
# Best 2K features

for i in tot_feats:
    if i == 4:
        feat_res_2k.append(get_features([name0,dataset+str(i)+name1],splits,int(K_feat)))
    else:
        feat_res_2k.append(get_features([name0,dataset+str(i)+name1],splits,int(2*K_feat)))

m = np.zeros((len(tot_feats),splits)) #rows=ring2,ring4,ring8... columns=fold0,fold1,fold2...
for j in range(len(tot_feats)):
    for k in range(splits):
        
        m[j,k] =len(matches(feat_res_2k[j][k],np.arange(int(K_feat))))

cv_feat_2k = np.mean(m,axis=1)
var_feat_2k = np.var(m,axis=1)

In [None]:
# Save results

df = pd.DataFrame([cv_feat,var_feat,cv_feat_2k,var_feat_2k])
df1 = df.T
df1
df1.to_excel("BESTK_RELIEF_RING+XOR.xlsx")

In [None]:
# RING+XOR+SUM RELIEF

dataDir = '/Users/utente/Documents/università/tesi - confronto FS/analisi 2-512 features'
path='./data/'
data_name='ring-xor-sum_1000samples-'
m = 5 # iterations of relief algorithm
tot_feats = [6,8,16,32,64,128,256,512]
splits = 6
K_feat = 6
name0 = 'Relief'
name1 = '_relief'
dataset = 'ring-xor-sum_'


for i in tot_feats:
    
    X,y = set_data(path,data_name,i)

    cv_relief(X,y,6,dataDir,m,dataset+str(i)+name1)


feat_res = []
feat_res_2k = []

# Best K features

for i in tot_feats:
    feat_res.append(get_features([name0,dataset+str(i)+name1],splits,K_feat))
    
m = np.zeros((len(tot_feats),splits)) #rows=ring2,ring4,ring8... columns=fold0,fold1,fold2...
for j in range(len(tot_feats)):
    for k in range(splits):
        m[j,k] =len(matches(feat_res[j][k],np.arange(K_feat)))

# best K feat averaged on the 6 folds
cv_feat = np.mean(m,axis=1) 
var_feat = np.var(m,axis=1)

    
# Best 2K features

for i in tot_feats:
    if i == 6:
        feat_res_2k.append(get_features([name0,dataset+str(i)+name1],splits,int(K_feat)))
    elif i == 8:
        feat_res_2k.append(get_features([name0,dataset+str(i)+name1],splits,8))
    else:
        feat_res_2k.append(get_features([name0,dataset+str(i)+name1],splits,int(2*K_feat)))

m = np.zeros((len(tot_feats),splits)) #rows=ring2,ring4,ring8... columns=fold0,fold1,fold2...
for j in range(len(tot_feats)):
    for k in range(splits):
        
        m[j,k] =len(matches(feat_res_2k[j][k],np.arange(int(K_feat))))

cv_feat_2k = np.mean(m,axis=1)
var_feat_2k = np.var(m,axis=1)

In [None]:
# Save results

df = pd.DataFrame([cv_feat,var_feat,cv_feat_2k,var_feat_2k])
df1 = df.T
df1
df1.to_excel("BESTK_RELIEF_RING-XOR-SUM.xlsx")

In [None]:
# RING RANDOM FOREST

dataDir = '/Users/utente/Documents/università/tesi - confronto FS/analisi 2-64 features'

path='./data/'
data_name='ring_1000samples-'
tot_feats = [2,4,8,16,32,64,128,256,512]

cv_auc = []
var_auc = []
cv_auprc = []
var_auprc = []
cv_feat = []
var_feat = []
cv_feat_2k = []
var_feat_2k = []

for i in tot_feats:
    
    X,y = set_data(path,data_name,i)

    auccc,auprc,vauccc,vauprc = cv_RF_training(X,y,6,dataDir,'ring'+str(i)+'_rf')

    var_auc.append(vauccc)
    cv_auc.append(auccc)
    var_auprc.append(vauprc)
    cv_auprc.append(auprc)

In [None]:
splits = 6
K_feat = 2
name0 = 'RandomForest'
name1 = '_rf'
dataset = 'ring'

# Best K features

for i in tot_feats:
    feat_res.append(get_features([name0,dataset+str(i)+name1],splits,K_feat))
    
m = np.zeros((len(tot_feats),splits)) #rows=ring2,ring4,ring8... columns=fold0,fold1,fold2...
for j in range(len(tot_feats)):
    for k in range(splits):
        m[j,k] =len(matches(feat_res[j][k],np.arange(K_feat)))

# best K feat averaged on the 6 folds
cv_feat = np.mean(m,axis=1) 
var_feat = np.var(m,axis=1)

    
# Best 2K features

for i in tot_feats:
    if i == 2:
        feat_res_2k.append(get_features([name0,dataset+str(i)+name1],splits,int(K_feat)))
    else:
        feat_res_2k.append(get_features([name0,dataset+str(i)+name1],splits,int(2*K_feat)))

m = np.zeros((len(tot_feats),splits)) #rows=ring2,ring4,ring8... columns=fold0,fold1,fold2...
for j in range(len(tot_feats)):
    for k in range(splits):
        
        m[j,k] =len(matches(feat_res_2k[j][k],np.arange(int(K_feat))))

cv_feat_2k = np.mean(m,axis=1)
var_feat_2k = np.var(m,axis=1)

In [None]:
# Save results

df = pd.DataFrame([cv_auc,var_auc,cv_auprc,var_auprc,cv_feat,var_feat,cv_feat_2k,var_feat_2k])
df1 = df.T
df1
df1.to_excel("Auc-Auprc_RANDOMFOREST_RING.xlsx")

In [None]:
# XOR RANDOM FOREST

dataDir = '/Users/utente/Documents/università/tesi - confronto FS/analisi 2-64 features'

path='./data/'
data_name='xor_1000samples-'
tot_feats = [2,4,8,16,32,64,128,256,512]

cv_auc = []
var_auc = []
cv_auprc = []
var_auprc = []
cv_feat = []
var_feat = []
cv_feat_2k = []
var_feat_2k = []

for i in tot_feats:
    
    X,y = set_data(path,data_name,i)

    auccc,auprc,vauccc,vauprc = cv_RF_training(X,y,6,dataDir,'xor'+str(i)+'_rf')

    var_auc.append(vauccc)
    cv_auc.append(auccc)
    var_auprc.append(vauprc)
    cv_auprc.append(auprc)

In [None]:
splits = 6
K_feat = 2
name0 = 'RandomForest'
name1 = '_rf'
dataset = 'xor'

# Best K features

for i in tot_feats:
    feat_res.append(get_features([name0,dataset+str(i)+name1],splits,K_feat))
    
m = np.zeros((len(tot_feats),splits)) #rows=ring2,ring4,ring8... columns=fold0,fold1,fold2...
for j in range(len(tot_feats)):
    for k in range(splits):
        m[j,k] =len(matches(feat_res[j][k],np.arange(K_feat)))

# best K feat averaged on the 6 folds
cv_feat = np.mean(m,axis=1) 
var_feat = np.var(m,axis=1)

    
# Best 2K features

for i in tot_feats:
    if i == 2:
        feat_res_2k.append(get_features([name0,dataset+str(i)+name1],splits,int(K_feat)))
    else:
        feat_res_2k.append(get_features([name0,dataset+str(i)+name1],splits,int(2*K_feat)))

m = np.zeros((len(tot_feats),splits)) #rows=ring2,ring4,ring8... columns=fold0,fold1,fold2...
for j in range(len(tot_feats)):
    for k in range(splits):
        
        m[j,k] =len(matches(feat_res_2k[j][k],np.arange(int(K_feat))))

cv_feat_2k = np.mean(m,axis=1)
var_feat_2k = np.var(m,axis=1)

In [None]:
# Save results

df = pd.DataFrame([cv_auc,var_auc,cv_auprc,var_auprc,cv_feat,var_feat,cv_feat_2k,var_feat_2k])
df1 = df.T
df1
df1.to_excel("Auc-Auprc_RANDOMFOREST_XOR.xlsx")

In [None]:
# RING XOR RANDOM FOREST

dataDir = '/Users/utente/Documents/università/tesi - confronto FS/analisi 2-64 features'

path='./data/'
data_name='ring+xor_1000samples-'
tot_feats = [4,8,16,32,64,128,256,512]

cv_auc = []
var_auc = []
cv_auprc = []
var_auprc = []
cv_feat = []
var_feat = []
cv_feat_2k = []
var_feat_2k = []

for i in tot_feats:
    
    X,y = set_data(path,data_name,i)

    auccc,auprc,vauccc,vauprc = cv_RF_training(X,y,6,dataDir,'ring-xor'+str(i)+'_rf')

    var_auc.append(vauccc)
    cv_auc.append(auccc)
    var_auprc.append(vauprc)
    cv_auprc.append(auprc)

In [None]:
splits = 6
K_feat = 4
name0 = 'RandomForest'
name1 = '_rf'
dataset = 'ring-xor'

# Best K features

for i in tot_feats:
    feat_res.append(get_features([name0,dataset+str(i)+name1],splits,K_feat))
    
m = np.zeros((len(tot_feats),splits)) #rows=ring2,ring4,ring8... columns=fold0,fold1,fold2...
for j in range(len(tot_feats)):
    for k in range(splits):
        m[j,k] =len(matches(feat_res[j][k],np.arange(K_feat)))

# best K feat averaged on the 6 folds
cv_feat = np.mean(m,axis=1) 
var_feat = np.var(m,axis=1)

    
# Best 2K features

for i in tot_feats:
    if i == 4:
        feat_res_2k.append(get_features([name0,dataset+str(i)+name1],splits,int(K_feat)))
    else:
        feat_res_2k.append(get_features([name0,dataset+str(i)+name1],splits,int(2*K_feat)))

m = np.zeros((len(tot_feats),splits)) #rows=ring2,ring4,ring8... columns=fold0,fold1,fold2...
for j in range(len(tot_feats)):
    for k in range(splits):
        
        m[j,k] =len(matches(feat_res_2k[j][k],np.arange(int(K_feat))))

cv_feat_2k = np.mean(m,axis=1)
var_feat_2k = np.var(m,axis=1)

In [None]:
# Save results

df = pd.DataFrame([cv_auc,var_auc,cv_auprc,var_auprc,cv_feat,var_feat,cv_feat_2k,var_feat_2k])
df1 = df.T
df1
df1.to_excel("Auc-Auprc_RANDOMFOREST_RING-XOR.xlsx")

In [None]:
# RING-XOR-SUM RANDOM FOREST

dataDir = '/Users/utente/Documents/università/tesi - confronto FS/analisi 2-512 features'

path='./data/'
data_name='ring-xor-sum_1000samples-'

tot_feats = [6,8,16,32,64,128,256,512]

cv_auc = []
var_auc = []
cv_auprc = []
var_auprc = []
cv_feat = []
var_feat = []
cv_feat_2k = []
var_feat_2k = []

for i in tot_feats:
    
    X,y = set_data(path,data_name,i)

    auccc,auprc,vauccc,vauprc = cv_RF_training(X,y,6,dataDir,'ring-xor-sum'+str(i)+'_rf')

    var_auc.append(vauccc)
    cv_auc.append(auccc)
    var_auprc.append(vauprc)
    cv_auprc.append(auprc)

In [None]:
splits = 6
K_feat = 6
name0 = 'RandomForest'
name1 = '_rf'
dataset = 'ring-xor-sum_auprc'


# Best K features

for i in tot_feats:
    feat_res.append(get_features([name0,dataset+str(i)+name1],splits,K_feat))
    
m = np.zeros((len(tot_feats),splits)) #rows=ring2,ring4,ring8... columns=fold0,fold1,fold2...
for j in range(len(tot_feats)):
    for k in range(splits):
        m[j,k] =len(matches(feat_res[j][k],np.arange(K_feat)))

# best K feat averaged on the 6 folds
cv_feat = np.mean(m,axis=1) 
var_feat = np.var(m,axis=1)

    
# Best 2K features

for i in tot_feats:
    if i == 6:
        feat_res_2k.append(get_features([name0,dataset+str(i)+name1],splits,int(K_feat)))
    elif i == 8:
        feat_res_2k.append(get_features([name0,dataset+str(i)+name1],splits,8))
    else:
        feat_res_2k.append(get_features([name0,dataset+str(i)+name1],splits,int(2*K_feat)))

m = np.zeros((len(tot_feats),splits)) #rows=ring2,ring4,ring8... columns=fold0,fold1,fold2...
for j in range(len(tot_feats)):
    for k in range(splits):
        
        m[j,k] =len(matches(feat_res_2k[j][k],np.arange(int(K_feat))))

cv_feat_2k = np.mean(m,axis=1)
var_feat_2k = np.var(m,axis=1)

In [None]:
# Save results

df = pd.DataFrame([cv_auc,var_auc,cv_auprc,var_auprc,cv_feat,var_feat,cv_feat_2k,var_feat_2k])
df1 = df.T
df1
df1.to_excel("Auc-Auprc_RANDOMFOREST_RING-XOR-SUM.xlsx")