In [1]:
import pandas as pd
import numpy as np
import scipy.stats
import sklearn.linear_model
import sklearn.neighbors

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
sns.set_context('talk')

import warnings
warnings.filterwarnings("ignore")

import mlxtend.feature_selection

In [2]:
full_data = pd.read_csv('full_data_short.csv')

In [3]:
train_data = pd.read_csv('train.csv')
val_data = pd.read_csv('val.csv')
test_data = pd.read_csv('test.csv')

In [4]:
list_full_data_columns = list(full_data.columns)
list_train_data_columns = list(train_data.columns)

In [5]:
list_dropped_features = list(set(list_full_data_columns).difference(set(list_train_data_columns)))

In [6]:
print('train_data shape:',np.shape(np.array(train_data)))
print('val_data shape:',np.shape(np.array(val_data)))
print('test_data shape:',np.shape(np.array(test_data)))
print('full_data shape:',np.shape(np.array(full_data)))

train_data shape: (202, 1139)
val_data shape: (36, 1139)
test_data shape: (42, 1139)
full_data shape: (280, 1347)


In [7]:
train_AD_healthy = train_data.loc[(train_data['group'] == 'Healthy') | (train_data['group'] == 'AD_MCI')]
train_PD_healthy = train_data.loc[(train_data['group'] == 'Healthy') | (train_data['group'] == 'PD')]
train_PDMCI_healthy = train_data.loc[(train_data['group'] == 'Healthy') | (train_data['group'] == 'PD_MCI_LBD')]

In [8]:
val_AD_healthy = val_data.loc[(val_data['group'] == 'Healthy') | (val_data['group'] == 'AD_MCI')]
val_PD_healthy = val_data.loc[(val_data['group'] == 'Healthy') | (val_data['group'] == 'PD')]
val_PDMCI_healthy = train_data.loc[(train_data['group'] == 'Healthy') | (train_data['group'] == 'PD_MCI_LBD')]

In [9]:
test_AD_healthy = test_data.loc[(test_data['group'] == 'Healthy') | (test_data['group'] == 'AD_MCI')]
test_PD_healthy = test_data.loc[(test_data['group'] == 'Healthy') | (test_data['group'] == 'PD')]
test_PDMCI_healthy = test_data.loc[(test_data['group'] == 'Healthy') | (test_data['group'] == 'PD_MCI_LBD')]

In [10]:
X_train_AD_healthy = train_AD_healthy.drop(columns='group')
y_train_AD_healthy = train_AD_healthy['group']
X_val_AD_healthy = val_AD_healthy.drop(columns='group')
y_val_AD_healthy = val_AD_healthy['group']
X_test_AD_healthy = test_AD_healthy.drop(columns='group')
y_test_AD_healthy = test_AD_healthy['group']


X_train_PD_healthy = train_PD_healthy.drop(columns='group')
y_train_PD_healthy = train_PD_healthy['group']
X_val_PD_healthy = val_PD_healthy.drop(columns='group')
y_val_PD_healthy = val_PD_healthy['group']
X_test_PD_healthy = test_PD_healthy.drop(columns='group')
y_test_PD_healthy = test_PD_healthy['group']


X_train_PDMCI_healthy = train_PDMCI_healthy.drop(columns='group')
y_train_PDMCI_healthy = train_PDMCI_healthy['group']
X_val_PDMCI_healthy = val_PDMCI_healthy.drop(columns='group')
y_val_PDMCI_healthy = val_PDMCI_healthy['group']
X_test_PDMCI_healthy = test_PDMCI_healthy.drop(columns='group')
y_test_PDMCI_healthy = test_PDMCI_healthy['group']

In [11]:
train_AD_healthy['group'].value_counts()

Healthy    110
AD_MCI      41
Name: group, dtype: int64

In [76]:
print('AD_healthy')
print('train\n',np.shape(np.array(X_train_AD_healthy)),'\n',np.shape(np.array(y_train_AD_healthy)),'\n',train_AD_healthy['group'].value_counts())
print('val\n',np.shape(np.array(X_val_AD_healthy)),'\n',np.shape(np.array(y_val_AD_healthy)),'\n',val_AD_healthy['group'].value_counts())
print('test\n',np.shape(np.array(X_test_AD_healthy)),'\n',np.shape(np.array(y_test_AD_healthy)),'\n',test_AD_healthy['group'].value_counts())

print('\nPD_healthy')
print('train\n',np.shape(np.array(X_train_PD_healthy)),'\n',np.shape(np.array(y_train_PD_healthy)),'\n',train_PD_healthy['group'].value_counts())
print('val\n',np.shape(np.array(X_val_PD_healthy)),'\n',np.shape(np.array(y_val_PD_healthy)),'\n',val_PD_healthy['group'].value_counts())
print('test\n',np.shape(np.array(X_test_PD_healthy)),'\n',np.shape(np.array(y_test_PD_healthy)),'\n',test_PD_healthy['group'].value_counts())

print('\nPDMCI_healthy')
print('train\n',np.shape(np.array(X_train_PDMCI_healthy)),'\n',np.shape(np.array(y_train_PDMCI_healthy)),'\n',train_PDMCI_healthy['group'].value_counts())
print('val\n',np.shape(np.array(X_val_PDMCI_healthy)),'\n',np.shape(np.array(y_val_PDMCI_healthy)),'\n',val_PDMCI_healthy['group'].value_counts())
print('test\n',np.shape(np.array(X_test_PDMCI_healthy)),'\n',np.shape(np.array(y_test_PDMCI_healthy)),'\n',test_PDMCI_healthy['group'].value_counts())

AD_healthy
train
 (151, 1138) 
 (151,) 
 Healthy    110
AD_MCI      41
Name: group, dtype: int64
val
 (26, 1138) 
 (26,) 
 Healthy    22
AD_MCI      4
Name: group, dtype: int64
test
 (30, 1138) 
 (30,) 
 Healthy    24
AD_MCI      6
Name: group, dtype: int64

PD_healthy
train
 (140, 1138) 
 (140,) 
 Healthy    110
PD          30
Name: group, dtype: int64
val
 (24, 1138) 
 (24,) 
 Healthy    22
PD          2
Name: group, dtype: int64
test
 (28, 1138) 
 (28,) 
 Healthy    24
PD          4
Name: group, dtype: int64

PDMCI_healthy
train
 (131, 1138) 
 (131,) 
 Healthy       110
PD_MCI_LBD     21
Name: group, dtype: int64
val
 (131, 1138) 
 (131,) 
 Healthy       110
PD_MCI_LBD     21
Name: group, dtype: int64
test
 (32, 1138) 
 (32,) 
 Healthy       24
PD_MCI_LBD     8
Name: group, dtype: int64


Define model to feature select with:

In [77]:
model = sklearn.linear_model.RidgeClassifier()
sfs = mlxtend.feature_selection.SequentialFeatureSelector(model, k_features=1118, forward=False,verbose=2,n_jobs=-1)

Start with AD_MCI

In [78]:
sfs_AD = sfs.fit(X_train_AD_healthy,y_train_AD_healthy)
bottom20_AD = sfs_AD.k_feature_names_

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    1.3s
[Parallel(n_jobs=-1)]: Done 288 tasks      | elapsed:    2.8s
[Parallel(n_jobs=-1)]: Done 1138 out of 1138 | elapsed:    6.7s finished

[2023-02-26 07:52:05] Features: 1137/1118 -- score: 0.8879569892473119[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 432 tasks      | elapsed:    2.1s
[Parallel(n_jobs=-1)]: Done 1114 out of 1137 | elapsed:    5.4s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done 1137 out of 1137 | elapsed:    5.4s finished

[2023-02-26 07:52:10] Features: 1136/1118 -- score: 0.9070967741935485[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 432 tasks      | elapsed:    2.2s
[Parallel(n_jobs=-1)]: Don

In [86]:
bottom20_AD = sfs_AD.k_feature_names_
bottom20_AD = list(bottom20_AD)
total = list(X_train_AD_healthy.columns)
bottom20_AD = list(set(total).difference(set(bottom20_AD)))
print(bottom20_AD)

['F13A', 'HV124', 'PLXA4', 'DAF', 'ARMD4', 'KCC2A', 'VMO1', 'PCDGL', 'PAPP2', 'B2L12', 'NDST1', 'FGL2', 'LVX54', 'PTPRR', 'PSA1', 'AGRB1', 'B3GN2', 'PAL4A', 'F174A', 'A0A075B7D0']


In [52]:
sfs_PD = sfs.fit(X_train_PD_healthy,y_train_PD_healthy)
bottom20_PD = sfs_PD.k_feature_names_

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    1.3s
[Parallel(n_jobs=-1)]: Done 288 tasks      | elapsed:    2.7s
[Parallel(n_jobs=-1)]: Done 1115 out of 1138 | elapsed:    6.6s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done 1138 out of 1138 | elapsed:    6.7s finished

[2023-02-26 06:58:19] Features: 1137/1118 -- score: 0.7857142857142858[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 432 tasks      | elapsed:    2.1s
[Parallel(n_jobs=-1)]: Done 1137 out of 1137 | elapsed:    5.3s finished

[2023-02-26 06:58:24] Features: 1136/1118 -- score: 0.7928571428571429[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 432 tasks      | elapsed:    2.1s
[Parallel(n_jobs=-1)]: Don

### This current bottom20_PD is not correct. You would have to re-run backward selection to get the correct list unfortunately. See below for the correct list

In [111]:
bottom20_PD = sfs_PD.k_feature_names_
bottom20_PD = list(bottom20_PD)
total = list(X_train_PD_healthy.columns)
bottom20_PD = list(set(total).difference(set(bottom20_PD)))
print(bottom20_PD)

['IGHD', 'A0A0G2JRQ6', 'DAF', 'A0A2R8Y422', 'A0A0J9YY99', 'RECK', 'PCDGL', 'B2L12', 'C1QT5', 'NDST1', 'S4R460', 'VWA1', 'FGL2', 'MATN3', 'OBSL1', 'A0A087X1L8', 'PSA1', 'B3GN2', 'PAL4A', 'F174A']


In [63]:
sfs_PDMCI = sfs.fit(X_train_PDMCI_healthy,y_train_PDMCI_healthy)
top20_PDMCI = sfs_PDMCI.k_feature_names_

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    1.2s
[Parallel(n_jobs=-1)]: Done 288 tasks      | elapsed:    2.6s
[Parallel(n_jobs=-1)]: Done 1115 out of 1138 | elapsed:    6.4s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done 1138 out of 1138 | elapsed:    6.5s finished

[2023-02-26 07:14:07] Features: 1137/1118 -- score: 0.8555555555555555[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 432 tasks      | elapsed:    2.0s
[Parallel(n_jobs=-1)]: Done 1114 out of 1137 | elapsed:    5.2s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done 1137 out of 1137 | elapsed:    5.2s finished

[2023-02-26 07:14:13] Features: 1136/1118 -- score: 0.8632478632478632[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    0.1s
[Para

In [71]:
sfs_AD == sfs_PD

False

In [110]:
bottom20_PDMCI = sfs_PDMCI.k_feature_names_
bottom20_PDMCI = list(bottom20_PDMCI)
total = list(X_train_PDMCI_healthy.columns)
bottom20_PDMCI = list(set(total).difference(set(bottom20_PDMCI)))
print(bottom20_PDMCI)

['IGHD', 'A0A0G2JRQ6', 'DAF', 'A0A2R8Y422', 'A0A0J9YY99', 'RECK', 'PCDGL', 'B2L12', 'C1QT5', 'NDST1', 'S4R460', 'VWA1', 'FGL2', 'MATN3', 'OBSL1', 'A0A087X1L8', 'PSA1', 'B3GN2', 'PAL4A', 'F174A']


In [87]:
bottom20_AD

['F13A',
 'HV124',
 'PLXA4',
 'DAF',
 'ARMD4',
 'KCC2A',
 'VMO1',
 'PCDGL',
 'PAPP2',
 'B2L12',
 'NDST1',
 'FGL2',
 'LVX54',
 'PTPRR',
 'PSA1',
 'AGRB1',
 'B3GN2',
 'PAL4A',
 'F174A',
 'A0A075B7D0']

In [90]:
type(bottom20_AD)

list

In [103]:
all_bottom_20 = []
all_bottom_20.append(bottom20_AD)
all_bottom_20.append(bottom20_PD)
all_bottom_20.append(bottom20_PDMCI)
all_bottom_20 = np.array(all_bottom_20)
all_bottom_20 = list(all_bottom_20)

In [104]:
all_bottom_20

[array(['F13A', 'HV124', 'PLXA4', 'DAF', 'ARMD4', 'KCC2A', 'VMO1', 'PCDGL',
        'PAPP2', 'B2L12', 'NDST1', 'FGL2', 'LVX54', 'PTPRR', 'PSA1',
        'AGRB1', 'B3GN2', 'PAL4A', 'F174A', 'A0A075B7D0'], dtype='<U10'),
 array(['IGHD', 'A0A0G2JRQ6', 'DAF', 'A0A2R8Y422', 'A0A0J9YY99', 'RECK',
        'PCDGL', 'B2L12', 'C1QT5', 'NDST1', 'S4R460', 'VWA1', 'FGL2',
        'MATN3', 'OBSL1', 'A0A087X1L8', 'PSA1', 'B3GN2', 'PAL4A', 'F174A'],
       dtype='<U10'),
 array(['IGHD', 'A0A0G2JRQ6', 'DAF', 'A0A2R8Y422', 'A0A0J9YY99', 'RECK',
        'PCDGL', 'B2L12', 'C1QT5', 'NDST1', 'S4R460', 'VWA1', 'FGL2',
        'MATN3', 'OBSL1', 'A0A087X1L8', 'PSA1', 'B3GN2', 'PAL4A', 'F174A'],
       dtype='<U10')]

I dont know why I'm having an issue with this array that I didn't have with the top20 lists, so I'm just going to copy paste it

In [112]:
all_bottom_20 = ['F13A', 'HV124', 'PLXA4', 'DAF', 'ARMD4', 'KCC2A', 'VMO1', 'PCDGL',
        'PAPP2', 'B2L12', 'NDST1', 'FGL2', 'LVX54', 'PTPRR', 'PSA1',
        'AGRB1', 'B3GN2', 'PAL4A', 'F174A', 'A0A075B7D0','IGHD', 'A0A0G2JRQ6', 'DAF', 'A0A2R8Y422', 'A0A0J9YY99', 'RECK',
        'PCDGL', 'B2L12', 'C1QT5', 'NDST1', 'S4R460', 'VWA1', 'FGL2',
        'MATN3', 'OBSL1', 'A0A087X1L8', 'PSA1', 'B3GN2', 'PAL4A', 'F174A','LV319', 'SRCRL', 'A0A075B7D8', 'HV108', 'DAF', 'A0A2R8Y422', 'ARMD4', 'RECK', 'A0A075B7E8', 'A0A075B7D4', 'PCDGL', 'B2L12', 'S4R460', 'FGL2', 'TREM2', 'VWC2', 'PSA1', 'S4R3C0', 'B3GN2', 'F174A']

In [115]:
all_bottom_20 = list(np.unique(all_bottom_20))
print(all_bottom_20)
len(all_bottom_20)

['A0A075B7D0', 'A0A075B7D4', 'A0A075B7D8', 'A0A075B7E8', 'A0A087X1L8', 'A0A0G2JRQ6', 'A0A0J9YY99', 'A0A2R8Y422', 'AGRB1', 'ARMD4', 'B2L12', 'B3GN2', 'C1QT5', 'DAF', 'F13A', 'F174A', 'FGL2', 'HV108', 'HV124', 'IGHD', 'KCC2A', 'LV319', 'LVX54', 'MATN3', 'NDST1', 'OBSL1', 'PAL4A', 'PAPP2', 'PCDGL', 'PLXA4', 'PSA1', 'PTPRR', 'RECK', 'S4R3C0', 'S4R460', 'SRCRL', 'TREM2', 'VMO1', 'VWA1', 'VWC2']


40