In [1]:
import pandas as pd
import numpy as np
import sys
sys.path.append('/home/jwerner/BrainPower/brainpower/submodule/')
import bp_preprocessing

#### Import the full data as a pandas dataframe, then handle the scale and NaN values as well as drop any columns which are not either the target or features

In [2]:
data_full = pd.read_csv('/home/jwerner/BrainPower/brainpower/data/unsplit_data/full_data_short.csv')

In [3]:
data_full = bp_preprocessing.handle_scale_and_nan(data_full,nandecision='drop',scale='Standard')
data_full = data_full.drop(columns='assay_ID')

#### Since this is a small dataset, use split_cats_by_tolerance to ensure that the dev and test data have equal ratios of categories

In [4]:
data_dev, data_test = bp_preprocessing.split_cats_by_tolerance(data_full,tolerance=0.01,randomstate=98281)

{'Healthy': 132, 'AD_MCI': 43, 'PD_MCI_LBD': 32, 'PD': 31}
{'Healthy': 24, 'AD_MCI': 8, 'PD_MCI_LBD': 5, 'PD': 5}
Randstate: 98281

Percent Healthy in dev, test: [0.5546218487394958, 0.5714285714285714] 
Standard deviation of these values: 0.008403361344537785 


Percent AD_MCI in dev, test: [0.18067226890756302, 0.19047619047619047] 
Standard deviation of these values: 0.004901960784313722 


Percent PD in dev, test: [0.13025210084033614, 0.11904761904761904] 
Standard deviation of these values: 0.005602240896358551 


Percent PD_MCI_LBD in dev, test: [0.13445378151260504, 0.11904761904761904] 
Standard deviation of these values: 0.007703081232492998 



#### The feature selection function works on the full dev set and returns a dataframe with statistics and selected features. The function passed in the cell below calculates a limited amount of information so that the process does not take too long. Further down are dataframes with more extensive data gathered with the following parameters:

bp_preprocessing.mrmr_feature_selection(data_dev,split=0.15,min_features=10,max_features=30,step_features=1,folds=10,ci=0.68,
                                          balancer='over_under without replacement', tolerance=0.01, target='group',
                                          model=sklearn.linear_model.RidgeClassifier(), score=sklearn.metrics.balanced_accuracy_score, featureselector=mrmr.mrmr_classif)

In [5]:
import sklearn
import mrmr

In [None]:
rforest_stats = bp_preprocessing.mrmr_feature_selection(data_dev,split=0.15,min_features=10,max_features=30,step_features=1,folds=10,ci=0.68,
                                          balancer='over_under without replacement', tolerance=0.01, target='group',
                                          model=sklearn.ensemble.RandomForestClassifier(),score=sklearn.metrics.balanced_accuracy_score,featureselector=mrmr.mrmr_classif)

100%|███████████████████████████████████████████████████████████████████████████████████| 10/10 [00:01<00:00,  8.52it/s]
100%|███████████████████████████████████████████████████████████████████████████████████| 11/11 [00:01<00:00,  9.44it/s]
100%|███████████████████████████████████████████████████████████████████████████████████| 12/12 [00:01<00:00,  9.31it/s]
100%|███████████████████████████████████████████████████████████████████████████████████| 13/13 [00:01<00:00,  8.16it/s]
100%|███████████████████████████████████████████████████████████████████████████████████| 14/14 [00:01<00:00,  8.77it/s]
100%|███████████████████████████████████████████████████████████████████████████████████| 15/15 [00:01<00:00,  8.62it/s]
100%|███████████████████████████████████████████████████████████████████████████████████| 16/16 [00:01<00:00,  8.14it/s]
100%|███████████████████████████████████████████████████████████████████████████████████| 17/17 [00:02<00:00,  6.59it/s]
100%|███████████████████████████

100%|███████████████████████████████████████████████████████████████████████████████████| 14/14 [00:01<00:00,  9.16it/s]
100%|███████████████████████████████████████████████████████████████████████████████████| 15/15 [00:01<00:00,  8.99it/s]
100%|███████████████████████████████████████████████████████████████████████████████████| 16/16 [00:01<00:00,  8.74it/s]
100%|███████████████████████████████████████████████████████████████████████████████████| 17/17 [00:01<00:00,  9.39it/s]
100%|███████████████████████████████████████████████████████████████████████████████████| 18/18 [00:02<00:00,  8.55it/s]
100%|███████████████████████████████████████████████████████████████████████████████████| 19/19 [00:02<00:00,  9.04it/s]
100%|███████████████████████████████████████████████████████████████████████████████████| 20/20 [00:02<00:00,  8.17it/s]
100%|███████████████████████████████████████████████████████████████████████████████████| 21/21 [00:02<00:00,  8.18it/s]
100%|███████████████████████████

In [34]:
ridge_dict = bp_preprocessing.get_feature_dict(ridge_stats,uniqueness=1)

In [35]:
ridge_dict

{10: [58,
  array(['1433G', '1433Z', 'A0A075B7D0', 'AK1C1', 'AMY2B', 'AMYP', 'CADH5',
         'CAH2', 'CALB1', 'CATD', 'CHIT1', 'CHLE', 'CLMP', 'CO4A', 'CO7',
         'CPVL', 'ECM1', 'ERAP2', 'FREM2', 'GNS', 'HBA', 'HDHD2', 'HLAE',
         'HTRA1', 'HYI', 'IDS', 'K1C12', 'KLK11', 'LRP1B', 'LTBP4', 'LV861',
         'MASP1', 'MGAT2', 'MGP', 'MXRA8', 'MYDGF', 'NEU1', 'NLGN3',
         'NPTX2', 'NPY', 'OMD', 'OSCAR', 'PAL4A', 'PLSL', 'POTEI', 'POTEJ',
         'PPT1', 'PTPR2', 'PVR', 'SCG2', 'SRPX', 'TAU', 'TIMP2', 'TRFM',
         'TSP2', 'TSP4', 'VWF', 'ZNF25'], dtype='<U10')],
 11: [65,
  array(['1433B', '1433G', '1433Z', 'A0A075B7D0', 'AK1C1', 'AMY2B', 'AMYP',
         'CADH5', 'CAH2', 'CALB1', 'CATD', 'CHIT1', 'CHLE', 'CLMP', 'CO4A',
         'CO7', 'CPVL', 'ECM1', 'ERAP2', 'FREM2', 'GDIA', 'GNS', 'HBA',
         'HBAZ', 'HDHD2', 'HLAE', 'HTRA1', 'HYI', 'IDS', 'K1C12', 'KLK11',
         'LRP1B', 'LTBP4', 'LV861', 'MASP1', 'MGAT2', 'MGP', 'MMP2',
         'MXRA8', 'MYDGF', 'NEDD8',

#### The following tables were generated this way. Because these were exported to a csv, the tuples turned into strings and must be read as literals

In [6]:
import ast

In [29]:
sys.path.append('/home/jwerner/BrainPower/data/split_data')
ridge_stats = pd.read_csv('/home/jwerner/BrainPower/brainpower/data/split_data/ridge_10_30_features.csv')
ridge_stats['ind_scores'] = ridge_stats['ind_scores'].apply(ast.literal_eval)
ridge_stats['features'] = ridge_stats['features'].apply(ast.literal_eval)

rforest_stats = pd.read_csv('/home/jwerner/BrainPower/brainpower/data/split_data/rforest_10_30_features.csv')
rforest_stats['ind_scores'] = rforest_stats['ind_scores'].apply(ast.literal_eval)
rforest_stats['features'] = rforest_stats['features'].apply(ast.literal_eval)


nbayes_stats = pd.read_csv('/home/jwerner/BrainPower/brainpower/data/split_data/nbayes_10_30_features.csv')
nbayes_stats['ind_scores'] = nbayes_stats['ind_scores'].apply(ast.literal_eval)
nbayes_stats['features'] = nbayes_stats['features'].apply(ast.literal_eval)


dectree_stats = pd.read_csv('/home/jwerner/BrainPower/brainpower/data/split_data/dectree_10_30_features.csv')
dectree_stats['ind_scores'] = dectree_stats['ind_scores'].apply(ast.literal_eval)
dectree_stats['features'] = dectree_stats['features'].apply(ast.literal_eval)

xgboost_stats = pd.read_csv('/home/jwerner/BrainPower/brainpower/data/split_data/xgboost_10_30_features.csv')
xgboost_stats['ind_scores'] = xgboost_stats['ind_scores'].apply(ast.literal_eval)
xgboost_stats['features'] = xgboost_stats['features'].apply(ast.literal_eval)

#### The features found in each run of the feature selector can be put into a dictionary by using get_feature_dict. The uniqueness parameter limits the features returned in the dictionary to only those with a multiplicity equal to or greater than the value passed or 'all' for only features which show up in every fold of the feature selection process.

#### For example, the cell below gets the feature dictionary for the 10-fold ridge classifer generated between 10 and 30 features, with only those features in each space that appear at least 3 times. of the 200 features generated by the 10 fold process, these 19 show up at least 3 times

In [31]:
nbayes_dict = bp_preprocessing.get_feature_dict(nbayes_stats,uniqueness=5)
nbayes_dict

{10: [3, array(['1433G', 'AK1C1', 'TAU'], dtype='<U5')],
 11: [4, array(['1433G', 'AK1C1', 'PAL4A', 'TAU'], dtype='<U5')],
 12: [4, array(['1433G', 'AK1C1', 'PAL4A', 'TAU'], dtype='<U5')],
 13: [4, array(['1433G', 'AK1C1', 'PAL4A', 'TAU'], dtype='<U5')],
 14: [5, array(['1433G', '1433Z', 'AK1C1', 'PAL4A', 'TAU'], dtype='<U5')],
 15: [5, array(['1433G', '1433Z', 'AK1C1', 'PAL4A', 'TAU'], dtype='<U5')],
 16: [5, array(['1433G', '1433Z', 'AK1C1', 'PAL4A', 'TAU'], dtype='<U5')],
 17: [5, array(['1433G', '1433Z', 'AK1C1', 'PAL4A', 'TAU'], dtype='<U5')],
 18: [6,
  array(['1433G', '1433Z', 'AK1C1', 'PAL4A', 'PGK1', 'TAU'], dtype='<U5')],
 19: [7,
  array(['1433B', '1433G', '1433Z', 'AK1C1', 'PAL4A', 'PGK1', 'TAU'],
        dtype='<U5')],
 20: [8,
  array(['1433B', '1433G', '1433Z', 'AK1C1', 'FREM2', 'PAL4A', 'PGK1',
         'TAU'], dtype='<U5')],
 21: [8,
  array(['1433B', '1433G', '1433Z', 'AK1C1', 'FREM2', 'PAL4A', 'PGK1',
         'TAU'], dtype='<U5')],
 22: [9,
  array(['1433B', '1433G'

In [30]:
rforest_dict = bp_preprocessing.get_feature_dict(rforest_stats,uniqueness=5)
rforest_dict

{10: [10,
  array(['1433Z', 'AK1C1', 'AMYP', 'CRIS3', 'HV69D', 'RARR2', 'SERC',
         'SV2A', 'TAU', 'TRH'], dtype='<U5')],
 11: [11,
  array(['1433G', '1433Z', 'AK1C1', 'AMYP', 'CRIS3', 'HV69D', 'RARR2',
         'SERC', 'SV2A', 'TAU', 'TRH'], dtype='<U5')],
 12: [12,
  array(['1433G', '1433Z', 'AK1C1', 'AMYP', 'ATS8', 'CRIS3', 'HV69D',
         'RARR2', 'SERC', 'SV2A', 'TAU', 'TRH'], dtype='<U5')],
 13: [13,
  array(['1433G', '1433Z', 'AK1C1', 'AMYP', 'ATS8', 'CO7', 'CRIS3', 'HV69D',
         'RARR2', 'SERC', 'SV2A', 'TAU', 'TRH'], dtype='<U5')],
 14: [14,
  array(['1433G', '1433Z', 'AK1C1', 'AMYP', 'ATS8', 'CO7', 'CRIS3', 'FMOD',
         'HV69D', 'RARR2', 'SERC', 'SV2A', 'TAU', 'TRH'], dtype='<U5')],
 15: [15,
  array(['1433G', '1433Z', 'AK1C1', 'AMYP', 'ATS8', 'CO7', 'CRIS3', 'FMOD',
         'HV69D', 'MYDGF', 'RARR2', 'SERC', 'SV2A', 'TAU', 'TRH'],
        dtype='<U5')],
 16: [16,
  array(['1433G', '1433Z', 'AK1C1', 'AMYP', 'ATS8', 'CO7', 'CRIS3', 'FMOD',
         'HV69D', 'MY

#### The altair_feature_selection_chart function will generate charts of the feature selection information and similarly has a uniqueness parameter that functions in the same way as the dictionary. Click on the figure to see the features matching the uniqueness threshold for that feature space across the 10 fold selection process

In [10]:
bp_preprocessing.altair_feature_selection_chart(ridge_stats,uniqueness=3)

In [11]:
bp_preprocessing.altair_feature_selection_chart(rforest_stats,uniqueness=3)