In [1]:
import pandas as pd
import numpy as np
import sys
sys.path.append('/home/jwerner/BrainPower/brainpower/submodule/')
import bp_preprocessing

#### Import the full data as a pandas dataframe, then handle the scale and NaN values as well as drop any columns which are not either the target or features

In [4]:
data_full = pd.read_csv('/home/jwerner/BrainPower/brainpower/data/unsplit_data/full_data_short.csv')

In [5]:
data_full = bp_preprocessing.handle_scale_and_nan(data_full,nandecision='drop',scale='Standard')
data_full = data_full.drop(columns='assay_ID')

#### Since this is a small dataset, use split_cats_by_tolerance to ensure that the dev and test data have equal ratios of categories

In [6]:
data_dev, data_test = bp_preprocessing.split_cats_by_tolerance(data_full,tolerance=0.01,randomstate=98281)

{'Healthy': 132, 'AD_MCI': 43, 'PD_MCI_LBD': 32, 'PD': 31}
{'Healthy': 24, 'AD_MCI': 8, 'PD_MCI_LBD': 5, 'PD': 5}
Randstate: 98281

Percent Healthy in dev, test: [0.5546218487394958, 0.5714285714285714] 
Standard deviation of these values: 0.008403361344537785 


Percent AD_MCI in dev, test: [0.18067226890756302, 0.19047619047619047] 
Standard deviation of these values: 0.004901960784313722 


Percent PD in dev, test: [0.13025210084033614, 0.11904761904761904] 
Standard deviation of these values: 0.005602240896358551 


Percent PD_MCI_LBD in dev, test: [0.13445378151260504, 0.11904761904761904] 
Standard deviation of these values: 0.007703081232492998 



#### Separate train and val from the dev data using the same function

In [7]:
data_train, data_val = bp_preprocessing.split_cats_by_tolerance(data_dev,tolerance=0.01)

{'Healthy': 112, 'AD_MCI': 37, 'PD_MCI_LBD': 27, 'PD': 26}
{'Healthy': 20, 'AD_MCI': 6, 'PD_MCI_LBD': 5, 'PD': 5}
Randstate: 996455695

Percent Healthy in dev, test: [0.5544554455445545, 0.5555555555555556] 
Standard deviation of these values: 0.0005500550055005382 


Percent AD_MCI in dev, test: [0.18316831683168316, 0.16666666666666666] 
Standard deviation of these values: 0.008250825082508254 


Percent PD in dev, test: [0.12871287128712872, 0.1388888888888889] 
Standard deviation of these values: 0.0050880088008800894 


Percent PD_MCI_LBD in dev, test: [0.13366336633663367, 0.1388888888888889] 
Standard deviation of these values: 0.002612761276127612 



#### Since this data is imbalanced, with one category in vast excess, balance the training data with over_under

In [8]:
data_traineq = bp_preprocessing.over_under(data_train,cat_in_excess='Healthy',target='group',silent=True)

#### Now you're ready to train a ML model. Here we are using the sklearn ridge classifier

In [9]:
X_train = data_traineq.drop(columns='group')
y_train = data_traineq['group']
X_val = data_val.drop(columns='group')
y_val = data_val['group']

In [10]:
import sklearn

In [11]:
model = sklearn.linear_model.RidgeClassifier()
model.fit(X_train,y_train)

#### Because the validation data remains imbalanced, its important to use a scoring metric that takes that into account, such as sklearn balanced_accuracy_score

In [12]:
sklearn.metrics.balanced_accuracy_score(y_val,model.predict(X_val))

0.8166666666666667

#### Once you've set your hyperparameters, you may want to train the model on the full dev set before applying it to the test set. Be sure to run over_under on the whole dev set if your data is imbalanced and you're using the whole dev set as a training set

In [13]:
data_deveq = bp_preprocessing.over_under(data_dev,cat_in_excess='Healthy',target='group',silent=True)

In [14]:
X_dev = data_deveq.drop(columns='group')
y_dev = data_deveq['group']
X_test = data_test.drop(columns='group')
y_test = data_test['group']

In [15]:
model = sklearn.linear_model.RidgeClassifier()
model.fit(X_dev,y_dev)

In [17]:
sklearn.metrics.balanced_accuracy_score(y_test,model.predict(X_test))

0.5520833333333335