In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time
import scipy
import statistics

import sklearn.linear_model
import sklearn.metrics
import sklearn.naive_bayes
import sklearn.preprocessing
import xgboost

import imblearn
import mrmr
from sklearn.model_selection import cross_val_score

def handle_scale_and_nan(df):
    features = list(df.select_dtypes(include='float64'))
    cat = list(df.select_dtypes(include='object'))
    scaler = sklearn.preprocessing.StandardScaler().fit(df[features])
    df_cont = pd.DataFrame(data=scaler.transform(df[features]), columns=features)
    df_cat = pd.DataFrame(data=df[cat], columns=cat)
    df = pd.concat([df_cat,df_cont],axis=1)
    df = df.fillna(value=0.01)
    
    return df

def split_cats_by_tolerance(
    frame,
    tolerance,
    silent=False,
    randomstate=None,
    split=0.15,
    step=1,
    target="group",
    categories=["Healthy", "AD_MCI", "PD", "PD_MCI_LBD"],
):
    tolerable_list = []
    if randomstate == None:
        randomstate = np.random.randint(0, 2**31)
    elif type(randomstate) == int:
        pass
    while sum(tolerable_list) != 4:
        df_dev, df_test = sklearn.model_selection.train_test_split(
            frame, test_size=split, random_state=randomstate
        )

        dev_dict = dict(df_dev[target].value_counts())
        test_dict = dict(df_test[target].value_counts())

        tolerable_list = []
        stats_dict = {}
        for i in range(0, len(categories)):
            try:
                percents = [
                    (dev_dict[categories[i]] / len(df_dev)),
                    (test_dict[categories[i]] / len(df_test)),
                ]
            except:
                break
            standdev = np.std(percents)
            if standdev <= tolerance:
                tolerable_list.append(1)
                stats_dict[str(categories[i])] = [[*percents], standdev]
            else:
                tolerable_list.append(0)

        randomstate += step

    if sum(tolerable_list) == 4:
        if silent == False:
            print(dev_dict)
            print(test_dict)
            print("Randstate:", randomstate - 1)
            for i in range(0, len(categories)):
                print(
                    "\nPercent",
                    categories[i],
                    "in dev, test:",
                    stats_dict[categories[i]][0],
                    "\nStandard deviation of these values:",
                    stats_dict[categories[i]][1],
                    "\n",
                )
        elif silent == True:
            pass

    return df_dev, df_test

def over_under(df_train,cat_in_excess='Healthy',target='group',randomstate=np.random.randint(0,4294967295)):
    """
    Takes dataframe(s) with only the target value and float64 features
    This function is to balance the samples in an imbalanced training dataset that has one category in excess, with additional categories more near each other
    The categories below the category in excess will be oversampled to equality, then the category in excess will be undersampled to equality
    ---Parameters---
    df_train: the training dataframe
    cat_in_excess: the category which is present in excess, far above the other categories
    target: target column in the dataframe
    randomstate: if chosen, this will the random state for the sampling. Default: None, numpy random integer method between 0 and 4294967295, the range of the sampling module used
    randomstate_sampler: the number of loops to run to compare random states starting from 
    """
    # Drop the excessive category and oversample minority to the intermediate category
    df_train_no_excess = df_train[df_train.group != cat_in_excess]
    over_sampler = imblearn.over_sampling.RandomOverSampler(random_state=randomstate)
    X_train = df_train_no_excess.drop(columns=target)
    y_train = df_train_no_excess[target]
    X_train_over, y_train_over = over_sampler.fit_resample(X_train,y_train)
    df_train_over = pd.concat([y_train_over,X_train_over],axis=1)

    # Re-introduce the excessive category and undersample the majority to the minority
    df_train_excess = pd.concat([df_train_over,df_train[df_train[target] == cat_in_excess]])
    under_sampler = imblearn.under_sampling.RandomUnderSampler(random_state=randomstate)
    X_train = df_train_excess.drop(columns=target)
    y_train = df_train_excess[target]
    X_train_under, y_train_under = under_sampler.fit_resample(X_train,y_train)
    df_train_eq = pd.concat([y_train_under,X_train_under],axis=1)
    
    return df_train_eq

def select_features(data_dev, n):
    X_dev = data_dev.iloc[:, 1:-1]
    y_dev = data_dev.iloc[:, 0]
    
    from sklearn.datasets import make_classification
    X, y = make_classification(n_samples = 100, n_features = 20, n_informative = 2, n_redundant = 2)
    X = pd.DataFrame(X_dev)
    y = pd.Series(y_dev)
    
    from mrmr import mrmr_classif
    selected_features = mrmr_classif(X=X, y=y, K=n)
    print(selected_features)
    
    import json
    aList = selected_features
    jsonStr = json.dumps(aList)
    
    jsonFile = open("mRMR_"+ str(n)+ "_features.json", "w")
    jsonFile.write(jsonStr)
    jsonFile.close()
    
    return selected_features

def filter_dataframe(data, selected_features):
    cols_to_keep = ['group'] + selected_features
    data = data.filter(items=cols_to_keep)
    
    return data

In [2]:
data_full = pd.read_csv('/Users/katherine/Desktop/BrainPower/brainpower/data/split_data/full_data_short.csv')

In [3]:
data_full = data_full.drop(columns='assay_ID')

In [4]:
df = handle_scale_and_nan(data_full)

In [5]:
data_dev, data_test = split_cats_by_tolerance(df,0.01,randomstate=98281)

{'Healthy': 132, 'AD_MCI': 43, 'PD_MCI_LBD': 32, 'PD': 31}
{'Healthy': 24, 'AD_MCI': 8, 'PD_MCI_LBD': 5, 'PD': 5}
Randstate: 98281

Percent Healthy in dev, test: [0.5546218487394958, 0.5714285714285714] 
Standard deviation of these values: 0.008403361344537785 


Percent AD_MCI in dev, test: [0.18067226890756302, 0.19047619047619047] 
Standard deviation of these values: 0.004901960784313722 


Percent PD in dev, test: [0.13025210084033614, 0.11904761904761904] 
Standard deviation of these values: 0.005602240896358551 


Percent PD_MCI_LBD in dev, test: [0.13445378151260504, 0.11904761904761904] 
Standard deviation of these values: 0.007703081232492998 



In [6]:
data_dev = over_under(data_dev,cat_in_excess='Healthy',target='group',randomstate=np.random.randint(0,4294967295))

In [7]:
selected_features = select_features(data_dev, 15)

100%|███████████████████████████████████████████| 15/15 [00:00<00:00, 19.32it/s]

['1433Z', 'CO7', 'LV214', 'AK1C1', 'AMY2B', 'SMS', '1433G', 'FREM2', 'CRIS2', 'CADH8', 'TAU', 'OMD', '1433B', 'PILRA', 'SPRN']





In [8]:
selected_features

['1433Z',
 'CO7',
 'LV214',
 'AK1C1',
 'AMY2B',
 'SMS',
 '1433G',
 'FREM2',
 'CRIS2',
 'CADH8',
 'TAU',
 'OMD',
 '1433B',
 'PILRA',
 'SPRN']

In [9]:
dev = filter_dataframe(data_dev, selected_features)
val = filter_dataframe(data_test, selected_features)

In [17]:
def apply_ml_model_PD(dev, classifier):
    """
    finds the R2 score for different ML models
    """
    folds = sklearn.model_selection.LeaveOneOut().split(dev)

    fold_scores = []
    for train_indexes, val_indexes in folds: # the KFold splitter returns the indexes of the data, not the data itself

        scaler = sklearn.preprocessing.StandardScaler() # normalizing the numerical features
        train_X = data_dev.iloc[:,1:]
        val_X = data_test.iloc[:,1:]
        
        train_y = data_dev['group'].values.reshape(-1,1)
        val_y= data_test['group'].values.reshape(-1,1)
        
        if classifier == "random_forest": 
            model = sklearn.ensemble.RandomForestClassifier()
        elif classifier == "naive_bayes":  
            model = sklearn.naive_bayes.GaussianNB()
        elif classifier == "decision_tree": 
            model = sklearn.tree.DecisionTreeClassifier()
        else: 
            print("wrong classifier named entered")
            
        model.fit(train_X, train_y)
        fold_scores.append(
            sklearn.metrics.balanced_accuracy_score(val_y, model.predict(val_X)))
    return print(f"Leave-one-out cross validated balanced accuracy scores for {classifier} model (mean, std): ({np.mean(fold_scores)}, {np.std(fold_scores)})")

In [18]:
apply_ml_model_PD(dev, classifier = 'random_forest')

Leave-one-out cross validated balanced accuracy scores for random_forest model (mean, std): (0.5042756782945736, 0.053220276954013344)
