## Library Imports

In [None]:
import pandas as pd
import numpy as np
import scipy.stats as stats
import sys
sys.path.append("..")  
import metaprivBIDS_stat as ST
from metapriv_stat import plot_calc
from metapriv_stat import rst_outlier_case2

## File from OpenNeuro Import
MULTI is the raw dataset "Multivariate Assessment of Inhibitory Control in Youth: Links with Psychopathology and Brain Function Dataset" taken from OpenNeuro. 

In [None]:
MULTI = 'https://s3.amazonaws.com/openneuro.org/ds004935/participants.tsv?versionId=pKNEG7Cn89OmcUDIq5B13qzn9x5Uizlf'
MULTI = pd.read_csv(MULTI, sep='\t')
WASI = 'https://s3.amazonaws.com/openneuro.org/ds004935/phenotype/WASI.tsv?versionId=FwdhWCxaUUiXmwoh6ELNocg3k84qoAjP'
WASI = pd.read_csv(WASI, sep='\t')
MULTI = MULTI.merge(WASI[['participant_id', 'WASI_score']], on='participant_id', how='left')
KSAD = 'https://s3.amazonaws.com/openneuro.org/ds004935/phenotype/KSADS.tsv?versionId=k4eJOEZkzDF2zpwsVFJgmS60AL6ObXVi'
KSAD = pd.read_csv(KSAD, sep='\t')
MULTI = MULTI.merge(KSAD, on='participant_id', how='left')


MULTI['income'] = MULTI['income'].replace(99, np.nan)
MULTI['race'] = MULTI['race'].replace('UNKNOWN', np.nan)
MULTI.to_csv('data_ds004935/MULTI_dataset_ds004935.csv', sep=',', index=False)

## Load in of results calculated using (MetaprivBIDS App)

The selected quasi identifers are: 
- age
- highest_education
- income
- race
- ethnicity
- sex

The selected sensitive variables are:
- race 

The dis-score fraction is the default 0.3 & missing values are indicated as NaN.

In [None]:
suda = pd.read_csv('data_ds004935/suda_ds004935.csv', sep=',')
pif = pd.read_csv('data_ds004935/pif_ds004935.csv', sep=',',index_col='Unnamed: 0').sort_index()
suda_field_score =  pd.read_csv('data_ds004935/att_suda.csv', sep=',').drop(columns=['Unnamed: 0'], errors='ignore')
k_combined_field =  pd.read_csv('data_ds004935/normalized_difference_results.csv', sep=',').drop(columns=['Difference in Unique Rows','Unique Rows After Removal']).rename(columns={'Column': 'variable'})
k_combined_all = pd.read_csv('data_ds004935/k-combined-3-6.csv', sep=',')
k_combined_field_non_risk = pd.read_csv('data_ds004935/normalized_difference_results_non.csv', sep=',').drop(columns=['Difference in Unique Rows','Unique Rows After Removal']).rename(columns={'Column': 'variable'})

## Using MetaprivBIDS stats import to calculate Pearson and Spearman correlation on field and row level.

In [None]:
ST.stats(suda, pif, suda_field_score, k_combined_field)

## Iterating through all possible combination of variables & calculating the correlation between SUDA, K-Global and PIF 

In [None]:
sum_score_df  = ST.calculate_summed_dis_scores(k_combined_all, MULTI, sample_fraction=0.3, missing_value= np.nan)

In [None]:
plot_calc(sum_score_df)

# T-test between risky and non-risky k-global values for variables. 


### Non risk variables

In [None]:
k_combined_field_non_risk

# Risk Variables

In [None]:
k_combined_field

## Result of Paired T-test

In [None]:
mean_risk_values = (k_combined_field['Normalized Difference']).mean()
print('Mean, risk identifiers:',mean_risk_values)

mean_risk_values = (k_combined_field_non_risk['Normalized Difference']).mean()
print('Mean, non-risk identifiers:',mean_risk_values)

t, p = stats.ttest_ind(k_combined_field['Normalized Difference'], k_combined_field_non_risk['Normalized Difference'])
print('t:',t,'p-value:', p)

# Outlier detection for PIF & SUD

In [None]:
class_outliers, madn, mad, outlier_indices, above_outlier_indices = ST.rst_outlier_case2(suda, 'dis-score')
print(len(above_outlier_indices))
print(above_outlier_indices)

class_outliers, madn, mad, outlier_indices, above_outlier_indices = rst_outlier_case2(pif, 'RIG')
print(len(above_outlier_indices))

print(above_outlier_indices)


## Outlier Participant  

In [None]:
MULTI.loc[36]

## Outliers after improvements

In [77]:
suda = pd.read_csv('data_ds004935/suda_redone_ds004935.csv', sep=',')
pif = pd.read_csv('data_ds004935/pif_redone_ds004935.csv', sep=',',index_col='Unnamed: 0').sort_index()

In [79]:
class_outliers, madn, mad, outlier_indices, above_outlier_indices = ST.rst_outlier_case2(suda, 'dis-score')
print(len(above_outlier_indices))
print(above_outlier_indices)

class_outliers, madn, mad, outlier_indices, above_outlier_indices = rst_outlier_case2(pif, 'RIG')
print(len(above_outlier_indices))

print(above_outlier_indices)


45
[3, 8, 10, 11, 12, 16, 17, 19, 24, 25, 36, 37, 52, 60, 63, 64, 93, 103, 106, 107, 108, 116, 117, 123, 126, 127, 128, 130, 133, 134, 139, 145, 146, 150, 152, 155, 165, 166, 171, 177, 181, 191, 207, 211, 218]
28
[3, 14, 15, 16, 17, 31, 32, 36, 52, 63, 64, 91, 93, 101, 106, 107, 116, 117, 123, 130, 139, 145, 150, 155, 171, 177, 181, 218]
