# Multiple Hypothesis Testing Correction

In [1]:
import pandas as pd
import numpy as np

%load_ext autoreload
%autoreload 2

import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns
sns.set_theme(
    context="paper", 
    style="whitegrid", 
    font_scale=1.2,
    rc={'figure.figsize': (10, 10), 'figure.dpi': 300}
)

from common.paths import PLS_RESULTS, RIDGE_RESULTS, CROSS_PRED_RESULTS
from common.results import load_results

from multipy.fwer import bonferroni
from multipy.fdr import lsu

## Get data

### Manual Input

In [2]:
# bin_all = [0.0003, 0.0003, 0.0003, 0.0003, 0.0100, 0.0643]
# bin_1 = [0.0953, 0.0313, 0.0593, 0.6485, 0.6048, 0.8287]
# bin_2 = [0.0003, 0.0247, 0.0003, 0.0020, 0.0107, 0.4159]
# bin_3 = [0.1789, 0.0653, 0.3196, 0.4172, 0.2779, 0.3162]
# pvals = np.array(bin_all + bin_1 + bin_2 + bin_3)
pvals = np.array([0.3214, 0.0140, 0.0200, 0.5309, 0.1497, 0.4950])
df = pd.DataFrame({'Features': ['Feature {}'.format(i) for i in range(1, len(pvals) + 1)], 'P-value':pvals})
display(df)

Unnamed: 0,Features,P-value
0,Feature 1,0.3214
1,Feature 2,0.014
2,Feature 3,0.02
3,Feature 4,0.5309
4,Feature 5,0.1497
5,Feature 6,0.495


### Within-Prediction

In [13]:
model = 'ridge'
population = 'healthy'

fn = f'{model}_pts_{population}'
results_folder = RIDGE_RESULTS if model == 'ridge' else PLS_RESULTS
df, results_path = load_results(fn, results_folder)
pvals = df['P-value']
display(df)

Unnamed: 0,Model,Population,Target,Bin,Alpha,Score,P-value
0,ridge,healthy,WISC_FSIQ,All,1,0.039798,0.417166
1,ridge,healthy,WISC_VSI,All,1,0.161814,0.107784
2,ridge,healthy,WISC_VCI,All,9901,0.196705,0.051896
3,ridge,healthy,WISC_FRI,All,1,-0.072839,0.734531
4,ridge,healthy,WISC_WMI,All,9901,0.116987,0.213573
5,ridge,healthy,WISC_PSI,All,1,-0.05608,0.698603


### Cross-Prediction

In [5]:
fn = 'ridge_pts_age_cross_prediction (consistent)'
# fn = 'ridge_pts_age_cross_prediction_bin_two_equal_samples'
df, results_path = load_results(fn, CROSS_PRED_RESULTS)
df = df.reset_index(drop=True)
pvals = df['P-value']
display(df)

Unnamed: 0,Model,Target,Train,Test,Score,P-value,Population,Num Permutations,FDR,Uncorrected,BFR
0,ridge,WISC_FSIQ,Bin 1,Bin 1,0.265258,0.01996,adhd,500,False,True,False
1,ridge,WISC_FSIQ,Bin 1,Bin 2,0.328246,0.001996,adhd,500,True,True,False
2,ridge,WISC_FSIQ,Bin 1,Bin 3,0.202752,0.015968,adhd,500,True,True,False
3,ridge,WISC_FSIQ,Bin 2,Bin 2,0.351665,0.001996,adhd,500,True,True,False
4,ridge,WISC_FSIQ,Bin 2,Bin 3,0.250487,0.003992,adhd,500,True,True,False
5,ridge,WISC_FSIQ,Bin 2,Bin 1,0.358564,0.001996,adhd,500,True,True,False
6,ridge,WISC_FSIQ,Bin 3,Bin 3,0.105539,0.177645,adhd,500,False,False,False
7,ridge,WISC_FSIQ,Bin 3,Bin 1,0.216069,0.015968,adhd,500,True,True,False
8,ridge,WISC_FSIQ,Bin 3,Bin 2,0.290556,0.001996,adhd,500,True,True,False
9,ridge,WISC_VSI,Bin 1,Bin 1,0.24493,0.017964,adhd,500,True,True,False


## BFR/FDR Correction

In [6]:
a_threshold = 0.05
df['Uncorrected'] = pvals < a_threshold
df['BFR'] = bonferroni(pvals, alpha=a_threshold)
df['FDR'] = lsu(pvals, q=a_threshold)
display(df.round(4))

Unnamed: 0,Model,Target,Train,Test,Score,P-value,Population,Num Permutations,FDR,Uncorrected,BFR
0,ridge,WISC_FSIQ,Bin 1,Bin 1,0.2653,0.02,adhd,500,True,True,False
1,ridge,WISC_FSIQ,Bin 1,Bin 2,0.3282,0.002,adhd,500,True,True,False
2,ridge,WISC_FSIQ,Bin 1,Bin 3,0.2028,0.016,adhd,500,True,True,False
3,ridge,WISC_FSIQ,Bin 2,Bin 2,0.3517,0.002,adhd,500,True,True,False
4,ridge,WISC_FSIQ,Bin 2,Bin 3,0.2505,0.004,adhd,500,True,True,False
5,ridge,WISC_FSIQ,Bin 2,Bin 1,0.3586,0.002,adhd,500,True,True,False
6,ridge,WISC_FSIQ,Bin 3,Bin 3,0.1055,0.1776,adhd,500,False,False,False
7,ridge,WISC_FSIQ,Bin 3,Bin 1,0.2161,0.016,adhd,500,True,True,False
8,ridge,WISC_FSIQ,Bin 3,Bin 2,0.2906,0.002,adhd,500,True,True,False
9,ridge,WISC_VSI,Bin 1,Bin 1,0.2449,0.018,adhd,500,True,True,False


### Save and update results file

In [7]:
df.to_csv(results_path)

### Get FDR q

In [18]:
m = len(pvals)
sort_ind = np.argsort(pvals)
qs = [(i+1.)*0.05/m for i, p in enumerate(pvals[sort_ind])]
a = pd.DataFrame([(p, q) for p, q in zip(pvals[sort_ind], qs)])
display(a)

Unnamed: 0,0,1
0,0.001996,0.000926
1,0.001996,0.001852
2,0.001996,0.002778
3,0.001996,0.003704
4,0.001996,0.00463
5,0.001996,0.005556
6,0.001996,0.006481
7,0.001996,0.007407
8,0.001996,0.008333
9,0.001996,0.009259


## Max Statistic Method

In [35]:
from common.binning import BIN_LABELS, ONLY_BIN_LABELS, EQUAL_BIN_LABELS
from common.paths import RIDGE_RESULTS, RIDGE_PSCORES, PLS_RESULTS, PLS_PSCORES
from common.results import save_perm_score, load_perm_score, load_results
from common.wisc import WISC_LEVEL

### Merge permutation scores per age bin

In [21]:
# Only need to run this code block once per model/population to generate merged permutation array
model = 'pls'
population = 'adhd'
pscores = RIDGE_PSCORES if model == 'ridge' else PLS_PSCORES

for bin_label in BIN_LABELS:
    bin_perm_scores = []
    
    for target in WISC_LEVEL[5]:
        perm_score = load_perm_score(f'{model}_{population}_{target}_{bin_label}_perm_scores', pscores)
        bin_perm_scores.append(perm_score)
    
    bin_perm_scores = np.concatenate(bin_perm_scores)
    print(bin_label, bin_perm_scores.shape)
    save_perm_score(bin_perm_scores, f'{model}_{population}_{bin_label}_perm_scores', pscores)

All (3000,)
Bin 1 (3000,)
Bin 2 (3000,)
Bin 3 (3000,)


### Load results (true scores)

In [25]:
model = 'pls'
population = 'adhd'
results = RIDGE_RESULTS if model == 'ridge' else PLS_RESULTS

df, results_path = load_results(f'{model}_pts_{population}', results)
# display(df)

In [None]:
a_threshold = 0.05
bin_threshold_scores = {k: None for k in EQUAL_BIN_LABELS}
max_stat_temp = pd.Series(dtype='bool')
pscores = RIDGE_PSCORES if model == 'ridge' else PLS_PSCORES

for bin_label in BIN_LABELS:
    bin_perm_scores = load_perm_score(f'{model}_{population}_{bin_label}_perm_scores', pscores)
    threshold_index = int(bin_perm_scores.shape[0] * (1 - a_threshold))
    threshold_score = np.sort(bin_perm_scores)[threshold_index]
    bin_threshold_scores[bin_label] = threshold_score
    
    bin_results = df[df['Bin'] == bin_label]
    max_stat_temp = pd.concat([max_stat_temp, threshold_score < bin_results['Score']])

df['Max-stat'] = max_stat_temp
# df.sort_values('Bin')
display(df)

### Save and update results file

In [40]:
df.to_csv(results_path)

### Visualize

In [None]:
for bin_label in BIN_LABELS:
    threshold_score = bin_threshold_scores[bin_label]
    
    g = sns.displot(bin_perm_scores, color='lightgrey', element="step")
    g.fig.subplots_adjust(top=0.9)
    g.fig.suptitle(f'Model: {model}, Population: {population}, {bin_label}, Num Perm: {bin_perm_scores.shape[0]}')
    g.ax.axvline(threshold_score, label='95th', color='black')
    
    for target_idx, target in enumerate(WISC_LEVEL[5]):
        target_score = df.loc[(df['Bin'] == bin_label) & (df['Target'] == target)]['Score']
        target_score = target_score.to_list()[0]
#         print(bin_label, target, target_score)
        g.ax.axvline(target_score, label=target, ls=(0, (3, np.random.randint(1, 6))), color=sns.color_palette()[target_idx])
    g.fig.legend(loc='center left', bbox_to_anchor=(1, 0.5))