In [1]:
import os
import pandas as pd
import numpy as np

dataset_location = '../SRPBS_OPEN/' # made a hard link to the dataset here
np.random.seed(1) # for reproducible random sampling

In [2]:
participants = pd.read_table(dataset_location+'participants.tsv')
participants

Unnamed: 0,participant_id,site,protocol,diag,age,sex,hand,sup
0,sub-0001,SWA,1,0,23,1,1.0,1
1,sub-0002,SWA,1,0,23,2,1.0,1
2,sub-0003,SWA,1,0,26,1,1.0,1
3,sub-0004,SWA,1,0,23,1,1.0,1
4,sub-0005,SWA,1,0,24,1,1.0,1
...,...,...,...,...,...,...,...,...
1405,sub-1406,CIN,14,5,36,1,1.0,8
1406,sub-1407,CIN,14,5,56,2,1.0,8
1407,sub-1408,CIN,14,5,34,2,1.0,8
1408,sub-1409,CIN,14,5,56,1,1.0,8


In [3]:
healthy_controls = participants[participants['diag'] == 0]
mdd_patients = participants[participants['diag'] == 2]

In [4]:
mdd_patients['site'].value_counts()

COI    71
UTO    62
HUH    57
HKH    33
KUT    16
HRC    16
Name: site, dtype: int64

In [5]:
healthy_controls['site'].value_counts()

KUT    159
COI    124
SWA    101
UTO     96
KTT     75
HUH     67
HRC     49
CIN     39
ATV     39
HKH     29
ATT     13
Name: site, dtype: int64

COI seems like a nice site to take a subset from, and it's the first site used in the discovery dataset in the paper. 124 HC and 70 MDD are used from this site in the paper, which matches the estimates above. Maybe 1 MDD patient had to be filtered out.

In [6]:
sample_size = 20
healthy_controls_sample = healthy_controls[healthy_controls['site'] == 'COI'].sample(n=sample_size)
mdd_patients_sample = mdd_patients[mdd_patients['site'] == 'COI'].sample(n=sample_size)

In [7]:
healthy_controls_sample

Unnamed: 0,participant_id,site,protocol,diag,age,sex,hand,sup
581,sub-0582,COI,5,0,65,1,1.0,2
659,sub-0660,COI,5,0,49,2,1.0,2
618,sub-0619,COI,5,0,26,1,1.0,2
651,sub-0652,COI,5,0,35,2,1.0,2
575,sub-0576,COI,5,0,60,1,1.0,2
547,sub-0548,COI,5,0,67,1,1.0,2
612,sub-0613,COI,5,0,56,1,1.0,2
577,sub-0578,COI,5,0,68,1,1.0,2
643,sub-0644,COI,5,0,48,2,1.0,2
628,sub-0629,COI,5,0,69,2,1.0,2


In [8]:
mdd_patients_sample

Unnamed: 0,participant_id,site,protocol,diag,age,sex,hand,sup
520,sub-0521,COI,5,2,43,2,1.0,2
1013,sub-1014,COI,5,2,46,2,1.0,2
493,sub-0494,COI,5,2,42,2,1.0,2
1012,sub-1013,COI,5,2,31,1,1.0,2
580,sub-0581,COI,5,2,41,2,1.0,2
540,sub-0541,COI,5,2,71,2,1.0,2
592,sub-0593,COI,5,2,75,2,1.0,2
560,sub-0561,COI,5,2,47,1,1.0,2
495,sub-0496,COI,5,2,33,1,1.0,2
584,sub-0585,COI,5,2,56,1,1.0,2


In [9]:
healthy_controls_sample_ids = sorted([int(id_str.split('-')[1]) for id_str in healthy_controls_sample['participant_id'].to_list()])
mdd_patients_sample_ids = sorted([int(id_str.split('-')[1]) for id_str in mdd_patients_sample['participant_id'].to_list()])
with open('hc_sample.txt','w') as f:
    f.write(str(healthy_controls_sample_ids))
with open('mdd_sample.txt','w') as f:
    f.write(str(mdd_patients_sample_ids))

In [115]:
def match_sample_by_age(sample1, sample2, threshold=None):
    cost_matrix = abs(sample1['age'].to_numpy().reshape(-1,1) - sample2['age'].to_numpy())
    row_ind, col_ind = linear_sum_assignment(cost_matrix)
    costs = cost_matrix[row_ind, col_ind]
    sorted_order = np.argsort(costs)
    if threshold is None:
        return[[sample1.iloc[row],sample2.iloc[col]] for row, col in zip(row_ind[sorted_order], col_ind[sorted_order])], costs.sum()
    else:
        return[[sample1.iloc[row],sample2.iloc[col]] for row, col in zip(row_ind[sorted_order], col_ind[sorted_order]) if cost_matrix[row,col] <= threshold], costs.sum()

In [116]:
age_match, cost = match_sample_by_age(mdd_patients[mdd_patients['site'] == 'COI'], healthy_controls[healthy_controls['site'] == 'COI'])

In [117]:
age_match

col                 5
  diag                     0
  age                     45
  sex                      2
  hand                   1.0
  sup                      2
  Name: 503, dtype: object],
 [participant_id    sub-0505
  site                   COI
  protocol                 5
  diag                     2
  age                     43
  sex                      1
  hand                   1.0
  sup                      2
  Name: 504, dtype: object,
  participant_id    sub-0635
  site                   COI
  protocol                 5
  diag                     0
  age                     43
  sex                      2
  hand                   1.0
  sup                      2
  Name: 634, dtype: object],
 [participant_id    sub-0507
  site                   COI
  protocol                 5
  diag                     2
  age                     46
  sex                      2
  hand                   1.0
  sup                      2
  Name: 506, dtype: object,
  participant_id    sub