In [2]:
import os
import pandas as pd
import numpy as np
import scipy.optimize
import matplotlib.pyplot as plt

dataset_location = '../SRPBS_OPEN/' # made a hard link to the dataset here

In [5]:
participants = pd.read_table(dataset_location+'participants.tsv')
participants

Unnamed: 0,participant_id,site,protocol,diag,age,sex,hand,sup
0,sub-0001,SWA,1,0,23,1,1.0,1
1,sub-0002,SWA,1,0,23,2,1.0,1
2,sub-0003,SWA,1,0,26,1,1.0,1
3,sub-0004,SWA,1,0,23,1,1.0,1
4,sub-0005,SWA,1,0,24,1,1.0,1
...,...,...,...,...,...,...,...,...
1405,sub-1406,CIN,14,5,36,1,1.0,8
1406,sub-1407,CIN,14,5,56,2,1.0,8
1407,sub-1408,CIN,14,5,34,2,1.0,8
1408,sub-1409,CIN,14,5,56,1,1.0,8


In [6]:
participants = participants.dropna()
participants

Unnamed: 0,participant_id,site,protocol,diag,age,sex,hand,sup
0,sub-0001,SWA,1,0,23,1,1.0,1
1,sub-0002,SWA,1,0,23,2,1.0,1
2,sub-0003,SWA,1,0,26,1,1.0,1
3,sub-0004,SWA,1,0,23,1,1.0,1
4,sub-0005,SWA,1,0,24,1,1.0,1
...,...,...,...,...,...,...,...,...
1405,sub-1406,CIN,14,5,36,1,1.0,8
1406,sub-1407,CIN,14,5,56,2,1.0,8
1407,sub-1408,CIN,14,5,34,2,1.0,8
1408,sub-1409,CIN,14,5,56,1,1.0,8


In [7]:
healthy_controls = participants[participants['diag'] == 0]
mdd_patients = participants[participants['diag'] == 2]

In [8]:
np.unique(participants.site.to_numpy())

array(['ATT', 'ATV', 'CIN', 'COI', 'HKH', 'HRC', 'HUH', 'KTT', 'KUT',
       'SWA', 'UTO'], dtype=object)

In [9]:
mdd_patients['site'].value_counts()

COI    71
UTO    59
HUH    57
HKH    33
HRC    16
KUT    16
Name: site, dtype: int64

In [10]:
healthy_controls['site'].value_counts()

KUT    159
COI    124
SWA    101
UTO     96
KTT     75
HUH     67
HRC     49
CIN     39
ATV     39
HKH     29
ATT     13
Name: site, dtype: int64

In [7]:
sites_in_common = set(mdd_patients.site).intersection(set(healthy_controls.site))

In [8]:
def write_subjects(subjects, site:str):
    subjects_ids = [int(subject[0].split('-')[1]) for subject in subjects]
    with open(f'{site}_all.txt','w') as f:
        f.write(str(subjects_ids))

In [9]:
write_subjects(healthy_controls.to_numpy(), 'hc')
write_subjects(mdd_patients.to_numpy(), 'mdd')

for site in sites_in_common:
    healthy_controls_at_site = healthy_controls[healthy_controls.site == site]
    mdd_patients_at_site = mdd_patients[mdd_patients.site == site]
    write_subjects(pd.concat((healthy_controls_at_site, mdd_patients_at_site)).to_numpy(), site)

## Outliers

In [14]:
import json
outlier_ids = json.loads(open('all_motion_outliers.txt','r').read())

In [16]:
outliers_data = participants[[int(name[-4:]) in outlier_ids for name in participants.participant_id.to_numpy()]]
outliers_data

Unnamed: 0,participant_id,site,protocol,diag,age,sex,hand,sup
24,sub-0025,SWA,1,0,49,1,1.0,1
165,sub-0166,SWA,1,0,27,1,1.0,1
185,sub-0186,SWA,1,0,24,1,1.0,1
308,sub-0309,HUH,2,2,44,1,1.0,2
313,sub-0314,HUH,2,0,45,2,1.0,2
336,sub-0337,HUH,2,2,55,2,1.0,2
343,sub-0344,HUH,2,0,28,1,1.0,2
378,sub-0379,HRC,3,2,44,1,1.0,2
381,sub-0382,HRC,3,0,43,2,2.0,2
384,sub-0385,HRC,3,2,42,1,1.0,2


In [18]:
outliers_data.site.value_counts()

COI    26
UTO     4
KUT     4
HUH     4
HRC     4
SWA     3
ATT     2
KTT     2
Name: site, dtype: int64

In [19]:
outliers_data.diag.value_counts()

0    34
2    15
Name: diag, dtype: int64