# 2. Dataset XIST-positive and -negative split

Load in datasets. Extract cells annotated as neurons or glia. Split by presence or absence of XIST in each cell. Construct pseudobulks.

In [1]:

import matplotlib.pyplot as plt
import scanpy as sc
import numpy as np
import time


In [2]:
import decoupler as dc
import pandas as pd
import sys
import anndata
import pickle


In [3]:
ads = {}

In [4]:
rt = '/home/ec2-user/curation/'
filenames = {'amrute':'ad.h5ad','brener':'ad.h5ad','chaffin':'human_dcm_hcm_scportal_03.17.2022.h5ad','eraslan':'ad.h5ad','hill':'ad.h5ad',
            'litvinukova':'ad.h5ad','kanemaru':'ad.h5ad','kuppe':'ad.h5ad','reichart':'ad.h5ad','selewa':'ad.h5ad',
             'knight_schrijver':'ad.h5ad',
            'sim':'ad.h5ad','simonson':'ICM_scportal_05.24.2022.h5ad','tucker':'healthy_human_4chamber_map_unnormalized_V4.h5ad',
            'linna_kuosmanen':['carebank.h5ad','periheart.h5ad']}

neuron_cts = ['Glia','glia','neuron','Neuron','neural','Neural','Schwann']


## Amrute

In [5]:
dataset = 'amrute'
ad = sc.read_h5ad(rt+dataset+'/'+filenames[dataset])

In [7]:
assert all(ad.raw.X.indices==ad.X.indices)
assert ad.raw.X.nnz==ad.X.nnz

In [9]:
ad.obs.rename(columns={'orig.ident':'donor_id','cell.type':'cell_type'},inplace=True)

In [10]:
ad = ad[ad.obs['condition']!='Donor'] #donors are from Koenig

Sexes and ages are not marked in the Seurat object.

In [11]:
ad.obs['sex']=None
female_donors = ['TWCM-'+x for x in ['13-1','13-104','14-173','296','370','397']]
filt = ad.obs['donor_id'].str.contains( "|".join(female_donors),regex=True)
ad.obs.loc[filt,'sex']='female'
ad.obs.loc[~filt,'sex']='male'

  ad.obs['sex']=None


In [12]:
age_map = {'190':18,'229':60,'239':44,'296':60,
           '359':46,'363':64,'370':67,'371':61,
           '373':69,'376':37,'397':47,'410':58,
           '463':29,
           '388':1000,'378':1001} #placeholders

In [13]:
ad.obs['age'] = ad.obs['donor_id'].str.split('-').str[1].map(age_map)

In [15]:
dids = ad.obs['donor_id'].str.split('-').str[1].unique()
metakeys = np.array(list(age_map.keys()))

In [16]:
ad.var.set_index('features',inplace=True)

In [17]:
filt_neur=ad.obs['cell_type'].str.contains( "|".join(neuron_cts),regex=True)
ad = ad[filt_neur]
ad.obs['xist'] = None
cf=ad[:,'XIST'].X.toarray().squeeze()>0
ad.obs.loc[cf,'xist']= 'present'
ad.obs.loc[~cf,'xist']= 'absent'

  ad.obs['xist'] = None


In [18]:
ad = dc.get_pseudobulk(ad,sample_col='donor_id',groups_col='xist',
    mode='sum',
    min_cells=0,
    min_counts=0,use_raw=True)

In [19]:
ads[dataset] = ad.copy()

## Brener

The annotated data got removed from Broad.

In [21]:
dataset = 'brener'
ad = sc.read_h5ad(rt+dataset+'/'+filenames[dataset])
ad_geo = sc.read_h5ad(rt+dataset+'/ad_geo.h5ad')

In [22]:
ad = ad[ad.obs['Virus']=='Covid']

In [23]:
assert all(ad_geo.var_names == ad.var_names)

In [24]:
assert all(ad_geo.obs_names == ad.obs_names)

In [26]:
assert ad.raw.X.nnz==ad.X.nnz
assert ad_geo.X.nnz==ad.X.nnz
assert all(ad.raw.X.indices==ad.X.indices)

In [27]:
ad_raw = ad.raw.to_adata()
ad_raw.X = ad_geo.X
ad.raw=ad_raw

In [28]:
ad.obs.rename(columns={'batches':'donor_id','leiden':'cell_type'},inplace=True)

In [29]:
ad.obs['sex']=None
filt = ad.obs['donor_id']=='15_144548-RV' #the one female sample
ad.obs.loc[filt,'sex']='female'
ad.obs.loc[~filt,'sex']='male'

In [30]:
study_id_map = {'11_144422':'19','15_144548':'66','8_144391':'05',
                '13_144471':'45','10_144421':'51','16_144608':'61',
                '10_144396':'39'}

age_map = {'05':83,'39':71,'61':58,'19':68,'45':65,'51':63,'66':69}
ad.obs['age'] = ad.obs['donor_id'].str.split('-').str[0].map(study_id_map).map(age_map)

In [31]:
filt_neur=ad.obs['cell_type'].str.contains( "|".join(neuron_cts),regex=True)
ad = ad[filt_neur]
ad.obs['xist'] = None
cf=ad[:,'XIST'].X.toarray().squeeze()>0
ad.obs.loc[cf,'xist']= 'present'
ad.obs.loc[~cf,'xist']= 'absent'

  ad.obs['xist'] = None


In [32]:
ad = dc.get_pseudobulk(ad,sample_col='donor_id',groups_col='xist',
    mode='sum',
    min_cells=0,
    min_counts=0,use_raw=True)


In [33]:
ads[dataset] = ad.copy()

## Eraslan

In [34]:
dataset = 'eraslan'
ad = sc.read_h5ad(rt+dataset+'/'+filenames[dataset])

In [35]:
ad=ad[ad.obs['tissue']=='anterior wall of left ventricle']

In [81]:
assert ad.raw.X.nnz==ad.X.nnz
assert all(ad.raw.X.indices==ad.X.indices)

AssertionError: 

In [39]:
ad.var.set_index('feature_name',inplace=True)
ad.raw.var.set_index('feature_name',inplace=True)

In [40]:
# ad.var

In [41]:
filt_neur=ad.obs['cell_type'].str.contains( "|".join(neuron_cts),regex=True)
ad = ad[filt_neur]
ad.obs['xist'] = None
cf=ad.raw[:,'XIST'].X.toarray().squeeze()>0 #note that raw has to be used here because the corrected data are imputed
ad.obs.loc[cf,'xist']= 'present'
ad.obs.loc[~cf,'xist']= 'absent'

  ad.obs['xist'] = None


In [42]:
ad = dc.get_pseudobulk(ad,sample_col='donor_id',groups_col='xist',
    mode='sum',
    min_cells=0,
    min_counts=0,use_raw=True)

In [44]:
age_map = {'GTEX-1ICG6':'70-79','GTEX-13N11':'50-59','GTEX-15RIE':'60-69'}
ad.obs['Age_bin'] = ad.obs['donor_id'].map(age_map)

In [46]:
ads[dataset] = ad.copy()

## Hill

In [47]:
dataset = 'hill'
ad = sc.read_h5ad(rt+dataset+'/'+filenames[dataset])

In [48]:
ad.obs.rename(columns={'MainCellType':'cell_type'},inplace=True)

Evidently 13_198 is duplicated (LV and RV), and UK1/2 somehow map to 3B62D and FC3CB.

In [49]:
# assert ad.raw.X.nnz==ad.X.nnz #no raw

In [50]:
# ad.var.set_index('feature_name',inplace=True)
filt_neur=ad.obs['cell_type'].str.contains( "|".join(neuron_cts),regex=True)
ad = ad[filt_neur]
ad.obs['xist'] = None
cf=ad[:,'XIST'].X.toarray().squeeze()>0
ad.obs.loc[cf,'xist']= 'present'
ad.obs.loc[~cf,'xist']= 'absent'

  ad.obs['xist'] = None


In [51]:
ad = dc.get_pseudobulk(ad,sample_col='donor_id',groups_col='xist',
    mode='sum',
    min_cells=0,
    min_counts=0,use_raw=False)

In [53]:
ad.obs['age'] = ad.obs['age'].str.split('y').str[0].astype(float)

In [54]:
ads[dataset] = ad.copy()

## Kanemaru

In [55]:
dataset = 'kanemaru'
ad = sc.read_h5ad(rt+dataset+'/'+filenames[dataset])

In [56]:
ad=ad[ad.obs['is_primary_data']]

Some donors got resequenced, and D11 was erroneously added into primary data. Per Fig. 1a, there should be 211,060 newly-generated RNA data points.

In [57]:
ad = ad[ad.obs['donor_id']!='D11']

In [58]:
assert ad.raw.X.nnz==ad.X.nnz
assert all(ad.raw.X.indices==ad.X.indices)

In [59]:
# ad.obs['Age_bin'] = ad.obs['age']

In [60]:
ad.obs.rename(columns={'age':'age_decade'},inplace=True)

AH2 is reported as 45-50 in Supp. Table but 40-45 in AnnData.  Numbers match otherwise.

In [61]:
age_map = {'A61':'70-75','AH1':'45-50','AH2':'45-50',
           'AV10':'20-25','AV13':'70-75','AV14':'45-50',
           'AV3':'60-65','D3':'55-60','D7':'60-65',
           'D8':'45-50'}
           
ad.obs['Age_bin'] = ad.obs['donor_id'].map(age_map)

  ad.obs['Age_bin'] = ad.obs['donor_id'].map(age_map)


In [62]:
ad.var.set_index('feature_name',inplace=True)


In [63]:
ad.raw.var.set_index('feature_name',inplace=True)


In [64]:
filt_neur=ad.obs['cell_type'].str.contains( "|".join(neuron_cts),regex=True)
ad = ad[filt_neur]
ad.obs['xist'] = None
cf=ad[:,'XIST'].X.toarray().squeeze()>0
ad.obs.loc[cf,'xist']= 'present'
ad.obs.loc[~cf,'xist']= 'absent'

  ad.obs['xist'] = None


In [65]:
ad = dc.get_pseudobulk(ad,sample_col='donor_id',groups_col='xist',
    mode='sum',
    min_cells=0,
    min_counts=0,use_raw=True)

In [66]:
ad.obs[['donor_id','Age_bin']].drop_duplicates()

Unnamed: 0,donor_id,Age_bin
A61_absent,A61,70-75
AH1_absent,AH1,45-50
AH2_absent,AH2,45-50
AV10_absent,AV10,20-25
AV13_absent,AV13,70-75
AV14_absent,AV14,45-50
AV3_absent,AV3,60-65
D3_absent,D3,55-60
D7_absent,D7,60-65
D8_absent,D8,45-50


In [67]:
ads[dataset] = ad.copy()

## Knight-Schrijver

In [68]:
dataset = 'knight_schrijver'
ad = sc.read_h5ad(rt+dataset+'/'+filenames[dataset])

In [69]:
ad=ad[ad.obs['is_primary_data']]

In [70]:
assert ad.raw.X.nnz==ad.X.nnz
assert all(ad.raw.X.indices==ad.X.indices)

In [71]:
ad.obs['age'] = 0 #all fetal

  ad.obs['age'] = 0 #all fetal


In [72]:
ad.var.set_index('feature_name',inplace=True)
ad.raw.var.set_index('feature_name',inplace=True)

In [73]:
filt_neur=ad.obs['cell_type'].str.contains( "|".join(neuron_cts),regex=True)
ad = ad[filt_neur]
ad.obs['xist'] = None
cf=ad[:,'XIST'].X.toarray().squeeze()>0
ad.obs.loc[cf,'xist']= 'present'
ad.obs.loc[~cf,'xist']= 'absent'

  ad.obs['xist'] = None


In [74]:
ad = dc.get_pseudobulk(ad,sample_col='donor_id',groups_col='xist',
    mode='sum',
    min_cells=0,
    min_counts=0,use_raw=True)

In [75]:
ads[dataset] = ad.copy()

## Kuppe

In [76]:
dataset = 'kuppe'
ad = sc.read_h5ad(rt+dataset+'/'+filenames[dataset])

In [77]:
assert ad.raw.X.nnz==ad.X.nnz
assert all(ad.raw.X.indices==ad.X.indices)

In [78]:
ad.obs['age'] = ad.obs['development_stage'].str.split('-').str[0].astype(float)

In [79]:
ad.var.set_index('feature_name',inplace=True)

In [80]:
filt_neur=ad.obs['cell_type'].str.contains( "|".join(neuron_cts),regex=True)
ad = ad[filt_neur]
ad.obs['xist'] = None
cf=ad[:,'XIST'].X.toarray().squeeze()>0
ad.obs.loc[cf,'xist']= 'present'
ad.obs.loc[~cf,'xist']= 'absent'

  ad.obs['xist'] = None


In [81]:
ad = dc.get_pseudobulk(ad,sample_col='donor_id',groups_col='xist',
    mode='sum',
    min_cells=0,
    min_counts=0,use_raw=True)

In [82]:
ads[dataset] = ad.copy()

## Linna-Kuosmanen

In [83]:
dataset = 'linna_kuosmanen'
ad1 = sc.read_h5ad(rt+dataset+'/'+filenames[dataset][0])
ad2 = sc.read_h5ad(rt+dataset+'/'+filenames[dataset][1])

In [84]:
assert ad1.obs['is_primary_data'].mean()==1
assert ad2.obs['is_primary_data'].mean()==1

In [85]:
ad1.var.set_index('feature_name',inplace=True)
ad2.var.set_index('feature_name',inplace=True)

In [86]:
ad = anndata.concat((ad1,ad2),axis=0)

  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")


In [87]:
ad.obs_names_make_unique()

In [88]:
assert all(ad.raw.X.indices==ad.X.indices)
assert ad.raw.X.nnz==ad.X.nnz

In [89]:
age_map = {'pediatric stage':'0-9','postnatal stage':'10-19','third decade stage':'20-29',
           'fourth decade stage':'30-39','fifth decade stage':'40-49','sixth decade stage':'50-59',
           'seventh decade stage':'60-69','eighth decade stage':'70-79','ninth decade stage':'80-89'}

In [90]:
ad.obs['development_stage'].value_counts(dropna=False)

development_stage
seventh decade stage    246548
eighth decade stage     166929
sixth decade stage      106508
fifth decade stage       46551
ninth decade stage       46526
Name: count, dtype: int64

In [91]:
ad.obs['Age_bin']  = ad.obs['development_stage'].map(age_map)

In [92]:
filt_neur=ad.obs['cell_type'].str.contains( "|".join(neuron_cts),regex=True)
ad = ad[filt_neur]
ad.obs['xist'] = None
cf=ad[:,'XIST'].X.toarray().squeeze()>0
ad.obs.loc[cf,'xist']= 'present'
ad.obs.loc[~cf,'xist']= 'absent'

  ad.obs['xist'] = None


In [93]:
ad = dc.get_pseudobulk(ad,sample_col='donor_id',groups_col='xist',
    mode='sum',
    min_cells=0,
    min_counts=0,use_raw=True)

In [94]:
ads[dataset] = ad.copy()

## Litvinukova

In [95]:
dataset = 'litvinukova'
ad = sc.read_h5ad(rt+dataset+'/'+filenames[dataset])

In [96]:
assert all(ad.raw.X.indices==ad.X.indices)
assert ad.raw.X.nnz==ad.X.nnz 

Bins in the AnnData are coarser than in Supplementary Table 1.

In [97]:
age_map = {'D1':'50-55','D2':'60-65','D3':'55-60',
           'D4':'70-75','D5':'65-70','D6':'70-75',
           'D7':'60-65','D11':'60-65','H2':'50-55',
           'H3':'50-55','H4':'55-60','H5':'50-55',
           'H6':'40-45','H7':'45-50'}
           
ad.obs['Age_bin'] = ad.obs['donor_id'].map(age_map)

In [98]:
ad.var.set_index('feature_name',inplace=True)

In [99]:
filt_neur=ad.obs['cell_type'].str.contains( "|".join(neuron_cts),regex=True)
ad = ad[filt_neur]
ad.obs['xist'] = None
cf=ad[:,'XIST'].X.toarray().squeeze()>0
ad.obs.loc[cf,'xist']= 'present'
ad.obs.loc[~cf,'xist']= 'absent'

  ad.obs['xist'] = None


In [100]:
ad = dc.get_pseudobulk(ad,sample_col='donor_id',groups_col='xist',
    mode='sum',
    min_cells=0,
    min_counts=0,use_raw=True)

In [101]:
ads[dataset] = ad.copy()

## Reichart

In [102]:
dataset = 'reichart'
ad = sc.read_h5ad(rt+dataset+'/'+filenames[dataset])

In [103]:
ad=ad[ad.obs['is_primary_data']]

In [104]:
assert ad.raw.X.nnz==ad.X.nnz
assert all(ad.raw.X.indices==ad.X.indices)

In [105]:
age_map = {'pediatric stage':'0-9','postnatal stage':'10-19','third decade stage':'20-29',
           'fourth decade stage':'30-39','fifth decade stage':'40-49','sixth decade stage':'50-59',
           'seventh decade stage':'60-69','eighth decade stage':'70-79'}

In [106]:
ad.obs['Age_bin']  = ad.obs['development_stage'].map(age_map)

  ad.obs['Age_bin']  = ad.obs['development_stage'].map(age_map)


In [107]:
ad.var.set_index('feature_name',inplace=True)

In [108]:
filt_neur=ad.obs['cell_type'].str.contains( "|".join(neuron_cts),regex=True)
ad = ad[filt_neur]
ad.obs['xist'] = None
cf=ad[:,'XIST'].X.toarray().squeeze()>0
ad.obs.loc[cf,'xist']= 'present'
ad.obs.loc[~cf,'xist']= 'absent'

  ad.obs['xist'] = None


In [109]:
ad = dc.get_pseudobulk(ad,sample_col='donor_id',groups_col='xist',
    mode='sum',
    min_cells=0,
    min_counts=0,use_raw=True)

In [110]:
ads[dataset] = ad.copy()

## Selewa

In [111]:
dataset = 'selewa'
ad = sc.read_h5ad(rt+dataset+'/'+filenames[dataset])

In [112]:
ad=ad[ad.obs['is_primary_data']]

In [113]:
assert ad.raw.X.nnz==ad.X.nnz
assert all(ad.raw.X.indices==ad.X.indices)

AssertionError: 

In [114]:
ad.obs['age'] = ad.obs['development_stage'].str.split('-').str[0].astype(float)

  ad.obs['age'] = ad.obs['development_stage'].str.split('-').str[0].astype(float)


In [115]:
ad.var.set_index('feature_name',inplace=True)
ad.raw.var.set_index('feature_name',inplace=True)

In [116]:
filt_neur=ad.obs['cell_type'].str.contains( "|".join(neuron_cts),regex=True)
ad = ad[filt_neur]
ad.obs['xist'] = None
cf=ad.raw[:,'XIST'].X.toarray().squeeze()>0
ad.obs.loc[cf,'xist']= 'present'
ad.obs.loc[~cf,'xist']= 'absent'

  ad.obs['xist'] = None


In [117]:
ad = dc.get_pseudobulk(ad,sample_col='donor_id',groups_col='xist',
    mode='sum',
    min_cells=0,
    min_counts=0,use_raw=True)

In [118]:
ads[dataset] = ad.copy()

## Sim

In [119]:
dataset = 'sim'
ad = sc.read_h5ad(rt+dataset+'/'+filenames[dataset])

In [120]:
ad=ad[ad.obs['is_primary_data']]

In [121]:
assert ad.raw.X.nnz==ad.X.nnz
assert all(ad.raw.X.indices==ad.X.indices)

AssertionError: 

In [122]:
filt = ~ad.obs['sample_id'].str.startswith('fetal')
ad.obs['age'] = None
ad.obs.loc[filt,'age'] = ad.obs.loc[filt,'development_stage'].str.split('-').str[0].astype(float)
ad.obs.loc[~filt,'age'] = 0

  ad.obs['age'] = None


In [123]:
ad.var.set_index('feature_name',inplace=True)
ad.raw.var.set_index('feature_name',inplace=True)

In [124]:
filt_neur=ad.obs['cell_type'].str.contains( "|".join(neuron_cts),regex=True)
ad = ad[filt_neur]
ad.obs['xist'] = None
cf=ad.raw[:,'XIST'].X.toarray().squeeze()>0 #imputed
ad.obs.loc[cf,'xist']= 'present'
ad.obs.loc[~cf,'xist']= 'absent'

  ad.obs['xist'] = None


In [125]:
ad = dc.get_pseudobulk(ad,sample_col='donor_id',groups_col='xist',
    mode='sum',
    min_cells=0,
    min_counts=0,use_raw=True)

In [126]:
ads[dataset] = ad.copy()

## Simonson

In [127]:
dataset = 'simonson'
ad = sc.read_h5ad(rt+dataset+'/'+filenames[dataset])

In [128]:
ad.obs.rename(columns={'cell_type_leiden0.5':'cell_type'},inplace=True)

In [129]:
# assert ad.raw.X.nnz==ad.X.nnz #no raw

In [130]:
filt_neur=ad.obs['cell_type'].str.contains( "|".join(neuron_cts),regex=True)
ad = ad[filt_neur]
ad.obs['xist'] = None
cf=ad[:,'XIST'].X.toarray().squeeze()>0
ad.obs.loc[cf,'xist']= 'present'
ad.obs.loc[~cf,'xist']= 'absent'

  ad.obs['xist'] = None


In [131]:
ad = dc.get_pseudobulk(ad,sample_col='donor_id',groups_col='xist',
    mode='sum',
    min_cells=0,
    min_counts=0,use_raw=False)

In [132]:
ads[dataset] = ad.copy()

## Tucker

In [133]:
dataset = 'tucker'
ad = sc.read_h5ad(rt+dataset+'/'+filenames[dataset])

In [134]:
ad.obs.rename(columns={'Cluster':'cell_type','biological.individual':'donor_id'},inplace=True)

In [135]:
# assert ad.raw.X.nnz==ad.X.nnz # no raw

In [136]:
for x in ad.obs['donor_id'].unique(): print(x)

1600
1666
1681
1702
1708
1723
1221


Sexes and ages need to be read off Table 1 manually.

In [137]:
ad.obs['sex']=None
female_donors = ['1221','1600','1708','1723']
filt = ad.obs['donor_id'].str.contains( "|".join(female_donors),regex=True)
ad.obs.loc[filt,'sex']='female'
ad.obs.loc[~filt,'sex']='male'

In [138]:
age_map = {'1221':52,'1600':51,'1666':54,'1681':39,'1702':59,'1708':60,'1723':47}
ad.obs['age'] = ad.obs['donor_id'].map(age_map)

In [139]:
filt_neur=ad.obs['cell_type'].str.contains( "|".join(neuron_cts),regex=True)
ad = ad[filt_neur]
ad.obs['xist'] = None
cf=ad[:,'XIST'].X.toarray().squeeze()>0
ad.obs.loc[cf,'xist']= 'present'
ad.obs.loc[~cf,'xist']= 'absent'

  ad.obs['xist'] = None


In [140]:
ad = dc.get_pseudobulk(ad,sample_col='donor_id',groups_col='xist',
    mode='sum',
    min_cells=0,
    min_counts=0,use_raw=False)

In [141]:
ads[dataset] = ad.copy()

# Export

In [142]:
for x in ads.keys():
    print(x)
    assert ((ads[x].X%1)==0).all()
for x in ads.keys():
    print(x)
    print(ads[x].obs['xist'].value_counts())

amrute
brener
eraslan
hill
kanemaru
knight_schrijver
kuppe
linna_kuosmanen
litvinukova
reichart
selewa
sim
simonson
tucker
amrute
xist
absent     25
present    25
Name: count, dtype: int64
brener
xist
absent     7
present    7
Name: count, dtype: int64
eraslan
xist
absent     3
present    3
Name: count, dtype: int64
hill
xist
present    13
absent      9
Name: count, dtype: int64
kanemaru
xist
absent     10
present     9
Name: count, dtype: int64
knight_schrijver
xist
absent     6
present    4
Name: count, dtype: int64
kuppe
xist
absent     20
present    16
Name: count, dtype: int64
linna_kuosmanen
xist
absent     97
present    90
Name: count, dtype: int64
litvinukova
xist
absent     14
present    14
Name: count, dtype: int64
reichart
xist
absent     67
present    67
Name: count, dtype: int64
selewa
xist
absent     3
present    3
Name: count, dtype: int64
sim
xist
absent     8
present    8
Name: count, dtype: int64
simonson
xist
present    15
absent     14
Name: count, dtype: int64
tuck

In [143]:
with open('/home/ec2-user/curation/heart_datasets/gg_250327_heart_data_split_by_xist.p', 'wb') as fp:
    pickle.dump(ads, fp)


In [145]:
!md5sum /home/ec2-user/curation/heart_datasets/gg_250327_heart_data_split_by_xist.p

f2b7d1752509686c721e2bca40b7a658  /home/ec2-user/curation/heart_datasets/gg_250327_heart_data_split_by_xist.p
