## Notebook to subset ATAC or SCAT-PD peak features that may contain a GWAS risk variant

In [1]:
!date

Fri Apr 14 16:29:41 UTC 2023


#### import libraries

In [2]:
from pandas import read_csv
from pybedtools import BedTool

#### set notebook variables

In [3]:
# naming
day = 'da65' # daALL
modality = 'SCAT-DA' # ATAC
cohort = 'foundin'
disease = 'PD'
set_name = f'{cohort}_{day}_{modality}_{disease}'

# directories
wrk_dir = '/home/jupyter/foundin_qtl'
quants_dir = f'{wrk_dir}/quants'
public_dir = f'{wrk_dir}/public'

# in files
locus_ld_info_file = f'{public_dir}/ld_info/{cohort}_{disease}.ld_prime.csv'
if modality == 'ATAC':
    peaks_annot_file = f'{quants_dir}/{cohort}_consensus_peaks.saf'
else:
    peaks_annot_file = f'{quants_dir}/{cohort}_{day}_{modality}.means.bedgraph'    
gwas_sum_stats_file = f'{public_dir}/nalls_pd_gwas/pd_sumstats_23andme_buildGRCh38.tsv.gz'

# out files
risk_peaks_bed = f'{quants_dir}/{set_name}_risk_peaks.bed'

# variables
DEBUG = False
SIG_THRESHOLD = 5.00e-08
SUG_THRESHOLD = 1.00e-05

### load input data

#### load feature annotations

In [4]:
%%time
if modality == 'ATAC':
    features_df = read_csv(peaks_annot_file, sep='\t')
    features_df = features_df.rename(columns={'GeneID': 'feature_id'})
else:
    features_df = read_csv(peaks_annot_file, sep='\t', header=None)
    features_df.columns = ['Chr', 'Start', 'End', 'mean']
    features_df['feature_id'] = features_df.Chr + '_' + features_df.Start.astype('str') + '_' + features_df.End.astype('str')
print(features_df.shape)
if DEBUG:
    display(features_df.head())

(459495, 5)
CPU times: user 667 ms, sys: 119 ms, total: 786 ms
Wall time: 786 ms


#### load summary stats

In [5]:
%%time
gwas_df = read_csv(gwas_sum_stats_file, sep='\t')
print(gwas_df.shape)
if DEBUG:
    display(gwas_df.head())

(7769022, 12)
CPU times: user 9.84 s, sys: 880 ms, total: 10.7 s
Wall time: 10.7 s


#### load the LD variants for the loci

In [6]:
ld_df = read_csv(locus_ld_info_file)
print(ld_df.shape)
if DEBUG:
    display(ld_df.head())

(24598, 10)


### subset summary stats to only those that are signficant or suggestive and in LD with index variants
may want to have a lower significant threshold to be more fine mappy

In [7]:
risk_df = gwas_df.loc[(gwas_df.p_value <= SIG_THRESHOLD) | 
                      (gwas_df.variant_id.isin(ld_df.SNP_B) & 
                       (gwas_df.p_value <= SUG_THRESHOLD))]
print(risk_df.shape)
if DEBUG:
    display(risk_df.head())    

(10595, 12)


### find ATAC/SCAT peak features that contain a risk variant

#### convert ATAC/SCAT features dataframe to bed

In [8]:
feature_bed = BedTool.from_dataframe(features_df[['Chr', 'Start', 'End', 'feature_id']])
print(feature_bed.count())
print(feature_bed.field_count())
if DEBUG:
    display(feature_bed.to_dataframe().head())

459495
4


#### convert summary stats to bed

In [9]:
risk_bed_df = risk_df[['chromosome', 'base_pair_location', 'variant_id', 'p_value']]
risk_bed_df.chromosome = 'chr' + risk_bed_df.chromosome.astype('str')
risk_bed_df.insert(1, 'start', value=risk_bed_df.base_pair_location-1)
risk_bed = BedTool.from_dataframe(risk_bed_df)

print(risk_bed.count())
print(risk_bed.field_count())
if DEBUG:
    display(risk_bed.to_dataframe().head())

10595
5


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


#### intersect the beds

In [10]:
feature_intersect = feature_bed.intersect(risk_bed)

print(feature_intersect.count())
print(feature_intersect.field_count())
if DEBUG:
    display(feature_intersect.to_dataframe().head())

1846
4


GL000195.1	22478	23222	GL000195.1_22478_23222

GL000195.1	22478	23222	GL000195.1_22478_23222



### save the bed for the ATAC features containing risk variants

In [11]:
risk_features_df = features_df.loc[features_df.feature_id.isin(feature_intersect.to_dataframe().name)]
print(f'{risk_features_df.feature_id.nunique()} unique features')
risk_features_df.to_csv(risk_peaks_bed, index=False)

1006 unique features


In [12]:
if DEBUG:
    display(risk_features_df.head())

In [13]:
risk_features_df.Chr.value_counts()

chr4     130
chr7      92
chr17     84
chr16     81
chr2      76
chr6      74
chr3      66
chr5      60
chr12     53
chr14     52
chr1      51
chr18     40
chr8      38
chr10     27
chr9      20
chr11     17
chr21     16
chr13     12
chr19      6
chr20      6
chr15      5
Name: Chr, dtype: int64

In [14]:
!date

Fri Apr 14 16:29:57 UTC 2023
