## Notebook to subset ATAC peak features that contain a risk variant

In [1]:
!date

Mon Oct  3 20:25:49 UTC 2022


#### import libraries

In [2]:
from pandas import read_csv, read_parquet, read_hdf
from pybedtools import BedTool

#### set notebook variables

In [3]:
# naming
day = 'daALL'
modality = 'ATAC'
cohort = 'foundin'
disease = 'PD'
set_name = f'{cohort}_{day}'

# directories
wrk_dir = '/home/jupyter/epigenetics'
quants_dir = f'{wrk_dir}/quants'
public_dir = f'{wrk_dir}/public'

# in files
locus_ld_info_file = f'{public_dir}/ld_info/{cohort}_{disease}.ld_prime.csv'
quants_in_file = f'{quants_dir}/{set_name}_{modality}.scaled.adj.hdf5'
peaks_annot_file = f'{quants_dir}/{cohort}_consensus_peaks.saf'
gwas_file = f'{public_dir}/nalls_pd_gwas/pdmeta_sumstats_hg38_no23andme.parquet'

# out files
risk_peaks_bed = f'{quants_dir}/{cohort}_risk_peaks.bed'
quants_out_file = f'{quants_dir}/{set_name}.risk.csv'

# variables
DEBUG = False
SIG_THRESHOLD = 5.00e-08
SUG_THRESHOLD = 1.00e-05

### load input data

#### load feature annotations

In [4]:
%%time
features_df = read_csv(peaks_annot_file, sep='\t')
print(features_df.shape)
if DEBUG:
    display(features_df.head())

(201820, 5)
CPU times: user 133 ms, sys: 12.3 ms, total: 146 ms
Wall time: 145 ms


#### load summary stats

In [5]:
%%time
gwas_df = read_parquet(gwas_file)
print(gwas_df.shape)
if DEBUG:
    display(gwas_df.head())

(16293588, 14)
CPU times: user 7.39 s, sys: 1.89 s, total: 9.28 s
Wall time: 5.99 s


#### load the feature quantifications

In [6]:
%%time
quants_df = read_hdf(quants_in_file)
print(quants_df.shape)
if DEBUG:
    display(quants_df.head())

(290, 201627)
CPU times: user 565 ms, sys: 249 ms, total: 814 ms
Wall time: 812 ms


#### load the LD variants for the loci

In [7]:
ld_df = read_csv(locus_ld_info_file)
print(ld_df.shape)
if DEBUG:
    display(ld_df.head())

(24598, 10)


### subset summary stats to only those that are signficant or suggestive and in LD with index variants
may want to have a lower significant threshold to be more fine mappy

In [8]:
risk_df = gwas_df.loc[(gwas_df.p_value <= SIG_THRESHOLD) | 
                      (gwas_df.variant_id.isin(ld_df.SNP_B) & 
                       (gwas_df.p_value <= SUG_THRESHOLD))]
print(risk_df.shape)
if DEBUG:
    display(risk_df.head())    

(4238, 14)


### find ATAC peak features that contain a risk variant

#### convert ATAC features dataframe to bed

In [9]:
feature_bed = BedTool.from_dataframe(features_df[['Chr', 'Start', 'End', 'GeneID']])
print(feature_bed.count())
print(feature_bed.field_count())
if DEBUG:
    display(feature_bed.to_dataframe().head())

201820
4


#### convert summary stats to bed

In [10]:
risk_bed_df = risk_df[['chromosome', 'base_pair_location', 'variant_id', 'p_value']]
risk_bed_df.chromosome = 'chr' + risk_bed_df.chromosome.astype('str')
risk_bed_df.insert(1, 'start', value=risk_bed_df.base_pair_location-1)
risk_bed = BedTool.from_dataframe(risk_bed_df)

print(risk_bed.count())
print(risk_bed.field_count())
if DEBUG:
    display(risk_bed.to_dataframe().head())

4238
5


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


#### intersect the beds

In [11]:
feature_intersect = feature_bed.intersect(risk_bed)

print(feature_intersect.count())
print(feature_intersect.field_count())
if DEBUG:
    display(feature_intersect.to_dataframe().head())

445
4


### save the bed for the ATAC features containing risk variants

In [12]:
risk_features_df = feature_intersect.to_dataframe()
print(f'{risk_features_df.name.nunique()} unique features')
risk_features_df.to_csv(risk_peaks_bed, index=False)

183 unique features


In [13]:
if DEBUG:
    display(risk_features_df.head())

In [14]:
risk_quants_df = quants_df[risk_features_df.name.unique()]
print(risk_quants_df.shape)
if DEBUG:
    display(risk_quants_df.head())

(290, 183)


#### save the ATAC peak feature quantifications for peaks containing risk variants

In [15]:
risk_quants_df.to_csv(quants_out_file)

In [16]:
risk_features_df.chrom.value_counts()

chr17    214
chr4      80
chr16     56
chr7      21
chr1      13
chr11     13
chr5      12
chr12      6
chr8       6
chr2       5
chr9       5
chr3       4
chr6       4
chr15      3
chr21      2
chr10      1
Name: chrom, dtype: int64

In [17]:
!date

Mon Oct  3 20:26:00 UTC 2022
