#### Notebook to compare the significant eQTS singals for cohort by visit

In [1]:
!date

Mon May 10 21:32:50 UTC 2021


In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import concurrent.futures

In [3]:
# parameters
# cohort = 'ppmi'
# months = [0, 6, 12, 24, 36]
cohort = 'pdbp'
months = [0, 6, 12, 18, 24]
tissue = 'wb'

In [4]:
# naming

# directories
home_dir = '/home/jupyter'
wrk_dir = f'{home_dir}/{cohort}'
eqts_dir = f'{wrk_dir}/eqts'

# input files

# out files
results_file = f'{eqts_dir}/{cohort}.{tissue}.eqts.csv'

# constants
autosomes = [str(x) for x in list(range(1,23))]
capture_out = !(nproc)
max_threads = int(capture_out[0])

#### load the results

In [5]:
eqts_df = None
for month in months:
    cohort_build = f'{cohort}.{tissue}{month}'
    eqts_file = f'{eqts_dir}/{cohort_build}.eqts.csv'
    this_df = pd.read_csv(eqts_file)
    gene_cnt = len(this_df['Name'].unique())
    print(f'month {month} shape is {this_df.shape} for {gene_cnt} genes')
    this_df['month'] = month
    eqts_df = pd.concat([eqts_df, this_df])
    print(f'after month {month} total shape {eqts_df.shape}')

month 0 shape is (16235, 7) for 16235 genes
after month 0 total shape (16235, 8)
month 6 shape is (16766, 7) for 16766 genes
after month 6 total shape (33001, 8)
month 12 shape is (16386, 7) for 16386 genes
after month 12 total shape (49387, 8)
month 24 shape is (16274, 7) for 16274 genes
after month 24 total shape (65661, 8)
month 36 shape is (15707, 7) for 15707 genes
after month 36 total shape (81368, 8)


In [6]:
print(eqts_df.shape)
display(eqts_df.sample(5))

(81368, 8)


Unnamed: 0,Name,coef,stderr,r2_adj,term_cnt,p-value,bh_fdr,month
8426,ENSG00000116922.14,0.409776,0.243466,0.080827,7.0,0.092776,0.257317,6
14731,ENSG00000107438.8,0.765187,0.182569,0.085644,7.0,3e-05,0.000383,0
14059,ENSG00000266712.3,-0.321905,0.311251,0.001216,7.0,0.301402,0.585608,24
1208,ENSG00000198995.3,0.429811,0.271622,0.053768,7.0,0.113982,0.269705,12
1926,ENSG00000254231.2,0.390948,0.291604,0.102977,7.0,0.180478,0.450268,24


In [7]:
eqts_df['Name'].value_counts().value_counts()

5    15680
4      461
1      355
3      171
2      128
Name: Name, dtype: int64

#### subset to significant based on FDR

In [8]:
sig_df = eqts_df.loc[eqts_df['bh_fdr'] < 0.05]
print(sig_df.shape)
display(sig_df.sample(5))

(11062, 8)


Unnamed: 0,Name,coef,stderr,r2_adj,term_cnt,p-value,bh_fdr,month
253,ENSG00000138668.18,1.116282,0.278371,0.020456,7.0,6.676491e-05,0.001779,12
7866,ENSG00000225101.6,0.551378,0.161391,0.258961,7.0,0.0006560429,0.004557,0
10338,ENSG00000164897.12,-1.243751,0.249367,0.042443,7.0,7.613772e-07,7.4e-05,6
13349,ENSG00000279529.1,-1.102299,0.248901,0.061669,7.0,1.090481e-05,0.000451,6
10084,ENSG00000145740.18,0.547633,0.190043,0.019319,7.0,0.004027638,0.018677,0


In [9]:
sig_df['month'].value_counts()

0     4678
12    2819
6     2732
24     833
Name: month, dtype: int64

In [10]:
sig_df['Name'].value_counts().value_counts()

1    4058
2    2095
3     774
4     123
Name: Name, dtype: int64

In [11]:
sig_df['Name'].value_counts().head()

ENSG00000167685.14    4
ENSG00000207181.1     4
ENSG00000276890.1     4
ENSG00000161618.9     4
ENSG00000276645.1     4
Name: Name, dtype: int64

In [12]:
sig_df['month'].value_counts()

0     4678
12    2819
6     2732
24     833
Name: month, dtype: int64

#### save the combined significant eQTS results

In [13]:
sig_df.to_csv(results_file, index=False)