Skip to content

Commit

Permalink
p-value probe detection work in progress
Browse files Browse the repository at this point in the history
  • Loading branch information
marcmaxson committed Nov 13, 2019
1 parent 7688609 commit 8f09418
Show file tree
Hide file tree
Showing 3 changed files with 625 additions and 1 deletion.
2 changes: 1 addition & 1 deletion index.rst
Expand Up @@ -14,8 +14,8 @@ methylcheck documentation
docs/methylprep_methylcheck_example.ipynb
docs/another-methylcheck-qc-example.ipynb
docs/demo_using_matched_meta_data.ipynb
docs/source/methylcheck
methylprep package <https://life-epigenetics-methylprep.readthedocs-hosted.com/en/latest/>
docs/source/methylcheck
methylize (analysis) package <https://life-epigenetics-methylize.readthedocs-hosted.com/en/latest/>


Expand Down
162 changes: 162 additions & 0 deletions methylcheck/p_value_probe_detection.py
@@ -0,0 +1,162 @@
from methylprep import run_pipeline
from methylcheck import mean_beta_compare
from scipy import stats
from statsmodels.distributions.empirical_distribution import ECDF
import pandas as pd
import numpy as np


def pval_minfi(data_containers):
# Pull M and U values
meth = pd.DataFrame(data_containers[0]._SampleDataContainer__data_frame.index)
unmeth = pd.DataFrame(data_containers[0]._SampleDataContainer__data_frame.index)

for i,c in enumerate(data_containers):
sample = data_containers[i].sample
m = c._SampleDataContainer__data_frame.rename(columns={'meth':sample})
u = c._SampleDataContainer__data_frame.rename(columns={'unmeth':sample})
meth = pd.merge(left=meth,right=m[sample],left_on='IlmnID',right_on='IlmnID',)
unmeth = pd.merge(left=unmeth,right=u[sample],left_on='IlmnID',right_on='IlmnID')

# Create empty dataframes for red and green negative controls
negctlsR = pd.DataFrame(data_containers[0].ctrl_red['Extended_Type'])
negctlsG = pd.DataFrame(data_containers[0].ctrl_green['Extended_Type'])

# Fill red and green dataframes
for i,c in enumerate(data_containers):
sample = str(data_containers[i].sample)
dfR = c.ctrl_red
dfR = dfR[dfR['Control_Type']=='NEGATIVE']
dfR = dfR[['Extended_Type','mean_value']].rename(columns={'mean_value':sample})
dfG = c.ctrl_green
dfG = dfG[dfG['Control_Type']=='NEGATIVE']
dfG = dfG[['Extended_Type','mean_value']].rename(columns={'mean_value':sample})
negctlsR = pd.merge(left=negctlsR,right=dfR,on='Extended_Type')
negctlsG = pd.merge(left=negctlsG,right=dfG,on='Extended_Type')

# Reset index on dataframes
negctlsG = negctlsG.set_index('Extended_Type')
negctlsR = negctlsR.set_index('Extended_Type')

# Get M and U values for IG, IR and II

# first pull out sections of manifest (will be used to identify which probes belong to each IG, IR, II)
manifest = data_containers[0].manifest.data_frame[['Infinium_Design_Type','Color_Channel']]
IG = manifest[(manifest['Color_Channel']=='Grn') & (manifest['Infinium_Design_Type']=='I')]
IR = manifest[(manifest['Color_Channel']=='Red') & (manifest['Infinium_Design_Type']=='I')]
II = manifest[manifest['Infinium_Design_Type']=='II']

# second merge with meth and unmeth dataframes
IG_meth = pd.merge(left=IG,right=meth,on='IlmnID').drop(columns=['Infinium_Design_Type','Color_Channel']).set_index('IlmnID')
IG_unmeth = pd.merge(left=IG,right=unmeth,on='IlmnID').drop(columns=['Infinium_Design_Type','Color_Channel']).set_index('IlmnID')
IR_meth = pd.merge(left=IR,right=meth,on='IlmnID').drop(columns=['Infinium_Design_Type','Color_Channel']).set_index('IlmnID')
IR_unmeth = pd.merge(left=IR,right=unmeth,on='IlmnID').drop(columns=['Infinium_Design_Type','Color_Channel']).set_index('IlmnID')
II_meth = pd.merge(left=II,right=meth,on='IlmnID').drop(columns=['Infinium_Design_Type','Color_Channel']).set_index('IlmnID')
II_unmeth = pd.merge(left=II,right=unmeth,on='IlmnID').drop(columns=['Infinium_Design_Type','Color_Channel']).set_index('IlmnID')

# Calcuate parameters
sdG = stats.median_absolute_deviation(negctlsG)
muG = np.median(negctlsG,axis=0)
sdR = stats.median_absolute_deviation(negctlsR)
muR = np.median(negctlsR,axis=0)

# calculate p values for type 1 Red
pIR = pd.DataFrame(index=IR_meth.index,
data=1 - stats.norm.cdf(IR_meth+IR_unmeth,2*muR,2*sdR),
columns=IR_meth.columns)

# calculate p values for type 1 Green
pIG = pd.DataFrame(index=IG_meth.index,
data=1 - stats.norm.cdf(IG_meth+IG_unmeth,2*muG,2*sdG),
columns=IG_meth.columns)

# calculat4e p values for type II
pII = pd.DataFrame(index=II_meth.index,
data=1-stats.norm.cdf(II_meth+II_unmeth,muR+muG,sdR+sdG),
columns=II_meth.columns)
# concat and sort
pval = pd.concat([pIR, pIG, pII])
pval = pval.sort_values(by='IlmnID')
return pval

def pval_sesame(data_containers):
# Pull M and U values
meth = pd.DataFrame(data_containers[0]._SampleDataContainer__data_frame.index)
unmeth = pd.DataFrame(data_containers[0]._SampleDataContainer__data_frame.index)

for i,c in enumerate(data_containers):
sample = data_containers[i].sample
m = c._SampleDataContainer__data_frame.rename(columns={'meth':sample})
u = c._SampleDataContainer__data_frame.rename(columns={'unmeth':sample})
meth = pd.merge(left=meth,right=m[sample],left_on='IlmnID',right_on='IlmnID',)
unmeth = pd.merge(left=unmeth,right=u[sample],left_on='IlmnID',right_on='IlmnID')

# Separate M and U values for IG, IR and II
# first pull out sections of manifest (will be used to identify which probes belong to each IG, IR, II)
manifest = data_containers[0].manifest.data_frame[['Infinium_Design_Type','Color_Channel']]
IG = manifest[(manifest['Color_Channel']=='Grn') & (manifest['Infinium_Design_Type']=='I')]
IR = manifest[(manifest['Color_Channel']=='Red') & (manifest['Infinium_Design_Type']=='I')]
II = manifest[manifest['Infinium_Design_Type']=='II']

# second merge with meth and unmeth dataframes
IG_meth = pd.merge(left=IG,right=meth,on='IlmnID').drop(columns=['Infinium_Design_Type','Color_Channel']).set_index('IlmnID')
IG_unmeth = pd.merge(left=IG,right=unmeth,on='IlmnID').drop(columns=['Infinium_Design_Type','Color_Channel']).set_index('IlmnID')
IR_meth = pd.merge(left=IR,right=meth,on='IlmnID').drop(columns=['Infinium_Design_Type','Color_Channel']).set_index('IlmnID')
IR_unmeth = pd.merge(left=IR,right=unmeth,on='IlmnID').drop(columns=['Infinium_Design_Type','Color_Channel']).set_index('IlmnID')
II_meth = pd.merge(left=II,right=meth,on='IlmnID').drop(columns=['Infinium_Design_Type','Color_Channel']).set_index('IlmnID')
II_unmeth = pd.merge(left=II,right=unmeth,on='IlmnID').drop(columns=['Infinium_Design_Type','Color_Channel']).set_index('IlmnID')

pval = pd.DataFrame(data=manifest.index,columns=['IlmnID'])
for i,c in enumerate(data_containers):
funcG = ECDF(data_containers[i].oob_green['mean_value'].values)
funcR = ECDF(data_containers[i].oob_red['mean_value'].values)
sample = data_containers[i].sample
pIR = pd.DataFrame(index=IR_meth.index,data=1-np.maximum(funcR(IR_meth[sample]), funcR(IR_unmeth[sample])),columns=[sample])
pIG = pd.DataFrame(index=IG_meth.index,data=1-np.maximum(funcG(IG_meth[sample]), funcG(IG_unmeth[sample])),columns=[sample])
pII = pd.DataFrame(index=II_meth.index,data=1-np.maximum(funcG(II_meth[sample]), funcR(II_unmeth[sample])),columns=[sample])
p = pd.concat([pIR,pIG,pII]).reset_index()
pval = pd.merge(pval,p)
return pval

def detect_probes(data_containers, method='sesame', save=False, silent=True):
"""
About:
a wrapper for the p-value probe detection methods. Tries to check inputs and rationalize them
with methyl-suite's standard data objects.
Inputs:
a list of sample data_containers. The dataframe must have a 'IlMnID' for the index of probe names.
And probe names should be `cgxxxxxx` format to work with other methylcheck functions
To create this, use:
data_containers = methylprep.run_pipeline(data_dir,
save_uncorrected=True,
betas=True)
(if there are more than 200 samples, you'll need to load the data from disk instead, as nothing will be returned)
method:
sesame -- use the sesame ported algorithm
minfi -- use the minfi ported algorithm
Checks:
data_containers must have 'meth' and 'unmeth' columns (uncorrected data)
the values for these columns should be between 0 and 10000s
(beta: 0 to 1; m_value: -neg to pos+ range)
Returns:
dataframe of p-value filtered probes
"""
if not ('unmeth' in data_containers[0]._SampleDataContainer__data_frame.columns and
'meth' in data_containers[0]._SampleDataContainer__data_frame.columns):
raise ValueError("Provide a list of data_containers that includes uncorrected data (with 'meth' and 'unmeth' columns, using the 'save_uncorrected' option in run_pipeline)")
if method == 'minfi':
pval = pval_minfi(data_containers)
else:
pval = pval_sesame(data_containers)

if silent == False:
# plot it
# df1 and df2 are probe X sample_id matrices
mean_beta_compare(df1, df2, save=save, verbose=False, silent=silent)
return pval

0 comments on commit 8f09418

Please sign in to comment.