Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
p-value probe detection work in progress
- Loading branch information
1 parent
7688609
commit 8f09418
Showing
3 changed files
with
625 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,162 @@ | ||
from methylprep import run_pipeline | ||
from methylcheck import mean_beta_compare | ||
from scipy import stats | ||
from statsmodels.distributions.empirical_distribution import ECDF | ||
import pandas as pd | ||
import numpy as np | ||
|
||
|
||
def pval_minfi(data_containers): | ||
# Pull M and U values | ||
meth = pd.DataFrame(data_containers[0]._SampleDataContainer__data_frame.index) | ||
unmeth = pd.DataFrame(data_containers[0]._SampleDataContainer__data_frame.index) | ||
|
||
for i,c in enumerate(data_containers): | ||
sample = data_containers[i].sample | ||
m = c._SampleDataContainer__data_frame.rename(columns={'meth':sample}) | ||
u = c._SampleDataContainer__data_frame.rename(columns={'unmeth':sample}) | ||
meth = pd.merge(left=meth,right=m[sample],left_on='IlmnID',right_on='IlmnID',) | ||
unmeth = pd.merge(left=unmeth,right=u[sample],left_on='IlmnID',right_on='IlmnID') | ||
|
||
# Create empty dataframes for red and green negative controls | ||
negctlsR = pd.DataFrame(data_containers[0].ctrl_red['Extended_Type']) | ||
negctlsG = pd.DataFrame(data_containers[0].ctrl_green['Extended_Type']) | ||
|
||
# Fill red and green dataframes | ||
for i,c in enumerate(data_containers): | ||
sample = str(data_containers[i].sample) | ||
dfR = c.ctrl_red | ||
dfR = dfR[dfR['Control_Type']=='NEGATIVE'] | ||
dfR = dfR[['Extended_Type','mean_value']].rename(columns={'mean_value':sample}) | ||
dfG = c.ctrl_green | ||
dfG = dfG[dfG['Control_Type']=='NEGATIVE'] | ||
dfG = dfG[['Extended_Type','mean_value']].rename(columns={'mean_value':sample}) | ||
negctlsR = pd.merge(left=negctlsR,right=dfR,on='Extended_Type') | ||
negctlsG = pd.merge(left=negctlsG,right=dfG,on='Extended_Type') | ||
|
||
# Reset index on dataframes | ||
negctlsG = negctlsG.set_index('Extended_Type') | ||
negctlsR = negctlsR.set_index('Extended_Type') | ||
|
||
# Get M and U values for IG, IR and II | ||
|
||
# first pull out sections of manifest (will be used to identify which probes belong to each IG, IR, II) | ||
manifest = data_containers[0].manifest.data_frame[['Infinium_Design_Type','Color_Channel']] | ||
IG = manifest[(manifest['Color_Channel']=='Grn') & (manifest['Infinium_Design_Type']=='I')] | ||
IR = manifest[(manifest['Color_Channel']=='Red') & (manifest['Infinium_Design_Type']=='I')] | ||
II = manifest[manifest['Infinium_Design_Type']=='II'] | ||
|
||
# second merge with meth and unmeth dataframes | ||
IG_meth = pd.merge(left=IG,right=meth,on='IlmnID').drop(columns=['Infinium_Design_Type','Color_Channel']).set_index('IlmnID') | ||
IG_unmeth = pd.merge(left=IG,right=unmeth,on='IlmnID').drop(columns=['Infinium_Design_Type','Color_Channel']).set_index('IlmnID') | ||
IR_meth = pd.merge(left=IR,right=meth,on='IlmnID').drop(columns=['Infinium_Design_Type','Color_Channel']).set_index('IlmnID') | ||
IR_unmeth = pd.merge(left=IR,right=unmeth,on='IlmnID').drop(columns=['Infinium_Design_Type','Color_Channel']).set_index('IlmnID') | ||
II_meth = pd.merge(left=II,right=meth,on='IlmnID').drop(columns=['Infinium_Design_Type','Color_Channel']).set_index('IlmnID') | ||
II_unmeth = pd.merge(left=II,right=unmeth,on='IlmnID').drop(columns=['Infinium_Design_Type','Color_Channel']).set_index('IlmnID') | ||
|
||
# Calcuate parameters | ||
sdG = stats.median_absolute_deviation(negctlsG) | ||
muG = np.median(negctlsG,axis=0) | ||
sdR = stats.median_absolute_deviation(negctlsR) | ||
muR = np.median(negctlsR,axis=0) | ||
|
||
# calculate p values for type 1 Red | ||
pIR = pd.DataFrame(index=IR_meth.index, | ||
data=1 - stats.norm.cdf(IR_meth+IR_unmeth,2*muR,2*sdR), | ||
columns=IR_meth.columns) | ||
|
||
# calculate p values for type 1 Green | ||
pIG = pd.DataFrame(index=IG_meth.index, | ||
data=1 - stats.norm.cdf(IG_meth+IG_unmeth,2*muG,2*sdG), | ||
columns=IG_meth.columns) | ||
|
||
# calculat4e p values for type II | ||
pII = pd.DataFrame(index=II_meth.index, | ||
data=1-stats.norm.cdf(II_meth+II_unmeth,muR+muG,sdR+sdG), | ||
columns=II_meth.columns) | ||
# concat and sort | ||
pval = pd.concat([pIR, pIG, pII]) | ||
pval = pval.sort_values(by='IlmnID') | ||
return pval | ||
|
||
def pval_sesame(data_containers): | ||
# Pull M and U values | ||
meth = pd.DataFrame(data_containers[0]._SampleDataContainer__data_frame.index) | ||
unmeth = pd.DataFrame(data_containers[0]._SampleDataContainer__data_frame.index) | ||
|
||
for i,c in enumerate(data_containers): | ||
sample = data_containers[i].sample | ||
m = c._SampleDataContainer__data_frame.rename(columns={'meth':sample}) | ||
u = c._SampleDataContainer__data_frame.rename(columns={'unmeth':sample}) | ||
meth = pd.merge(left=meth,right=m[sample],left_on='IlmnID',right_on='IlmnID',) | ||
unmeth = pd.merge(left=unmeth,right=u[sample],left_on='IlmnID',right_on='IlmnID') | ||
|
||
# Separate M and U values for IG, IR and II | ||
# first pull out sections of manifest (will be used to identify which probes belong to each IG, IR, II) | ||
manifest = data_containers[0].manifest.data_frame[['Infinium_Design_Type','Color_Channel']] | ||
IG = manifest[(manifest['Color_Channel']=='Grn') & (manifest['Infinium_Design_Type']=='I')] | ||
IR = manifest[(manifest['Color_Channel']=='Red') & (manifest['Infinium_Design_Type']=='I')] | ||
II = manifest[manifest['Infinium_Design_Type']=='II'] | ||
|
||
# second merge with meth and unmeth dataframes | ||
IG_meth = pd.merge(left=IG,right=meth,on='IlmnID').drop(columns=['Infinium_Design_Type','Color_Channel']).set_index('IlmnID') | ||
IG_unmeth = pd.merge(left=IG,right=unmeth,on='IlmnID').drop(columns=['Infinium_Design_Type','Color_Channel']).set_index('IlmnID') | ||
IR_meth = pd.merge(left=IR,right=meth,on='IlmnID').drop(columns=['Infinium_Design_Type','Color_Channel']).set_index('IlmnID') | ||
IR_unmeth = pd.merge(left=IR,right=unmeth,on='IlmnID').drop(columns=['Infinium_Design_Type','Color_Channel']).set_index('IlmnID') | ||
II_meth = pd.merge(left=II,right=meth,on='IlmnID').drop(columns=['Infinium_Design_Type','Color_Channel']).set_index('IlmnID') | ||
II_unmeth = pd.merge(left=II,right=unmeth,on='IlmnID').drop(columns=['Infinium_Design_Type','Color_Channel']).set_index('IlmnID') | ||
|
||
pval = pd.DataFrame(data=manifest.index,columns=['IlmnID']) | ||
for i,c in enumerate(data_containers): | ||
funcG = ECDF(data_containers[i].oob_green['mean_value'].values) | ||
funcR = ECDF(data_containers[i].oob_red['mean_value'].values) | ||
sample = data_containers[i].sample | ||
pIR = pd.DataFrame(index=IR_meth.index,data=1-np.maximum(funcR(IR_meth[sample]), funcR(IR_unmeth[sample])),columns=[sample]) | ||
pIG = pd.DataFrame(index=IG_meth.index,data=1-np.maximum(funcG(IG_meth[sample]), funcG(IG_unmeth[sample])),columns=[sample]) | ||
pII = pd.DataFrame(index=II_meth.index,data=1-np.maximum(funcG(II_meth[sample]), funcR(II_unmeth[sample])),columns=[sample]) | ||
p = pd.concat([pIR,pIG,pII]).reset_index() | ||
pval = pd.merge(pval,p) | ||
return pval | ||
|
||
def detect_probes(data_containers, method='sesame', save=False, silent=True): | ||
""" | ||
About: | ||
a wrapper for the p-value probe detection methods. Tries to check inputs and rationalize them | ||
with methyl-suite's standard data objects. | ||
Inputs: | ||
a list of sample data_containers. The dataframe must have a 'IlMnID' for the index of probe names. | ||
And probe names should be `cgxxxxxx` format to work with other methylcheck functions | ||
To create this, use: | ||
data_containers = methylprep.run_pipeline(data_dir, | ||
save_uncorrected=True, | ||
betas=True) | ||
(if there are more than 200 samples, you'll need to load the data from disk instead, as nothing will be returned) | ||
method: | ||
sesame -- use the sesame ported algorithm | ||
minfi -- use the minfi ported algorithm | ||
Checks: | ||
data_containers must have 'meth' and 'unmeth' columns (uncorrected data) | ||
the values for these columns should be between 0 and 10000s | ||
(beta: 0 to 1; m_value: -neg to pos+ range) | ||
Returns: | ||
dataframe of p-value filtered probes | ||
""" | ||
if not ('unmeth' in data_containers[0]._SampleDataContainer__data_frame.columns and | ||
'meth' in data_containers[0]._SampleDataContainer__data_frame.columns): | ||
raise ValueError("Provide a list of data_containers that includes uncorrected data (with 'meth' and 'unmeth' columns, using the 'save_uncorrected' option in run_pipeline)") | ||
if method == 'minfi': | ||
pval = pval_minfi(data_containers) | ||
else: | ||
pval = pval_sesame(data_containers) | ||
|
||
if silent == False: | ||
# plot it | ||
# df1 and df2 are probe X sample_id matrices | ||
mean_beta_compare(df1, df2, save=save, verbose=False, silent=silent) | ||
return pval |
Oops, something went wrong.