# Demonstrating methylize regression and plotting

In [1]:
import numpy as np
import pandas as pd
import math
import time
import methylprep
import methylcheck

In [3]:
%load_ext autoreload
%autoreload 2
from methylize import diff_meth_pos, volcano_plot, manhattan_plot

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


ModuleNotFoundError: No module named 'diff_meth_pos'

In [None]:
#Install joblib module for parallelization
import sys
!conda install --yes --prefix {sys.prefix} joblib

In [None]:
# load a processed dataset and the phenotype list per sample.
# in CLI I ran `python -m methylprep process -d GSE69852_copy --betas --m_value to make these files.
betas = pd.read_pickle('GSE69852_beta_values.pkl')
m_values = pd.read_pickle('GSE69852_m_values.pkl')
betas.head()
#m_values.head()

## Testing logistic regression

In [None]:
##Create a logistic regression dataset using strings that will later be converted to zeros and ones
test_pheno_data = ["fetal","fetal","fetal","adult","adult","adult"]

In [None]:
test_M_values_T = m_values.transpose()
test_M_values_T

In [None]:
##Run a logistic regression on the methylation data, first 30000 probes only here 
test_results = diff_meth_pos(test_M_values_T.sample(30000, axis=1), #.iloc[:,64000:67000],
                                  test_pheno_data,
                                  regression_method="logistic",
                            export=True)

## Testing Manhattan plot visualizations

In [None]:
manhattan_plot(test_results, cutoff=0.13, palette='default', save=True)

In [None]:
""" pvalues: diff bw the two phenotype labels (A vs B difference sig)
most p-values in the demo data are not near or below 0.05.
That means those probes / locations are not useful in separating the two phenotypes
"""
interesting_probes = test_results[test_results['PValue'] <= 0.13]
interesting_probes

## Testing linear regression

In [None]:
# usually you would pull this data from methylprep meta_data dataframe
# Create a linear regression dataset using strings that will be coerced to numbers (adult ages are estimates, not exact)
test_pheno_data2 = ["0","0","0","52","54","57"]

In [None]:
# Run a linear regression on the methylation data versus age of sample
test_results2 = diff_meth_pos(test_M_values_T.iloc[:,:10000],
                              test_pheno_data2,
                              regression_method="linear")

In [None]:
interesting_probes = test_results2[test_results2['PValue'] <= 0.05]
print(len(interesting_probes))
interesting_probes.head()

## Testing Volcano plot visualizations

In [None]:
volcano_plot(test_results2, fontsize=16, cutoff=0.15, beta_coefficient_cutoff=(-0.09,0.09), save=True)