In [1]:
import pandas as pd
from math import sqrt

from ProjectTools import OpenBeds

In [2]:
dry = True

def filter_on_depth(df):
    average = df["readCount"].mean()
    df = df[df["readCount"].ge(10)]
    df = df[df["readCount"].le(average + 3*sqrt(average))]
    return df

if dry == True:
    wgbs_bed_path = './test_data/ENCSR893RHD_modifications_mm39_sub.bed'
    nano_5mc_bed_path = './test_data/prom_R10.4.1_E8.2_WGS_brain_DH_5mC_nonAggregated_sub.bedMethyl'
    tab_bed_path = './test_data/CRR008807_TAB_merged.bedGraph.gz.bismark.zero.cov_sub.bed'
    nano_5hmC_bed_path = './test_data/prom_R10.4.1_E8.2_WGS_brain_5hmC_sub.bed'
    
    wgbs_df = filter_on_depth(OpenBeds.get_wgbs(wgbs_bed_path))
    nano_mc_df = filter_on_depth(OpenBeds.get_nanopore_5mc(nano_5mc_bed_path))

    tab_df = filter_on_depth(OpenBeds.get_tab(tab_bed_path))
    nano_hmc_df = filter_on_depth(OpenBeds.get_nanopore_5hmc(nano_5hmC_bed_path))

else:
    wgbs_bed_path = './data/ENCSR893RHD_modifications_mm39.bed'
    nano_5mc_bed_path = './data/prom_R10.4.1_E8.2_WGS_brain_DH_5mC_nonAggregated.bedMethyl'
    tab_bed_path = './data/CRR008807_TAB_merged.bedGraph.gz.bismark.zero.cov'
    nano_5hmC_bed_path = './data/prom_R10.4.1_E8.2_WGS_brain_5hmC.bed'

    wgbs_df = filter_on_depth(OpenBeds.get_wgbs(wgbs_bed_path))
    nano_mc_df = filter_on_depth(OpenBeds.get_nanopore_5mc(nano_5mc_bed_path))
    tab_df = filter_on_depth(OpenBeds.get_tab(tab_bed_path))
    nano_hmc_df = filter_on_depth(OpenBeds.get_nanopore_5hmc(nano_5hmC_bed_path))



In [4]:
from pybedtools import BedTool

ref_features = BedTool('./feature_references/mm39_fig3_features.bed')

mc_bed = BedTool.from_dataframe(pd.concat([nano_mc_df, wgbs_df]))
hmc_bed = BedTool.from_dataframe(pd.concat([tab_df, nano_hmc_df]))

In [5]:
def define_intergenic(mod_bed):
    df_of_non_intersects = BedTool.intersect(mod_bed, ref_features, v=True).to_dataframe(
        names=["chromosome", "chromStart", "chromEnd", "modification_type", "readCount", "percentMeth", "method", "feature_type"])
    df_of_non_intersects["feature_type"] = "Intergenic"
    return df_of_non_intersects

def find_all_intersects(mod_bed):
    features_df = BedTool.intersect(mod_bed, ref_features, wb=True).to_dataframe(
        names=["chromosome", "chromStart", "chromEnd", "modification_type", "readCount", "percentMeth", "method", "dup1", "dup2", "dup3", "feature_name", "feature_type"])
    features_df.loc[features_df["feature_type"].str.contains("CpG"), "feature_type"] = "CGI"
    intergenic_df = define_intergenic(mod_bed)


    return pd.concat([features_df, intergenic_df]).drop(columns=["dup1", "dup2", "dup3"])

In [6]:
mc_intersects_df = find_all_intersects(mc_bed)
hmc_intersects_df = find_all_intersects(hmc_bed)
hmc_intersects_df.loc[hmc_intersects_df["method"] == "Nanopore 5hmC", "method"] = "Nanopore"
mc_intersects_df.loc[mc_intersects_df["method"] == "Nanopore 5mC", "method"] = "Nanopore"


In [7]:
hmc_averages_df = hmc_intersects_df.groupby(["feature_name", "feature_type", "method"])["percentMeth"].mean().reset_index()
hmc_averages_df.loc[hmc_averages_df["percentMeth"] > 0]

Unnamed: 0,feature_name,feature_type,method,percentMeth
0,ENSMUSR00000000054,Enhancer,Nanopore,6.355000
1,ENSMUSR00000000054,Enhancer,TAB,5.048077
2,ENSMUSR00000000095,Promoter,Nanopore,4.075269
3,ENSMUSR00000000095,Promoter,TAB,7.718418
4,ENSMUSR00000000102,Promoter,Nanopore,3.700043
...,...,...,...,...
43758,ENSMUST00020183657.1_exon_0_0_chr1_57507029_f,Exon,Nanopore,5.975000
43759,ENSMUST00020183657.1_exon_0_0_chr1_57507029_f,Exon,TAB,25.000000
43760,ENSMUST00020183682.1_exon_0_0_chr1_59665739_f,Exon,Nanopore,19.230000
43761,ENSMUST00020183707.1_exon_0_0_chr1_14002602_f,Exon,Nanopore,16.345000
