In [1]:
import pandas as pd
import numpy as np
from pybedtools import BedTool
from math import sqrt

from ProjectTools import OpenBeds

In [2]:
dry = True

def filter_on_depth(df):
    average = df["readCount"].mean()
    df = df[df["readCount"].ge(10)]
    df = df[df["readCount"].le(average + 3*sqrt(average))]
    return df

if dry == True:
    wgbs_bed_path = './test_data/ENCSR893RHD_modifications_mm39_sub.bed'
    nano_5mc_bed_path = './test_data/prom_R10.4.1_E8.2_WGS_brain_DH_5mC_nonAggregated_sub.bedMethyl'
    tab_bed_path = './test_data/CRR008807_TAB_merged.bedGraph.gz.bismark.zero.cov_sub.bed'
    nano_5hmC_bed_path = './test_data/prom_R10.4.1_E8.2_WGS_brain_5hmC_sub.bed'
    
    wgbs_df = filter_on_depth(OpenBeds.get_wgbs(wgbs_bed_path))
    nano_mc_df = filter_on_depth(OpenBeds.get_nanopore_5mc(nano_5mc_bed_path))

    tab_df = filter_on_depth(OpenBeds.get_tab(tab_bed_path))
    nano_hmc_df = filter_on_depth(OpenBeds.get_nanopore_5hmc(nano_5hmC_bed_path))

else:
    wgbs_bed_path = './data/ENCSR893RHD_modifications_mm39.bed'
    nano_5mc_bed_path = './data/prom_R10.4.1_E8.2_WGS_brain_DH_5mC_nonAggregated.bedMethyl'
    tab_bed_path = './data/CRR008807_TAB_merged.bedGraph.gz.bismark.zero.cov'
    nano_5hmC_bed_path = './data/prom_R10.4.1_E8.2_WGS_brain_5hmC.bed'

    wgbs_df = filter_on_depth(OpenBeds.get_wgbs(wgbs_bed_path))
    nano_mc_df = filter_on_depth(OpenBeds.get_nanopore_5mc(nano_5mc_bed_path))
    tab_df = filter_on_depth(OpenBeds.get_tab(tab_bed_path))
    nano_hmc_df = filter_on_depth(OpenBeds.get_nanopore_5hmc(nano_5hmC_bed_path))



In [10]:
from pybedtools import BedTool

ref_features = BedTool('./feature_references/mm39_all_features.bed')

mc_bed = BedTool.from_dataframe(pd.concat([nano_mc_df, wgbs_df]))
hmc_bed = BedTool.from_dataframe(pd.concat([tab_df, nano_hmc_df]))
ref_features = BedTool('./feature_references/mm39_all_features.bed')

def define_intergenic(mod_bed):
    df_of_non_intersects = BedTool.intersect(mod_bed, ref_features, v=True).to_dataframe(
        names=["chromosome", "chromStart", "chromEnd", "modification_type", "readCount", "percentMeth", "method", "feature"])
    df_of_non_intersects["feature"] = "Intergenic"
    return df_of_non_intersects

def find_all_intersects(mod_bed):
    features_df = BedTool.intersect(mod_bed, ref_features, wb=True).to_dataframe(
        names=["chromosome", "chromStart", "chromEnd", "modification_type", "readCount", "percentMeth", "method", "dup1", "dup2", "dup3", "feature"])
    intergenic_df = define_intergenic(mod_bed)

    return pd.concat([features_df, intergenic_df]).drop(columns=["dup1", "dup2", "dup3"])

In [15]:
mc_intersects = find_all_intersects(mc_bed)
hmc_intersects = find_all_intersects(hmc_bed)