In [1]:
import pandas as pd
from ProjectTools import OpenBeds
from ProjectTools.OpenBeds import filterDepth

In [23]:
dry = True

if dry == True:
    top_hmc_genes = pd.read_excel('./top_hmc_genes_testSet.xlsx').to_dict()
    top_hmc_genes.pop("Unnamed: 0")

    tab_bed_path = './test_data/CRR008807_TAB_merged.bedGraph.gz.bismark.zero.cov_sub.bed'
    nano_3mod_path = './test_data/prom_R10.4.1_E8.2_WGS_brain_0.9.1_mods_sub.bed'
    
    nano_mc_df, nano_hmc_df = map(filterDepth, OpenBeds.get_nanopore_threeMod_wStrand(nano_3mod_path))
    tab_df = filterDepth(OpenBeds.get_bismark(tab_bed_path, "5hmC"))

else:
    top_hmc_genes = pd.read_excel('./top_hmc_genes.xlsx').to_dict()
    top_hmc_genes.pop("Unnamed: 0")

    tab_bed_path = './data/TAB_data/CRR008807_TAB_merged.bedGraph.gz.bismark.zero.cov'
    nano_3mod_path = './data/prom_R10.4.1_E8.2_WGS_brain_0.9.1_mods.bed'

    nano_mc_df, nano_hmc_df = map(filterDepth, OpenBeds.get_nanopore_threeMod_wStrand(nano_3mod_path))
    tab_df = filterDepth(OpenBeds.get_bismark(tab_bed_path, "5hmC"))

In [24]:
import subprocess

genes_info = {}

for gene_name in top_hmc_genes["geneName"].values():
    gene_info = subprocess.check_output(
        "grep {gene_name} ./feature_references/fig5_features/mm39_RefSeqC_select_merged_modified.bed".format(
        gene_name=gene_name), shell=True).decode(
        "UTF-8").split()
    gene_entry = {
        gene_name : 
        {
        "chromosome" : gene_info[0],
        "chromStart" : gene_info[1],
        "chromEnd" : gene_info[2],
        "strand" : gene_info[5]
        }
        }
    
    genes_info.update(gene_entry)

In [25]:
top_gene_bed = pd.DataFrame(genes_info).T.reset_index()[["chromosome", "chromStart", "chromEnd", "strand", "index"]].rename(
                    columns={"index" : "geneName"})

In [44]:
from pybedtools import BedTool

def addStrandsToBismark(df, strand_bed_path):
    names=["chromosome", "chromStart", "chromEnd", "modification_type", "readCount", "percentMeth", "method", ".", ".1", ".2", "strand"]

    strands_df = BedTool.intersect(BedTool.from_dataframe(df), 
                                   BedTool(strand_bed_path), wb=True).to_dataframe(
        names=names).drop(columns=[".", ".1", ".2"])
    
    strands_df = strands_df[["chromosome", "chromStart", "chromEnd", "strand", "modification_type", "readCount", "percentMeth", "method"]]
    
    return strands_df

def geneIntersect(df): 
    df = df[["chromosome", "chromStart", "chromEnd", "readCount", "percentMeth", "strand"]]
    gene_info_df = pd.DataFrame(genes_info).T.reset_index().rename(
        columns={"index" : "geneName"})
    gene_info_df["."] = "." 
    gene_info_df = gene_info_df[["chromosome", "chromStart", "chromEnd", "geneName", ".", "strand"]]
    
    gene_df = BedTool.intersect(
                BedTool.from_dataframe(df),
                BedTool.from_dataframe(gene_info_df),
                wb=True, s=True).to_dataframe(
        names=["chromosome", "chromStart", "chromEnd", "readCount", "percentMeth", "strand", ".1", ".2", ".3", "geneName", ".4", ".5"]).drop(
        columns=["readCount", ".1", ".2", ".3", ".4", ".5"], errors="ignore"
                                            )
    return gene_df

nano_intersect = geneIntersect(nano_hmc_df)
tab_intersect = geneIntersect(addStrandsToBismark(tab_df, './data/TAB_data/CRR008807_TAB_cytosine_report.tsv.CpG_report_strands.bed'))


In [45]:
nano_intersect

Unnamed: 0,chromosome,chromStart,chromEnd,percentMeth,strand,geneName
0,chr1,4927938,4927939,0.00,+,Tcea1
1,chr1,4927942,4927943,0.00,+,Tcea1
2,chr1,4927947,4927948,0.00,+,Tcea1
3,chr1,4927963,4927964,0.00,+,Tcea1
4,chr1,4927965,4927966,0.00,+,Tcea1
...,...,...,...,...,...,...
34751,chr1,72339868,72339869,0.00,+,Tmem169
34752,chr1,72340098,72340099,0.00,+,Tmem169
34753,chr1,72340409,72340410,0.00,+,Tmem169
34754,chr1,72341493,72341494,63.64,+,Tmem169


In [46]:
tab_intersect

Unnamed: 0,chromosome,chromStart,chromEnd,percentMeth,strand,geneName
0,chr1,4928719,4928720,0.000000,+,Tcea1
1,chr1,4928777,4928778,0.000000,+,Tcea1
2,chr1,4928794,4928795,0.000000,+,Tcea1
3,chr1,4928822,4928823,0.000000,+,Tcea1
4,chr1,4928824,4928825,0.000000,+,Tcea1
...,...,...,...,...,...,...
14167,chr1,72338325,72338326,7.692308,+,Tmem169
14168,chr1,72338383,72338384,0.000000,+,Tmem169
14169,chr1,72338549,72338550,10.000000,+,Tmem169
14170,chr1,72340586,72340587,10.000000,+,Tmem169


In [29]:
target_bed = pd.DataFrame(genes_info).T.reset_index()[["chromosome", "chromStart", "chromEnd"]]
target_bed.to_csv('./feature_references/fig7_genes/target_genes.bed', sep="\t", header=None, index=None)