# <span style="color:#ff1414"> BEDtools analysis. </span>

This is a script to answer research questions outlined elsewhere. In summary, this script:

1. compares methylation results between different methylation-callers, and between different methylation sequencing methods.

2. compares methylation between genes and non-gene regions

3. compares methylation between transposons and non-repetitive regions

4. compares transposons and genes


Note:
- PB/pb = PacBio
- ONT/ont = Oxford Nanopore Technology
- NP = Nanopolish

In [2]:


import matplotlib.patches as mpatches

 # need for  stats

from scipy.stats import wilcoxon
import matplotlib.pyplot as plt
from matplotlib_venn import venn2
from scipy.stats import spearmanr


In [120]:
# load modules
import os
import glob
import pprint
import pandas as pd
import scipy
import numpy as np
import pybedtools
from pybedtools import BedTool

In [64]:
#First we need to define the base dirs
DIRS = {}
DIRS['BASE2'] = '/home/anjuni/analysis'
DIRS['FEATURES'] = os.path.join(DIRS['BASE2'], 'coverage', 'feature_files')
DIRS['RAND'] = os.path.join(DIRS['BASE2'], 'coverage', 'randomisation')
DIRS['COVERAGE'] = os.path.join(DIRS['BASE2'], 'coverage')
DIRS['GENE'] = os.path.join(DIRS['COVERAGE'], 'gene_level')
DIRS['GENE_BODY'] = os.path.join(DIRS['GENE'], 'gene_body')
DIRS['TE'] = os.path.join(DIRS['GENE'], 'te')
DIRS['GFF_INPUT'] = os.path.join(DIRS['BASE2'], 'gff_output')
DIRS['FIGURES'] = os.path.join(DIRS['BASE2'], 'figures')
DIRS['COMPARE'] = os.path.join(DIRS['BASE2'], 'gene_te_comparison')

In [9]:
#Quick chech if directories exist
for value in DIRS.values():
    if not os.path.exists(value):
        print('%s does not exist' % value)

/home/anjuni/methylation_calling/pacbio/input/sorted_bed_files/cutoffs does not exist


In [79]:
#Make filepaths
bed_file_list = [fn for fn in glob.iglob('%s/*.bed' % DIRS['BED_INPUT'], recursive=True)]
gff_file_list = [fn for fn in glob.iglob('%s/*anno.gff3' % DIRS['GFF_INPUT'], recursive=True)]
te_file_list = [fn for fn in glob.iglob('%s/*.gff' % DIRS['GFF_INPUT'], recursive=True)]

In [80]:
#Check that the list works
print(*bed_file_list, sep='\n')
print(*gff_file_list, sep='\n')
print(*te_file_list, sep='\n')

/home/anjuni/analysis/bedtools_output/sequencing_comparison/5mC_plus_tombo_sorted.bed
/home/anjuni/analysis/bedtools_output/sequencing_comparison/5mC_CpG_tombo_np.bed
/home/anjuni/analysis/bedtools_output/sequencing_comparison/5mC_tombo_np.bed
/home/anjuni/analysis/bedtools_output/sequencing_comparison/5mC_plus_CpG_np_tombo.bed
/home/anjuni/analysis/bedtools_output/sequencing_comparison/5mC_hc_tombo_sorted.CpG.plus.bed
/home/anjuni/analysis/bedtools_output/sequencing_comparison/5mC_s_nanopolish.bed
/home/anjuni/analysis/bedtools_output/sequencing_comparison/nanopolish_rerun_subtract.bed
/home/anjuni/analysis/bedtools_output/sequencing_comparison/5mC_hc_nanopolish_sorted.bed
/home/anjuni/analysis/bedtools_output/sequencing_comparison/5mC_plus_CpG_tombo_np.bed
/home/anjuni/analysis/bedtools_output/sequencing_comparison/nano_plus_tombo_overlap.bed
/home/anjuni/analysis/bedtools_output/sequencing_comparison/5mC_tombo_sorted.bed
/home/anjuni/analysis/bedtools_output/sequencing_comparison/5m

## <span style='color:#9ac615'> 8. Comparing methylated transposons and genes. <span/>

In [None]:
# make a bed file of the 500 most highly methylated and 500 least methylated genes
# write a function so you can find top 100 as well
# or use a cutoff for methylation fraction by ranking

In [58]:
# TE coverage files in bed6 format were generated in notebook 6 for the next part and then copied over to the input folder
# Gene annotation files in bed6 that were previously generated were copied over to the input folder

!cp /home/anjuni/analysis/coverage/gene_level/gene_body/input/*anno.s* /home/anjuni/analysis/gene_te_comparison/input
!cp /home/anjuni/analysis/coverage/gene_level/te/coverage/*6.b* /home/anjuni/analysis/gene_te_comparison/input

In [177]:
!cp /home/anjuni/analysis/coverage/gene_level/te/coverage/*6.id.b* /home/anjuni/analysis/gene_te_comparison/input

In [141]:
# make a dataframe of transposons and rank by methylation level
te_fn_list = [fn for fn in glob.iglob('%s/*6.id.bed' % os.path.join(DIRS['COMPARE'], 'input'), recursive=True)]

In [178]:
te_fn_list

['/home/anjuni/analysis/gene_te_comparison/input/Pst_104E_v13_h_ctg.REPET.6mA_hc_tombo_sorted.cutoff.0.50.overlap6.id.bed',
 '/home/anjuni/analysis/gene_te_comparison/input/Pst_104E_v13_h_ctg.REPET.6mA_hc_tombo_sorted.cutoff.0.95.overlap6.id.bed',
 '/home/anjuni/analysis/gene_te_comparison/input/Pst_104E_v13_p_ctg.REPET.5mC_hc_tombo_sorted.cutoff.0.50.overlap6.id.bed',
 '/home/anjuni/analysis/gene_te_comparison/input/Pst_104E_v13_h_ctg.REPET.5mC_hc_tombo_sorted.cutoff.0.95.overlap6.id.bed',
 '/home/anjuni/analysis/gene_te_comparison/input/Pst_104E_v13_p_ctg.REPET.6mA_hc_tombo_sorted.cutoff.0.50.overlap6.id.bed',
 '/home/anjuni/analysis/gene_te_comparison/input/Pst_104E_v13_p_ctg.REPET.6mA_hc_tombo_sorted.cutoff.0.95.overlap6.id.bed',
 '/home/anjuni/analysis/gene_te_comparison/input/Pst_104E_v13_p_ctg.REPET.5mC_hc_tombo_sorted.cutoff.0.95.overlap6.id.bed',
 '/home/anjuni/analysis/gene_te_comparison/input/Pst_104E_v13_h_ctg.REPET.5mC_hc_tombo_sorted.cutoff.0.50.overlap6.id.bed']

In [210]:
headings = ['contig', 'start', 'stop', 'name', 'overlap_fraction', 'strand']

In [179]:
def fn_to_df_dict(fn_list):
    "Returns a dictionary of pandas dataframes for a list of file paths."
    df_dict = {}
    for fn in fn_list:
        df = pd.read_csv(fn, sep='\t', header=None, names = headings)
        df_dict[fn.split('/')[-1]] = df
    return df_dict

In [180]:
te_df_dict = fn_to_df_dict(te_fn_list)

In [181]:
print(*te_df_dict, sep='\n')

Pst_104E_v13_h_ctg.REPET.6mA_hc_tombo_sorted.cutoff.0.50.overlap6.id.bed
Pst_104E_v13_h_ctg.REPET.6mA_hc_tombo_sorted.cutoff.0.95.overlap6.id.bed
Pst_104E_v13_p_ctg.REPET.5mC_hc_tombo_sorted.cutoff.0.50.overlap6.id.bed
Pst_104E_v13_h_ctg.REPET.5mC_hc_tombo_sorted.cutoff.0.95.overlap6.id.bed
Pst_104E_v13_p_ctg.REPET.6mA_hc_tombo_sorted.cutoff.0.50.overlap6.id.bed
Pst_104E_v13_p_ctg.REPET.6mA_hc_tombo_sorted.cutoff.0.95.overlap6.id.bed
Pst_104E_v13_p_ctg.REPET.5mC_hc_tombo_sorted.cutoff.0.95.overlap6.id.bed
Pst_104E_v13_h_ctg.REPET.5mC_hc_tombo_sorted.cutoff.0.50.overlap6.id.bed


In [182]:
te_df_dict['Pst_104E_v13_h_ctg.REPET.5mC_hc_tombo_sorted.cutoff.0.50.overlap6.id.bed'].head()

Unnamed: 0,contig,start,stop,name,overlap_fraction,strand
0,hcontig_000_003,148,221,ms32995_hcontig_000_003_DXX-MITE_MCL744_Pst79_...,0.287671,+
1,hcontig_000_003,243,270,ms206957_hcontig_000_003_TTTTCGAAATTGAA2,0.074074,+
2,hcontig_000_003,674,693,ms206958_hcontig_000_003_CCTCCGTGTT2,0.421053,+
3,hcontig_000_003,860,879,ms206959_hcontig_000_003_CAGGGAGTG2,0.157895,+
4,hcontig_000_003,1048,1059,ms206960_hcontig_000_003_TCTCC2,0.545455,+


In [218]:
# rank by methylation level and subset to a new df
def top_te(df_dict, asc, number):
    """Takes a dictionary of dataframes, sorts by overlap fraction, and returns a dictionary of dataframes sliced by the top number."""
    top_dict = {}
    for key, value in df_dict.items():
        df = value.sort_values(by='overlap_fraction', ascending=asc).loc[:500]
        top_dict[key] = df
        out_fn = os.path.join(DIRS['COMPARE'], 'te_ranked_files', key)
        out_fn = out_fn.replace('bed', '500.bed')
        df.to_csv(out_fn, sep='\t', header=None, index=None)
    return top_dict

In [206]:
top500_te_df_dict['Pst_104E_v13_h_ctg.REPET.5mC_hc_tombo_sorted.cutoff.0.50.overlap6.id.bed'].head()

Unnamed: 0,contig,start,stop,name,overlap_fraction,strand
21509,hcontig_002_011,290260,290275,ms360242_hcontig_002_011_C15,1.0,+
28850,hcontig_003_002,546975,546987,ms258802_hcontig_003_002_C12,1.0,+
158472,hcontig_036_008,110484,110494,ms198267_hcontig_036_008_C10,1.0,+
83601,hcontig_012_028,491436,491447,ms267817_hcontig_012_028_C11,1.0,+
102575,hcontig_018_016,154833,154849,ms128345_hcontig_018_016_C16,1.0,+


In [219]:
top500_te_df_dict = top_te(te_df_dict, False, 500)
#low500_te_df_dict = top_te(te_df_dict, True, 500)
#top100_te_df_dict = top_te(te_df_dict, False, 100)
#low100_te_df_dict = top_te(te_df_dict, True, 100)

In [185]:
print(*top500_te_df_dict, sep='\n')
print(*low500_te_df_dict, sep='\n')

Pst_104E_v13_h_ctg.REPET.6mA_hc_tombo_sorted.cutoff.0.50.overlap6.id.bed
Pst_104E_v13_h_ctg.REPET.6mA_hc_tombo_sorted.cutoff.0.95.overlap6.id.bed
Pst_104E_v13_p_ctg.REPET.5mC_hc_tombo_sorted.cutoff.0.50.overlap6.id.bed
Pst_104E_v13_h_ctg.REPET.5mC_hc_tombo_sorted.cutoff.0.95.overlap6.id.bed
Pst_104E_v13_p_ctg.REPET.6mA_hc_tombo_sorted.cutoff.0.50.overlap6.id.bed
Pst_104E_v13_p_ctg.REPET.6mA_hc_tombo_sorted.cutoff.0.95.overlap6.id.bed
Pst_104E_v13_p_ctg.REPET.5mC_hc_tombo_sorted.cutoff.0.95.overlap6.id.bed
Pst_104E_v13_h_ctg.REPET.5mC_hc_tombo_sorted.cutoff.0.50.overlap6.id.bed
Pst_104E_v13_h_ctg.REPET.6mA_hc_tombo_sorted.cutoff.0.50.overlap6.id.bed
Pst_104E_v13_h_ctg.REPET.6mA_hc_tombo_sorted.cutoff.0.95.overlap6.id.bed
Pst_104E_v13_p_ctg.REPET.5mC_hc_tombo_sorted.cutoff.0.50.overlap6.id.bed
Pst_104E_v13_h_ctg.REPET.5mC_hc_tombo_sorted.cutoff.0.95.overlap6.id.bed
Pst_104E_v13_p_ctg.REPET.6mA_hc_tombo_sorted.cutoff.0.50.overlap6.id.bed
Pst_104E_v13_p_ctg.REPET.6mA_hc_tombo_sorted.cutoff

In [215]:
top500_fn_list = [fn for fn in glob.iglob('%s/*id*' % os.path.join(DIRS['COMPARE'], 'te_ranked_files'), recursive=True)]
top500_dict = {}
for fn in top500_fn_list:
    top500_dict[fn.split('/')[-1]] = fn

In [216]:
top500_dict

{'Pst_104E_v13_h_ctg.REPET.5mC_hc_tombo_sorted.cutoff.0.50.overlap6.id.500.bed': '/home/anjuni/analysis/gene_te_comparison/te_ranked_files/Pst_104E_v13_h_ctg.REPET.5mC_hc_tombo_sorted.cutoff.0.50.overlap6.id.500.bed',
 'Pst_104E_v13_h_ctg.REPET.5mC_hc_tombo_sorted.cutoff.0.95.overlap6.id.500.bed': '/home/anjuni/analysis/gene_te_comparison/te_ranked_files/Pst_104E_v13_h_ctg.REPET.5mC_hc_tombo_sorted.cutoff.0.95.overlap6.id.500.bed',
 'Pst_104E_v13_h_ctg.REPET.6mA_hc_tombo_sorted.cutoff.0.50.overlap6.id.500.bed': '/home/anjuni/analysis/gene_te_comparison/te_ranked_files/Pst_104E_v13_h_ctg.REPET.6mA_hc_tombo_sorted.cutoff.0.50.overlap6.id.500.bed',
 'Pst_104E_v13_h_ctg.REPET.6mA_hc_tombo_sorted.cutoff.0.95.overlap6.id.500.bed': '/home/anjuni/analysis/gene_te_comparison/te_ranked_files/Pst_104E_v13_h_ctg.REPET.6mA_hc_tombo_sorted.cutoff.0.95.overlap6.id.500.bed',
 'Pst_104E_v13_p_ctg.REPET.5mC_hc_tombo_sorted.cutoff.0.50.overlap6.id.500.bed': '/home/anjuni/analysis/gene_te_comparison/te_ra

In [167]:
DIRS['COMPARE']

'/home/anjuni/analysis/gene_te_comparison'

In [176]:
top500_fn_list

['/home/anjuni/analysis/gene_te_comparison/te_ranked_files/Pst_104E_v13_h_ctg.REPET.5mC_hc_tombo_sorted.cutoff.0.50.overlap6.id.500.bed',
 '/home/anjuni/analysis/gene_te_comparison/te_ranked_files/Pst_104E_v13_h_ctg.REPET.6mA_hc_tombo_sorted.cutoff.0.95.overlap6.id.500.bed',
 '/home/anjuni/analysis/gene_te_comparison/te_ranked_files/Pst_104E_v13_p_ctg.REPET.5mC_hc_tombo_sorted.cutoff.0.50.overlap6.id.500.bed',
 '/home/anjuni/analysis/gene_te_comparison/te_ranked_files/Pst_104E_v13_p_ctg.REPET.6mA_hc_tombo_sorted.cutoff.0.95.overlap6.id.500.bed',
 '/home/anjuni/analysis/gene_te_comparison/te_ranked_files/Pst_104E_v13_p_ctg.REPET.6mA_hc_tombo_sorted.cutoff.0.50.overlap6.id.500.bed',
 '/home/anjuni/analysis/gene_te_comparison/te_ranked_files/Pst_104E_v13_h_ctg.REPET.5mC_hc_tombo_sorted.cutoff.0.95.overlap6.id.500.bed',
 '/home/anjuni/analysis/gene_te_comparison/te_ranked_files/Pst_104E_v13_p_ctg.REPET.5mC_hc_tombo_sorted.cutoff.0.95.overlap6.id.500.bed',
 '/home/anjuni/analysis/gene_te_co

In [None]:
bed6_he

In [217]:
%%bash

#Run this on NCI

scp -r anjni@cbbu.anu.edu.au:/home/anjuni/analysis/gene_te_comparison/te_ranked_files/* .

# Use gffsort.pl to sort the catenated files
cd /home/anjuni/analysis/gene_te_comparison/te_ranked_files
for x in *id*
do
len=${#x}
sortBed -i ${x} > ${x::len-4}.sorted.bed
done

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [114]:
gene_list = [fn for fn in glob.iglob('%s/*d.bed' % os.path.join(DIRS['COMPARE'], 'input'), recursive=True)]
gene_dict = {}
for fn in gene_list:
    gene_dict[fn.split('/')[-1]] = fn

In [115]:
gene_dict

{'Pst_104E_v13_h_ctg.anno.sorted.bed': '/home/anjuni/analysis/gene_te_comparison/input/Pst_104E_v13_h_ctg.anno.sorted.bed',
 'Pst_104E_v13_p_ctg.anno.sorted.bed': '/home/anjuni/analysis/gene_te_comparison/input/Pst_104E_v13_p_ctg.anno.sorted.bed'}

In [129]:
def add_score(fn_dict):
    """This function adds an integer score to replace the '.' in the score column in the annotation bed file, so it can be parsed as bed6."""
    out_fn_dict = {}
    for key, value in fn_dict.items():
        in_fn = fn_dict[key]
        df = pd.read_csv(in_fn, sep='\t', header = None)
        for index, row in df.iterrows():
            df.iat[index,4] = 0 # change the gene end site to 500bp downstream of TSS
        out_fn = in_fn.replace('.bed', '.score.bed') # make the outfile name
        df.to_csv(out_fn, header=None, index=None, sep='\t') # save the new tss df to a bed file
        outkey = out_fn.split('/')[-1]
        out_fn_dict[outkey] = out_fn # save the outfile names to a dictionary
    return out_fn_dict

In [130]:
gene_score_dict = add_score(gene_dict)

In [131]:
gene_score_dict

{'Pst_104E_v13_h_ctg.anno.sorted.score.bed': '/home/anjuni/analysis/gene_te_comparison/input/Pst_104E_v13_h_ctg.anno.sorted.score.bed',
 'Pst_104E_v13_p_ctg.anno.sorted.score.bed': '/home/anjuni/analysis/gene_te_comparison/input/Pst_104E_v13_p_ctg.anno.sorted.score.bed'}

In [150]:
#Quick chech if directories exist
for value in gene_score_dict.values():
    if not os.path.exists(value):
        print('%s does not exist' % value)

In [None]:
#overall closest gene

#closest upstream gene

#closest downstream gene

In [None]:
#non-overlapping overall closest gene

#non-overlapping closest upstream gene

#non-overlapping closest downstream gene

In [None]:
def genes_near_te(te_bed_fn, gene_bed_fn, io, iu, di):
    """Takes two bed6 filenames and returns dataframe with 5' and 3' distances."""
    from pybedtools import BedTool
    te = BedTool(te_bed_fn)
    gene = BedTool(gene_bed_fn)
    df = te.closest(gene ,io=io, iu=iu id=di, N=True, d=True).to_dataframe()
    df.rename(columns={12:'3_distance', 3:'gene', 9:'3_target', 0:'contig'}, inplace=True)
    out_fn = te_bed_fn.replace('input', 'output')
    df.to_csv(out_fn, sep='\t', index=None, header=None)
    return df

In [207]:
te_bed_fn = top500_dict['Pst_104E_v13_p_ctg.REPET.5mC_hc_tombo_sorted.cutoff.0.50.overlap6.id.500.bed']
gene_bed_fn = gene_score_dict['Pst_104E_v13_p_ctg.anno.sorted.score.bed']
io = False
iu = False
di = False

In [189]:
te = BedTool(te_bed_fn)
gene = BedTool(gene_bed_fn)

In [155]:
te

<BedTool(                 contig   start    stop  \
160586  hcontig_037_011     545     555   
210767  hcontig_106_001   74335   74353   
199925  hcontig_069_004    7125    7136   
209165  hcontig_096_004      13      23   
135653  hcontig_028_003  112714  112724   
88179   hcontig_013_023  282075  282087   
57354   hcontig_007_007    1515    1527   
210083  hcontig_096_004  144813  144827   
210285  hcontig_104_001   21713   21738   
18017   hcontig_001_021  875808  875829   
203194  hcontig_072_009    1456    1474   
18004   hcontig_001_021  873743  873758   
80588   hcontig_012_014  157244  157254   
78055   hcontig_011_019   15346   15383   
42472   hcontig_005_021  390265  390293   
135027  hcontig_028_003   31909   31924   
174261  hcontig_044_023    1220    1239   
47010   hcontig_006_013    1944    1986   
150390  hcontig_033_014    3796    3806   
174266  hcontig_044_023    1997    2007   
53583   hcontig_007_006   34738   34750   
131768  hcontig_026_018  409643  409665   
19

Previous errors in the closest() function were due to '.' in the score column, which needs to be an integer between 0 and 1000 in bed6 file format.
Errors in closest() function are likely due to white space in the name column for transposons.
This is being remedied by running everything again for TE annotation files with only their ID in the name column.
New bed6 annotation files were generated for TEs, and these only have ID, hopefully without white space that cause errors.

In [208]:
df = te.closest(gene, d=True).to_dataframe()
#df.rename(columns={12:'n3_distance', 3:'gene', 9:'3_target', 0:'contig'}, inplace=True)
#out_fn = te_bed_fn.replace('input', 'output')
#df.to_csv(out_fn, sep='\t', index=None, header=None)
#df.head()

BEDToolsError: 
Command was:

	bedtools closest -d -b /home/anjuni/analysis/gene_te_comparison/input/Pst_104E_v13_h_ctg.anno.sorted.score.bed -a /home/anjuni/analysis/gene_te_comparison/te_ranked_files/Pst_104E_v13_h_ctg.REPET.5mC_hc_tombo_sorted.cutoff.0.50.overlap6.id.500.bed

Error message was:
ERROR: chromomsome sort ordering for file /home/anjuni/analysis/gene_te_comparison/te_ranked_files/Pst_104E_v13_h_ctg.REPET.5mC_hc_tombo_sorted.cutoff.0.50.overlap6.id.500.bed is inconsistent with other files. Record was:
hcontig_012_028	491436	491447	ms267817_hcontig_012_028_C11	1.0	+
