# <span style="color:#ff1414"> BEDtools analysis. </span>

This is a script to answer research questions outlined elsewhere. In summary, this script:

1. compares methylation results between different methylation-callers, and between different methylation sequencing methods.

2. compares methylation between genes and non-gene regions

3. compares methylation between transposons and non-repetitive regions

4. compares transposons and genes


Note:
- PB/pb = PacBio
- ONT/ont = Oxford Nanopore Technology
- NP = Nanopolish

In [216]:
import pybedtools
import scipy

import matplotlib.patches as mpatches

import numpy as np # need for  stats

from scipy.stats import wilcoxon
import matplotlib.pyplot as plt
from matplotlib_venn import venn2

In [217]:
# load modules
import os
import glob
import pprint
from pybedtools import BedTool
import pandas as pd
from scipy.stats import spearmanr

In [223]:


DIRS['BASE1'] = '/home/anjuni/methylation_calling/pacbio'

DIRS['BED_INPUT'] = os.path.join(DIRS['BASE2'], 'bedtools_output', 'sequencing_comparison')

DIRS['FIGURES'] = os.path.join(DIRS['BASE2'], 'figures')

DIRS['I_FROM_C'] = os.path.join(DIRS['BASE2'], 'bedtools_output', 'intersects_from_cutoffs')
DIRS['BED_CUTOFFS'] = os.path.join(DIRS['BASE1'], 'input', 'sorted_bed_files', 'cutoffs')
DIRS['6MA_CUTOFFS'] = os.path.join(DIRS['BASE1'], 'input', 'sorted_bed_files', 'cutoffs_6mA')
DIRS['5MC_CUTOFFS'] = os.path.join(DIRS['BASE1'], 'input', 'sorted_bed_files', 'cutoffs_5mC')

In [222]:
#First we need to define the base dirs
DIRS = {}
DIRS['BASE2'] = '/home/anjuni/analysis'
DIRS['FEATURES'] = os.path.join(DIRS['BASE2'], 'coverage', 'feature_files')
DIRS['RAND'] = os.path.join(DIRS['BASE2'], 'coverage', 'randomisation')
DIRS['WINDOW_OUTPUT'] = os.path.join(DIRS['BASE2'], 'windows')
DIRS['WINDOW_INPUT'] = os.path.join(DIRS['BASE2'], 'input_for_windows')
DIRS['REF'] = '/home/anjuni/Pst_104_v13_assembly/'
DIRS['COVERAGE'] = os.path.join(DIRS['BASE2'], 'coverage')
DIRS['GFF_INPUT'] = os.path.join(DIRS['BASE2'], 'gff_output')
DIRS['TE_SF'] = os.path.join(DIRS['COVERAGE'], 'superfamily_files')

In [225]:
#Quick chech if directories exist
for value in DIRS.values():
    if not os.path.exists(value):
        print('%s does not exist' % value)

/home/anjuni/methylation_calling/pacbio/input/sorted_bed_files/cutoffs does not exist


In [224]:
#Make filepaths
bed_file_list = [fn for fn in glob.iglob('%s/*.bed' % DIRS['BED_INPUT'], recursive=True)]
gff_file_list = [fn for fn in glob.iglob('%s/*anno.gff3' % DIRS['GFF_INPUT'], recursive=True)]
te_file_list = [fn for fn in glob.iglob('%s/*.gff' % DIRS['GFF_INPUT'], recursive=True)]

In [80]:
#Check that the list works
print(*bed_file_list, sep='\n')
print(*gff_file_list, sep='\n')
print(*te_file_list, sep='\n')

/home/anjuni/analysis/bedtools_output/sequencing_comparison/5mC_plus_tombo_sorted.bed
/home/anjuni/analysis/bedtools_output/sequencing_comparison/5mC_CpG_tombo_np.bed
/home/anjuni/analysis/bedtools_output/sequencing_comparison/5mC_tombo_np.bed
/home/anjuni/analysis/bedtools_output/sequencing_comparison/5mC_plus_CpG_np_tombo.bed
/home/anjuni/analysis/bedtools_output/sequencing_comparison/5mC_hc_tombo_sorted.CpG.plus.bed
/home/anjuni/analysis/bedtools_output/sequencing_comparison/5mC_s_nanopolish.bed
/home/anjuni/analysis/bedtools_output/sequencing_comparison/nanopolish_rerun_subtract.bed
/home/anjuni/analysis/bedtools_output/sequencing_comparison/5mC_hc_nanopolish_sorted.bed
/home/anjuni/analysis/bedtools_output/sequencing_comparison/5mC_plus_CpG_tombo_np.bed
/home/anjuni/analysis/bedtools_output/sequencing_comparison/nano_plus_tombo_overlap.bed
/home/anjuni/analysis/bedtools_output/sequencing_comparison/5mC_tombo_sorted.bed
/home/anjuni/analysis/bedtools_output/sequencing_comparison/5m

## <span style='color:#144fff'> 4. Making windows. <span/>

In [22]:
# Make folder for windows. Each BED file will contain a series of windows
#os.mkdir(DIRS['WINDOW_OUTPUT'])
#os.mkdir()
# Define inputs
gene_fn = '/home/anjuni/analysis/gff_output/Pst_104E_v13_ph_ctg.anno.sorted.gff3'
lt_gene_fn = '/home/anjuni/analysis/gff_output/Pst_104E_v13_ph_ctg.TE.sorted.gff3'
reference_genome = os.path.join(DIRS['REF'], 'Pst_104E_v13_ph_ctg.fa')

In [58]:
# Make the genome size file for windows
!samtools faidx /home/anjuni/Pst_104_v13_assembly/Pst_104E_v13_ph_ctg.fa
!cut -f 1,2 /home/anjuni/Pst_104_v13_assembly/Pst_104E_v13_ph_ctg.fa.fai > /home/anjuni/analysis/gff_output/Pst_104E_v13_ph_ctg.genome_file
# Note: this does put the p contig values before h contig ones, while annotation files put h contig before p contig
# May be a problem in the future but probs not
# Sorted it anyway below, as reference genome fasta had contigs in that order arbitrarily:
!/home/anjuni/myapps/gff3sort/gff3sort.pl /home/anjuni/analysis/gff_output/Pst_104E_v13_ph_ctg.genome_file >  /home/anjuni/analysis/gff_output/Pst_104E_v13_ph_ctg.sorted.genome_file

Smartmatch is experimental at /home/anjuni/myapps/gff3sort/gff3sort.pl line 68.
Use of uninitialized value $pos in hash element at /home/anjuni/myapps/gff3sort/gff3sort.pl line 67, <$_[...]> line 1.
Use of uninitialized value $pos in hash element at /home/anjuni/myapps/gff3sort/gff3sort.pl line 67, <$_[...]> line 2.
Use of uninitialized value $pos in hash element at /home/anjuni/myapps/gff3sort/gff3sort.pl line 67, <$_[...]> line 3.
Use of uninitialized value $pos in hash element at /home/anjuni/myapps/gff3sort/gff3sort.pl line 67, <$_[...]> line 4.
Use of uninitialized value $pos in hash element at /home/anjuni/myapps/gff3sort/gff3sort.pl line 67, <$_[...]> line 5.
Use of uninitialized value $pos in hash element at /home/anjuni/myapps/gff3sort/gff3sort.pl line 67, <$_[...]> line 6.
Use of uninitialized value $pos in hash element at /home/anjuni/myapps/gff3sort/gff3sort.pl line 67, <$_[...]> line 7.
Use of uninitialized value $pos in hash element at /home/anjuni/myapps/gff3sort/gff3sor

In [7]:
# Define all file paths for window BED files
window_fn_dict = {}
window_bed_dict = {}
#window_fn_dict['100kb'] = os.path.join(DIRS['WINDOW_OUTPUT'], 'Pst_104E_v13_ph_ctg_w100kb.bed')
#window_fn_dict['30kb'] = os.path.join(DIRS['WINDOW_OUTPUT'], 'Pst_104E_v13_ph_ctg_w30kb.bed')
window_fn_dict['10kb'] = os.path.join(DIRS['WINDOW_OUTPUT'], 'Pst_104E_v13_ph_ctg_w10kb.bed')
#window_fn_dict['100kb_s20kb'] = os.path.join(DIRS['WINDOW_OUTPUT'], 'Pst_104E_v13_ph_ctg_w100kb_s20kb.bed')
#window_fn_dict['30kb_s6kb'] = os.path.join(DIRS['WINDOW_OUTPUT'], 'Pst_104E_v13_ph_ctg_w30kb_s6kb.bed')
window_fn_dict['10kb_s2kb'] = os.path.join(DIRS['WINDOW_OUTPUT'], 'Pst_104E_v13_ph_ctg_w10kb_s2kb.bed')
genome_size_f_fn = os.path.join(DIRS['WINDOW_INPUT'], 'Pst_104E_v13_ph_ctg.sorted.genome_file')

In [23]:
# Check whether the dictionary looks nice :) (it does!) :D
pprint.pprint(window_fn_dict)

{'10kb': '/home/anjuni/analysis/windows/Pst_104E_v13_ph_ctg_w10kb.bed',
 '10kb_s2kb': '/home/anjuni/analysis/windows/Pst_104E_v13_ph_ctg_w10kb_s2kb.bed'}


In [73]:
# Make the actual windows! :D
#!bedtools makewindows -g {genome_size_f_fn} -w 100000 > {window_fn_dict['100kb']}
#!bedtools makewindows -g {genome_size_f_fn} -w 30000 > {window_fn_dict['30kb']}
!bedtools makewindows -g {genome_size_f_fn} -w 10000 > {window_fn_dict['10kb']}

In [10]:
# Make sliding windows
#!bedtools makewindows -g {genome_size_f_fn} -w 100000 -s 20000 > {window_fn_dict['100kb_s20kb']}
#!bedtools makewindows -g {genome_size_f_fn} -w 30000 -s 6000 > {window_fn_dict['30kb_s6kb']}
!bedtools makewindows -g {genome_size_f_fn} -w 10000 -s 2000 > {window_fn_dict['10kb_s2kb']}

In [1]:
# Function to easily convert the values in the file name dict into bedtools objects
def make_bed_dict(fn_dict):
    """Takes an input filename dictionary and outputs a dictionary of pybedtools objects for the filenames."""
    bed_dict = {}
    for key, value in fn_dict.items():      
        bed_dict[key] = BedTool(value)
    return bed_dict

In [15]:
#new make a bedtools window dataframe
window_bed_dict = make_bed_dict(window_fn_dict)

In [226]:
# Check whether the bed file dictionary looks nice :) (it does!) :D
pprint.pprint(window_bed_dict)

{'10kb': <BedTool(/home/anjuni/analysis/windows/Pst_104E_v13_ph_ctg_w10kb.bed)>,
 '10kb_s2kb': <BedTool(/home/anjuni/analysis/windows/Pst_104E_v13_ph_ctg_w10kb_s2kb.bed)>}


## <span style='color:#148aff'> 5. Coverage analysis of methylation with gene annotation files. <span/>

# <span style='color:red'> Move this to anno file prep notebook. <span/>

### <span style='color:#148aff'> 5.A Make feature files of effector proteins, non-effectors, genes, BUSCO genes and TE superfamilies. <span/>

In [17]:
%%bash
# Downloading the effector file (need raw version with only the file)
cd /home/anjuni/analysis/input_for_windows
wget https://raw.githubusercontent.com/BenjaminSchwessinger/Pst_104_E137_A-_genome/master/supplemental_files/Supplemental_file_9.txt
mv Supplemental_file_9.txt Candidate_effectors.txt

--2018-08-07 15:30:17--  https://raw.githubusercontent.com/BenjaminSchwessinger/Pst_104_E137_A-_genome/master/supplemental_files/Supplemental_file_9.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.80.133
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.80.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 49295 (48K) [text/plain]
Saving to: 'Supplemental_file_9.txt’

     0K .......... .......... .......... .......... ........  100% 2.80M=0.02s

2018-08-07 15:30:22 (2.80 MB/s) - 'Supplemental_file_9.txt’ saved [49295/49295]



In [24]:
# Make a GFF file of effector proteins

# First extract all lines with genes (not exon or CDS) from the gene annotation file
! grep 'gene' /home/anjuni/analysis/gff_output/Pst_104E_v13_ph_ctg.anno.sorted.gff3 > /home/anjuni/analysis/gff_output/Pst_104E_v13_ph_ctg.anno.sorted.genes_only.gff3

In [17]:
# Then extract the effector lines from the gene file

# First write grep function for python
import fileinput
import re
import glob

def grep(PAT, FILES):
    """Same function as 'grep' in bash."""
    fileinput.close() # close the file in case the iterable was previously open to prevent the "input() already active" error
    for line in fileinput.input(glob.glob(FILES)):
        if re.search(PAT, line):
            #print(fileinput.filename(), fileinput.lineno(), line)
            return line
    fileinput.close()

# Write function for filtering effectors from gene file using grep            
def make_effector_gff(effector_list, gene_gff, out_gff):
    """Get effector protein features out of gene annotation files."""
    with open(out_gff, mode = 'w') as out_file:
        for effector in effector_list:
            print(grep(effector, gene_gff), file = out_file)
        
# And make a list of effectors
def make_effector_list(input_file):
    list_name = []
    with open(input_file) as file:
        for line in file:
            line = line.strip()
            list_name.append(line)
    return list_name

In [18]:
# Make the effector list
effectors = make_effector_list('/home/anjuni/analysis/input_for_windows/Candidate_effectors.txt')

In [19]:
# Check if list works (it does, but is not sorted)
print(effectors)

['evm.TU.pcontig_058.72', 'evm.TU.pcontig_017.202', 'evm.TU.pcontig_010.332', 'evm.TU.pcontig_010.240', 'evm.TU.pcontig_031.123', 'evm.TU.pcontig_008.86', 'evm.TU.pcontig_050.77', 'evm.TU.pcontig_016.165', 'evm.TU.pcontig_037.43', 'evm.TU.pcontig_007.151', 'evm.TU.pcontig_030.240', 'evm.TU.pcontig_078.71', 'evm.TU.pcontig_052.152', 'evm.TU.pcontig_157.8', 'evm.TU.pcontig_037.109', 'evm.TU.pcontig_005.418', 'evm.TU.pcontig_062.104', 'evm.TU.pcontig_010.352', 'evm.TU.pcontig_013.247', 'evm.TU.pcontig_064.41', 'evm.TU.pcontig_062.26', 'evm.TU.pcontig_026.251', 'evm.TU.pcontig_005.10', 'evm.TU.pcontig_013.152', 'evm.TU.pcontig_036.154', 'evm.TU.pcontig_003.543', 'evm.TU.pcontig_004.284', 'evm.TU.pcontig_005.98', 'evm.TU.pcontig_004.435', 'evm.TU.pcontig_002.333', 'evm.TU.pcontig_000.679', 'evm.TU.pcontig_011.263', 'evm.TU.pcontig_051.51', 'evm.TU.pcontig_004.18', 'evm.TU.pcontig_017.200', 'evm.TU.pcontig_011.238', 'evm.TU.pcontig_026.59', 'evm.TU.pcontig_165.8', 'evm.TU.pcontig_010.220', '

In [20]:
# Sort effector list
effectors.sort()
print(effectors) # Check if sorting worked (it did)

['evm.TU.hcontig_000_003.1', 'evm.TU.hcontig_000_003.10', 'evm.TU.hcontig_000_003.120', 'evm.TU.hcontig_000_003.158', 'evm.TU.hcontig_000_003.2', 'evm.TU.hcontig_000_003.20', 'evm.TU.hcontig_000_003.26', 'evm.TU.hcontig_000_003.314', 'evm.TU.hcontig_000_003.340', 'evm.TU.hcontig_000_003.380', 'evm.TU.hcontig_000_003.402', 'evm.TU.hcontig_000_003.419', 'evm.TU.hcontig_000_003.421', 'evm.TU.hcontig_000_003.423', 'evm.TU.hcontig_000_003.444', 'evm.TU.hcontig_000_003.450', 'evm.TU.hcontig_000_003.90', 'evm.TU.hcontig_000_031.4', 'evm.TU.hcontig_000_050.114', 'evm.TU.hcontig_000_050.122', 'evm.TU.hcontig_000_050.141', 'evm.TU.hcontig_000_050.144', 'evm.TU.hcontig_000_050.149', 'evm.TU.hcontig_000_050.34', 'evm.TU.hcontig_000_050.85', 'evm.TU.hcontig_000_050.87', 'evm.TU.hcontig_000_050.9', 'evm.TU.hcontig_000_050.93', 'evm.TU.hcontig_000_054.13', 'evm.TU.hcontig_000_054.43', 'evm.TU.hcontig_000_054.68', 'evm.TU.hcontig_001_001.103', 'evm.TU.hcontig_001_001.128', 'evm.TU.hcontig_001_001.129'

In [276]:
# Run the function to make a file of effectors
genes_only_fn = '/home/anjuni/analysis/gff_output/Pst_104E_v13_ph_ctg.anno.sorted.genes_only.gff3'
effector_fn  = '/home/anjuni/analysis/input_for_windows/Pst_104E_v13_ph_ctg.effectors.gff3'
make_effector_gff(effectors, genes_only_fn, effector_fn)

In [846]:
# Move the effector file and all gene annotation files to the feature file directory
!cp /home/anjuni/analysis/gff_output/Pst_104E_v13_ph_ctg.anno.sorted.genes_only.gff3 /home/anjuni/analysis/coverage/feature_files

In [849]:
%%bash
# Make a file of non-effector genes only
cd /home/anjuni/analysis/coverage/feature_files
subtractBed -a Pst_104E_v13_ph_ctg.anno.sorted.genes_only.gff3 -b Pst_104E_v13_ph_ctg.effectors.gff3 > Pst_104E_v13_ph_ctg.non_effectors.gff3

In [24]:
# Make a GFF file of LT genes

# First extract all lines with genes (not exon or CDS) from the gene annotation file
! grep 'gene' /home/anjuni/analysis/gff_output/Pst_104E_v13_ph_ctg.TE.sorted.gff3 > /home/anjuni/analysis/gff_output/Pst_104E_v13_ph_ctg.TE.sorted.genes_only.gff3

In [25]:
# Move the LT gene_only file to the feature file directory
!cp /home/anjuni/analysis/gff_output/Pst_104E_v13_ph_ctg.TE.sorted.genes_only.gff3 /home/anjuni/analysis/coverage/feature_files

In [34]:
# Make a GFF file of combined genes

# First extract all lines with genes (not exon or CDS) from the gene annotation file
! grep 'gene' /home/anjuni/analysis/gff_output/Pst_104E_v13_ph_ctg_combined_sorted_anno.gff3 > /home/anjuni/analysis/gff_output/Pst_104E_v13_ph_ctg_combined_sorted_anno.genes_only.gff3

In [35]:
# Move the abogene_only file to the feature file directory
!cp /home/anjuni/analysis/gff_output/Pst_104E_v13_ph_ctg_combined_sorted_anno.genes_only.gff3 /home/anjuni/analysis/coverage/feature_files

# <span style='color:red'> Move this to anno file prep notebook. <span/>

### BUSCO files notes

- were made in the annotation prep notebook
- have gene ID instead of gene Name in the 'feature' column, so I will make a new column with gene Name, using the original annotation file, and replace the gene ID with gene Name

In [154]:
#Inputs
all_genes_h = os.path.join(DIRS['GFF_INPUT'], 'Pst_104E_v13_h_ctg_combined_sorted_anno.genes_only.gff3')
all_genes_p = os.path.join(DIRS['GFF_INPUT'], 'Pst_104E_v13_p_ctg_combined_sorted_anno.genes_only.gff3')
busco_h = os.path.join(DIRS['GFF_INPUT'], 'busco', 'Pst_104E_v12_h_busco.gene.bed')
busco_p = os.path.join(DIRS['GFF_INPUT'], 'busco', 'Pst_104E_v12_p_busco.gene.bed')
non_busco_h = os.path.join(DIRS['GFF_INPUT'], 'busco', 'Pst_104E_v12_h_non_busco.gene.bed')
non_busco_p = os.path.join(DIRS['GFF_INPUT'], 'busco', 'Pst_104E_v12_p_non_busco.gene.bed')

In [111]:
def make_busco_list(busco_file):
    busco_gene_list = []
    df = pd.read_csv(busco_file, header=None, sep='\t')
    for index, row in df.iterrows():
        busco_gene_list.append(row[3])
    return busco_gene_list

In [112]:
busco_h_list = make_busco_list(busco_h)
busco_p_list = make_busco_list(busco_p)
non_busco_h_list = make_busco_list(non_busco_h)
non_busco_p_list = make_busco_list(non_busco_p)

In [187]:
gene_anno_fn = os.path.join(DIRS['GFF_INPUT'], 'Pst_104E_v13_ph_ctg_combined_sorted_anno.genes_only.gff3')

In [184]:
# write a function to make a list of gene Names
def get_busco_names(busco_gene_list, anno_fn):
    "This function goes through lines a gene annotation file and checks whether the gene_ID in the input list is in the line, and saves the corresponding gene name to a list."
    gene_name_list = []
    for gene in busco_gene_list:
        with open(anno_fn) as infile:
            for line in infile:
                line = line.rstrip()
                e = line.split('\t')
                if e[2] == 'gene':
                    f = e[-1].split(';')    
                    for i in f:
                        if i.startswith('ID=') and i.endswith(gene):
                            for i in f:
                                if i.startswith('Name='):
                                    name = i[5:]
                                    gene_name_list.append(name)
    return gene_name_list

In [188]:
busco_gene_h = get_busco_names(busco_h_list, gene_anno_fn)

In [189]:
busco_gene_h

['gene_model_hcontig_0000_03.12',
 'gene_model_hcontig_0000_03.19',
 'gene_model_hcontig_0000_03.29',
 'gene_model_hcontig_0000_03.83',
 'gene_model_hcontig_0000_03.87',
 'gene_model_hcontig_0000_03.91',
 'gene_model_hcontig_0000_03.126',
 'gene_model_hcontig_0000_03.174',
 'gene_model_hcontig_0000_03.180',
 'gene_model_hcontig_0000_03.192',
 'gene_model_hcontig_0000_03.215',
 'gene_model_hcontig_0000_03.224',
 'gene_model_hcontig_0000_03.230',
 'gene_model_hcontig_0000_03.232',
 'gene_model_hcontig_0000_03.244',
 'gene_model_hcontig_0000_03.246',
 'gene_model_hcontig_0000_03.254',
 'gene_model_hcontig_0000_03.269',
 'gene_model_hcontig_0000_03.272',
 'gene_model_hcontig_0000_03.275',
 'gene_model_hcontig_0000_03.326',
 'gene_model_hcontig_0000_03.328',
 'gene_model_hcontig_0000_03.336',
 'gene_model_hcontig_0000_03.343',
 'gene_model_hcontig_0000_03.371',
 'gene_model_hcontig_0000_03.375',
 'gene_model_hcontig_0000_03.381',
 'gene_model_hcontig_0000_03.403',
 'gene_model_hcontig_0000_

In [None]:
#busco_gene_h = get_busco_names(busco_h_list, gene_anno_fn)
busco_gene_p = get_busco_names(busco_p_list, gene_anno_fn)
non_busco_gene_h = get_busco_names(non_busco_h_list, gene_anno_fn)
non_busco_gene_p = get_busco_names(non_busco_p_list, gene_anno_fn)

In [206]:
busco_gene_p

['gene_model_pcontig_000.11',
 'gene_model_pcontig_000.18',
 'gene_model_pcontig_000.22',
 'gene_model_pcontig_000.78',
 'gene_model_pcontig_000.82',
 'gene_model_pcontig_000.86',
 'gene_model_pcontig_000.119',
 'gene_model_pcontig_000.160',
 'gene_model_pcontig_000.166',
 'gene_model_pcontig_000.178',
 'gene_model_pcontig_000.204',
 'gene_model_pcontig_000.214',
 'gene_model_pcontig_000.220',
 'gene_model_pcontig_000.222',
 'gene_model_pcontig_000.235',
 'gene_model_pcontig_000.254',
 'gene_model_pcontig_000.269',
 'gene_model_pcontig_000.272',
 'gene_model_pcontig_000.275',
 'gene_model_pcontig_000.317',
 'gene_model_pcontig_000.319',
 'gene_model_pcontig_000.328',
 'gene_model_pcontig_000.336',
 'gene_model_pcontig_000.371',
 'gene_model_pcontig_000.376',
 'gene_model_pcontig_000.382',
 'gene_model_pcontig_000.407',
 'gene_model_pcontig_000.417',
 'gene_model_pcontig_000.424',
 'gene_model_pcontig_000.447',
 'gene_model_pcontig_000.449',
 'gene_model_pcontig_000.452',
 'gene_model_p

In [None]:
non_busco_gene_p = get_busco_names(non_busco_p_list, gene_anno_fn)

In [190]:
print(len(busco_gene_h))

1293


In [136]:
print(len(busco_gene_h))
print(len(busco_gene_p))
print(len(non_busco_gene_h))
print(len(non_busco_gene_p))

1293
1444
13028
13028


In [203]:
# function to edit busco bed file and replace gene ID column with gene Name
def replace_id_with_name(in_fn, gene_list):
    "This function takes a busco bed file with gene ID's and a list of their corresponding gene Names, and makes a new busco bed file with gene Name instead of ID."
    df = pd.read_csv(in_fn, header=None, names=busco_headings, sep='\t')
    df['new_name'] = pd.Series(gene_list)
    df = df [['contig', 'start', 'stop', 'new_name', 'score', 'strand']]
    out_fn = in_fn.replace('gene', 'gene_name')
    df.to_csv(out_fn, header=None, index=None, sep='\t')

In [227]:
replace_id_with_name(busco_h, busco_gene_h)
replace_id_with_name(busco_p, busco_gene_p)
replace_id_with_name(non_busco_h, non_busco_gene_h)
replace_id_with_name(non_busco_p, non_busco_gene_p)

In [204]:
%%bash

#move the new busco files to the annotation directory and remove the original busco files from the annotation directory
cd /home/anjuni/analysis/gff_output/busco
mv *name* ..
cd ..
rm *busco.gene.b*

### TE superfamily files notes

- files present for p, h and combined ph contigs
- sf = superfamily

In [1270]:
# Read in the superfamily file as a pandas dataframe
te_sf_fn_p = os.path.join(DIRS['BASE2'], 'gff_output', 'Pst_104E_v13_p_ctg.REPET.sorted.filtered.superfamily.gff')
te_sf_fn_ph = os.path.join(DIRS['BASE2'], 'gff_output', 'Pst_104E_v13_p_ctg.REPET.sorted.filtered.superfamily.gff')
te_sf_fn_h = os.path.join(DIRS['BASE2'], 'gff_output', 'Pst_104E_v13_p_ctg.REPET.sorted.filtered.superfamily.gff')
te_sf_names = ['Contig', 'Source', 'Type', 'Start', 'Stop', 'Value', 'Strand', 'Space', 'Superfamily']

te_sf_df_p = pd.read_csv(te_sf_fn_p, sep='\t', header = None, names = te_sf_names)
te_sf_df_ph = pd.read_csv(te_sf_fn_ph, sep='\t', header = None, names = te_sf_names)
te_sf_df_h = pd.read_csv(te_sf_fn_h, sep='\t', header = None, names = te_sf_names)

In [None]:
# write a function that loops over the last column in the data frame to make a list

In [1388]:
#Test to make a list of all sf
def make_superfamily_list(df):
    """Goes through dataframe and makes a list of all unique TE superfamily names."""
    superfamilies = []
    for sf in df['Superfamily']:
        if sf not in superfamilies:
            superfamilies.append(sf)
    return superfamilies

In [1391]:
te_sf_list_ph = make_superfamily_list(te_sf_df_ph)

In [1399]:
# write a function to loop over sf list and make a gff file for each superfamily
# these gff files will be used for window coverage analysis later
def filter_superfamilies(df, sf_list):
    """Filter out the TE's by their superfamily into separate files."""
    sf_dict = {}
    for sf in sf_list:
        filter_list = [sf]
        filtered_df = df[df['Superfamily'].isin(filter_list)]
        sf_dict[sf] = filtered_df
    return sf_dict

In [1400]:
te_sf_dict_ph = filter_superfamilies(te_sf_df_ph, te_sf_list_ph)

In [1415]:
# save out the dataframes in the dictionary as gff files
for key, value in te_sf_dict_ph.items():
    fn = 'Pst_104E_v13_ph_ctg.' + key + '.gff3'
    out_fn = os.path.join(DIRS['COVERAGE'], 'superfamily_files', fn)
    te_sf_dict_ph[key].to_csv(out_fn, sep='\t', header=None, index=None)

### <span style='color:#148aff'> 5.B Making dicts of feature file of effector proteins, non-effectors, genes, LT gene, BUSCO and TE superfamilies. <span/>

In [236]:
# Function that gives a dictionary of each input filehandle
def file_name_dict(file_list):
    """Outputs a dictionary of input file paths for a given list of input file paths."""
    file_dict = {}
    for file in file_list:
        file_dict[file.split('/')[-1]] = file
    return file_dict

In [237]:
# Make filepaths for feature files for genes, effectors, TE, methylation and randomised files for methylation.
feature_files = [fn for fn in glob.iglob('%s/*' % DIRS['FEATURES'], recursive=True)]
feature_fn_dict = file_name_dict(feature_files)

In [238]:
# This is the new file with only gene annotations
pprint.pprint(feature_fn_dict)

{'5mC_hc_tombo_sorted.cutoff.0.80.bed': '/home/anjuni/analysis/coverage/feature_files/5mC_hc_tombo_sorted.cutoff.0.80.bed',
 '6mA_hc_tombo_sorted.cutoff.0.80.bed': '/home/anjuni/analysis/coverage/feature_files/6mA_hc_tombo_sorted.cutoff.0.80.bed',
 'Pst_104E_v12_ph_busco.gene_name.bed': '/home/anjuni/analysis/coverage/feature_files/Pst_104E_v12_ph_busco.gene_name.bed',
 'Pst_104E_v12_ph_non_busco.gene_name.bed': '/home/anjuni/analysis/coverage/feature_files/Pst_104E_v12_ph_non_busco.gene_name.bed',
 'Pst_104E_v13_ph_ctg.ClassI:?:?.gff3': '/home/anjuni/analysis/coverage/feature_files/Pst_104E_v13_ph_ctg.ClassI:?:?.gff3',
 'Pst_104E_v13_ph_ctg.ClassI:DIRS:?.gff3': '/home/anjuni/analysis/coverage/feature_files/Pst_104E_v13_ph_ctg.ClassI:DIRS:?.gff3',
 'Pst_104E_v13_ph_ctg.ClassI:DIRS:DIRS.gff3': '/home/anjuni/analysis/coverage/feature_files/Pst_104E_v13_ph_ctg.ClassI:DIRS:DIRS.gff3',
 'Pst_104E_v13_ph_ctg.ClassI:LARD:?.gff3': '/home/anjuni/analysis/coverage/feature_files/Pst_104E_v13_ph_c

In [239]:
# Make dictionary of bedtools objects and check if it worked (it did!)
feature_bed_dict = make_bed_dict(feature_fn_dict)

In [240]:
pprint.pprint(feature_bed_dict)

{'5mC_hc_tombo_sorted.cutoff.0.80.bed': <BedTool(/home/anjuni/analysis/coverage/feature_files/5mC_hc_tombo_sorted.cutoff.0.80.bed)>,
 '6mA_hc_tombo_sorted.cutoff.0.80.bed': <BedTool(/home/anjuni/analysis/coverage/feature_files/6mA_hc_tombo_sorted.cutoff.0.80.bed)>,
 'Pst_104E_v12_ph_busco.gene_name.bed': <BedTool(/home/anjuni/analysis/coverage/feature_files/Pst_104E_v12_ph_busco.gene_name.bed)>,
 'Pst_104E_v12_ph_non_busco.gene_name.bed': <BedTool(/home/anjuni/analysis/coverage/feature_files/Pst_104E_v12_ph_non_busco.gene_name.bed)>,
 'Pst_104E_v13_ph_ctg.ClassI:?:?.gff3': <BedTool(/home/anjuni/analysis/coverage/feature_files/Pst_104E_v13_ph_ctg.ClassI:?:?.gff3)>,
 'Pst_104E_v13_ph_ctg.ClassI:DIRS:?.gff3': <BedTool(/home/anjuni/analysis/coverage/feature_files/Pst_104E_v13_ph_ctg.ClassI:DIRS:?.gff3)>,
 'Pst_104E_v13_ph_ctg.ClassI:DIRS:DIRS.gff3': <BedTool(/home/anjuni/analysis/coverage/feature_files/Pst_104E_v13_ph_ctg.ClassI:DIRS:DIRS.gff3)>,
 'Pst_104E_v13_ph_ctg.ClassI:LARD:?.gff3': 

In [241]:
rand_files = [fn for fn in glob.iglob('%s/*' % DIRS['RAND'], recursive=True)]
rand_fn_dict = file_name_dict(rand_files)

In [242]:
# Check whether the feature file dictionary works (it does). len = 59
pprint.pprint(rand_fn_dict)

{'5mC_hc_tombo_sorted.cutoff.0.80_rand.bed': '/home/anjuni/analysis/coverage/randomisation/5mC_hc_tombo_sorted.cutoff.0.80_rand.bed',
 '6mA_hc_tombo_sorted.cutoff.0.80_rand.bed': '/home/anjuni/analysis/coverage/randomisation/6mA_hc_tombo_sorted.cutoff.0.80_rand.bed'}


In [243]:
# Make dictionary of bedtools objects and check if it worked (it did!)
rand_bed_dict = make_bed_dict(rand_fn_dict)

In [244]:
pprint.pprint(rand_bed_dict)

{'5mC_hc_tombo_sorted.cutoff.0.80_rand.bed': <BedTool(/home/anjuni/analysis/coverage/randomisation/5mC_hc_tombo_sorted.cutoff.0.80_rand.bed)>,
 '6mA_hc_tombo_sorted.cutoff.0.80_rand.bed': <BedTool(/home/anjuni/analysis/coverage/randomisation/6mA_hc_tombo_sorted.cutoff.0.80_rand.bed)>}


In [291]:
# make pacbio bed file
pb_fn_dict = {}
pb_fn_dict['pb'] = os.path.join(DIRS['FEATURES'], '6mA_prob_smrtlink_sorted.cutoff.0.80.bed')
pb_bed_dict = make_bed_dict(pb_fn_dict)

In [303]:
#make pacbio rand file
pb_rand_fn_dict = {}
pb_rand_fn_dict['6mA_prob_smrtlink_sorted.cutoff.0.80_rand.bed'] = os.path.join(DIRS['RAND'], '6mA_prob_smrtlink_sorted.cutoff.0.80_rand.bed')
pb_rand_bed_dict = make_bed_dict(pb_rand_fn_dict)

### <span style='color:#148aff'> 5.C Using Ben's pybedtools coverage function on the feature files. <span/>

In [245]:
# Make a function to do overlaps for feature files
def coverage(windowbed_dict, featurebed_dict, featurefn_dict, old_folder_name, new_folder_name):
    """Create coverage files from a dictionary of window pybedtools objects and a dictionary of feature file pybedtools objects. Also outputs a dictionary of pandas dataframes for all coverage files."""
    feature_overlap_df_dict = {}
    for wkey, wbed in windowbed_dict.items():
        for fkey, fbed in featurebed_dict.items():
            tmp_df = wbed.coverage(fbed).to_dataframe().iloc[:,[0,1,2,3,6]] # make a dataframe to put headings
            tmp_df.rename(columns={'name': 'overlap_count', 'thickStart': 'overlap_fraction'}, inplace=True) # rename headings
            if featurefn_dict[fkey].endswith('.bed'): # for methylation files
                tmp_fn = featurefn_dict[fkey].replace('.bed', '.%s.overlap.bed' % wkey) # change output file path
            if featurefn_dict[fkey].endswith('.gff3'): # for gene/transposon/effector/exon files
                tmp_fn = featurefn_dict[fkey].replace('.gff3', '.%s.overlap.bed' % wkey) # change output file path
            tmp_fn = tmp_fn.replace(old_folder_name, new_folder_name)
            feature_overlap_df_dict[tmp_fn.split('/')[-1]] = tmp_df # file name as key and dataframe as value for overlap dict
            tmp_df.to_csv(tmp_fn, sep='\t', header=None, index=None) # save to a csv(pybedtools outputs more d.p. than BEDTools)
    return feature_overlap_df_dict

In [246]:
# Run overlaps between windows and features
feature_coverage_dict = coverage(window_bed_dict, feature_bed_dict, feature_fn_dict, 'feature_files', 'coverage_feature')
rand_coverage_dict = coverage(window_bed_dict, rand_bed_dict, rand_fn_dict, 'randomisation', 'coverage_random')

In [296]:
window_bed_dict

{'10kb': <BedTool(/home/anjuni/analysis/windows/Pst_104E_v13_ph_ctg_w10kb.bed)>,
 '10kb_s2kb': <BedTool(/home/anjuni/analysis/windows/Pst_104E_v13_ph_ctg_w10kb_s2kb.bed)>}

In [297]:
#run pacbio coverage
pb_coverage_dict = coverage(window_bed_dict, pb_bed_dict, pb_fn_dict, 'feature_files', 'coverage_pb')

In [304]:
pb_rand_coverage_dict = coverage(window_bed_dict, pb_rand_bed_dict, pb_rand_fn_dict, 'feature_files', 'coverage_pb')

In [305]:
pb_rand_coverage_dict

{'6mA_prob_smrtlink_sorted.cutoff.0.80_rand.10kb.overlap.bed':                  chrom   start     end  overlap_count  overlap_fraction
 0      hcontig_000_003       0   10000              4          0.000400
 1      hcontig_000_003   10000   20000              7          0.000600
 2      hcontig_000_003   20000   30000              7          0.000700
 3      hcontig_000_003   30000   40000              7          0.000700
 4      hcontig_000_003   40000   50000              9          0.000900
 5      hcontig_000_003   50000   60000              2          0.000200
 6      hcontig_000_003   60000   70000              6          0.000600
 7      hcontig_000_003   70000   80000              7          0.000700
 8      hcontig_000_003   80000   90000              2          0.000200
 9      hcontig_000_003   90000  100000              8          0.000800
 10     hcontig_000_003  100000  110000              4          0.000400
 11     hcontig_000_003  110000  120000              3        

### <span style='color:#148aff'> 5.D Statistical tests on the coverage files. <span/>

In [277]:
def separate_sliding_windows(input_dict, string):
    """Separates sliding and non-sliding windows into separate dictionaries."""
    sliding_dict = {}
    non_sliding_dict = {}
    for key, value in input_dict.items():
        if key.endswith(string):
            sliding_dict[key] = value
        else:
            non_sliding_dict[key] = value
    return sliding_dict, non_sliding_dict

In [278]:
rand_s_dict, rand_ns_dict = separate_sliding_windows(rand_coverage_dict, '_s2kb.overlap.bed')
feature_s_dict, feature_ns_dict = separate_sliding_windows(feature_coverage_dict, '_s2kb.overlap.bed')

In [307]:
#make the pacbio non-sliding dict
pb_ns_dict = {}
pb_ns_dict['6mA_prob_smrtlink_sorted.cutoff.0.80.10kb.overlap.bed'] = pb_coverage_dict['6mA_prob_smrtlink_sorted.cutoff.0.80.10kb.overlap.bed']
pb_rand_ns_dict = {}
pb_rand_ns_dict['6mA_prob_smrtlink_sorted.cutoff.0.80_rand.10kb.overlap.bed'] = pb_rand_coverage_dict['6mA_prob_smrtlink_sorted.cutoff.0.80_rand.10kb.overlap.bed']

In [279]:
def coverage_wilcoxon(obs_df_dict, exp_df_dict, obs_slice, exp_slice):
    """This function returns a dictionary of chi square statistic and p-value for a test of observed and randomised sites."""
    wilcoxon_dict = {}
    for okey, ovalue in obs_df_dict.items():
        for ekey, evalue in exp_df_dict.items():
            if okey[:obs_slice] == ekey[:exp_slice]:
                o_df = ovalue
                e_df = evalue
                obs = o_df['overlap_fraction']
                exp = e_df['overlap_fraction']
                stat, p = wilcoxon(obs, exp)
                wilcoxon_dict[okey] = stat, p
    return wilcoxon_dict

In [280]:
wilcoxon_dict = coverage_wilcoxon(feature_ns_dict, rand_ns_dict, -17, -22)

In [288]:
wilcoxon_dict

{'5mC_hc_tombo_sorted.cutoff.0.80.10kb.overlap.bed': (45245491.5,
  1.2543957014880141e-206),
 '6mA_hc_tombo_sorted.cutoff.0.80.10kb.overlap.bed': (46297440.5,
  3.308405230559043e-182)}

In [289]:
# save to tsv
df = pd.DataFrame.from_dict(wilcoxon_dict, orient='index')
df.rename(columns={0: "Wilcoxon T statistic", 1: 'p-value'}, inplace=True)
df.to_csv(os.path.join(DIRS['FIGURES'], 'coverage', 'window_wilcoxon_table.tsv'), header=True, index=True, sep='\t')

In [286]:
df

Unnamed: 0,Wilcoxon stat,p-value
5mC_hc_tombo_sorted.cutoff.0.80.10kb.overlap.bed,45245491.5,1.254396e-206
6mA_hc_tombo_sorted.cutoff.0.80.10kb.overlap.bed,46297440.5,3.308405e-182


In [309]:
pb_wilcoxon_dict = coverage_wilcoxon(pb_ns_dict, pb_rand_ns_dict, -17, -22)

In [310]:
pb_wilcoxon_dict

{'6mA_prob_smrtlink_sorted.cutoff.0.80.10kb.overlap.bed': (47600654.5,
  2.991879067358158e-26)}

In [311]:
df = pd.DataFrame.from_dict(pb_wilcoxon_dict, orient='index')
df.rename(columns={0: "Wilcoxon T statistic", 1: 'p-value'}, inplace=True)
df.to_csv(os.path.join(DIRS['FIGURES'], 'coverage', 'window_wilcoxon_table_pb.tsv'), header=True, index=True, sep='\t')

In [312]:
df

Unnamed: 0,Wilcoxon T statistic,p-value
6mA_prob_smrtlink_sorted.cutoff.0.80.10kb.overlap.bed,47600654.5,2.991879e-26


In [1233]:
# keep this to view coverage of all the cutoffs
pprint.pprint(wilcoxon_coverage)

{'5mC_hc_tombo_sorted.cutoff.0.00.10kb.overlap.bed': (87915.5, 0.0),
 '5mC_hc_tombo_sorted.cutoff.0.10.10kb.overlap.bed': (85739.5, 0.0),
 '5mC_hc_tombo_sorted.cutoff.0.20.10kb.overlap.bed': (101277.5, 0.0),
 '5mC_hc_tombo_sorted.cutoff.0.30.10kb.overlap.bed': (234278.0, 0.0),
 '5mC_hc_tombo_sorted.cutoff.0.40.10kb.overlap.bed': (2327553.5, 0.0),
 '5mC_hc_tombo_sorted.cutoff.0.50.10kb.overlap.bed': (16607427.5, 0.0),
 '5mC_hc_tombo_sorted.cutoff.0.60.10kb.overlap.bed': (46236315.0,
                                                      9.9306376916492e-187),
 '5mC_hc_tombo_sorted.cutoff.0.70.10kb.overlap.bed': (60573267.0,
                                                      2.6349063610233442e-05),
 '5mC_hc_tombo_sorted.cutoff.0.80.10kb.overlap.bed': (45245491.5,
                                                      1.2543957014880141e-206),
 '5mC_hc_tombo_sorted.cutoff.0.90.10kb.overlap.bed': (28260071.5, 0.0),
 '5mC_hc_tombo_sorted.cutoff.0.95.10kb.overlap.bed': (22742993.5, 0.0),
 

### <span style='color:#148aff'> 5.E Correlations between the coverage files. <span/>

In [None]:
# comparing methylation with features, so first make a dictionary of only the methylation files
# and another dictionary of only the non-methylation feature files

In [253]:
# methylation dictionary
m_dict = {}
for key, value in feature_coverage_dict.items():
    if key.endswith('10kb.overlap.bed'):
        if key.startswith('6') or key.startswith('5'):
            m_dict[key] = value

In [255]:
# feature dictionary
f_dict = {}
for key, value in feature_coverage_dict.items():
    if key.endswith('10kb.overlap.bed'):
        if key.startswith('6') or key.startswith('5'):
            continue
        else:
            f_dict[key] = value

In [314]:
print(*f_dict, sep='\n')

Pst_104E_v13_ph_ctg_combined_sorted_anno.genes_only.10kb.overlap.bed
Pst_104E_v13_ph_ctg.ClassII:TIR:PiggyBac.10kb.overlap.bed
Pst_104E_v13_ph_ctg.ClassII:TIR:?.10kb.overlap.bed
Pst_104E_v13_ph_ctg.anno.sorted.genes_only.10kb.overlap.bed
Pst_104E_v13_ph_ctg.ClassII:TIR:PIF-Harbinger.10kb.overlap.bed
Pst_104E_v13_ph_ctg.ClassII:Helitron:Helitron.10kb.overlap.bed
Pst_104E_v13_ph_ctg.ClassI:LINE:R2.10kb.overlap.bed
Pst_104E_v13_ph_ctg.ClassI:DIRS:?.10kb.overlap.bed
Pst_104E_v13_ph_ctg.ClassII:TIR:hAT.10kb.overlap.bed
Pst_104E_v13_ph_ctg.ClassII:TIR:CACTA.10kb.overlap.bed
Pst_104E_v13_ph_ctg.TE.sorted.genes_only.10kb.overlap.bed
Pst_104E_v13_ph_ctg.ClassI:PLE:Penelope.10kb.overlap.bed
Pst_104E_v13_ph_ctg.effectors.10kb.overlap.bed
Pst_104E_v13_ph_ctg.ClassI:LINE:RTE.10kb.overlap.bed
Pst_104E_v13_ph_ctg.ClassI:LTR:Gypsy.10kb.overlap.bed
Pst_104E_v13_ph_ctg.ClassII:MITE:?.10kb.overlap.bed
Pst_104E_v13_ph_ctg.ClassII:?:Sola.10kb.overlap.bed
Pst_104E_v13_ph_ctg.ClassI:LTR:Retrovirus.10kb.overl

In [259]:
# make a function to run spearman for each gene type

def spearman_windows(methyl_dict, feature_dict):
    spearman_overlap_df_dict = {}
    for mkey, mvalue in methyl_dict.items():
        for fkey, fvalue in feature_dict.items():
            o_df = mvalue
            e_df = fvalue
            obs = o_df['overlap_fraction']
            exp = e_df['overlap_fraction']
            stat, p = spearmanr(obs, exp)
            skey = mkey[:4] + fkey
            spearman_overlap_df_dict[skey] = stat, p
    return spearman_overlap_df_dict

In [260]:
spearman_dict = spearman_windows(m_dict, f_dict)

In [261]:
spearman_dict

{'5mC_Pst_104E_v12_ph_busco.gene_name.10kb.overlap.bed': (-0.1704994890523781,
  1.1415630166229089e-104),
 '5mC_Pst_104E_v12_ph_non_busco.gene_name.10kb.overlap.bed': (-0.27063526090237444,
  1.1466182209656832e-266),
 '5mC_Pst_104E_v13_ph_ctg.ClassI:?:?.10kb.overlap.bed': (-0.00519367445593043,
  0.5111880840420979),
 '5mC_Pst_104E_v13_ph_ctg.ClassI:DIRS:?.10kb.overlap.bed': (0.019275164321009612,
  0.014749270092848817),
 '5mC_Pst_104E_v13_ph_ctg.ClassI:DIRS:DIRS.10kb.overlap.bed': (0.0016064985685383798,
  0.8389650838948349),
 '5mC_Pst_104E_v13_ph_ctg.ClassI:LARD:?.10kb.overlap.bed': (-0.0458088308309309,
  6.722066527105717e-09),
 '5mC_Pst_104E_v13_ph_ctg.ClassI:LINE:?.10kb.overlap.bed': (0.004025024144122994,
  0.6106431027992951),
 '5mC_Pst_104E_v13_ph_ctg.ClassI:LINE:I.10kb.overlap.bed': (0.007980722642673023,
  0.31270909851877354),
 '5mC_Pst_104E_v13_ph_ctg.ClassI:LINE:Jockey.10kb.overlap.bed': (0.021562490448515197,
  0.006373733431931318),
 '5mC_Pst_104E_v13_ph_ctg.ClassI:

In [270]:
# save out spearman for tombo 5mC and 6mA
df = pd.DataFrame.from_dict(spearman_dict, orient='index')
df.rename(columns={0: "Spearman's rho", 1: 'p-value'}, inplace=True)
df.to_csv(os.path.join(DIRS['FIGURES'], 'coverage', 'window_spearman_table.tsv'), header=True, index=True, sep='\t')

In [271]:
df.head()

Unnamed: 0,Spearman's rho,p-value
5mC_Pst_104E_v13_ph_ctg_combined_sorted_anno.genes_only.10kb.overlap.bed,-0.184339,2.536146e-122
5mC_Pst_104E_v13_ph_ctg.ClassII:TIR:PiggyBac.10kb.overlap.bed,0.00909,0.2501939
5mC_Pst_104E_v13_ph_ctg.ClassII:TIR:?.10kb.overlap.bed,0.000527,0.946805
5mC_Pst_104E_v13_ph_ctg.anno.sorted.genes_only.10kb.overlap.bed,-0.326227,0.0
5mC_Pst_104E_v13_ph_ctg.ClassII:TIR:PIF-Harbinger.10kb.overlap.bed,-0.006031,0.4455436


In [275]:
# save out sorted correlations, to make it easier to analyse
sorted_df = df.sort_values("Spearman's rho", ascending=False)
sorted_df.to_csv(os.path.join(DIRS['FIGURES'], 'coverage', 'window_spearman_table_sorted.tsv'), header=True, index=True, sep='\t')

In [None]:
# run Spearman for PacBio

In [315]:
pb_spearman_dict = spearman_windows(pb_ns_dict, f_dict)

In [316]:
pb_spearman_dict

{'6mA_Pst_104E_v12_ph_busco.gene_name.10kb.overlap.bed': (-0.17851603514970638,
  1.023234932780495e-114),
 '6mA_Pst_104E_v12_ph_non_busco.gene_name.10kb.overlap.bed': (-0.3379357530834295,
  0.0),
 '6mA_Pst_104E_v13_ph_ctg.ClassI:?:?.10kb.overlap.bed': (0.01771702551497019,
  0.025004894274363882),
 '6mA_Pst_104E_v13_ph_ctg.ClassI:DIRS:?.10kb.overlap.bed': (0.019022991654741785,
  0.016103126684138908),
 '6mA_Pst_104E_v13_ph_ctg.ClassI:DIRS:DIRS.10kb.overlap.bed': (0.05072423646486444,
  1.3573948376615245e-10),
 '6mA_Pst_104E_v13_ph_ctg.ClassI:LARD:?.10kb.overlap.bed': (0.0492199262202844,
  4.665822223896692e-10),
 '6mA_Pst_104E_v13_ph_ctg.ClassI:LINE:?.10kb.overlap.bed': (0.022929481004950597,
  0.003721012352614802),
 '6mA_Pst_104E_v13_ph_ctg.ClassI:LINE:I.10kb.overlap.bed': (0.030792559404915987,
  9.775755307576431e-05),
 '6mA_Pst_104E_v13_ph_ctg.ClassI:LINE:Jockey.10kb.overlap.bed': (0.030466746116696107,
  0.00011580162362623678),
 '6mA_Pst_104E_v13_ph_ctg.ClassI:LINE:L1.10kb.

In [323]:
#save out pacbio spearman values
df = pd.DataFrame.from_dict(pb_spearman_dict, orient='index')
df.rename(columns={0: "Spearman's rho", 1: 'p-value'}, inplace=True)
df.to_csv(os.path.join(DIRS['FIGURES'], 'coverage', 'window_spearman_table_pb.tsv'), header=True, index=True, sep='\t')

In [324]:
#check pb df
df

Unnamed: 0,Spearman's rho,p-value
6mA_Pst_104E_v13_ph_ctg_combined_sorted_anno.genes_only.10kb.overlap.bed,-0.193097,2.9762519999999997e-134
6mA_Pst_104E_v13_ph_ctg.ClassII:TIR:PiggyBac.10kb.overlap.bed,0.013725,0.08251944
6mA_Pst_104E_v13_ph_ctg.ClassII:TIR:?.10kb.overlap.bed,0.124379,3.372204e-56
6mA_Pst_104E_v13_ph_ctg.anno.sorted.genes_only.10kb.overlap.bed,-0.398905,0.0
6mA_Pst_104E_v13_ph_ctg.ClassII:TIR:PIF-Harbinger.10kb.overlap.bed,0.092691,7.023578e-32
6mA_Pst_104E_v13_ph_ctg.ClassII:Helitron:Helitron.10kb.overlap.bed,0.07904,1.3223340000000002e-23
6mA_Pst_104E_v13_ph_ctg.ClassI:LINE:R2.10kb.overlap.bed,-0.005964,0.4505699
6mA_Pst_104E_v13_ph_ctg.ClassI:DIRS:?.10kb.overlap.bed,0.019023,0.01610313
6mA_Pst_104E_v13_ph_ctg.ClassII:TIR:hAT.10kb.overlap.bed,0.157632,1.47828e-89
6mA_Pst_104E_v13_ph_ctg.ClassII:TIR:CACTA.10kb.overlap.bed,0.068395,4.641286e-18


In [320]:
# save out sorted correlations
sorted_df = df.sort_values("Spearman's rho", ascending=False)
sorted_df.to_csv(os.path.join(DIRS['FIGURES'], 'coverage', 'window_spearman_table_sorted_pb.tsv'), header=True, index=True, sep='\t')

In [None]:
# comparing correlation between PacBio and Tombo methylation calls

In [325]:
# 5mC correlation with 6mA
spearmanr(feature_coverage_dict['5mC_hc_tombo_sorted.cutoff.0.80.10kb.overlap.bed'].overlap_fraction,\
         feature_coverage_dict['6mA_hc_tombo_sorted.cutoff.0.80.10kb.overlap.bed'].overlap_fraction)

SpearmanrResult(correlation=0.8057204715735399, pvalue=0.0)

In [326]:
# pacbio correlation with tombo 6mA
spearmanr(pb_coverage_dict['6mA_prob_smrtlink_sorted.cutoff.0.80.10kb.overlap.bed'].overlap_fraction,\
         feature_coverage_dict['6mA_hc_tombo_sorted.cutoff.0.80.10kb.overlap.bed'].overlap_fraction)

SpearmanrResult(correlation=0.06436272824615612, pvalue=3.634875467412381e-16)

In [327]:
# pacbio correlation with tombo 5mC
spearmanr(pb_coverage_dict['6mA_prob_smrtlink_sorted.cutoff.0.80.10kb.overlap.bed'].overlap_fraction,\
         feature_coverage_dict['5mC_hc_tombo_sorted.cutoff.0.80.10kb.overlap.bed'].overlap_fraction)

SpearmanrResult(correlation=0.25802376810985295, pvalue=9.741033574630119e-242)

Overall, very weak correlations between methylation and key features in the genome, especially TE superfamilies :)


Tombo had a high correlation between 5mC and 6mA, while Tombo and PacBio had no correlation between 6mA sites detected. It should be noted that Tombo detected a far higher number of sites.