# <span style="color:#ff1414"> BEDtools analysis. </span>

This is a script to answer research questions outlined elsewhere. In summary, this script:

1. compares methylation results between different methylation-callers, and between different methylation sequencing methods.

2. compares methylation between genes and non-gene regions

3. compares methylation between transposons and non-repetitive regions

4. compares transposons and genes


Note:
- PB/pb = PacBio
- ONT/ont = Oxford Nanopore Technology
- NP = Nanopolish

In [208]:
import pybedtools
import scipy

import matplotlib.patches as mpatches
import pandas as pd
import numpy as np # need for  stats

from scipy.stats import wilcoxon
import matplotlib.pyplot as plt
from matplotlib_venn import venn2

In [209]:
# load modules
import os
import glob
import pprint
from pybedtools import BedTool
from scipy.stats import spearmanr

In [344]:
#First we need to define the base dirs
DIRS = {}
DIRS['BASE2'] = '/home/anjuni/analysis'
DIRS['FEATURES'] = os.path.join(DIRS['BASE2'], 'coverage', 'feature_files')
DIRS['RAND'] = os.path.join(DIRS['BASE2'], 'coverage', 'randomisation')
DIRS['COVERAGE'] = os.path.join(DIRS['BASE2'], 'coverage')
DIRS['RNA'] = os.path.join(DIRS['BASE2'], 'rna_counts')
DIRS['edgeR'] = os.path.join(DIRS['RNA'], 'edgeR_output')
DIRS['TRIALS'] = os.path.join(DIRS['RNA'], 'trials_tsv')
DIRS['MEAN_STD'] = os.path.join(DIRS['RNA'], 'average_and_stdev')
DIRS['FIGURES'] = os.path.join(DIRS['BASE2'], 'figures')

DIRS['GENE'] = os.path.join(DIRS['COVERAGE'], 'gene_level')
DIRS['GENE_BODY'] = os.path.join(DIRS['GENE'], 'gene_body')
DIRS['BOTH_U_D'] = os.path.join(DIRS['GENE'], 'both_upstream_downstream')
DIRS['DOWN_STR'] = os.path.join(DIRS['GENE'], 'downstream')
DIRS['UP_STR'] = os.path.join(DIRS['GENE'], 'upstream')
DIRS['TSS'] = os.path.join(DIRS['GENE'], 'tss_6mA_only')
DIRS['TE'] = os.path.join(DIRS['GENE'], 'te')
DIRS['COMPARE'] = os.path.join(DIRS['BASE2'], 'gene_te_comparison')
DIRS['GENE_TE'] = os.path.join(DIRS['COMPARE'], 'output', 'both')

In [211]:
#Quick chech if directories exist
for value in DIRS.values():
    if not os.path.exists(value):
        print('%s does not exist' % value)

## <span style='color:#15c66e'> 7. Analysing gene expression files. <span/>

### <span style='color:#15c66e'> 7.A Only genes. <span/>

#### <span style='color:#a347ff'> 7.A.1 Making dataframe with methylation + expression data.<span/>

In [433]:
# Prepare RNA-seq inputs
rnaseq_list = [os.path.join(DIRS['MEAN_STD'], 'Pst_104E_v13_h_gene_rpkm_average.tsv'), \
               os.path.join(DIRS['MEAN_STD'], 'Pst_104E_v13_p_gene_rpkm_average.tsv')]

In [434]:
rnaseq_list

['/home/anjuni/analysis/rna_counts/average_and_stdev/Pst_104E_v13_h_gene_rpkm_average.tsv',
 '/home/anjuni/analysis/rna_counts/average_and_stdev/Pst_104E_v13_p_gene_rpkm_average.tsv']

In [435]:
# make heading list for converting the methylation coverage bed file to dataframe :)
headings = ['contig', 'start', 'stop', 'gene_ID', 'overlap_count', 'overlap_fraction']

In [436]:
# Convert list to dict of dataframes
def file_name_dict(file_list):
    """Outputs a dictionary of input file paths for a given list of input file paths."""
    file_dict = {}
    for file in file_list:
        file_dict[file.split('/')[-1]] = file
    return file_dict

In [437]:
def df_dict_from_fn_dict(file_dict, header_row, header_list):
    """Outputs a dictionary of dataframes from a dictionary of file names."""
    df_dict = {}
    for key, value in file_dict.items():
        df = pd.read_csv(value, sep='\t', header=header_row, names=header_list)
        df = df.loc[:,['gene_ID', 'overlap_fraction']].sort_values('gene_ID', ascending=True)
        df_dict[key] = df
    return df_dict

In [438]:
def df_dict_from_fn_list(file_list, header_row, header_list):
    """Outputs a dictionary of dataframes from a dictionary of file names."""
    df_dict = {}
    for file in file_list:
        df = pd.read_csv(file, sep='\t', header=header_row, names=header_list)
        df = df.loc[:,['gene_ID', 'overlap_fraction']]#.sort_values('gene_ID', ascending=True)  # sorting alphabetically messes it up, so I won't do it here
        df_dict[fn.split('/')[-1]] = df
        df_dict[file.split('/')[-1]] = df
    return df_dict

In [439]:
#Make the dataframes from part 6.
h_contig_df = pd.read_csv(os.path.join(DIRS['FIGURES'], 'coverage', 'coverage_df_h.tsv'), sep='\t', )
p_contig_df = pd.read_csv(os.path.join(DIRS['FIGURES'], 'coverage', 'coverage_df_p.tsv'), sep='\t', )

In [440]:
#Add the expression data to it
h_contig_df = h_contig_df.join(rnaseq_df_dict['Pst_104E_v13_h_gene_rpkm_average.tsv'].set_index('name'), on='name').copy()
p_contig_df = p_contig_df.join(rnaseq_df_dict['Pst_104E_v13_p_gene_rpkm_average.tsv'].set_index('name'), on='name').copy()

In [441]:
#Save out the dataframe
h_contig_df.to_csv(os.path.join(DIRS['FIGURES'], 'expression', 'rnaseq_methyl_df_h.tsv'), header=True, index=None, sep='\t')
p_contig_df.to_csv(os.path.join(DIRS['FIGURES'], 'expression', 'rnaseq_methyl_df_p.tsv'), header=True, index=None, sep='\t')

#### <span style='color:#a347ff'> 7.A.2 Run Spearman stats on these dataframes.<span/>

In [442]:
# make a list of observed columns for Spearman
column_list = [ '5mC_gene_body', '6mA_gene_body', '5mC_upstream', '6mA_upstream', '5mC_downstream', '6mA_downstream', '6mA_tss', '6mA_tss_pb', '5mC_both', '6mA_both']

In [412]:
#print the headings list
column_list

['5mC_gene_body',
 '6mA_gene_body',
 '5mC_upstream',
 '6mA_upstream',
 '5mC_downstream',
 '6mA_downstream',
 '6mA_tss',
 '6mA_tss_pb',
 '5mC_both',
 '6mA_both']

In [333]:
#make a list of randomised columns to see correlation to methylation (put in supplementary)
rand_col = []
for col in column_list:
    rand = col + '_rand'
    rand_col.append(rand)
rand_col

['5mC_gene_body_rand',
 '6mA_gene_body_rand',
 '5mC_upstream_rand',
 '6mA_upstream_rand',
 '5mC_downstream_rand',
 '6mA_downstream_rand',
 '6mA_tss_rand',
 '6mA_tss_pb_rand',
 '5mC_both_rand',
 '6mA_both_rand']

In [335]:
def spearman_r(df, col_list):
    """This function returns a dictionary of spearman's R statistic and p-value for a test of observed and expected sites."""
    spearmanr_dict = {}
    for col in col_list:
        obs = df[col]
        exp = df['UG']
        stat, p = spearmanr(obs, exp)
        spearmanr_dict[col] = stat, p
    return spearmanr_dict

In [336]:
h_spearman = spearman_r(h_contig_df, column_list)
p_spearman = spearman_r(p_contig_df, column_list)

In [353]:
h_spearman

{'5mC_both': (-0.3044031532017658, 0.0),
 '5mC_downstream': (-0.2768273977762557, 4.0868316534222915e-300),
 '5mC_gene_body': (-0.3126339037881005, 0.0),
 '5mC_upstream': (-0.2806826592142046, 7.995819268164825e-309),
 '6mA_both': (-0.16789856365981518, 5.983119686579735e-109),
 '6mA_downstream': (-0.16266590546033927, 2.700116232198628e-102),
 '6mA_gene_body': (-0.1803997565972118, 9.484426077668715e-126),
 '6mA_tss': (-0.1841182034577834, 5.403187356825117e-131),
 '6mA_tss_pb': (-0.20913412220047725, 2.86601824581424e-169),
 '6mA_upstream': (-0.13359602347906796, 2.5469653149336237e-69)}

In [337]:
p_spearman

{'5mC_both': (-0.3451099389497695, 0.0),
 '5mC_downstream': (-0.31878090684897764, 0.0),
 '5mC_gene_body': (-0.3607519742299347, 0.0),
 '5mC_upstream': (-0.32422161953772544, 0.0),
 '6mA_both': (-0.17815821951606303, 1.0142759397668388e-137),
 '6mA_downstream': (-0.17429952606580357, 8.168828483487274e-132),
 '6mA_gene_body': (-0.20092134652151095, 2.2283255936526472e-175),
 '6mA_tss': (-0.1955196554493612, 5.295093228556039e-166),
 '6mA_tss_pb': (-0.2473212424030754, 1.6163848249639127e-267),
 '6mA_upstream': (-0.14925411430465801, 8.757339441259174e-97)}

In [338]:
# test out spearman on the randomised methylation
rand_h_spearman = spearman_r(h_contig_df, rand_col)
rand_p_spearman = spearman_r(p_contig_df, rand_col)

# these have correlations of 0 :///

In [415]:
rand_h_spearman

{'5mC_both_rand': (-0.028497784235788276, 0.00018561673613514348),
 '5mC_downstream_rand': (0.014313085443598909, 0.06051141928913956),
 '5mC_gene_body_rand': (0.009229122617243892, 0.2261673646798127),
 '5mC_upstream_rand': (-0.045002547182038374, 3.538907507847777e-09),
 '6mA_both_rand': (0.008036952423347062, 0.29190754875629343),
 '6mA_downstream_rand': (0.01746262954028329, 0.02201297189277224),
 '6mA_gene_body_rand': (0.005345763274584303, 0.4832880664626604),
 '6mA_tss_pb_rand': (-0.0036451583538479487, 0.6326423885137442),
 '6mA_tss_rand': (7.589997680255605e-05, 0.9920586458004681),
 '6mA_upstream_rand': (0.006958073255854432, 0.36152611898518705)}

In [416]:
rand_p_spearman

{'5mC_both_rand': (-0.051939657233027896, 4.863209708715647e-13),
 '5mC_downstream_rand': (0.0020955340893188588, 0.7706963367166384),
 '5mC_gene_body_rand': (-0.018587645186478534, 0.009722556627502299),
 '5mC_upstream_rand': (-0.06948048614439731, 3.828289110186641e-22),
 '6mA_both_rand': (-0.02273697339898356, 0.0015623210828710967),
 '6mA_downstream_rand': (-0.024829439116171234, 0.0005523128011186317),
 '6mA_gene_body_rand': (-0.019874352015986224, 0.0056999812430114346),
 '6mA_tss_pb_rand': (0.00042579031147201805, 0.9527749931943764),
 '6mA_tss_rand': (-0.021349531987561864, 0.0029798948629995536),
 '6mA_upstream_rand': (-0.008270096848074211, 0.25002373193223987)}

In [341]:
def save_spearman_dict(wdict, genome):
    df = pd.DataFrame.from_dict(wdict, orient='index')
    df.rename(columns={0: "Spearman's rho", 1: 'p-value'}, inplace=True)
    out_fn = os.path.join(DIRS['FIGURES'], 'expression', ('spearman_r_table_%s.tsv' % genome))
    df.to_csv(out_fn, header=True, index=True, sep = '\t')

In [342]:
save_spearman_dict(h_spearman, 'h')
save_spearman_dict(p_spearman, 'p')

In [414]:
#save out the null one as well
save_spearman_dict(rand_h_spearman, 'rand_h')
save_spearman_dict(rand_p_spearman, 'rand_p')

### <span style='color:#15c66e'> 7.B Methylated transposons with genes. <span/>

#### <span style='color:#a347ff'> 7.B.1 Making dataframe with methylation + expression data + gene TE data.<span/>

In [None]:
# add columns for genes close to highly methylated transposons and unmethylated transposons

In [443]:
# make lists of the genes close to transposons with a high level of each type of methylation

# make a list of file names
gt_fn_dict = file_name_dict([fn for fn in glob.iglob('%s/*bed' % os.path.join(DIRS['GENE_TE']), recursive=True)])

In [444]:
gt_fn_dict

{'Pst_104E_v13_h_ctg.REPET.5mC_hc_tombo_sorted.cutoff.0.80.overlap.500.low.sorted.bed': '/home/anjuni/analysis/gene_te_comparison/output/both/Pst_104E_v13_h_ctg.REPET.5mC_hc_tombo_sorted.cutoff.0.80.overlap.500.low.sorted.bed',
 'Pst_104E_v13_h_ctg.REPET.5mC_hc_tombo_sorted.cutoff.0.80.overlap.500.top.sorted.bed': '/home/anjuni/analysis/gene_te_comparison/output/both/Pst_104E_v13_h_ctg.REPET.5mC_hc_tombo_sorted.cutoff.0.80.overlap.500.top.sorted.bed',
 'Pst_104E_v13_h_ctg.REPET.6mA_hc_tombo_sorted.cutoff.0.80.overlap.500.low.sorted.bed': '/home/anjuni/analysis/gene_te_comparison/output/both/Pst_104E_v13_h_ctg.REPET.6mA_hc_tombo_sorted.cutoff.0.80.overlap.500.low.sorted.bed',
 'Pst_104E_v13_h_ctg.REPET.6mA_hc_tombo_sorted.cutoff.0.80.overlap.500.top.sorted.bed': '/home/anjuni/analysis/gene_te_comparison/output/both/Pst_104E_v13_h_ctg.REPET.6mA_hc_tombo_sorted.cutoff.0.80.overlap.500.top.sorted.bed',
 'Pst_104E_v13_h_ctg.REPET.6mA_prob_smrtlink_sorted.cutoff.0.80.overlap.500.low.sorted.b

In [445]:
# make a header list for converting the bed files to dataframes
header_list_gt = ['contig', 'start', 'stop', 'name', 'score', 'gene', 'distance']

In [446]:
def gene_te_df_dict(file_dict, header_row, header_list):
    """Outputs a dictionary of dataframes from a dictionary of file names."""
    df_dict = {}
    for key, value in file_dict.items():
        df = pd.read_csv(value, sep='\t', header=header_row, names=header_list_gt)
        df = df.loc[:,['gene', 'distance']].sort_values('gene', ascending=True)
        df_dict[key] = df
    return df_dict

In [447]:
gt_df_dict = gene_te_df_dict(gt_fn_dict, None, header_list_gt)

In [448]:
print(*gt_df_dict, sep='\n')

Pst_104E_v13_p_ctg.REPET.5mC_hc_tombo_sorted.cutoff.0.80.overlap.500.low.sorted.bed
Pst_104E_v13_h_ctg.REPET.6mA_prob_smrtlink_sorted.cutoff.0.80.overlap.500.top.sorted.bed
Pst_104E_v13_h_ctg.REPET.5mC_hc_tombo_sorted.cutoff.0.80.overlap.500.top.sorted.bed
Pst_104E_v13_p_ctg.REPET.5mC_hc_tombo_sorted.cutoff.0.80.overlap.500.top.sorted.bed
Pst_104E_v13_h_ctg.REPET.6mA_prob_smrtlink_sorted.cutoff.0.80.overlap.500.low.sorted.bed
Pst_104E_v13_h_ctg.REPET.5mC_hc_tombo_sorted.cutoff.0.80.overlap.500.low.sorted.bed
Pst_104E_v13_p_ctg.REPET.6mA_hc_tombo_sorted.cutoff.0.80.overlap.500.top.sorted.bed
Pst_104E_v13_p_ctg.REPET.6mA_prob_smrtlink_sorted.cutoff.0.80.overlap.500.low.sorted.bed
Pst_104E_v13_h_ctg.REPET.6mA_hc_tombo_sorted.cutoff.0.80.overlap.500.top.sorted.bed
Pst_104E_v13_p_ctg.REPET.6mA_prob_smrtlink_sorted.cutoff.0.80.overlap.500.top.sorted.bed
Pst_104E_v13_p_ctg.REPET.6mA_hc_tombo_sorted.cutoff.0.80.overlap.500.low.sorted.bed
Pst_104E_v13_h_ctg.REPET.6mA_hc_tombo_sorted.cutoff.0.80

In [450]:
def list_gene_names(df):
    gene_list = []
    if df['gene'] != '.':
        gene_list.append(df['gene'])

In [451]:
#make a list of genes in each type
m5c_top_h = list(set(pd.read_csv(os.path.join(gt_fn_dict['Pst_104E_v13_h_ctg.REPET.5mC_hc_tombo_sorted.cutoff.0.80.overlap.500.top.sorted.bed']), sep='\t', header=None)[5].tolist()))
m6a_top_h = list(set(pd.read_csv(os.path.join(gt_fn_dict['Pst_104E_v13_h_ctg.REPET.6mA_hc_tombo_sorted.cutoff.0.80.overlap.500.top.sorted.bed']), sep='\t', header=None)[5].tolist()))
m6a_pb_top_h = list(set(pd.read_csv(os.path.join(gt_fn_dict['Pst_104E_v13_h_ctg.REPET.6mA_prob_smrtlink_sorted.cutoff.0.80.overlap.500.top.sorted.bed']), sep='\t', header=None)[5].tolist()))
m5c_low_h = list(set(pd.read_csv(os.path.join(gt_fn_dict['Pst_104E_v13_h_ctg.REPET.5mC_hc_tombo_sorted.cutoff.0.80.overlap.500.low.sorted.bed']), sep='\t', header=None)[5].tolist()))
m6a_low_h = list(set(pd.read_csv(os.path.join(gt_fn_dict['Pst_104E_v13_h_ctg.REPET.6mA_hc_tombo_sorted.cutoff.0.80.overlap.500.low.sorted.bed']), sep='\t', header=None)[5].tolist()))
m6a_pb_low_h = list(set(pd.read_csv(os.path.join(gt_fn_dict['Pst_104E_v13_h_ctg.REPET.6mA_prob_smrtlink_sorted.cutoff.0.80.overlap.500.low.sorted.bed']), sep='\t', header=None)[5].tolist()))

In [452]:
m5c_top_p = list(set(pd.read_csv(os.path.join(gt_fn_dict['Pst_104E_v13_p_ctg.REPET.5mC_hc_tombo_sorted.cutoff.0.80.overlap.500.top.sorted.bed']), sep='\t', header=None)[5].tolist()))
m6a_top_p = list(set(pd.read_csv(os.path.join(gt_fn_dict['Pst_104E_v13_p_ctg.REPET.6mA_hc_tombo_sorted.cutoff.0.80.overlap.500.top.sorted.bed']), sep='\t', header=None)[5].tolist()))
m6a_pb_top_p = list(set(pd.read_csv(os.path.join(gt_fn_dict['Pst_104E_v13_p_ctg.REPET.6mA_prob_smrtlink_sorted.cutoff.0.80.overlap.500.top.sorted.bed']), sep='\t', header=None)[5].tolist()))
m5c_low_p = list(set(pd.read_csv(os.path.join(gt_fn_dict['Pst_104E_v13_p_ctg.REPET.5mC_hc_tombo_sorted.cutoff.0.80.overlap.500.low.sorted.bed']), sep='\t', header=None)[5].tolist()))
m6a_low_p = list(set(pd.read_csv(os.path.join(gt_fn_dict['Pst_104E_v13_p_ctg.REPET.6mA_hc_tombo_sorted.cutoff.0.80.overlap.500.low.sorted.bed']), sep='\t', header=None)[5].tolist()))
m6a_pb_low_p = list(set(pd.read_csv(os.path.join(gt_fn_dict['Pst_104E_v13_p_ctg.REPET.6mA_prob_smrtlink_sorted.cutoff.0.80.overlap.500.low.sorted.bed']), sep='\t', header=None)[5].tolist()))

In [454]:
h_contig_df['Top_5mC_Gene_TE'] = 'Other'
h_contig_df.loc[h_contig_df.name.isin(m5c_top_h), 'Top_5mC_Gene_TE'] = 'Methylated\nTE adjacent'

h_contig_df['Top_6mA_Gene_TE'] = 'Other'
h_contig_df.loc[h_contig_df.name.isin(m6a_top_h), 'Top_6mA_Gene_TE'] = 'Methylated\nTE adjacent'

h_contig_df['Top_6mA_pb_Gene_TE'] = 'Other'
h_contig_df.loc[h_contig_df.name.isin(m6a_pb_top_h), 'Top_6mA_pb_Gene_TE'] = 'Methylated\nTE adjacent'

h_contig_df['Low_5mC_Gene_TE'] = 'Other'
h_contig_df.loc[h_contig_df.name.isin(m5c_low_h), 'Low_5mC_Gene_TE'] = 'Methylated\nTE adjacent'

h_contig_df['Low_6mA_Gene_TE'] = 'Other'
h_contig_df.loc[h_contig_df.name.isin(m6a_low_h), 'Low_6mA_Gene_TE'] = 'Methylated\nTE adjacent'

h_contig_df['Low_6mA_pb_Gene_TE'] = 'Other'
h_contig_df.loc[h_contig_df.name.isin(m6a_pb_low_h), 'Low_6mA_pb_Gene_TE'] = 'Methylated\nTE adjacent'

In [455]:
p_contig_df['Top_5mC_Gene_TE'] = 'Other'
p_contig_df.loc[p_contig_df.name.isin(m5c_top_p), 'Top_5mC_Gene_TE'] = 'Methylated\nTE adjacent'

p_contig_df['Top_6mA_Gene_TE'] = 'Other'
p_contig_df.loc[p_contig_df.name.isin(m6a_top_p), 'Top_6mA_Gene_TE'] = 'Methylated\nTE adjacent'

p_contig_df['Top_6mA_pb_Gene_TE'] = 'Other'
p_contig_df.loc[p_contig_df.name.isin(m6a_pb_top_p), 'Top_6mA_pb_Gene_TE'] = 'Methylated\nTE adjacent'

p_contig_df['Low_5mC_Gene_TE'] = 'Others'
p_contig_df.loc[p_contig_df.name.isin(m5c_low_p), 'Low_5mC_Gene_TE'] = 'Methylated\nTE adjacent'

p_contig_df['Low_6mA_Gene_TE'] = 'Other'
p_contig_df.loc[p_contig_df.name.isin(m6a_low_p), 'Low_6mA_Gene_TE'] = 'Methylated\nTE adjacent'

p_contig_df['Low_6mA_pb_Gene_TE'] = 'Other'
p_contig_df.loc[p_contig_df.name.isin(m6a_pb_low_p), 'Low_6mA_pb_Gene_TE'] = 'Methylated\nTE adjacent'

In [456]:
#Save out the dataframe
h_contig_df.to_csv(os.path.join(DIRS['FIGURES'], 'expression', 'rnaseq_methyl_df_h.tsv'), header=True, index=None, sep='\t')
p_contig_df.to_csv(os.path.join(DIRS['FIGURES'], 'expression', 'rnaseq_methyl_df_p.tsv'), header=True, index=None, sep='\t')