# <span style='color:crimson'> Circa plot <span/>

This is a notebook to generate inputs for circa plots

1. collect pre-existing file paths
    - If needed, make a genome_size file of the largest 20 and 30 contigs --> not needed.
3. run coverage for methylation
4. check the files worked
5. copy these files to NCI so they can be copied to the home directory

In [1]:
# load modules

import pybedtools
from pybedtools import BedTool
import os
import glob
import pprint
#import numpy # need for p-value stats
import scipy
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from matplotlib_venn import venn2
from scipy.stats import wilcoxon
from scipy.stats import spearmanr

In [11]:
# set paths
DIRS = {}
DIRS['BASE'] = '/home/anjuni/circa'
DIRS['OLD_COVERAGE'] = os.path.join(DIRS['BASE'], 'window_analysis')
DIRS['NEW_COVERAGE'] = os.path.join(DIRS['BASE'], 'new_windows')
DIRS['PUBLISHED'] = os.path.join(DIRS['BASE'], 'publication')
DIRS['BASE2'] = '/home/anjuni/analysis'
DIRS['WINDOWS'] = os.path.join(DIRS['BASE2'], 'windows')

Files needed:
- circabed for genes, TE, 5mC, 6mA

### <span style='color:deeppink'> 1. Collect pre-existing file paths  <span/>

In [9]:
for fn in glob.iglob('%s/*.w100kb.overlap.circabed' % DIRS['OLD_COVERAGE'], recursive=True):
    print(fn)

/home/anjuni/circa/window_analysis/Pst_104E_v12_p_noeffector.gene.w100kb.overlap.circabed
/home/anjuni/circa/window_analysis/Pst_104E_v12_p_non_busco_non_effector.gene.w100kb.overlap.circabed
/home/anjuni/circa/window_analysis/Pst_104E_v12_p_busco.gene.w100kb.overlap.circabed
/home/anjuni/circa/window_analysis/Pst_104E_v12_p_ctg.REPET.sorted.g1000_superfamily.w100kb.overlap.circabed
/home/anjuni/circa/window_analysis/Pst_104E_v12_p_ctg.REPET.sorted.superfamily.w100kb.overlap.circabed
/home/anjuni/circa/window_analysis/Pst_104E_v12_p_ctg.REPET.sorted.g400_superfamily.w100kb.overlap.circabed
/home/anjuni/circa/window_analysis/Pst_104E_v12_cluster_8.gene.w100kb.overlap.circabed
/home/anjuni/circa/window_analysis/Pst_104E_v12_p_all.gene.w100kb.overlap.circabed
/home/anjuni/circa/window_analysis/Pst_104E_v12_p_non_busco.gene.w100kb.overlap.circabed
/home/anjuni/circa/window_analysis/Pst_104E_v12_p_effector.gene.w100kb.overlap.circabed


In [13]:
for fn in glob.iglob('%s/*' % DIRS['PUBLISHED'], recursive=True):
    print(fn)

/home/anjuni/circa/publication/Pst_104E_p_ctg.second_30_largest_contigs.genome
/home/anjuni/circa/publication/Pst_104E_p_ctg.30_largest_contigs.genome
/home/anjuni/circa/publication/First_draft_wo_all_genes.circa
/home/anjuni/circa/publication/Pst_104E_p_ctg.30_largest_contigs.fa
/home/anjuni/circa/publication/First_draft_w_all_genes.circa
/home/anjuni/circa/publication/First_draft_wo_all_genes.svg
/home/anjuni/circa/publication/Pst_104E_p_ctg.second_30_largest_contigs.fa
/home/anjuni/circa/publication/Pst_104E_p_ctg.third_30_largest_contigs.genome
/home/anjuni/circa/publication/First_draft_wo_all_genes_fig.tif
/home/anjuni/circa/publication/First_draft.circa
/home/anjuni/circa/publication/First_draft_wo_all_genes_fig.svg


#### Files to copy to NCI

##### Inputs:
/home/anjuni/circa/window_analysis/Pst_104E_v12_p_all.gene.w100kb.overlap.circabed
/home/anjuni/circa/window_analysis/Pst_104E_v12_p_ctg.REPET.sorted.superfamily.w100kb.overlap.circabed
/home/anjuni/circa/publication/Pst_104E_p_ctg.30_largest_contigs.genome
/home/anjuni/circa/publication/Pst_104E_p_ctg.30_largest_contigs.fa

##### Circa plots:
/home/anjuni/circa/publication/First_draft.circa
/home/anjuni/circa/publication/First_draft_wo_all_genes.circa
/home/anjuni/circa/publication/First_draft_wo_all_genes_fig.tif
/home/anjuni/circa/publication/First_draft_wo_all_genes.svg
/home/anjuni/circa/publication/First_draft_wo_all_genes_fig.svg
/home/anjuni/circa/publication/First_draft_w_all_genes.circa

### <span style='color:deeppink'> 2. Run coverage for methylation.  <span/>

In [14]:
genome_size_fn = os.path.join(DIRS['OLD_COVERAGE'], 'Pst_104E_v12_p_ctg.genome_file')

In [15]:
# make dictionaries of fn

# Define all file paths for window BED files
window_fn_dict = {}
window_bed_dict = {}
window_fn_dict['100kb'] = os.path.join(DIRS['NEW_COVERAGE'], 'Pst_104E_v12_ph_ctg_w100kb.bed')
pprint.pprint(window_fn_dict)

{'100kb': '/home/anjuni/circa/new_windows/Pst_104E_v12_ph_ctg_w100kb.bed'}


In [26]:
# Make methylation fn dict
feature_fn_dict = {}
feature_fn_dict['5mC_hc_tombo_sorted.cutoff.0.00.bed'] = '/home/anjuni/analysis/coverage/feature_files/5mC_hc_tombo_sorted.cutoff.0.00.bed'
feature_fn_dict['6mA_hc_tombo_sorted.cutoff.0.00.bed'] = '/home/anjuni/analysis/coverage/feature_files/6mA_hc_tombo_sorted.cutoff.0.00.bed'
pprint.pprint(methyl_fn_dict)

{'5mC_hc_tombo_sorted.cutoff.0.00.bed': '/home/anjuni/analysis/coverage/feature_files/5mC_hc_tombo_sorted.cutoff.0.00.bed',
 '6mA_hc_tombo_sorted.cutoff.0.00.bed': '/home/anjuni/analysis/coverage/feature_files/6mA_hc_tombo_sorted.cutoff.0.00.bed'}


In [53]:
# Make methylation fn dict
feature_fn_dict = {}
feature_fn_dict['5mC_hc_tombo_sorted.cutoff.0.50.bed'] = '/home/anjuni/analysis/coverage/feature_files/5mC_hc_tombo_sorted.cutoff.0.50.bed'
feature_fn_dict['6mA_hc_tombo_sorted.cutoff.0.50.bed'] = '/home/anjuni/analysis/coverage/feature_files/6mA_hc_tombo_sorted.cutoff.0.50.bed'
pprint.pprint(feature_fn_dict)

{'5mC_hc_tombo_sorted.cutoff.0.50.bed': '/home/anjuni/analysis/coverage/feature_files/5mC_hc_tombo_sorted.cutoff.0.50.bed',
 '6mA_hc_tombo_sorted.cutoff.0.50.bed': '/home/anjuni/analysis/coverage/feature_files/6mA_hc_tombo_sorted.cutoff.0.50.bed'}


In [57]:
# Make methylation fn dict
feature_fn_dict = {}
feature_fn_dict['5mC_hc_tombo_sorted.cutoff.0.30.bed'] = '/home/anjuni/analysis/coverage/feature_files/5mC_hc_tombo_sorted.cutoff.0.30.bed'
feature_fn_dict['6mA_hc_tombo_sorted.cutoff.0.30.bed'] = '/home/anjuni/analysis/coverage/feature_files/6mA_hc_tombo_sorted.cutoff.0.30.bed'
pprint.pprint(feature_fn_dict)

{'5mC_hc_tombo_sorted.cutoff.0.30.bed': '/home/anjuni/analysis/coverage/feature_files/5mC_hc_tombo_sorted.cutoff.0.30.bed',
 '6mA_hc_tombo_sorted.cutoff.0.30.bed': '/home/anjuni/analysis/coverage/feature_files/6mA_hc_tombo_sorted.cutoff.0.30.bed'}


In [16]:
# Make the actual windows! :D
!bedtools makewindows -g {genome_size_fn} -w 100000 > {window_fn_dict['100kb']}

In [17]:
#new make a bedtools window dataframe
window_bed_dict = {}
for key, value in window_fn_dict.items():
    window_bed_dict[key] = BedTool(value)

In [18]:
window_bed_dict

{'100kb': <BedTool(/home/anjuni/circa/new_windows/Pst_104E_v12_ph_ctg_w100kb.bed)>}

In [58]:
#new make a bedtools methylation dataframe
feature_bed_dict = {}
for key, value in feature_fn_dict.items():
    feature_bed_dict[key] = BedTool(value)

In [59]:
feature_bed_dict

{'5mC_hc_tombo_sorted.cutoff.0.30.bed': <BedTool(/home/anjuni/analysis/coverage/feature_files/5mC_hc_tombo_sorted.cutoff.0.30.bed)>,
 '6mA_hc_tombo_sorted.cutoff.0.30.bed': <BedTool(/home/anjuni/analysis/coverage/feature_files/6mA_hc_tombo_sorted.cutoff.0.30.bed)>}

In [None]:
def coverage(windowbed_dict, featurebed_dict, featurefn_dict, old_folder_name, new_folder_name):
    """Create coverage files from a dictionary of window pybedtools objects and a dictionary of feature file pybedtools objects. Also outputs a dictionary of pandas dataframes for all coverage files."""
    feature_overlap_df_dict = {}
    for wkey, wbed in windowbed_dict.items():
        for fkey, fbed in featurebed_dict.items():
            tmp_df = wbed.coverage(fbed).to_dataframe().iloc[:,[0,1,2,3,6]] # make a dataframe to put headings
            tmp_df.rename(columns={'name': 'overlap_count', 'thickStart': 'overlap_fraction'}, inplace=True) # rename headings
            if featurefn_dict[fkey].endswith('.gff3'): # for gene/transposon/effector/exon files
                tmp_fn = featurefn_dict[fkey].replace('.gff3', '.%s.overlap.bed' % wkey) # change output file path
            tmp_fn = tmp_fn.replace(old_folder_name, new_folder_name)
            if featurefn_dict[fkey].endswith('.bed'): # for methylation files
                tmp_fn = featurefn_dict[fkey].replace('.bed', '.%s.overlap.bed' % wkey) # change output file path
            feature_overlap_df_dict[tmp_fn.split('/')[-1]] = tmp_df # file name as key and dataframe as value for overlap dict
            tmp_df.to_csv(tmp_fn, sep='\t', header=None, index=None) # save to a csv(pybedtools outputs more d.p. than BEDTools)
    return feature_overlap_df_dict

In [61]:
feature_bed_dict

{'5mC_hc_tombo_sorted.cutoff.0.30.bed': <BedTool(/home/anjuni/analysis/coverage/feature_files/5mC_hc_tombo_sorted.cutoff.0.30.bed)>,
 '6mA_hc_tombo_sorted.cutoff.0.30.bed': <BedTool(/home/anjuni/analysis/coverage/feature_files/6mA_hc_tombo_sorted.cutoff.0.30.bed)>}

In [60]:
feature_overlap_df_dict = {}
for wkey, wbed in window_bed_dict.items():
    for fkey, fbed in feature_bed_dict.items():
        tmp_df = wbed.coverage(fbed).to_dataframe().iloc[:,[0,1,2,3,6]]
        tmp_df.rename(columns={'name': 'overlap_count', 'thickStart': 'overlap_fraction'}, inplace=True)
        tmp_fn = feature_fn_dict[fkey].replace('bed', '%s.overlap.circabed' % wkey)
        tmp_fn = tmp_fn.replace('analysis/coverage/feature_files', 'circa/new_windows')
        feature_overlap_df_dict[tmp_fn.split('/')[-1]] = tmp_df
        tmp_df.to_csv(tmp_fn, sep='\t', index=None)

In [33]:
# set functions

# coverage
