# Overview
subset

## Loading data and creating files

In [None]:
# import libraries
import os
import glob
import subprocess
import toytree
import toyplot.pdf
import shutil
from Bio import AlignIO, SeqIO

# create directory for subsetted sets, and the subfolders for BED files and block_Files
def create_dir(dir):
    if not os.path.exists(dir):
        os.mkdir(dir)

In [75]:
# load xlsx pandas
subset_clusters_table = pd.read_excel(io = '../results/plotting_ME_synteny/data/HGT_signature_ORFs_modified_210818.xlsx',
                                      sheet_name = 'for plotting')



In [135]:
# copying files
hgs_to_split = ['CG_2', 'CG_17']
for faa_file in glob.glob('../results/trying_to_catch_homologs/results/connected_groups_filtered/fastas/protein/*faa'):
    hg_name = faa_file.rpartition('/')[2].split('.')[0]
    if hg_name not in hgs_to_split:
        shutil.copy(src = faa_file, dst = '../results/trying_to_catch_homologs/results/connected_groups_filtered_splitted_arCOGs/fastas/protein/')
    if hg_name in hgs_to_split:
        # get full list of record ids in FASTA
        record_ids_fasta = [record.id for record in SeqIO.parse(faa_file, 'fasta')]
        records_fasta = [record for record in SeqIO.parse(faa_file, 'fasta')]
        #print('\n----\n')
        #print('record_ids for', hg_name, 'at the beginning:', record_ids_fasta)
        # grouping by specified column and create the subsetted files
        for name, group in subset_clusters_table.query("`Cluster`.str.contains(@hg_name)").groupby('Cluster'):
            # get sequences for group, from table
            group_seqs = group['Organism_ORF'].to_list() 
            # get record for sequences that enter in group
            group_output = '../results/trying_to_catch_homologs/results/connected_groups_filtered_splitted_arCOGs/fastas/protein/{0}.faa'.format(name) 
            # create output name and save fasta
            if not os.path.exists(group_output):
                group_records = [record for record in records_fasta if record.id in group_seqs]
                #print('group_records prior to exporting', group_records)
                with open(group_output, 'w') as handle_fasta:
                    SeqIO.write(group_records, handle_fasta, 'fasta')
            # remove this ids from <record_ids_fasta>
            group_records = [record for record in SeqIO.parse(faa_file, 'fasta') if record.id in group_seqs]
            for record_id in [record.id for record in group_records]:
                record_ids_fasta.remove(record_id)
            #print('hg_name subsetted:', record_ids_fasta)
        # getting those records not present in the list, and printing to output file
        # getting output file name
        if hg_name == 'CG_2':
            output_fasta_unknowns = 'CG_2i'
        if hg_name == 'CG_17':
            output_fasta_unknowns = 'CG_17b'
        # create output name
        output_unknown_seqs = '../results/trying_to_catch_homologs/results/connected_groups_filtered_splitted_arCOGs/fastas/protein/{0}.faa'.format(output_fasta_unknowns)
        # get those not present
        seqids_unknowns = record_ids_fasta # because, I removed sequences as creating files
        records_unknowns = [record for record in SeqIO.parse(faa_file, 'fasta') if record.id in seqids_unknowns]
        # get to file
        if not os.path.exists(output_unknown_seqs):
            with open(output_unknown_seqs, 'w') as handle_fasta:
                SeqIO.write(records_unknowns, handle_fasta, 'fasta')

## Creating BED records

### Creating GFF3 for features annotated in PATRIC + ORFs in remote homologues

In [3]:
# load libraries
import pandas as pd
from Bio import SeqIO

# creating complete GFF for each of the ME ORFs predictions
# create directory to allocate results
creating_dirs = ['../results/MEs_predicted_orfs_GFFs', 
                 '../results/MEs_predicted_orfs_GFFs/standard_nomenclature/',
                 '../results/MEs_predicted_orfs_GFFs/simplified_nomenclature',
                 '../results/plotting_ME_synteny',
                 '../results/plotting_ME_synteny/data',
                 '../results/plotting_ME_synteny/data/block_files',
                 '../results/plotting_ME_synteny/data/GFFs',
                 '../results/plotting_ME_synteny/data/GFFs_all_records/',
                 '../results/plotting_ME_synteny/data/BED',
                 '../results/plotting_ME_synteny/data/BED_all_records',
                 '../results/plotting_ME_synteny/results']

for dir in creating_dirs:
    if not os.path.exists(dir):
        os.mkdir(dir)

# listing ORF prediction files and creating a list of all CDS that are in the remote homolog groups
ORFs_in_remote_homologs_groups = []
for remote_homologs_group_file in glob.glob('../results/trying_to_catch_homologs/results/connected_groups_filtered_splitted_arCOGs/fastas/protein/*faa'):
    for record in SeqIO.parse(remote_homologs_group_file, 'fasta'):
        ORFs_in_remote_homologs_groups.append(record.id)

In [4]:
# loading table with new annotations
import pandas as pd

orf_annotation_table = pd.read_csv('../results/MEs_predicted_orfs_renamed/data/annotation_table_ORFs_renamed.tsv', sep = '\t')
orf_annotation_table.head()
# perform little change
orf_annotation_table_to_import = orf_annotation_table.drop(['Featured ID in Genome Annotation'], axis = 1)

In [5]:
%load_ext rpy2.ipython

In [6]:
%%R -i ORFs_in_remote_homologs_groups -o ME_ORFs_annot_table_simplecode

library(tidyverse)
library(magrittr)
library(glue)

# loading TSV with annotations for ME ORFs. with some modifications this can be actually converted into a GFF
ME_ORFs_annot_table = readr::read_tsv('../results/MEs_predicted_orfs_renamed/data/annotation_table_ORFs_renamed.tsv')

# saving version with simplified code
ME_ORFs_annot_table_simplecode = ME_ORFs_annot_table %>%
    # selecting features employed in the GFF file
    dplyr::select(`Genome ID`, `ORF ID`, `contig`, `strand`, `START`, `STOP`) %>%
    dplyr::rename(seqid = 'contig',
                  start = 'START',
                  end = 'STOP') %>%
    dplyr::mutate(seqid = seqid %>% str_split(' ') %>% purrr::map_chr(1), source = 'getorf', phase = '1', type = 'CDS', score = 0.000000000000001, attributes = as.character(glue('ID={`ORF ID`};Name={`ORF ID`}'))) %>%
    .[, c('Genome ID', 'seqid', 'source', 'type', 'start', 'end', 'score', 'strand', 'phase', 'attributes')]
 

R[write to console]: ── [1mAttaching packages[22m ──────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.1 ──

R[write to console]: [32m✔[39m [34mggplot2[39m 3.3.3     [32m✔[39m [34mpurrr  [39m 0.3.4
[32m✔[39m [34mtibble [39m 3.1.2     [32m✔[39m [34mdplyr  [39m 1.0.6
[32m✔[39m [34mtidyr  [39m 1.1.3     [32m✔[39m [34mstringr[39m 1.4.0
[32m✔[39m [34mreadr  [39m 1.4.0     [32m✔[39m [34mforcats[39m 0.5.1

R[write to console]: ── [1mConflicts[22m ─────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()

R[write to console]: 
Attaching package: ‘magrittr’


R[write to console]: The following object is masked from ‘package:purrr’:

    set_names


R[write to console]: The following


[36m──[39m [1m[1mColumn specification[1m[22m [36m─────────────────────────────────────────────────────────────────────────────────────────────────────────────────[39m
cols(
  `Genome ID` = [31mcol_character()[39m,
  `ORF ID` = [31mcol_character()[39m,
  `Featured ID in Genome Annotation` = [31mcol_character()[39m,
  `getorf featured ID` = [31mcol_character()[39m,
  contig = [31mcol_character()[39m,
  strand = [31mcol_character()[39m,
  START = [32mcol_double()[39m,
  STOP = [32mcol_double()[39m
)



In [7]:
%%R -i ORFs_in_remote_homologs_groups,orf_annotation_table_to_import -o ME_ORFs_annot_table_simplecode

# load libraries
library(tidyverse)
library(magrittr)
library(glue)
library(tidytidbits)

# convert table to tibble and create dictionary
orf_annotation_table_to_import %<>% as_tibble()
getorf2new_id.dict = orf_annotation_table_to_import$`ORF ID`
names(getorf2new_id.dict) = orf_annotation_table_to_import$`getorf featured ID` %>% str_split(' ') %>% purrr::map_chr(1)

# loading TSV with annotations for ME ORFs. with some modifications this can be actually converted into a GFF
ME_ORFs_annot_table = readr::read_tsv('../results/MEs_predicted_orfs_renamed/data/annotation_table_ORFs_renamed.tsv')

# saving version with simplified code
ME_ORFs_annot_table_simplecode = ME_ORFs_annot_table %>%
    # selecting features employed in the GFF file
    dplyr::select(`Genome ID`, `ORF ID`, `contig`, `strand`, `START`, `STOP`) %>%
    dplyr::rename(seqid = 'contig',
                  start = 'START',
                  end = 'STOP') %>%
    dplyr::mutate(seqid = seqid %>% str_split(' ') %>% purrr::map_chr(1), source = 'getorf', phase = '1', type = 'CDS', score = 0.000000000000001, attributes = as.character(glue('ID={`ORF ID`};Name={`ORF ID`}'))) %>%
    .[, c('Genome ID', 'seqid', 'source', 'type', 'start', 'end', 'score', 'strand', 'phase', 'attributes')]
 
ME_ORFs_annot_table_simplecode$`Genome ID` %<>% as.character()

ORFs_in_remote_homologs_groups %<>% unlist() %>% tidytidbits::lookup_chr(., dict = getorf2new_id.dict, default = identity)
ORFs_in_remote_homologs_groups

# getting also the ORF ID of the feature IDs annotated by PATRIC
PATRIC_featured_ids = ME_ORFs_annot_table %>% dplyr::filter(!is.na(`Featured ID in Genome Annotation`)) %>% .$`ORF ID` %>% unique()

# saving for each ME in a different file
ME_ORFs_annot_table_simplecode %>%
    group_split(`Genome ID`) %>%
    purrr::map(., ~{
        # getting ME genome tag
        tag = unique(.x$`Genome ID`)
        
        # saving TSV file with extension gff3
        .x %>%
            dplyr::select(-c(`Genome ID`)) %>%
            dplyr::mutate(start2 = case_when(start > end ~ end, start < end ~ start), 
                          end2 = case_when(start > end ~ start, start < end ~ end)) %>%
            dplyr::select(-c(start, end)) %>%
            dplyr::rename(start = 'start2', end = 'end2') %>%
            .[, c('seqid', 'source', 'type', 'start', 'end', 'score', 'strand', 'phase', 'attributes')] %>%
            readr::write_tsv(., glue('../results/MEs_predicted_orfs_GFFs/simplified_nomenclature/ME.{tag}.ORFs.gff3'), col_names = F)
        
        # saving FASTA of remote homologs CDS
        .x %>%
            dplyr::mutate(orfid = attributes %>% str_split(';') %>% purrr::map_chr(1) %>% str_replace_all(., 'ID=', '')) %>%
            dplyr::filter(orfid %in% ORFs_in_remote_homologs_groups) %>%
            dplyr::select(-c(`Genome ID`, orfid)) %>%
            dplyr::mutate(start2 = case_when(start > end ~ end, start < end ~ start), 
                          end2 = case_when(start > end ~ start, start < end ~ end)) %>%
            dplyr::select(-c(start, end)) %>%
            dplyr::rename(start = 'start2', end = 'end2') %>%
            .[, c('seqid', 'source', 'type', 'start', 'end', 'score', 'strand', 'phase', 'attributes')] %>%
            readr::write_tsv(., glue('../results/plotting_ME_synteny/data/GFFs/ME.{tag}.remote_homologues.ORFs.gff3'), col_names = F)
        
        # saving FASTA of features annotated by PATRIC + CDS coded in remote homologs (i.e. complete set of PATRIC + expanded annotation)
        .x %>%
            dplyr::mutate(orfid = attributes %>% str_split(';') %>% purrr::map_chr(1) %>% str_replace_all(., 'ID=', '')) %>%
            dplyr::filter(orfid %in% c(ORFs_in_remote_homologs_groups, PATRIC_featured_ids)) %>%
            dplyr::select(-c(`Genome ID`, orfid)) %>%
            dplyr::mutate(start2 = case_when(start > end ~ end, start < end ~ start), 
                          end2 = case_when(start > end ~ start, start < end ~ end)) %>%
            dplyr::select(-c(start, end)) %>%
            dplyr::rename(start = 'start2', end = 'end2') %>%
            .[, c('seqid', 'source', 'type', 'start', 'end', 'score', 'strand', 'phase', 'attributes')] %>%
            readr::write_tsv(., glue('../results/plotting_ME_synteny/data/GFFs_all_records/ME.{tag}.all_records.ORFs.gff3'), col_names = F)
    })


[36m──[39m [1m[1mColumn specification[1m[22m [36m─────────────────────────────────────────────────────────────────────────────────────────────────────────────────[39m
cols(
  `Genome ID` = [31mcol_character()[39m,
  `ORF ID` = [31mcol_character()[39m,
  `Featured ID in Genome Annotation` = [31mcol_character()[39m,
  `getorf featured ID` = [31mcol_character()[39m,
  contig = [31mcol_character()[39m,
  strand = [31mcol_character()[39m,
  START = [32mcol_double()[39m,
  STOP = [32mcol_double()[39m
)

[[1]]
[90m# A tibble: 1,011 x 9[39m
   seqid   source type  start   end score strand phase attributes               
   [3m[90m<chr>[39m[23m   [3m[90m<chr>[39m[23m  [3m[90m<chr>[39m[23m [3m[90m<dbl>[39m[23m [3m[90m<dbl>[39m[23m [3m[90m<dbl>[39m[23m [3m[90m<chr>[39m[23m  [3m[90m<chr>[39m[23m [3m[90m<chr>[39m[23m                    
[90m 1[39m CP0104… getorf CDS    [4m3[24m001  [4m4[24m869 1[90me[39m[31m-15[39m +      1   

In [8]:
# converting GFFs into BED format
for ME_GFF_file in glob.glob('../results/plotting_ME_synteny/data/GFFs/*'):
    # create BED file output name and store it in a variable
    bed_output = ME_GFF_file.replace('GFFs', 'BED').replace('.gff3', '.bed')
    # run external tool to create conversion
    bed_conv_cmd = 'python -m jcvi.formats.gff bed --type=CDS --key=Name {0} -o {1}'.format(ME_GFF_file, bed_output).split(' ')
    subprocess.run(bed_conv_cmd)
    #print(ME_GFF_file)
    #print(bed_output)

# same for GFFs with all records
for ME_GFF_file in glob.glob('../results/plotting_ME_synteny/data/GFFs_all_records/*'):
    # create BED file output name and store it in a variable
    bed_output = ME_GFF_file.replace('GFFs', 'BED').replace('.gff3', '.bed')
    # run external tool to create conversion
    bed_conv_cmd = 'python -m jcvi.formats.gff bed --type=CDS --key=Name {0} -o {1}'.format(ME_GFF_file, bed_output).split(' ')
    subprocess.run(bed_conv_cmd)
    #print(ME_GFF_file)
    #print(bed_output)

## Generating miscellaneous files

In [9]:
import itertools 
# create a small dictionary between species code and fasta id
species2id_table = pd.read_csv('../data/genomes_and_taxonomy.csv', sep = ',')
species2id_dict = {}

for index, row in species2id_table.iterrows():
    species2id_dict.update({row['Taxonomy']: row['Genome_id'].replace('GCA_', '')})

# generating species pairs
#[print(a) for a in itertools.combinations(species2id_table['Taxonomy'].to_list(), 2)]

In [10]:
# first creating a dictionary to go from getorf IDs to novel ids
orf_annotation_table_to_import.head()
getorfid2new_id = {}

for index, row in orf_annotation_table_to_import.iterrows():
    getorfid = row['getorf featured ID'].split(' ')[0]
    newid = row['ORF ID']
    getorfid2new_id.update({getorfid: newid})

In [11]:
# loading table with target organisms
mobile_element_lengths_table_filtered = pd.read_csv('../results/target_organisms.tsv', sep ='\t')
mobile_element_lengths_table_filtered = mobile_element_lengths_table_filtered.drop(['Unnamed: 0', 'ME_file', 'Total length'], axis = 1)
target_genomeids = mobile_element_lengths_table_filtered['Group tag'].to_list()

# filtering species2id_table to organisms in this list
species2id_table = species2id_table.query("`Genome_id` in @target_genomeids")

### Modify BEDs to include dummy ORFs
In order to be able to plot mobile elements in its full length, *dummy* ORFs are generated for each mobile element at its sequence start and end (of size 1bp) and included in the pipeline. 

In [13]:
# load function
from Bio import SeqIO

# loading all contigs from the mobile elements and parsing their length
contig_lengths = {}
for fna_file in glob.glob('../data/mobile_elements_sequences/*fna'):
    records = [record for record in SeqIO.parse(fna_file, 'fasta')]
    for record in records:
        contig_lengths.update({record.id: len(record.seq)})
        
# get 
for bed_file in glob.glob('../results/plotting_ME_synteny/data/BED/*.remote_homologues.ORFs.bed'):
    # working on non-empty files
    if not os.stat(bed_file).st_size == 0:
        #print(bed_file)
        # load BED table
        bed_table = pd.read_csv(bed_file, sep = '\t', names = ['contig', 'start', 'end', 'orfid', 'evalue', 'strand'])
        # get rid of dummy rows if already present
        possible_dummies = [orfid for orfid in bed_table['orfid'].to_list() if 'dummy' in orfid]
        bed_table = bed_table.query("`orfid` not in @possible_dummies")
        # get contig name
        contig = bed_table.iloc[0]['contig']
        # get contig length
        contig_length = contig_lengths[contig]
        # get dummy names by modifying name of first feature
        orfid_list_start = bed_table.iloc[0]['orfid'].split('_')
        orfid_list_end = bed_table.iloc[0]['orfid'].split('_')
        orfid_list_start[1] = 'dummystart'
        orfid_list_end[1] = 'dummyend'
        orfid_start = '_'.join(orfid_list_start)
        orfid_end = '_'.join(orfid_list_end)
        # add dummy feature for start and end
        dummy_start = pd.DataFrame.from_dict({'contig': [contig], 
                                              'start': [0], 
                                              'end': [1], 
                                              'orfid': [orfid_start],
                                              'evalue': [1e-15],
                                              'strand': ['+']})
        dummy_end = pd.DataFrame.from_dict({'contig': [contig], 
                                            'start': [contig_length-1], 
                                            'end': [contig_length], 
                                            'orfid': [orfid_end],
                                            'evalue': [1e-15],
                                            'strand': ['+']})
        # concatenate 
        bed_table_final = pd.concat([dummy_start, bed_table, dummy_end])
        bed_table_final = bed_table_final.drop_duplicates()
        # save table
        bed_table_final.to_csv(bed_file, sep = '\t', header = False, index = False)
        
# get 
for bed_file in glob.glob('../results/plotting_ME_synteny/data/BED_all_records/*.bed'):
    # working on non-empty files
    if not os.stat(bed_file).st_size == 0:
        #print(bed_file)
        # load BED table
        bed_table = pd.read_csv(bed_file, sep = '\t', names = ['contig', 'start', 'end', 'orfid', 'evalue', 'strand'])
        # get rid of dummy rows if already present
        possible_dummies = [orfid for orfid in bed_table['orfid'].to_list() if 'dummy' in orfid]
        bed_table = bed_table.query("`orfid` not in @possible_dummies")
        # get contig name
        contig = bed_table.iloc[0]['contig']
        # get contig length
        contig_length = contig_lengths[contig]
        # get dummy names by modifying name of first feature
        orfid_list_start = bed_table.iloc[0]['orfid'].split('_')
        orfid_list_end = bed_table.iloc[0]['orfid'].split('_')
        orfid_list_start[1] = 'dummystart'
        orfid_list_end[1] = 'dummyend'
        orfid_start = '_'.join(orfid_list_start)
        orfid_end = '_'.join(orfid_list_end)
        # add dummy feature for start and end
        dummy_start = pd.DataFrame.from_dict({'contig': [contig], 
                                              'start': [0], 
                                              'end': [1], 
                                              'orfid': [orfid_start],
                                              'evalue': [1e-15],
                                              'strand': ['+']})
        dummy_end = pd.DataFrame.from_dict({'contig': [contig], 
                                            'start': [contig_length-1], 
                                            'end': [contig_length], 
                                            'orfid': [orfid_end],
                                            'evalue': [1e-15],
                                            'strand': ['+']})
        # concatenate 
        bed_table_final = pd.concat([dummy_start, bed_table, dummy_end])
        bed_table_final = bed_table_final.drop_duplicates()
        # save table
        bed_table_final.to_csv(bed_file, sep = '\t', header = False, index = False)

### Concatenate to get all BEDs in one

In [14]:
%%bash

cat ../results/plotting_ME_synteny/data/BED_all_records/ME.017357405.1.all_records.ORFs.bed \
../results/plotting_ME_synteny/data/BED_all_records/ME.60847.21.all_records.ORFs.bed \
../results/plotting_ME_synteny/data/BED_all_records/ME.2743089.3.all_records.ORFs.bed \
../results/plotting_ME_synteny/data/BED_all_records/ME.222984.5.all_records.ORFs.bed \
../results/plotting_ME_synteny/data/BED_all_records/ME.2496101.3.all_records.ORFs.bed \
../results/plotting_ME_synteny/data/BED_all_records/ME.1526048.3.all_records.ORFs.bed \
../results/plotting_ME_synteny/data/BED_all_records/ME.ga0207733_100382.all_records.ORFs.bed \
../results/plotting_ME_synteny/data/BED_all_records/ME.ga0207719_100190.all_records.ORFs.bed \
../results/plotting_ME_synteny/data/BED_all_records/ME.926690.3.all_records.ORFs.bed \
../results/plotting_ME_synteny/data/BED_all_records/ME.jgi12330j12834_1000008.all_records.ORFs.bed \
../results/plotting_ME_synteny/data/BED_all_records/ME.ga0207718_100100.all_records.ORFs.bed > ../results/plotting_ME_synteny/data/BED/all_species_all_records.bed

In [15]:
%%bash

cat ../results/plotting_ME_synteny/data/BED/ME.60847.21.remote_homologues.ORFs.bed \
../results/plotting_ME_synteny/data/BED_all_records/ME.2743089.3.all_records.ORFs.bed \
../results/plotting_ME_synteny/data/BED_all_records/ME.222984.5.all_records.ORFs.bed \
../results/plotting_ME_synteny/data/BED_all_records/ME.2496101.3.all_records.ORFs.bed \
../results/plotting_ME_synteny/data/BED_all_records/ME.1526048.3.all_records.ORFs.bed \
../results/plotting_ME_synteny/data/BED_all_records/ME.ga0207733_100382.all_records.ORFs.bed \
../results/plotting_ME_synteny/data/BED_all_records/ME.ga0207719_100190.all_records.ORFs.bed \
../results/plotting_ME_synteny/data/BED_all_records/ME.926690.3.all_records.ORFs.bed \
../results/plotting_ME_synteny/data/BED_all_records/ME.jgi12330j12834_1000008.all_records.ORFs.bed \
../results/plotting_ME_synteny/data/BED_all_records/ME.ga0207718_100100.all_records.ORFs.bed > ../results/plotting_ME_synteny/data/BED_all_records/all_species_complete_records.bed

## Creating block files

### Create block files from homolog groups

In [18]:
# create a small dictionary between species code and fasta id
species2id_table = pd.read_csv('../data/genomes_and_taxonomy.csv', sep = ',')
species2id_dict = {}

for index, row in species2id_table.iterrows():
    species2id_dict.update({row['Taxonomy']: row['Genome_id'].replace('GCA_', '')})

In [20]:
# import
import glob
from Bio import SeqIO
# loading table with new annotation
import pandas as pd

# create dictionary between species and column name in block file
species2blockcol_dict = {'Halogeometricum_borinquense_strain_wsp4': 'Halogeometricum',
                         'Halobonum_sp_NJ-3-1': 'Halobonum',
                         'Natrinema_altunense_strain_AJ2': 'Natrinema',
                         'Haloterrigena_sp_SYSU_A121-1': 'Haloterrigena',
                         'Haloferax_sp_Q22': 'Haloferax',
                         'ga0207733_100382': 'ga0207733_100382',
                         'Halovivax_sp_KZCA124': 'Halovivax',
                         'ga0207719_100190': 'ga0207719_100190',
                         'Haloplanus_natans_DSM_17983': 'Haloplanus',
                         'jgi12330j12834_1000008': 'jgi12330j12834_1000008',
                         'ga0207718_100100': 'ga0207718_100100'}

# create order of species in my configuration
block_column_order = ['Halovivax', 'Halogeometricum', 'Halobonum', 'ga0207718_100100', 'jgi12330j12834_1000008', 
                                                             'Natrinema', 'Halovivax', 'Haloterrigena', 'Haloplanus', 
                                                             'Haloferax', 'ga0207719_100190', 'ga0207733_100382']

orf_annotation_table = pd.read_csv('../results/MEs_predicted_orfs_renamed/data/annotation_table_ORFs_renamed.tsv', sep = '\t')
orf_annotation_table.head()
# perform little change
orf_annotation_table_to_import = orf_annotation_table.drop(['Featured ID in Genome Annotation'], axis = 1)

# first creating a dictionary to go from getorf IDs to novel ids
orf_annotation_table_to_import.head()
getorfid2new_id = {}

for index, row in orf_annotation_table_to_import.iterrows():
    getorfid = row['getorf featured ID'].split(' ')[0]
    newid = row['ORF ID']
    getorfid2new_id.update({getorfid: newid})

# create list to allocate rows
homolog_groups_composition_rows = []

# looping over groups
for homolog_group_file in glob.glob('../results/trying_to_catch_homologs/results/connected_groups_filtered_splitted_arCOGs/fastas/protein/*faa'):
    # parsing records in them and translating the records to new ids
    records = [getorfid2new_id[record.id] for record in SeqIO.parse(homolog_group_file, 'fasta')]
    # create a dictionary for relevant species, including an empty list for each
    classifing_genes_by_species_dict = {'Halogeometricum_borinquense_strain_wsp4': [],
                                   'Halobonum_sp_NJ-3-1': [],
                                   'Natrinema_altunense_strain_AJ2': [],
                                   'Haloterrigena_sp_SYSU_A121-1': [],
                                   'Haloferax_sp_Q22':[] ,
                                   'ga0207733_100382': [],
                                   'Halovivax_sp_KZCA124': [],
                                   'ga0207719_100190': [],
                                   'Haloplanus_natans_DSM_17983': [],
                                   'jgi12330j12834_1000008': [],
                                   'ga0207718_100100': []}
    for record_id in records:
    # for each record... 
        # identify its species
        species = '_'.join(record_id.split('_')[2:])
        #print(species)
        if species in list(species2blockcol_dict.keys()):
            species_blockname = species2blockcol_dict[species]
            # append to the relevant dictionary
            classifing_genes_by_species_dict[species].append(record_id)
    # create a pandas dataframe from a dictionary that includes entries from the same species collapsed with a ','
    for key in classifing_genes_by_species_dict.keys():
        if len(classifing_genes_by_species_dict[key]) > 0:
            classifing_genes_by_species_dict[key] = [','.join(classifing_genes_by_species_dict[key])]
        elif len(classifing_genes_by_species_dict[key]) == 0:
            classifing_genes_by_species_dict[key] = ['.']
    # add homolog group
    classifing_genes_by_species_dict.update({'homolog_group': [homolog_group_file]})
    gene_row = pd.DataFrame.from_dict(classifing_genes_by_species_dict)
    # append to general list
    homolog_groups_composition_rows.append(gene_row)

# concatenate
homolog_groups_composition_table = pd.concat(homolog_groups_composition_rows)

# save table
homolog_groups_composition_table.to_csv('../results/plotting_ME_synteny/data/homolog_groups_composition_table.tsv',
                                        sep = '\t',
                                        index = False)

# splitting rows in R...

In [21]:
import glob
import pandas as pd
from Bio import SeqIO

# listing ORF prediction files and creating a list of all CDS that are in the remote homolog groups
ORFs_in_remote_homologs_groups = []
for remote_homologs_group_file in glob.glob('../results/trying_to_catch_homologs/results/connected_groups_filtered_splitted_arCOGs/fastas/protein/*faa'):
    for record in SeqIO.parse(remote_homologs_group_file, 'fasta'):
        ORFs_in_remote_homologs_groups.append(record.id)

# first creating a dictionary to go from getorf IDs to novel ids
orf_annotation_table_to_import.head()
getorfid2new_id = {}

for index, row in orf_annotation_table_to_import.iterrows():
    getorfid = row['getorf featured ID'].split(' ')[0]
    newid = row['ORF ID']
    getorfid2new_id.update({getorfid: newid})
    
# load concatenated BED with all features
patric_and_orfs_bed_table = pd.read_csv('../results/plotting_ME_synteny/data/BED_all_records/all_species_complete_records.bed', sep = '\t', names = ['contig', 'start', 'end', 'orfid', 'evalue', 'strand'])
patric_and_orfs_orfids = patric_and_orfs_bed_table['orfid'].to_list()

# remember that got list of features in homolog groups (ORFs_in_remote_homologs_groups)
# translating to new id
orfs_in_homolog_groups = [getorfid2new_id[id] for id in ORFs_in_remote_homologs_groups if 'dummy' not in id]

# determine features that arent present in the list
patric_and_orfs_orfids_nonhomologs = [id for id in patric_and_orfs_orfids if id not in orfs_in_homolog_groups]
# create list to allocate rows of block
dummy_block_entries_rows = []

# create dictionary between species and column name in block file
species2blockcol_dict = {'Halogeometricum_borinquense_strain_wsp4': 'Halogeometricum',
                         'Halobonum_sp_NJ-3-1': 'Halobonum',
                         'Natrinema_altunense_strain_AJ2': 'Natrinema',
                         'Haloterrigena_sp_SYSU_A121-1': 'Haloterrigena',
                         'Haloferax_sp_Q22': 'Haloferax',
                         'ga0207733_100382': 'ga0207733_100382',
                         'ga0207719_100190': 'ga0207719_100190',
                         'Haloplanus_natans_DSM_17983': 'Haloplanus',
                         'jgi12330j12834_1000008': 'jgi12330j12834_1000008',
                         'ga0207718_100100': 'ga0207718_100100'}

# create order of species in my configuration
block_column_order = ['Halogeometricum', 'Halobonum', 'ga0207718_100100', 'jgi12330j12834_1000008', 
                                                             'Natrinema', 'Halovivax', 'Haloterrigena', 'Haloplanus', 
                                                             'Haloferax', 'ga0207719_100190', 'ga0207733_100382']
# loop over
for orfid in patric_and_orfs_orfids_nonhomologs:
    # detect species
    species = '_'.join(orfid.split('_')[2:])
    species_blockname = species2blockcol_dict[species]
    # get the other species
    other_species = [species for species in block_column_order if species != species_blockname]
    # create dictionary to allocate, as columns, the orfid (for the species) or a '.' (other species)
    dummy_dict = {}
    dummy_dict.update({species_blockname: [orfid]})
    # create pandas data frame
    for species in other_species:
        dummy_dict.update({species: ['.']})
    dummy_row = pd.DataFrame.from_dict(dummy_dict)
    dummy_row = dummy_row[block_column_order]
    # order and append to list
    dummy_block_entries_rows.append(dummy_row)
    
# concatenate
dummy_block_entries_table = pd.concat(dummy_block_entries_rows)
# drop duplicates (although shouldn't exist)
dummy_block_entries_table = dummy_block_entries_table.drop_duplicates()

In [22]:
%%R -i homolog_groups_composition_table -o homolog_groups_composition_table_splitted

# load libraries
library(tidyverse)
library(magrittr)
library(glue)

# convert to tibble()
homolog_groups_composition_table %<>% as_tibble()
homolog_groups_composition_table_splitted = homolog_groups_composition_table %>%
    tidyr::separate_rows(., `Halogeometricum_borinquense_strain_wsp4`, sep = ',') %>%
    tidyr::separate_rows(., `Halobonum_sp_NJ-3-1`, sep = ',') %>%
    tidyr::separate_rows(., `Natrinema_altunense_strain_AJ2`, sep = ',') %>%
    tidyr::separate_rows(., `Haloterrigena_sp_SYSU_A121-1`, sep = ',') %>%
    tidyr::separate_rows(., `Haloferax_sp_Q22`, sep = ',') %>%
    tidyr::separate_rows(., `ga0207733_100382`, sep = ',') %>%
    tidyr::separate_rows(., `ga0207719_100190`, sep = ',') %>%
    tidyr::separate_rows(., `Haloplanus_natans_DSM_17983`, sep = ',') %>%
    tidyr::separate_rows(., `jgi12330j12834_1000008`, sep = ',') %>%
    tidyr::separate_rows(., `ga0207718_100100`, sep = ',') %>%
    tidyr::separate_rows(., `Halovivax_sp_KZCA124`, sep = ',') %>%
    tidyr::replace_na(., list('Halogeometricum_borinquense_strain_wsp4' = '.',
                              'Halobonum_sp_NJ-3-1' = '.',
                              'Natrinema_altunense_strain_AJ2' = '.',
                              'Haloterrigena_sp_SYSU_A121-1' = '.',
                              'Haloferax_sp_Q22' = '.',
                              'ga0207733_100382' = '.',
                              'ga0207719_100190' = '.',
                              'Haloplanus_natans_DSM_17983' = '.',
                              'jgi12330j12834_1000008' = '.',
                              'ga0207718_100100' = '.',
                              'Halovivax_sp_KZCA124' = '.'))

In [23]:
# order
homolog_groups_composition_table_splitted = homolog_groups_composition_table_splitted[['homolog_group', 'Halogeometricum_borinquense_strain_wsp4', 
                                           'Halobonum_sp_NJ-3-1', 'ga0207718_100100', 'jgi12330j12834_1000008', 
                                           'Natrinema_altunense_strain_AJ2', 'Halovivax_sp_KZCA124', 
                                           'Haloterrigena_sp_SYSU_A121-1', 'Haloplanus_natans_DSM_17983', 
                                           'Haloferax_sp_Q22', 'ga0207719_100190', 'ga0207733_100382']]
# save table
homolog_groups_composition_table_splitted.to_csv('../results/plotting_ME_synteny/data/homolog_groups_composition_table.tsv', sep = '\t', header = True, index = False)

# get rid of homologous group table and save as block file
homolog_groups_composition_table_splitted.drop('homolog_group', axis = 1).to_csv('../results/plotting_ME_synteny/data/block_files/all_species_noanchor.block', sep = '\t', header = False, index = False)


### Create block files for features that dont present homology signal

In [24]:
# listing ORF prediction files and creating a list of all CDS that are in the remote homolog groups
ORFs_in_remote_homologs_groups = []
for remote_homologs_group_file in glob.glob('../results/trying_to_catch_homologs/results/connected_groups_filtered_splitted_arCOGs/fastas/protein/*faa'):
    for record in SeqIO.parse(remote_homologs_group_file, 'fasta'):
        ORFs_in_remote_homologs_groups.append(record.id)

# load concatenated BED with all features
patric_and_orfs_bed_table = pd.read_csv('../results/plotting_ME_synteny/data/BED_all_records/all_species_complete_records.bed', sep = '\t', names = ['contig', 'start', 'end', 'orfid', 'evalue', 'strand'])
patric_and_orfs_orfids = patric_and_orfs_bed_table['orfid'].to_list()

# remember that got list of features in homolog groups (ORFs_in_remote_homologs_groups)
# translating to new id
orfs_in_homolog_groups = [getorfid2new_id[id] for id in ORFs_in_remote_homologs_groups if 'dummy' not in id]

# determine features that arent present in the list
patric_and_orfs_orfids_nonhomologs = [id for id in patric_and_orfs_orfids if id not in orfs_in_homolog_groups]
# create list to allocate rows of block
dummy_block_entries_rows = []

# create dictionary between species and column name in block file
species2blockcol_dict = {'Halogeometricum_borinquense_strain_wsp4': 'Halogeometricum',
                         'Halobonum_sp_NJ-3-1': 'Halobonum',
                         'Natrinema_altunense_strain_AJ2': 'Natrinema',
                         'Haloterrigena_sp_SYSU_A121-1': 'Haloterrigena',
                         'Haloferax_sp_Q22': 'Haloferax',
                         'ga0207733_100382': 'ga0207733_100382',
                         'ga0207719_100190': 'ga0207719_100190',
                         'Haloplanus_natans_DSM_17983': 'Haloplanus',
                         'jgi12330j12834_1000008': 'jgi12330j12834_1000008',
                         'ga0207718_100100': 'ga0207718_100100'}

# create order of species in my configuration
block_column_order = ['Halogeometricum', 'Halobonum', 'ga0207718_100100', 'jgi12330j12834_1000008', 
                                                             'Natrinema', 'Halovivax', 'Haloterrigena', 'Haloplanus', 
                                                             'Haloferax', 'ga0207719_100190', 'ga0207733_100382']
# loop over
for orfid in patric_and_orfs_orfids_nonhomologs:
    # detect species
    species = '_'.join(orfid.split('_')[2:])
    species_blockname = species2blockcol_dict[species]
    # get the other species
    other_species = [species for species in block_column_order if species != species_blockname]
    # create dictionary to allocate, as columns, the orfid (for the species) or a '.' (other species)
    dummy_dict = {}
    dummy_dict.update({species_blockname: [orfid]})
    # create pandas data frame
    for species in other_species:
        dummy_dict.update({species: ['.']})
    dummy_row = pd.DataFrame.from_dict(dummy_dict)
    dummy_row = dummy_row[block_column_order]
    # order and append to list
    dummy_block_entries_rows.append(dummy_row)
    
# concatenate
dummy_block_entries_table = pd.concat(dummy_block_entries_rows)
# drop duplicates (although shouldn't exist)
dummy_block_entries_table = dummy_block_entries_table.drop_duplicates()

# save into a file
dummy_block_entries_table.to_csv('../results/plotting_ME_synteny/data/block_files/dummy_features_no_homologues.block', sep = '\t', index = False, header = False)


### Concatenating to create final block file

In [25]:
%%bash

cat ../results/plotting_ME_synteny/data/block_files/all_species_noanchor.block ../results/plotting_ME_synteny/data/block_files/dummy_features_no_homologues.block > ../results/plotting_ME_synteny/data/block_files/all_species_with_anchors_full_features.block

### Subsetting

In [185]:
# get table
homolog_groups_composition_table = pd.read_csv('../results/plotting_ME_synteny/data/homolog_groups_composition_table.tsv',
                                        sep = '\t')

# subset for target species
# convert selected species to blocknames column names with <species2blockcol_dict>
selected_block_columns = [species2blockcol_dict[species] for species in selection_of_species]

homolog_groups_composition_table_subset = homolog_groups_composition_table[selection_of_species]
homolog_groups_composition_table_subset = homolog_groups_composition_table_subset.drop_duplicates()

# splitting in R


In [186]:
%load_ext rpy2.ipython

The rpy2.ipython extension is already loaded. To reload it, use:
  %reload_ext rpy2.ipython


In [187]:
%%R -i homolog_groups_composition_table_subset -o homolog_groups_composition_table_subset_splitted

# load libraries
library(tidyverse)
library(magrittr)
library(glue)

# convert to tibble()
homolog_groups_composition_table_subset %<>% as_tibble()
homolog_groups_composition_table_subset_splitted = homolog_groups_composition_table_subset %>%
    tidyr::separate_rows(., `Halogeometricum_borinquense_strain_wsp4`, sep = ',') %>%
    tidyr::separate_rows(., `Natrinema_altunense_strain_AJ2`, sep = ',') %>%
    tidyr::separate_rows(., `Haloferax_sp_Q22`, sep = ',') %>%
    tidyr::separate_rows(., `Haloplanus_natans_DSM_17983`, sep = ',') %>%
    tidyr::separate_rows(., `jgi12330j12834_1000008`, sep = ',') %>%
    tidyr::replace_na(., list('Halogeometricum_borinquense_strain_wsp4' = '.',
                              'Natrinema_altunense_strain_AJ2' = '.',
                              'Haloferax_sp_Q22' = '.',
                              'Haloplanus_natans_DSM_17983' = '.',
                              'jgi12330j12834_1000008' = '.'))

In [188]:
homolog_groups_composition_table_subset_splitted

Unnamed: 0,Halogeometricum_borinquense_strain_wsp4,jgi12330j12834_1000008,Natrinema_altunense_strain_AJ2,Haloplanus_natans_DSM_17983,Haloferax_sp_Q22
1,.,ORF_2889_jgi12330j12834_1000008,.,ORF_2192_Haloplanus_natans_DSM_17983,ORF_1207_Haloferax_sp_Q22
2,ORF_1098_Halogeometricum_borinquense_strain_wsp4,.,.,.,.
3,.,.,.,.,.
4,.,ORF_2530_jgi12330j12834_1000008,.,.,.
5,.,ORF_1532_jgi12330j12834_1000008,.,.,.
...,...,...,...,...,...
139,ORF_124_Halogeometricum_borinquense_strain_wsp4,ORF_1496_jgi12330j12834_1000008,ORF_1255_Natrinema_altunense_strain_AJ2,ORF_1736_Haloplanus_natans_DSM_17983,ORF_1454_Haloferax_sp_Q22
140,ORF_124_Halogeometricum_borinquense_strain_wsp4,ORF_1496_jgi12330j12834_1000008,ORF_1255_Natrinema_altunense_strain_AJ2,ORF_2439_Haloplanus_natans_DSM_17983,ORF_1454_Haloferax_sp_Q22
141,ORF_1919_Halogeometricum_borinquense_strain_wsp4,ORF_1496_jgi12330j12834_1000008,ORF_1255_Natrinema_altunense_strain_AJ2,ORF_1736_Haloplanus_natans_DSM_17983,ORF_1454_Haloferax_sp_Q22
142,ORF_1919_Halogeometricum_borinquense_strain_wsp4,ORF_1496_jgi12330j12834_1000008,ORF_1255_Natrinema_altunense_strain_AJ2,ORF_2439_Haloplanus_natans_DSM_17983,ORF_1454_Haloferax_sp_Q22


In [26]:
homolog_groups_composition_table_subset_splitted.to_csv('../results/plotting_ME_synteny/data/block_files/subset_species_no_anchors_only_homologues.block', sep = '\t', index = False, header = False)

NameError: name 'homolog_groups_composition_table_subset_splitted' is not defined

In [27]:
# create rows for each species
dummystart_block_halovivax = pd.DataFrame.from_dict({'Halovivax': ['ORF_dummystart_Halovivax_sp_KZCA124'], 'Halogeometricum': ['.'],'Halobonum': ['.'], 'Natrinema': ['.'], 'Haloterrigena': ['.'],
                                                      'Haloferax': ['.'], 'ga0207733_100382': ['.'], 'ga0207719_100190': ['.'], 'Haloplanus': ['.'],
                                                      'jgi12330j12834_1000008': ['.'], 'ga0207718_100100': ['.']})

dummyend_block_halovivax = pd.DataFrame.from_dict({'Halovivax': ['ORF_dummyend_Halovivax_sp_KZCA124'], 'Halogeometricum': ['.'],'Halobonum': ['.'], 'Natrinema': ['.'], 'Haloterrigena': ['.'],
                                                      'Haloferax': ['.'], 'ga0207733_100382': ['.'], 'ga0207719_100190': ['.'], 'Haloplanus': ['.'],
                                                      'jgi12330j12834_1000008': ['.'], 'ga0207718_100100': ['.']})

dummystart_block_halogeometricum = pd.DataFrame.from_dict({'Halovivax': ['.'], 'Halogeometricum': ['ORF_dummystart_Halogeometricum_borinquense_strain_wsp4'],'Halobonum': ['.'], 'Natrinema': ['.'], 'Haloterrigena': ['.'],
                                                      'Haloferax': ['.'], 'ga0207733_100382': ['.'], 'ga0207719_100190': ['.'], 'Haloplanus': ['.'],
                                                      'jgi12330j12834_1000008': ['.'], 'ga0207718_100100': ['.']})

dummyend_block_halogeometricum = pd.DataFrame.from_dict({'Halovivax': ['.'], 'Halogeometricum': ['ORF_dummyend_Halogeometricum_borinquense_strain_wsp4'],'Halobonum': ['.'], 'Natrinema': ['.'], 'Haloterrigena': ['.'],
                                                      'Haloferax': ['.'], 'ga0207733_100382': ['.'], 'ga0207719_100190': ['.'], 'Haloplanus': ['.'],
                                                      'jgi12330j12834_1000008': ['.'], 'ga0207718_100100': ['.']})

dummystart_block_halobonum = pd.DataFrame.from_dict({'Halovivax': ['.'], 'Halogeometricum': ['.'],'Halobonum': ['ORF_dummystart_Halobonum_sp_NJ-3-1'], 'Natrinema': ['.'], 'Haloterrigena': ['.'],
                                                      'Haloferax': ['.'], 'ga0207733_100382': ['.'], 'ga0207719_100190': ['.'], 'Haloplanus': ['.'],
                                                      'jgi12330j12834_1000008': ['.'], 'ga0207718_100100': ['.']})

dummyend_block_halobonum = pd.DataFrame.from_dict({'Halovivax': ['.'], 'Halogeometricum': ['.'],'Halobonum': ['ORF_dummyend_Halobonum_sp_NJ-3-1'], 'Natrinema': ['.'], 'Haloterrigena': ['.'],
                                                      'Haloferax': ['.'], 'ga0207733_100382': ['.'], 'ga0207719_100190': ['.'], 'Haloplanus': ['.'],
                                                      'jgi12330j12834_1000008': ['.'], 'ga0207718_100100': ['.']})

dummystart_block_natrinema = pd.DataFrame.from_dict({'Halovivax': ['.'], 'Halogeometricum': ['.'],'Halobonum': ['.'], 'Natrinema': ['ORF_dummystart_Natrinema_altunense_strain_AJ2'], 'Haloterrigena': ['.'],
                                                      'Haloferax': ['.'], 'ga0207733_100382': ['.'], 'ga0207719_100190': ['.'], 'Haloplanus': ['.'],
                                                      'jgi12330j12834_1000008': ['.'], 'ga0207718_100100': ['.']})

dummyend_block_natrinema = pd.DataFrame.from_dict({'Halovivax': ['.'], 'Halogeometricum': ['.'],'Halobonum': ['.'], 'Natrinema': ['ORF_dummyend_Natrinema_altunense_strain_AJ2'], 'Haloterrigena': ['.'],
                                                      'Haloferax': ['.'], 'ga0207733_100382': ['.'], 'ga0207719_100190': ['.'], 'Haloplanus': ['.'],
                                                      'jgi12330j12834_1000008': ['.'], 'ga0207718_100100': ['.']})

dummystart_block_haloterrigena = pd.DataFrame.from_dict({'Halovivax': ['.'], 'Halogeometricum': ['.'],'Halobonum': ['.'], 'Natrinema': ['.'], 'Haloterrigena': ['ORF_dummystart_Haloterrigena_sp_SYSU_A121-1'],
                                                      'Haloferax': ['.'], 'ga0207733_100382': ['.'], 'ga0207719_100190': ['.'], 'Haloplanus': ['.'],
                                                      'jgi12330j12834_1000008': ['.'], 'ga0207718_100100': ['.']})

dummyend_block_haloterrigena = pd.DataFrame.from_dict({'Halovivax': ['.'], 'Halogeometricum': ['.'],'Halobonum': ['.'], 'Natrinema': ['.'], 'Haloterrigena': ['ORF_dummyend_Haloterrigena_sp_SYSU_A121-1'],
                                                      'Haloferax': ['.'], 'ga0207733_100382': ['.'], 'ga0207719_100190': ['.'], 'Haloplanus': ['.'],
                                                      'jgi12330j12834_1000008': ['.'], 'ga0207718_100100': ['.']})

dummystart_block_haloferax = pd.DataFrame.from_dict({'Halovivax': ['.'], 'Halogeometricum': ['.'],'Halobonum': ['.'], 'Natrinema': ['.'], 'Haloterrigena': ['.'],
                                                      'Haloferax': ['ORF_dummystart_Haloferax_sp_Q22'], 'ga0207733_100382': ['.'], 'ga0207719_100190': ['.'], 'Haloplanus': ['.'],
                                                      'jgi12330j12834_1000008': ['.'], 'ga0207718_100100': ['.']})

dummyend_block_haloferax = pd.DataFrame.from_dict({'Halovivax': ['.'], 'Halogeometricum': ['.'],'Halobonum': ['.'], 'Natrinema': ['.'], 'Haloterrigena': ['.'],
                                                      'Haloferax': ['ORF_dummyend_Haloferax_sp_Q22'], 'ga0207733_100382': ['.'], 'ga0207719_100190': ['.'], 'Haloplanus': ['.'],
                                                      'jgi12330j12834_1000008': ['.'], 'ga0207718_100100': ['.']})

dummystart_block_ga0207733_100382 = pd.DataFrame.from_dict({'Halovivax': ['.'], 'Halogeometricum': ['.'],'Halobonum': ['.'], 'Natrinema': ['.'], 'Haloterrigena': ['.'],
                                                      'Haloferax': ['.'], 'ga0207733_100382': ['ORF_dummystart_ga0207733_100382'], 'ga0207719_100190': ['.'], 'Haloplanus': ['.'],
                                                      'jgi12330j12834_1000008': ['.'], 'ga0207718_100100': ['.']})

dummyend_block_ga0207733_100382 = pd.DataFrame.from_dict({'Halovivax': ['.'], 'Halogeometricum': ['.'],'Halobonum': ['.'], 'Natrinema': ['.'], 'Haloterrigena': ['.'],
                                                      'Haloferax': ['.'], 'ga0207733_100382': ['ORF_dummyend_ga0207733_100382'], 'ga0207719_100190': ['.'], 'Haloplanus': ['.'],
                                                      'jgi12330j12834_1000008': ['.'], 'ga0207718_100100': ['.']})

dummystart_block_ga0207719_100190 = pd.DataFrame.from_dict({'Halovivax': ['.'], 'Halogeometricum': ['.'],'Halobonum': ['.'], 'Natrinema': ['.'], 'Haloterrigena': ['.'],
                                                      'Haloferax': ['.'], 'ga0207733_100382': ['.'], 'ga0207719_100190': ['ORF_dummystart_ga0207719_100190'], 'Haloplanus': ['.'],
                                                      'jgi12330j12834_1000008': ['.'], 'ga0207718_100100': ['.']})

dummyend_block_ga0207719_100190 = pd.DataFrame.from_dict({'Halovivax': ['.'], 'Halogeometricum': ['.'],'Halobonum': ['.'], 'Natrinema': ['.'], 'Haloterrigena': ['.'],
                                                      'Haloferax': ['.'], 'ga0207733_100382': ['.'], 'ga0207719_100190': ['ORF_dummyend_ga0207719_100190'], 'Haloplanus': ['.'],
                                                      'jgi12330j12834_1000008': ['.'], 'ga0207718_100100': ['.']})

dummystart_block_haloplanus = pd.DataFrame.from_dict({'Halovivax': ['.'], 'Halogeometricum': ['.'],'Halobonum': ['.'], 'Natrinema': ['.'], 'Haloterrigena': ['.'],
                                                      'Haloferax': ['.'], 'ga0207733_100382': ['.'], 'ga0207719_100190': ['.'], 'Haloplanus': ['ORF_dummystart_Haloplanus_natans_DSM_17983'],
                                                      'jgi12330j12834_1000008': ['.'], 'ga0207718_100100': ['.']})

dummyend_block_haloplanus = pd.DataFrame.from_dict({'Halovivax': ['.'], 'Halogeometricum': ['.'],'Halobonum': ['.'], 'Natrinema': ['.'], 'Haloterrigena': ['.'],
                                                      'Haloferax': ['.'], 'ga0207733_100382': ['.'], 'ga0207719_100190': ['.'], 'Haloplanus': ['ORF_dummyend_Haloplanus_natans_DSM_17983'],
                                                      'jgi12330j12834_1000008': ['.'], 'ga0207718_100100': ['.']})

dummystart_block_jgi12330j12834_1000008 = pd.DataFrame.from_dict({'Halovivax': ['.'], 'Halogeometricum': ['.'],'Halobonum': ['.'], 'Natrinema': ['.'], 'Haloterrigena': ['.'],
                                                      'Haloferax': ['.'], 'ga0207733_100382': ['.'], 'ga0207719_100190': ['.'], 'Haloplanus': ['.'],
                                                      'jgi12330j12834_1000008': ['ORF_dummystart_jgi12330j12834_1000008'], 'ga0207718_100100': ['.']})

dummyend_block_jgi12330j12834_1000008 = pd.DataFrame.from_dict({'Halovivax': ['.'], 'Halogeometricum': ['.'],'Halobonum': ['.'], 'Natrinema': ['.'], 'Haloterrigena': ['.'],
                                                      'Haloferax': ['.'], 'ga0207733_100382': ['.'], 'ga0207719_100190': ['.'], 'Haloplanus': ['.'],
                                                      'jgi12330j12834_1000008': ['ORF_dummyend_jgi12330j12834_1000008'], 'ga0207718_100100': ['.']})

dummystart_block_ga0207718_100100 = pd.DataFrame.from_dict({'Halovivax': ['.'], 'Halogeometricum': ['.'],'Halobonum': ['.'], 'Natrinema': ['.'], 'Haloterrigena': ['.'],
                                                      'Haloferax': ['.'], 'ga0207733_100382': ['.'], 'ga0207719_100190': ['.'], 'Haloplanus': ['.'],
                                                      'jgi12330j12834_1000008': ['.'], 'ga0207718_100100': ['ORF_dummystart_ga0207718_100100']})

dummyend_block_ga0207718_100100 = pd.DataFrame.from_dict({'Halovivax': ['.'], 'Halogeometricum': ['.'],'Halobonum': ['.'], 'Natrinema': ['.'], 'Haloterrigena': ['.'],
                                                      'Haloferax': ['.'], 'ga0207733_100382': ['.'], 'ga0207719_100190': ['.'], 'Haloplanus': ['.'],
                                                      'jgi12330j12834_1000008': ['.'], 'ga0207718_100100': ['ORF_dummyend_ga0207718_100100']})

# concatenate
block_column_order = ['Halogeometricum', 'Halobonum', 'ga0207718_100100', 'jgi12330j12834_1000008', 
                                                             'Natrinema', 'Halovivax', 'Haloterrigena', 'Haloplanus', 
                                                             'Haloferax', 'ga0207719_100190', 'ga0207733_100382']

dummy_block_table = pd.concat([dummystart_block_halovivax[block_column_order],
           dummyend_block_halovivax[block_column_order],
           dummystart_block_halogeometricum[block_column_order], dummyend_block_halogeometricum[block_column_order],
           dummystart_block_halobonum[block_column_order], dummyend_block_halobonum[block_column_order],
           dummystart_block_natrinema[block_column_order], dummyend_block_natrinema[block_column_order],
           dummystart_block_haloterrigena[block_column_order], dummyend_block_haloterrigena[block_column_order],
           dummystart_block_haloferax[block_column_order], dummyend_block_haloferax[block_column_order],
           dummystart_block_ga0207733_100382[block_column_order], dummyend_block_ga0207733_100382[block_column_order],
           dummystart_block_ga0207719_100190[block_column_order], dummyend_block_ga0207719_100190[block_column_order],
           dummystart_block_haloplanus[block_column_order], dummyend_block_haloplanus[block_column_order],
           dummystart_block_jgi12330j12834_1000008[block_column_order], dummyend_block_jgi12330j12834_1000008[block_column_order],
           dummystart_block_ga0207718_100100[block_column_order], dummyend_block_ga0207718_100100[block_column_order]])

In [28]:
selection_of_species = ['Halogeometricum_borinquense_strain_wsp4', 
                  'jgi12330j12834_1000008', 
                  'Natrinema_altunense_strain_AJ2', 
                  'Haloplanus_natans_DSM_17983', 
                  'Haloferax_sp_Q22']

In [29]:
# load table
dummy_block_entries_table = pd.read_csv('../results/plotting_ME_synteny/data/block_files/dummy_features_no_homologues.block', 
                                        sep = '\t', names = ['Halogeometricum', 'Halobonum', 'ga0207718_100100', 'jgi12330j12834_1000008', 
                                                             'Natrinema', 'Halovivax', 'Haloterrigena', 'Haloplanus', 
                                                             'Haloferax', 'ga0207719_100190', 'ga0207733_100382'])

# subset
# convert selected species to blocknames column names with <species2blockcol_dict>
selection_of_species = ['Halogeometricum_borinquense_strain_wsp4', 
                  'jgi12330j12834_1000008', 
                  'Natrinema_altunense_strain_AJ2', 
                  'Haloplanus_natans_DSM_17983', 
                  'Haloferax_sp_Q22']
selected_block_columns = [species2blockcol_dict[species] for species in selection_of_species]

dummy_block_entries_table_subset = dummy_block_entries_table[selected_block_columns]

# now subsetting for dummy blocks and joining
dummy_block_table_subset = dummy_block_table[selected_block_columns]
dummy_block_entries_table_subset = pd.concat([dummy_block_entries_table_subset, dummy_block_table_subset])

# now dropping duplicates
dummy_block_entries_table_subset = dummy_block_entries_table_subset.drop_duplicates()

# save to file
dummy_block_entries_table_subset.to_csv('../results/plotting_ME_synteny/data/block_files/dummy_features_no_homologues_subset.block', sep = '\t', index = False, header = False)

In [30]:
%%bash

cat ../results/plotting_ME_synteny/data/block_files/subset_species_no_anchors_only_homologues.block \
../results/plotting_ME_synteny/data/block_files/dummy_features_no_homologues_subset.block > ../results/plotting_ME_synteny/data/block_files/all_species_with_anchors_full_features_subsetted.block 

## Layout file employed
Layout file specifying all graphical positions for mobile elements is in path *../results/plotting_ME_synteny/data/layout_files/all_species_full_with_anchor_full_features.layout*.
___

Please find attached below its content

```
# x,   y, rotation,     ha,     va, color, ratio,            label
0.27, 0.9,        0, right, center,     #fc8d62,    1, Halogeometricum borinquense strain wsp4
0.35, 0.8,        180, right, center,      #fc8d62,     1, Halobonum sp NJ-3-1
0.47, 0.7,        180, right, center,      #fc8d62,    1, ga0207718_100100
0.455, 0.6,         180, right,   center,  #fc8d62,   1, jgi12330j12834_1000008
0.45, 0.5,        180, right, center,      #fc8d62,    1, Natrinema altunense strain AJ2
0.44, 0.4,        0, right, center,      #fc8d62,     1,  Halovivax sp KZCA124
0.40, 0.32,        0, right, center,      #fc8d62,     1, Haloterrigena sp SYSU A121-1
0.437, 0.24,        180, right, center,      #fc8d62,     1, Haloplanus natans DSM 17983
0.475, 0.175,        0, right, center,      #fc8d62,     1, Haloferax sp Q22 
0.55, 0.105,        180, right, center,      #fc8d62,     1, ga0207719_100190
0.538, 0.05,        180, right, center,      #fc8d62,     1, ga0207733_100382
# edges
e, 0, 1
e, 1, 2
e, 2, 3
e, 3, 4
e, 4, 5
e, 5, 6
e, 6, 7
e, 7, 8
e, 8, 9
e, 9, 10
```

___

## Plotting

In [69]:
%%bash

python -m jcvi.graphics.synteny ../results/plotting_ME_synteny/data/block_files/all_species_with_anchors_full_features.block ../results/plotting_ME_synteny/data/BED/all_species_all_records.bed ../results/plotting_ME_synteny/data/layout_files/all_species_full_with_anchor_full_features.layout --glyphcolor=orthogroup --figsize=20x10 --dpi=100
mv ../results/plotting_ME_synteny/data/block_files/all_species_with_anchors_full_features.pdf ../results/plotting_ME_synteny/results/all_species_with_anchors_full_features.pdf

python -m jcvi.graphics.synteny ../results/plotting_ME_synteny/data/block_files/all_species_with_anchors_full_features.block ../results/plotting_ME_synteny/data/BED/all_species_all_records.bed ../results/plotting_ME_synteny/data/layout_files/all_species_full_with_anchor_full_features.layout --glyphcolor=orthogroup --figsize=20x10 --dpi=100 --format=svg
mv ../results/plotting_ME_synteny/data/block_files/all_species_with_anchors_full_features.svg ../results/plotting_ME_synteny/results/all_species_with_anchors_full_features.svg

[19:28:01] DEBUG    Load file `../results/plotting_ME_synteny/data/BE base.py:37
                    D/all_species_all_records.bed`                              
           DEBUG    Load file `../results/plotting_ME_synteny/data/bl base.py:37
                    ock_files/all_species_with_anchors_full_features.           
                    block`                                                      
           DEBUG    Load file `../results/plotting_ME_synteny/data/la base.py:37
                    yout_files/all_species_full_with_anchor_full_feat           
                    ures.layout`                                                
Column 0: ORF_dummystart_Halogeometricum_borinquense_strain_wsp4 - ORF_dummyend_Halogeometricum_borinquense_strain_wsp4 (CP048739:1-89772)
  CP048739 .. 124 (779) features .. +
Column 1: ORF_dummystart_Halobonum_sp_NJ-3-1 - ORF_dummyend_Halobonum_sp_NJ-3-1 (CP058579:1-172009)
  CP058579 .. 246 (991) features .. +
Column 2: ORF_dummystart_ga0207718_10