This notebook is aimed at getting all the effector candidates from the Pst_104E_genome as defined as the following.
EffectorP prediction done by Jana Sperschneider
Gene expression cluster analysis done by Jana Sperschneider picking cluster 2, 3 and 8

In [47]:
import os
from Bio import SeqIO
import pandas as pd
import re

In [82]:
#define your input folders
CLUSTER_FOLDER = '/home/benjamin/genome_assembly/PST79/FALCON/p_assemblies/v9_1/Pst_104E_v12/Pst_104E_genome/gene_expression/Pst104_p_SecretomeClustering'
EFFECTORP_FOLDER = '/home/benjamin/genome_assembly/PST79/FALCON/p_assemblies/v9_1/Pst_104E_v12/Pst_104E_genome/Secretomes/EffectorP'
GFF_FOLDER = '/home/benjamin/genome_assembly/PST79/FALCON/p_assemblies/v9_1/032017_assembly'
PROTEIN_ANNO_FOLDER = '/home/benjamin/genome_assembly/PST79/FALCON/p_assemblies/v9_1/Pst_104E_v12/enrichment_analysis/pa_26062017'
OUT_FOLDER = '/home/benjamin/genome_assembly/PST79/FALCON/p_assemblies/v9_1/Pst_104E_v12/enrichment_analysis/lists'

In [66]:
genome = 'Pst_104E_v12_'
p_effector_list = []
h_effector_list = []

In [67]:
#define what you want to take
clusters = [ 2, 3, 8]
clusters_files = [os.path.join(CLUSTER_FOLDER, x) for x in os.listdir(CLUSTER_FOLDER)\
                 if x.startswith('Cluster') and x.endswith('_DEs.fasta') and int(x[7]) in clusters ]
effectorp_files = [os.path.join(EFFECTORP_FOLDER, x) for x in os.listdir(EFFECTORP_FOLDER)\
                  if x.endswith('effectors.fasta') and x.startswith(genome)]

In [68]:
#get all the sequence names into a list from the fasta headers 
for file in clusters_files:
    fh = open(file, 'r')
    for seq in SeqIO.parse(fh, 'fasta'):
        if 'hcontig' in seq.id:
            h_effector_list.append(seq.id)
        if 'pcontig' in seq.id:
            p_effector_list.append(seq.id)
    fh.close()

for file in effectorp_files:
    fh = open(file, 'r')
    for seq in SeqIO.parse(fh, 'fasta'):
        if 'hcontig' in seq.id:
            h_effector_list.append(seq.id)
        if 'pcontig' in seq.id:
            p_effector_list.append(seq.id)
    fh.close()

In [69]:
p_effector_file = os.path.join(OUT_FOLDER, genome + 'p_effector.list')
h_effector_file = os.path.join(OUT_FOLDER, genome + 'h_effector.list')

In [70]:
#write out the sets of effector candidates
fh = open(p_effector_file, 'w')
for ec in set(p_effector_list):
    print(ec, file=fh)
fh.close()
fh = open(h_effector_file, 'w')
for ec in set(h_effector_list):
    print(ec, file=fh)
fh.close()


In [71]:
#subset the gff files as well and write those out
p_gff_file = [os.path.join(GFF_FOLDER, x) for x in os.listdir(GFF_FOLDER)\
                 if x.startswith(genome+'p_ctg') and x.endswith('anno.gff3') ][0]
h_gff_file = [os.path.join(GFF_FOLDER, x) for x in os.listdir(GFF_FOLDER)\
                 if x.startswith(genome+'h_ctg') and x.endswith('anno.gff3') ][0]

In [72]:
#gff header 
gff_header = ['seqid', 'source', 'type', 'start', 'end', 'score', 'strand', 'phase', 'attributes']

In [99]:
#now subset the gff files for effectors only
p_gff_df = pd.read_csv(p_gff_file, header = None, sep='\t', names= gff_header)
p_gff_df['ID'] = p_gff_df.attributes.str.extract(r'ID=([^;]*);', expand=False)
p_gff_df.sort_values(by=['seqid', 'start'], inplace = True)
h_gff_df = pd.read_csv(h_gff_file, header = None, sep='\t', names= gff_header)
h_gff_df['ID'] = h_gff_df.attributes.str.extract(r'ID=([^;]*);', expand=False)
h_gff_df.sort_values(by=['seqid', 'start'], inplace = True)

In [100]:
h_gff_df.head()

Unnamed: 0,seqid,source,type,start,end,score,strand,phase,attributes,ID
147866,hcontig_000_003,EVM,gene,1023,1469,.,+,.,ID=evm.TU.hcontig_000_003.1;Name=gene_model_hc...,evm.TU.hcontig_000_003.1
147867,hcontig_000_003,EVM,mRNA,1023,1469,.,+,.,ID=evm.model.hcontig_000_003.1;Parent=evm.TU.h...,evm.model.hcontig_000_003.1
147868,hcontig_000_003,EVM,exon,1023,1469,.,+,.,ID=evm.model.hcontig_000_003.1.exon1;Parent=ev...,evm.model.hcontig_000_003.1.exon1
147869,hcontig_000_003,EVM,CDS,1023,1469,.,+,0,ID=cds.evm.model.hcontig_000_003.1;Parent=evm....,cds.evm.model.hcontig_000_003.1
147846,hcontig_000_003,EVM,gene,4850,5854,.,+,.,ID=evm.TU.hcontig_000_003.2;Name=gene_model_hc...,evm.TU.hcontig_000_003.2


In [101]:
#now write out dataframes for p_gff
file_name = p_effector_file.replace('.list', '.gene.gff3')
p_gff_df[(p_gff_df.type == 'gene') & (p_gff_df.ID.str.replace('TU', 'model').isin(p_effector_list))].\
    loc[:,gff_header].to_csv(file_name, header=None, index=None, sep='\t')
#no effector list
file_name = p_effector_file.replace('p_effector.list', 'p_noeffector.gene.gff3')    
p_gff_df[(p_gff_df.type == 'gene') & (~p_gff_df.ID.str.replace('TU', 'model').isin(p_effector_list))].\
    loc[:,gff_header].to_csv(file_name, header=None, index=None, sep='\t')
file_name = p_effector_file.replace('p_effector.list', 'p_noeffector.list')
p_gff_df[(p_gff_df.type == 'gene') & (~p_gff_df.ID.str.replace('TU', 'model').isin(p_effector_list))]['ID'].str.replace('TU', 'model')\
    .to_csv(file_name, header=None, index=None, sep='\t')

In [102]:
#now write out dataframes for h_gff
file_name = h_effector_file.replace('.list', '.gene.gff3')
h_gff_df[(h_gff_df.type == 'gene') & (h_gff_df.ID.str.replace('TU', 'model').isin(h_effector_list))].\
    loc[:,gff_header].to_csv(file_name, header=None, index=None, sep='\t')
#no effector list
file_name = h_effector_file.replace('h_effector.list', 'h_noeffector.gene.gff3')    
h_gff_df[(h_gff_df.type == 'gene') & (~h_gff_df.ID.str.replace('TU', 'model').isin(h_effector_list))].\
    loc[:,gff_header].to_csv(file_name, header=None, index=None, sep='\t')
file_name = h_effector_file.replace('h_effector.list', 'h_noeffector.list')
h_gff_df[(h_gff_df.type == 'gene') & (~h_gff_df.ID.str.replace('TU', 'model').isin(h_effector_list))]['ID'].str.replace('TU', 'model')\
    .to_csv(file_name, header=None, index=None, sep='\t')

In [103]:
#now get BUSCO list and write out stuff
p_busco_file = [os.path.join(PROTEIN_ANNO_FOLDER, x) for x in os.listdir(PROTEIN_ANNO_FOLDER) if x.startswith(genome+'p_ctg') and 'busco' in x][0]
p_busco_list = pd.read_csv(p_busco_file, header=None, sep='\t')[0].tolist()
h_busco_file = [os.path.join(PROTEIN_ANNO_FOLDER, x) for x in os.listdir(PROTEIN_ANNO_FOLDER) if x.startswith(genome+'h_ctg') and 'busco' in x][0]
h_busco_list = pd.read_csv(h_busco_file, header=None, sep='\t')[0].tolist()

In [104]:
#write out BUSCO for pcontigs
file_name = p_effector_file.replace('effector.list', 'busco.list')
p_gff_df[(p_gff_df.type == 'gene') & (p_gff_df.ID.str.replace('TU', 'model').isin(p_busco_list))]['ID'].str.replace('TU', 'model')\
    .to_csv(file_name, header=None, index=None, sep='\t')
file_name = p_effector_file.replace('effector.list', 'busco.gene.gff3')
p_gff_df[(p_gff_df.type == 'gene') & (p_gff_df.ID.str.replace('TU', 'model').isin(p_busco_list))].\
    loc[:,gff_header].to_csv(file_name, header=None, index=None, sep='\t')

In [105]:
#write out BUSCO for hcontigs
file_name = h_effector_file.replace('effector.list', 'busco.list')
h_gff_df[(h_gff_df.type == 'gene') & (h_gff_df.ID.str.replace('TU', 'model').isin(h_busco_list))]['ID'].str.replace('TU', 'model')\
    .to_csv(file_name, header=None, index=None, sep='\t')
file_name = h_effector_file.replace('effector.list', 'busco.gene.gff3')
h_gff_df[(h_gff_df.type == 'gene') & (h_gff_df.ID.str.replace('TU', 'model').isin(h_busco_list))].\
    loc[:,gff_header].to_csv(file_name, header=None, index=None, sep='\t')