# <span style="color:yellowgreen"> Prepare TE and gene model annotation files. </span>

This notebook combines the TE model data with the gene model data, as they were previously separated.

1. Get the annotations file (.gff3) and the reference file (ctg.fa)
2. SeqIO.parse through reference genome and get a list of all contig names.
3. Make pandas dataframe of annotation file.
4. Subset dataframe to column 0 that only has values in list of contig names from reference file.
5. Use "to.csv(\t)" to convert the subset to a tab-separated GFF file.
6. Cat this subset GFF file of TE data to the gene data GFF file.

# <span style="color:mediumseagreen"> Load modules. </span>

In [2]:
import pybedtools
from pybedtools import BedTool
import os
import pysam
import pandas as pd
import glob
import wiggelen
import pybedtools
from pybedtools import BedTool
from Bio import SeqIO

# <span style="color:mediumturquoise"> Set directory paths and file handles. </span>

In [3]:
#First we need to define the base dirs
#Define initial base dir as relative path, but everything else as absolute path
DIRS ={}
DIRS['BASE'] =  '../../../../analysis/' # absolute path = '/home/anjuni/analysis/'
DIRS['GFF_INPUT'] = os.path.join(os.path.abspath(DIRS['BASE']), 'annotations')
DIRS['GFF_OUT'] = os.path.join(os.path.abspath(DIRS['BASE']), 'gff_output')
DIRS['REF'] = '../../../../Pst_104_v13_assembly/'

In [4]:
#Quick chech if directories exist
for value in DIRS.values():
    if not os.path.exists(value):
        print('%s does not exist' % value)
    else:
        print(value)

../../../../analysis/
/home/anjuni/analysis/annotations
/home/anjuni/analysis/gff_output
../../../../Pst_104_v13_assembly/


In [6]:
#Make filepaths
gff_file_list = [fn for fn in glob.iglob('%s/*.gff3' % DIRS['GFF_INPUT'], recursive=True)]

te_p_model = os.path.join(DIRS['GFF_INPUT'], 'Pst_104E_v13_p_ctg.repeatgenesLT.gff3')
te_h_model = os.path.join(DIRS['GFF_INPUT'], 'Pst_104E_v13_h_ctg.repeatgenesLT.gff3')
gene_p_model = os.path.join(DIRS['GFF_INPUT'], 'Pst_104E_v13_p_ctg.anno.gff3')
gene_h_model = os.path.join(DIRS['GFF_INPUT'], 'Pst_104E_v13_h_ctg.anno.gff3')

te_model_list = [te_p_model, te_h_model]

ref_fh = os.path.join(DIRS['REF'], 'Pst_104E_v13_ph_ctg.fa')

In [24]:
#Check that the list works
print(*gff_file_list, sep='\n')

../../../../analysis/annotations/Pst_104E_v13_p_ctg.repeatgenesLT.gff3
../../../../analysis/annotations/Pst_104E_v13_h_ctg.repeatgenesLT.gff3
../../../../analysis/annotations/Pst_104E_v13_p_ctg.anno.gff3
../../../../analysis/annotations/Pst_104E_v13_h_ctg.anno.gff3


# <span style="color:mediumslateblue"> Functions </span>

In [233]:
#Test to make a list of all contig names
def contig_list(reference_genome, reference_format):
    """Goes through reference genome and makes a list of all contig names."""
    contigs = []
    for seq in  SeqIO.parse(reference_genome, reference_format):    
        if seq.id not in contigs:
            contigs.append(seq.id)
    return contigs

In [234]:
def gff_to_df(gff_file):
    """Convert annotation gff file into a Pandas dataframe."""
    gff_header_list = ['seqname', 'source', 'feature', 'start', 'end', 'score', 'strand', 'frame', 'attribute']
    df = pd.read_csv(gff_file, sep='\t', header = None, names = gff_header_list)
    return df

In [235]:
def filter_gff(gff_df, contig_list):
    """Filter out the contigs that aren't in the final Pst genome assembly from the TE annotation file."""
    filtered_df = gff_df[gff_df['seqname'].isin(contig_list)]
    return filtered_df

In [237]:
def df_to_gff(filtered_df, initial_gff_file):
    """Convert filtered dataframe from gff file back into a gff file."""
    name = 'Pst_104E_v13_' + initial_gff_file[-24] + '_ctg.TE.gff3'
    filtered_df.to_csv(path_or_buf=os.path.join(os.path.abspath(DIRS['GFF_OUT']), name), sep = '\t', header=False, index=False)

# <span style="color:mediumorchid"> Run functions </span>

In [59]:
#Get list of all contigs
all_contigs = contig_list(ref_fh, 'fasta')

In [62]:
#Check if list worked
len(all_contigs)

631

In [238]:
for gff_file in te_model_list:
    initial_df = gff_to_df(gff_file)
    filtered_pdf = filter_gff(initial_df, all_contigs)
    df_to_gff(filtered_pdf, gff_file)

# <span style="color:violet"> Catenate the TE gff files with the gene model gff files and sort. </span>

In [248]:
%%bash
# Use Bash to catenate gene and TE files
cd ../../../../analysis/
cat annotations/Pst_104E_v13_p_ctg.anno.gff3 output/Pst_104E_v13_p_ctg.TE.gff3 > output/Pst_104E_v13_p_ctg_combined_anno.gff3
cat annotations/Pst_104E_v13_h_ctg.anno.gff3 output/Pst_104E_v13_h_ctg.TE.gff3 > output/Pst_104E_v13_h_ctg_combined_anno.gff3

In [2]:
%%bash
# Use gffsort.pl to sort the catenated files
cd /home/anjuni/analysis/output
/home/anjuni/myapps/gff3sort/gff3sort.pl Pst_104E_v13_p_ctg_combined_anno.gff3 >  Pst_104E_v13_p_ctg_combined_sorted_anno.gff3
/home/anjuni/myapps/gff3sort/gff3sort.pl Pst_104E_v13_h_ctg_combined_anno.gff3 >  Pst_104E_v13_h_ctg_combined_sorted_anno.gff3

Smartmatch is experimental at /home/anjuni/myapps/gff3sort/gff3sort.pl line 68.
Smartmatch is experimental at /home/anjuni/myapps/gff3sort/gff3sort.pl line 68.


In [5]:
%%bash
# For STAR RNA-seq mapping, I need a single gff file with annotations for the whole genome, so p and h contigs will be combined
# H-contig is catenated before the P-contig, so it's automatically sorted.
cd ../../../../analysis/gff_output
cat Pst_104E_v13_h_ctg_combined_sorted_anno.gff3 Pst_104E_v13_p_ctg_combined_sorted_anno.gff3 > Pst_104E_v13_ph_ctg_combined_sorted_anno.gff3

In [6]:
%%bash
#remove intermediate combined files after the sorted combiend files were generated
cd /home/anjuni/analysis/gff_output
rm *combined_anno.gff3

rm: cannot remove '*combined_anno.gff3': No such file or directory


# <span style="color:#ff4747"> Sort and catenate the p and h TE gff files and the p and h gene model gff files. </span>

This is for the window analysis comparing TE and gene density throughout the genome :)

In [5]:
%%bash
# Sorting the files
# sort the TE annotation files
cd /home/anjuni/analysis/gff_output
/home/anjuni/myapps/gff3sort/gff3sort.pl Pst_104E_v13_h_ctg.TE.gff3 > Pst_104E_v13_h_ctg.TE.sorted.gff3
/home/anjuni/myapps/gff3sort/gff3sort.pl Pst_104E_v13_p_ctg.TE.gff3 > Pst_104E_v13_p_ctg.TE.sorted.gff3

#sort the gene annotation files
cd /home/anjuni/analysis/annotations
/home/anjuni/myapps/gff3sort/gff3sort.pl Pst_104E_v13_h_ctg.anno.gff3 > ../gff_output/Pst_104E_v13_h_ctg.anno.sorted.gff3
/home/anjuni/myapps/gff3sort/gff3sort.pl Pst_104E_v13_p_ctg.anno.gff3 > ../gff_output/Pst_104E_v13_p_ctg.anno.sorted.gff3

Smartmatch is experimental at /home/anjuni/myapps/gff3sort/gff3sort.pl line 68.
Smartmatch is experimental at /home/anjuni/myapps/gff3sort/gff3sort.pl line 68.
Smartmatch is experimental at /home/anjuni/myapps/gff3sort/gff3sort.pl line 68.
Smartmatch is experimental at /home/anjuni/myapps/gff3sort/gff3sort.pl line 68.


In [7]:
%%bash
# catenate the two TE files and two gene files.
cd /home/anjuni/analysis/gff_output
cat Pst_104E_v13_h_ctg.anno.sorted.gff3 Pst_104E_v13_p_ctg.anno.sorted.gff3 > Pst_104E_v13_ph_ctg.anno.sorted.gff3
cat Pst_104E_v13_h_ctg.TE.sorted.gff3 Pst_104E_v13_p_ctg.TE.sorted.gff3 > Pst_104E_v13_ph_ctg.TE.sorted.gff3

# <span style="color:coral"> Filter SSR lines out of the TE superfamily GFF files and combine and sort them. </span>

In [12]:
%%bash
cd /home/anjuni/analysis/annotations_with_permissions
grep -v 'SSR' Pst_104E_v12_h_ctg.REPET.sorted.superfamily.gff > ../output/Pst_104E_v13_h_ctg.REPET.sorted.filtered.superfamily.gff
grep -v 'SSR' Pst_104E_v12_p_ctg.REPET.sorted.superfamily.gff > ../output/Pst_104E_v13_p_ctg.REPET.sorted.filtered.superfamily.gff
#Use bash to filter out 'SSR' (simple sequence repeats) in the TE superfamily file. Change filtered file name to v13 from v12.

In [13]:
%%bash
cd /home/anjuni/analysis/output
cat Pst_104E_v13_h_ctg.REPET.sorted.filtered.superfamily.gff Pst_104E_v13_p_ctg.REPET.sorted.filtered.superfamily.gff > Pst_104E_v13_ph_ctg.REPET.sorted.filtered.superfamily.gff
#Use bash to catenate the two h and p TE superfamily files.

# <span style="color:orange"> Test functions </span>

In [65]:
# Test to make a list of all contig names
def contig_list_test(reference_genome, reference_format):
    """Goes through reference genome and makes a list of all contig names."""
    count = 0
    contigs = []
    for seq in  SeqIO.parse(reference_genome, reference_format):    
        if count < 10:
            if seq.id not in contigs:
                contigs.append(seq.id)
                count += 1
        else:
            break
    return contigs

In [66]:
# Test contig list function
test_list = contig_list_test(ref_fh, 'fasta')
print(*test_list, sep='\n')

pcontig_193
pcontig_225
pcontig_164
pcontig_166
pcontig_184
pcontig_189
pcontig_129
pcontig_147
pcontig_148
pcontig_095


In [101]:
# Test to convert the gff3 annotation files to pandas dataframes
def gff_to_df(gff_file):
    """Convert gff file into a Pandas dataframe."""
    gff_header_list = ['seqname', 'source', 'feature', 'start', 'end', 'score', 'strand', 'frame', 'attribute']
    df = pd.read_csv(gff_file, sep='\t', header = None, names = gff_header_list)
    return df

In [166]:
# Test converting the primary contig gff file to a dataframa 
test_df = gff_to_df(te_p_model) # works

In [167]:
# Check the dataframe format
test_df.head()

Unnamed: 0,seqname,source,feature,start,end,score,strand,frame,attribute
0,pcontig_000,EVM,gene,135540,135950,.,+,.,ID=evm.TU.pcontig_000.38;locus_tag=Pst104E_302...
1,pcontig_000,EVM,mRNA,135540,135950,.,+,.,ID=evm.model.pcontig_000.38;Parent=evm.TU.pcon...
2,pcontig_000,EVM,exon,135540,135950,.,+,.,ID=evm.model.pcontig_000.38.exon1;Parent=evm.m...
3,pcontig_000,EVM,CDS,135540,135950,.,+,0,ID=cds.evm.model.pcontig_000.38;Parent=evm.mod...
4,pcontig_000,EVM,gene,137365,137865,.,+,.,ID=evm.TU.pcontig_000.39;locus_tag=Pst104E_302...


In [None]:
# Test to filter out removed contigs from gff file dataframe
def filter_gff(gff_df, contig_list):
    filtered_df = gff_df[gff_df['seqname'].isin(contig_list)]
    return filtered_df

In [None]:
# Test p_contig file
test_filtered_df = filter_gff(test_df, all_contigs) # works

In [194]:
# Check the filtered dataframe format
test_filtered_df.head()

Unnamed: 0,seqname,source,feature,start,end,score,strand,frame,attribute
0,pcontig_000,EVM,gene,135540,135950,.,+,.,ID=evm.TU.pcontig_000.38;locus_tag=Pst104E_302...
1,pcontig_000,EVM,mRNA,135540,135950,.,+,.,ID=evm.model.pcontig_000.38;Parent=evm.TU.pcon...
2,pcontig_000,EVM,exon,135540,135950,.,+,.,ID=evm.model.pcontig_000.38.exon1;Parent=evm.m...
3,pcontig_000,EVM,CDS,135540,135950,.,+,0,ID=cds.evm.model.pcontig_000.38;Parent=evm.mod...
4,pcontig_000,EVM,gene,137365,137865,.,+,.,ID=evm.TU.pcontig_000.39;locus_tag=Pst104E_302...


In [None]:
# Check whether primary contig gff has a different number of contigs to the number of primary contigs in the contig list
len(set(test_df.seqname.unique())) - len(set([x for x in all_contigs if x.startswith('pcon')])) == 0
#len(set(test_df.seqname.unique())) - len(set([x for x in all_contigs if x.startswith('hcon')])) == 0 # same test for h_contig

In [188]:
# Check how many primary contigs in the contig list
len(set([x for x in all_contigs if x.startswith('pcon')]))
#len(set([x for x in all_contigs if x.startswith('hcon')])) # same test for h_contig

156

In [175]:
# Check how many unique contigs in the gff file
len(set(test_df.seqname.unique()))

153

In [176]:
# Check how many unique contigs in the filtered file
len(set(test_filtered_df.seqname.unique()))

139

In [192]:
# Compare initial with filtered dataframe
print('Initial dataframe: ', len(test_df))
print('Final dataframe:   ', len(test_filtered_df))

Initial dataframe:  22056
Final dataframe:    21972


In [231]:
# Test converting filtered pandas dataframe to gff file.
def df_to_gff_test(filtered_df, initial_gff_file):
    """Convert filtered dataframe from gff file back into a gff file"""
    name = 'Pst_104E_v13_' + initial_gff_file[-24] + '_ctg.TE.gff3'
    filtered_df.to_csv(path_or_buf=os.path.join(os.path.abspath(DIRS['GFF_OUT']), name), sep = '\t', header=False, index=False)

In [232]:
# Test p_contig dataframe
df_to_gff_test(test_filtered_df, te_p_model) #works