In [1]:
%matplotlib inline
import pandas as pd
import os
import re
from Bio import SeqIO
import pysam
from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq
from Bio import SearchIO
from pybedtools import BedTool
import numpy as np



In [2]:
source_dir = '/home/benjamin/genome_assembly/PST79/FALCON/p_assemblies/v9_1/032017_assembly'

In [3]:
out_dir = '/home/benjamin/genome_assembly/PST79/FALCON/p_assemblies/v9_1/Pst_104E_v12/TE_analysis'

In [4]:
if not os.path.exists(out_dir):
    os.mkdir(out_dir)

In [5]:
p_repet_gff = pd.read_csv(out_dir+'/'+ "Pst_104E_v12_p_ctg.REPET.gff", sep='\t', header = None)

In [6]:
p_repet_gff[8].head()

0    ID=ms7308_pcontig_193_DTX-incomp-chim_MCL5_Pst...
1    ID=mp7308-1_pcontig_193_DTX-incomp-chim_MCL5_P...
2    ID=ms7309_pcontig_193_DTX-incomp_MCL9_Pst79_RE...
3    ID=mp7309-1_pcontig_193_DTX-incomp_MCL9_Pst79_...
4    ID=ms7310_pcontig_193_DTX-incomp_MCL9_Pst79_RE...
Name: 8, dtype: object

In [7]:
TE_post_analysis_p = '/home/benjamin/genome_assembly/PST79/FALCON/p_assemblies/v9_1/REPET/Pst79_p/Pst79_p_full_annotate/postanalysis'

In [8]:
TE_post_analysis_p_header = 'TE      length  covg    frags   fullLgthFrags   copies  fullLgthCopies  meanId  sdId    minId   q25Id   medId   q75Id   maxId   meanLgth        sdLgth  minLgth q25Lgth medLgth q75Lgth maxLgth meanLgthPerc    sdLgthPerc      minLgthPerc  q25LgthPerc     medLgthPerc     q75LgthPerc     maxLgthPerc'.split(' ')

In [9]:
TE_post_analysis_p_header = [x for x in TE_post_analysis_p_header if x != '']
TE_post_analysis_p_header

['TE',
 'length',
 'covg',
 'frags',
 'fullLgthFrags',
 'copies',
 'fullLgthCopies',
 'meanId',
 'sdId',
 'minId',
 'q25Id',
 'medId',
 'q75Id',
 'maxId',
 'meanLgth',
 'sdLgth',
 'minLgth',
 'q25Lgth',
 'medLgth',
 'q75Lgth',
 'maxLgth',
 'meanLgthPerc',
 'sdLgthPerc',
 'minLgthPerc',
 'q25LgthPerc',
 'medLgthPerc',
 'q75LgthPerc',
 'maxLgthPerc']

In [10]:
p_repet_summary_df = pd.read_csv(TE_post_analysis_p+'/'+'Pst79p_anno_chr_allTEs_nr_noSSR_join_path.annotStatsPerTE.tab' ,\
                                names = TE_post_analysis_p_header, header=None, sep='\t', skiprows=1 )

In [11]:
#check if I can filter the tab files for removing all TEs that are on the 2000 plus contigs
#remove tRNAs TEs with infernal

In [13]:
p_repet_summary_df['Code'] = p_repet_summary_df['TE'].apply(lambda x: x.split('_')[0])

In [14]:
p_repet_summary_df['Code'].unique()

array(['DHX-comp', 'DHX-incomp-chim', 'DHX-incomp', 'DMX-incomp',
       'DTX-comp-chim', 'DTX-comp', 'DTX-incomp-chim', 'DTX-incomp',
       'DXX-MITE-chim', 'DXX-MITE', 'DXX', 'noCat', 'PotentialHostGene',
       'RIX-comp', 'RIX-incomp-chim', 'RIX-incomp', 'RLX-comp-chim',
       'RLX-comp', 'RLX-incomp-chim', 'RLX-incomp', 'RPX-incomp-chim',
       'RSX-incomp-chim', 'RSX-incomp', 'RXX-chim', 'RXX-LARD-chim',
       'RXX-LARD', 'RXX-TRIM-chim', 'RXX-TRIM', 'RXX', 'RYX-comp-chim',
       'RYX-comp', 'RYX-incomp-chim', 'RYX-incomp', 'XXX-chim'], dtype=object)

In [15]:
code_keys = p_repet_summary_df['Code'].unique()

In [16]:
code_keys.sort()

In [17]:
code_long = ['DNA_transposon Helitron', 'DNA_transposon Helitron', 'DNA_transposon Helitron', 'DNA_transposon Maverick',\
            'DNA_transposon TIR', 'DNA_transposon TIR', 'DNA_transposon TIR', 'DNA_transposon TIR', 'DNA_transposon noCat',\
             'DNA_transposon MITE','DNA_transposon MITE', 'Potential Host Gene', 'Retrotransposon LINE', 'Retrotransposon LINE',\
             'Retrotransposon LINE','Retrotransposon LTR','Retrotransposon LTR', 'Retrotransposon LTR', 'Retrotransposon LTR', 'Retrotransposon PLE', \
             'Retrotransposon SINE',  'Retrotransposon SINE', 'Retrotransposon noCat', 'Retrotransposon LARD',\
             'Retrotransposon LARD', 'Retrotransposon TRIM', 'Retrotransposon TRIM', 'Retrotransposon noCat',  \
             'Retrotransposon Crypton','Retrotransposon Crypton','Retrotransposon Crypton','Retrotransposon Crypton',\
             'noCat', 'noCat']

In [18]:
code_dict = dict(zip(code_keys, code_long))

In [19]:
code_dict

{'DHX-comp': 'DNA_transposon Helitron',
 'DHX-incomp': 'DNA_transposon Helitron',
 'DHX-incomp-chim': 'DNA_transposon Helitron',
 'DMX-incomp': 'DNA_transposon Maverick',
 'DTX-comp': 'DNA_transposon TIR',
 'DTX-comp-chim': 'DNA_transposon TIR',
 'DTX-incomp': 'DNA_transposon TIR',
 'DTX-incomp-chim': 'DNA_transposon TIR',
 'DXX': 'DNA_transposon noCat',
 'DXX-MITE': 'DNA_transposon MITE',
 'DXX-MITE-chim': 'DNA_transposon MITE',
 'PotentialHostGene': 'Potential Host Gene',
 'RIX-comp': 'Retrotransposon LINE',
 'RIX-incomp': 'Retrotransposon LINE',
 'RIX-incomp-chim': 'Retrotransposon LINE',
 'RLX-comp': 'Retrotransposon LTR',
 'RLX-comp-chim': 'Retrotransposon LTR',
 'RLX-incomp': 'Retrotransposon LTR',
 'RLX-incomp-chim': 'Retrotransposon LTR',
 'RPX-incomp-chim': 'Retrotransposon PLE',
 'RSX-incomp': 'Retrotransposon SINE',
 'RSX-incomp-chim': 'Retrotransposon SINE',
 'RXX': 'Retrotransposon noCat',
 'RXX-LARD': 'Retrotransposon LARD',
 'RXX-LARD-chim': 'Retrotransposon LARD',
 'RXX

In [20]:
p_repet_summary_df['Code long'] = p_repet_summary_df['Code'].apply(lambda x: code_dict[x])

In [21]:
p_repet_summary_sum_df = pd.pivot_table(p_repet_summary_df, values=['covg', 'copies'], index='Code long', aggfunc=np.sum)

In [22]:
p_repet_summary_mean_df = pd.pivot_table(p_repet_summary_df, values='length', index='Code long', aggfunc=np.mean)

In [23]:
pd.concat([p_repet_summary_sum_df,p_repet_summary_mean_df], axis=1 )

Unnamed: 0_level_0,copies,covg,length
Code long,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
DNA_transposon Helitron,1075,817566,2989.764706
DNA_transposon MITE,3789,886304,490.240741
DNA_transposon Maverick,268,345406,8562.666667
DNA_transposon TIR,19166,12711595,4020.474359
DNA_transposon noCat,5286,2376456,3034.734513
Potential Host Gene,1372,1375304,6120.490566
Retrotransposon Crypton,1337,1049299,6874.526316
Retrotransposon LARD,10752,4947564,5407.736111
Retrotransposon LINE,323,237992,4446.625
Retrotransposon LTR,18893,16276421,6384.627841


In [24]:
#now filter the gff dataframe to delete all the high coverage contigs
contigs_smaller_2000 = pd.read_csv('/home/benjamin/genome_assembly/PST79/FALCON/p_assemblies/v9_1/032017_assembly/pcontig_smaller_2000.txt',\
                                  header=None)[0].tolist()

In [25]:
p_repet_gff_filtered = p_repet_gff[p_repet_gff[0].isin(contigs_smaller_2000)].reset_index(drop=True)

In [26]:
import pybedtools
import multiprocessing
import re

In [28]:
p_repet_gff_filtered[1].unique()

array(['Pst79p_anno_REPET_TEs', 'Pst79p_anno_REPET_SSRs',
       'Pst79p_anno_REPET_tblastx', 'Pst79p_anno_REPET_blastx',
       'Pst79a_anno_REPET_TEs', 'Pst79a_anno_REPET_SSRs',
       'Pst79a_anno_REPET_tblastx', 'Pst79a_anno_REPET_blastx'], dtype=object)

In [29]:
def ID_filter_gff(_feature, _id):
    """
    This filter parses out the top level id form the 9th gff column form a REPET gff file.
    It has a specific search pattern for each feature type in column 2.
    _type is defined by the feature '_'.join(feature.split("_")[-2:])
    """
    _type = '_'.join(_feature.split("_")[-2:])
    if _type == 'REPET_TEs':

        TE_pattern = 'ID=[A-Z,a-z,0-9,-]*_[A-Z,a-z,0-9]*_[0-9]*_([A-Z,a-z,0-9,-]*_[A-Z,a-z,0-9,-]*)_'
        TE_prog = re.compile(TE_pattern)
        TE_match = TE_prog.search(_id)
        return TE_match.group(1)
    if _type == 'REPET_SSRs':
        
        SSR_pattern = 'ID=[A-Z,a-z,0-9,-]*_[A-Z,a-z,0-9]*_[0-9]*_([A-Z,a-z,0-9,-]*)'
        SSR_prog = re.compile(SSR_pattern)
        SSR_match = SSR_prog.search(_id)
        return SSR_match.group(1)
    if _type == 'REPET_tblastx' or _type == 'REPET_blastx':
        #if "#" in _id:
        #     blast_pattern = 'ID=[A-Z,a-z,0-9,-]*_[A-Z,a-z,0-9]*_[0-9]*_([A-Z,a-z,0-9,-]*[_]?[A-Z,a-z,0-9,-]*[_|#|0-9]+?:[A-Z,a-z,0-9,-,:]*)'
        #else:
        #    blast_pattern = r'ID=[A-Z,a-z,0-9,-]*_[A-Z,a-z,0-9]*_[0-9]*_([\w+|:|-]*)'
        blast_prog = re.compile(r'ID=[A-Z,a-z,0-9,-]*_[A-Z,a-z,0-9]*_[0-9]*_([^;| ]*)')
        #blast_prog = re.compile(blast_pattern)
        blast_match = blast_prog.search(_id)
        return blast_match.group(1)


In [31]:
blast_test = p_repet_gff_filtered[p_repet_gff_filtered[1] == 'Pst79p_anno_REPET_blastx'][8][578]

In [32]:
blast_pattern = 'ID=[A-Z,a-z,0-9,-]*_[A-Z,a-z,0-9]*_[0-9]*_([A-Z,a-z,0-9,-]*[_]?[A-Z,a-z,0-9,-]*[_,#,0-9]+?:[A-Z,a-z,0-9,-,:]*)'
blast_pattern = 'ID=[A-Z,a-z,0-9,-]*_[A-Z,a-z,0-9]*_[0-9]*_(\w+);'
blast_prog = re.compile(r'ID=[A-Z,a-z,0-9,-]*_[A-Z,a-z,0-9]*_[0-9]*_([^;| ]*)')
blast_match = blast_prog.search(blast_test)

In [33]:
blast_match.group(1)

'Copia-48_Mad-I_1p#2:ClassI:LTR:Copia'

In [35]:
p_repet_gff_filtered['ID'] = p_repet_gff_filtered.apply(lambda row: ID_filter_gff(row[1], row[8]), axis=1)

In [36]:
#filter out potential host genes
p_repet_gff_filtered = p_repet_gff_filtered[~p_repet_gff_filtered[8].str.contains("Potential")]

In [37]:
num_unique_REs = len(p_repet_gff_filtered['ID'].unique())

In [38]:
print('This is the number of unique repetitive elements: %i' % num_unique_REs)

This is the number of unique repetitive elements: 48659


In [39]:
frequency_of_REs  = p_repet_gff_filtered.groupby('ID')[0].count()

In [40]:
frequency_of_REs.sort_values(inplace=True)

In [42]:
p_repet_gff_filtered.to_csv(out_dir+'/Pst_104E_v12_p_ctg.REPET.filtered.gff', sep='\t', header = None, index=None,columns=None)

In [43]:
p_repet_gff_filtered[8] = p_repet_gff_filtered['ID']

In [44]:
p_repet_gff_filtered_2 = p_repet_gff_filtered.iloc[:,0:9]

In [46]:
p_repet_gff_filtered_2.to_csv(out_dir+'/Pst_104E_v12_p_ctg.REPET.ID_column.gff', sep='\t', header = None, index=None,columns=None)

In [47]:
RE_id_gff = pybedtools.BedTool(out_dir+'/Pst_104E_v12_p_ctg.REPET.ID_column.gff')

In [49]:
os.chdir(out_dir)

In [50]:
repet_gff = 'Pst_104E_v12_p_ctg.REPET.gff'

In [51]:
repet_prefix = '.'.join(repet_gff.split('.')[0:2])

In [52]:
g = RE_id_gff.remove_invalid().saveas()

In [53]:
# Next, we create a function to pass only features for a particular
# featuretype.  This is similar to a "grep" operation when applied to every
# feature in a BedTool
def id_filter(feature, _id):
    if feature[8] == _id:
        return True
    return False

In [111]:
# subset the id and safe in specific folder
# return the subsetted file as bedtool
def subset_id(_id):
    #ClassI are retrotransposon form blast
    if 'ClassI:' in _id:
        out_path = TE_path_dict['Retrotransposon']   
    #ClassII are DNA_transponson
    elif 'ClassII' in _id:
        out_path = TE_path_dict['DNA_transposon'] 
    #The rest with '_' should be REPET_TEs
    elif _id.split('_')[0] in list(code_dict.keys()):
        key = code_dict[_id.split('_')[0]].split(' ')[0]
        out_path = TE_path_dict[key]
    #everything without '_' at the end should be SSR
    elif '_' not in _id:
        out_path = TE_path_dict['SSR']
    out_fn = out_path+'/'+repet_prefix+'.'+_id+'.gff'
    result = g.filter(id_filter, _id).saveas(out_fn)
    cov_fn = out_fn.replace('gff','cov')
    cov = result.genome_coverage(dz=True,g=p_genome_file)
    cov.saveas(cov_fn)
    #return pybedtools.BedTool(result.fn)

In [58]:
TE_types = ['Retrotransposon', 'DNA_transposon', 'noCat', 'SSR']
TE_path = [os.path.join(out_dir, x) for x in TE_types]
TE_path_dict = dict(zip(TE_types, TE_path))

In [60]:
for TE_type in TE_types:
    new_path = os.path.join(out_dir, TE_type)
    if not os.path.exists(new_path):
        os.mkdir(new_path)

In [105]:
_id = frequency_of_REs.index.tolist()

In [None]:
[subset_id(x) for x in _test_id]

In [110]:
full_path_list[0].replace('gff','cov')

'/home/benjamin/genome_assembly/PST79/FALCON/p_assemblies/v9_1/Pst_104E_v12/TE_analysis/SSR/Pst_104E_v12_p_ctg.REPETGAGCAGGTA3.cov'

In [113]:
_test_id = _id[0:5]

In [63]:
g = RE_id_gff.remove_invalid().saveas()

In [None]:
[x for x in _test_id if '_' not in x]

In [106]:
p_genome_file = 'Pst_104E_v12_p_ctg.genome_file'

In [65]:
all_cov_RE = g.genome_coverage(dz=True,g=p_genome_file)

In [66]:
all_cov_RE.saveas('Pst_104E_v12_p_ctg.REPET.ID_column.cov')

<BedTool(Pst_104E_v12_p_ctg.REPET.ID_column.cov)>

In [69]:
p_repet_gff_filtered[1].unique()

array(['Pst79p_anno_REPET_TEs', 'Pst79p_anno_REPET_SSRs',
       'Pst79p_anno_REPET_tblastx', 'Pst79p_anno_REPET_blastx',
       'Pst79a_anno_REPET_TEs', 'Pst79a_anno_REPET_SSRs',
       'Pst79a_anno_REPET_tblastx', 'Pst79a_anno_REPET_blastx'], dtype=object)

In [74]:
[x for x in p_repet_gff_filtered_2[(p_repet_gff_filtered_2[1] == 'Pst79p_anno_REPET_TE')  ][8].unique() if '_' not in x]

[]

In [68]:
[x for x in p_repet_gff_filtered_2[(p_repet_gff_filtered_2[1] == 'Pst79p_anno_REPET_tblastx') | (p_repet_gff_filtered_2[1] == 'Pst79p_anno_REPET_blastx') ][8].unique() if '_' not in x]

['ATCOPIA43I:ClassI:LTR:Copia',
 'CFT1:ClassI:LTR:Gypsy',
 'TCN1-I:ClassI:LTR:Gypsy',
 'PsAppalachian-I:ClassI:LTR:Gypsy',
 'SUSHII:ClassI:LTR:Gypsy',
 'MtPH-A6-4-Ia:ClassII:TIR:PIF-Harbinger',
 'MtPH-M-2-Ia:ClassII:TIR:PIF-Harbinger',
 'MtPH-M-2-IIa:ClassII:TIR:PIF-Harbinger',
 'GORPI:ClassI:LTR:Gypsy',
 'YOYOI:ClassI:LTR:Gypsy',
 'SUSHIIDR1:ClassI:LTR:Gypsy',
 'CYCLO:ClassI:?:?',
 'PTERV2a:ClassI:LTR:ERV',
 'TCN2-I:ClassI:LTR:?',
 'TCN3-I:ClassI:LTR:?',
 'HERVH48I:ClassI:LTR:Retrovirus',
 'HERVR:ClassI:LTR:ERV',
 'CfERVF1:ClassI:LTR:ERV',
 'NONAUT-5:ClassI:LTR:Gypsy',
 'ATCOPIA18I:ClassI:LTR:Copia',
 'ATCOPIA2I:ClassI:LTR:Copia',
 'ATCOPIA5I:ClassI:LTR:Copia',
 'ATCOPIA6I:ClassI:LTR:Copia',
 'CIRCE:ClassI:LTR:?',
 'HERV-Fc2:ClassI:LTR:ERV',
 'RNERV19:ClassI:LTR:ERV',
 'ATCOPIA13I:ClassI:LTR:Copia',
 'CcNgaro3:ClassI:DIRS:DIRS',
 'PTERV2b:ClassI:LTR:ERV',
 'HERV46I:ClassI:?:?',
 'Howilli1:ClassII:TIR:hAT',
 'SZ-6IN:ClassI:LTR:Copia',
 'HELITRON2:ClassII:Helitron:Helitron',
 'ATCOPIA97

In [None]:
p_repet_gff[p_repet_gff[8].str.contains('Copia6-VV_I_')][8]