In [1]:
%matplotlib inline
import pandas as pd
import os
import re
from Bio import SeqIO
import pysam
from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq
from Bio import SearchIO
from pybedtools import BedTool
import numpy as np



In [2]:
source_dir = '/home/benjamin/genome_assembly/PST79/FALCON/p_assemblies/v9_1/032017_assembly'

In [3]:
out_dir = '/home/benjamin/genome_assembly/PST79/FALCON/p_assemblies/v9_1/Pst_104E_v12/TE_analysis'

In [4]:
if not os.path.exists(out_dir):
    os.mkdir(out_dir)

In [5]:
p_repet_gff = pd.read_csv(source_dir+'/'+ "Pst_104E_v12_p_ctg.REPET.gff", sep='\t', header = None, skiprows=1)

In [6]:
p_repet_gff[8].head()

0    ID=ms7308_pcontig_193_DTX-incomp-chim_MCL5_Pst...
1    ID=mp7308-1_pcontig_193_DTX-incomp-chim_MCL5_P...
2    ID=ms7309_pcontig_193_DTX-incomp_MCL9_Pst79_RE...
3    ID=mp7309-1_pcontig_193_DTX-incomp_MCL9_Pst79_...
4    ID=ms7310_pcontig_193_DTX-incomp_MCL9_Pst79_RE...
Name: 8, dtype: object

In [7]:
TE_post_analysis_p = '/home/benjamin/genome_assembly/PST79/FALCON/p_assemblies/v9_1/REPET/Pst79_p/Pst79_p_full_annotate/postanalysis'

In [8]:
TE_post_analysis_p_header = 'TE      length  covg    frags   fullLgthFrags   copies  fullLgthCopies  meanId  sdId    minId   q25Id   medId   q75Id   maxId   meanLgth        sdLgth  minLgth q25Lgth medLgth q75Lgth maxLgth meanLgthPerc    sdLgthPerc      minLgthPerc  q25LgthPerc     medLgthPerc     q75LgthPerc     maxLgthPerc'.split(' ')

In [9]:
TE_post_analysis_p_header = [x for x in TE_post_analysis_p_header if x != '']
TE_post_analysis_p_header

['TE',
 'length',
 'covg',
 'frags',
 'fullLgthFrags',
 'copies',
 'fullLgthCopies',
 'meanId',
 'sdId',
 'minId',
 'q25Id',
 'medId',
 'q75Id',
 'maxId',
 'meanLgth',
 'sdLgth',
 'minLgth',
 'q25Lgth',
 'medLgth',
 'q75Lgth',
 'maxLgth',
 'meanLgthPerc',
 'sdLgthPerc',
 'minLgthPerc',
 'q25LgthPerc',
 'medLgthPerc',
 'q75LgthPerc',
 'maxLgthPerc']

In [10]:
p_repet_summary_df = pd.read_csv(TE_post_analysis_p+'/'+'Pst79p_anno_chr_allTEs_nr_noSSR_join_path.annotStatsPerTE.tab' ,\
                                names = TE_post_analysis_p_header, header=None, sep='\t', skiprows=1 )

In [11]:
#check if I can filter the tab files for removing all TEs that are on the 2000 plus contigs
#remove tRNAs TEs with infernal

In [12]:
p_repet_summary_df

Unnamed: 0,TE,length,covg,frags,fullLgthFrags,copies,fullLgthCopies,meanId,sdId,minId,...,medLgth,q75Lgth,maxLgth,meanLgthPerc,sdLgthPerc,minLgthPerc,q25LgthPerc,medLgthPerc,q75LgthPerc,maxLgthPerc
0,DHX-comp_MCL7_Pst79_REPET-B-G1637-Map3_reversed,6846,83754,57,5,45,5,87.95,8.91,59.20,...,690.0,2511.0,6846.0,27.19,33.33,0.34,3.30,10.08,36.68,100.00
1,DHX-comp_MCL913_Pst79_REPET-B-R1806-Map7,5263,60250,52,2,43,2,64.67,11.98,55.80,...,1287.0,1943.0,5263.0,26.62,24.40,0.53,1.98,24.45,36.92,100.00
2,DHX-incomp-chim_MCL1088_Pst79_REPET-B-R439-Map4,6573,25628,25,2,24,2,74.05,14.87,56.10,...,141.5,1160.0,6593.0,16.25,30.71,0.38,0.64,2.15,17.65,100.30
3,DHX-incomp-chim_MCL275_Pst79_REPET-B-G1603-Map3,6321,58446,65,4,54,4,71.73,13.08,56.50,...,298.5,1276.0,6324.0,17.12,27.02,0.35,1.57,4.72,20.19,100.05
4,DHX-incomp-chim_MCL66_Pst79_REPET-B-R1526-Map5...,6436,64594,64,5,56,5,75.09,10.98,57.10,...,280.0,1485.0,6440.0,17.93,28.32,0.36,1.32,4.35,23.07,100.06
5,DHX-incomp-chim_MCL7_Pst79_REPET-L-B131-Map1_r...,14467,97263,214,1,182,1,84.33,9.19,60.00,...,188.0,788.0,14467.0,3.70,8.02,0.15,0.54,1.30,5.45,100.00
6,DHX-incomp_MCL1022_Pst79_REPET-B-R332-Map3,1957,13993,13,4,12,4,77.68,15.13,55.60,...,1221.0,1957.0,1990.0,59.59,39.75,5.16,24.48,62.39,100.00,101.69
7,DHX-incomp_MCL1036_Pst79_REPET-B-R3503-Map3,923,4019,17,1,15,1,83.39,9.49,69.10,...,172.0,257.0,910.0,29.03,33.12,5.09,5.96,18.63,27.84,98.59
8,DHX-incomp_MCL1048_Pst79_REPET-B-R3670-Map4_re...,1274,11536,23,3,22,3,74.42,14.21,59.20,...,415.0,1081.0,1301.0,41.16,36.17,3.38,7.77,32.57,84.85,102.12
9,DHX-incomp_MCL1049_Pst79_REPET-B-R3672-Map3_re...,852,7394,11,2,11,2,75.69,16.47,57.70,...,716.0,763.0,852.0,78.89,21.99,19.48,72.65,84.04,89.55,100.00


In [13]:
test_df = p_repet_summary_df.head(600)

In [14]:
test_df

Unnamed: 0,TE,length,covg,frags,fullLgthFrags,copies,fullLgthCopies,meanId,sdId,minId,...,medLgth,q75Lgth,maxLgth,meanLgthPerc,sdLgthPerc,minLgthPerc,q25LgthPerc,medLgthPerc,q75LgthPerc,maxLgthPerc
0,DHX-comp_MCL7_Pst79_REPET-B-G1637-Map3_reversed,6846,83754,57,5,45,5,87.95,8.91,59.20,...,690.0,2511.0,6846.0,27.19,33.33,0.34,3.30,10.08,36.68,100.00
1,DHX-comp_MCL913_Pst79_REPET-B-R1806-Map7,5263,60250,52,2,43,2,64.67,11.98,55.80,...,1287.0,1943.0,5263.0,26.62,24.40,0.53,1.98,24.45,36.92,100.00
2,DHX-incomp-chim_MCL1088_Pst79_REPET-B-R439-Map4,6573,25628,25,2,24,2,74.05,14.87,56.10,...,141.5,1160.0,6593.0,16.25,30.71,0.38,0.64,2.15,17.65,100.30
3,DHX-incomp-chim_MCL275_Pst79_REPET-B-G1603-Map3,6321,58446,65,4,54,4,71.73,13.08,56.50,...,298.5,1276.0,6324.0,17.12,27.02,0.35,1.57,4.72,20.19,100.05
4,DHX-incomp-chim_MCL66_Pst79_REPET-B-R1526-Map5...,6436,64594,64,5,56,5,75.09,10.98,57.10,...,280.0,1485.0,6440.0,17.93,28.32,0.36,1.32,4.35,23.07,100.06
5,DHX-incomp-chim_MCL7_Pst79_REPET-L-B131-Map1_r...,14467,97263,214,1,182,1,84.33,9.19,60.00,...,188.0,788.0,14467.0,3.70,8.02,0.15,0.54,1.30,5.45,100.00
6,DHX-incomp_MCL1022_Pst79_REPET-B-R332-Map3,1957,13993,13,4,12,4,77.68,15.13,55.60,...,1221.0,1957.0,1990.0,59.59,39.75,5.16,24.48,62.39,100.00,101.69
7,DHX-incomp_MCL1036_Pst79_REPET-B-R3503-Map3,923,4019,17,1,15,1,83.39,9.49,69.10,...,172.0,257.0,910.0,29.03,33.12,5.09,5.96,18.63,27.84,98.59
8,DHX-incomp_MCL1048_Pst79_REPET-B-R3670-Map4_re...,1274,11536,23,3,22,3,74.42,14.21,59.20,...,415.0,1081.0,1301.0,41.16,36.17,3.38,7.77,32.57,84.85,102.12
9,DHX-incomp_MCL1049_Pst79_REPET-B-R3672-Map3_re...,852,7394,11,2,11,2,75.69,16.47,57.70,...,716.0,763.0,852.0,78.89,21.99,19.48,72.65,84.04,89.55,100.00


In [15]:
test_df[test_df['TE'].str.match('DXX-MITE_MCL6_')]

Unnamed: 0,TE,length,covg,frags,fullLgthFrags,copies,fullLgthCopies,meanId,sdId,minId,...,medLgth,q75Lgth,maxLgth,meanLgthPerc,sdLgthPerc,minLgthPerc,q25LgthPerc,medLgthPerc,q75LgthPerc,maxLgthPerc
584,DXX-MITE_MCL6_Pst79_REPET-B-P544.10-Map3_reversed,587,2534,5,4,5,4,93.54,5.22,84.5,...,585.0,585.0,585.0,86.34,28.2,35.95,96.76,99.66,99.66,99.66
585,DXX-MITE_MCL6_Pst79_REPET-B-R95-Map10_reversed,594,9501,29,4,27,4,88.0,7.67,70.07,...,338.0,550.0,587.0,59.24,31.57,10.77,26.43,56.9,92.59,98.82
586,DXX-MITE_MCL6_Pst79_REPET_RS_1331,587,9958,35,5,32,6,82.97,6.3,71.5,...,287.5,552.0,641.0,53.01,34.26,8.52,21.98,48.98,94.04,109.2


In [16]:
test_df['Code'] = test_df['TE'].apply(lambda x: x.split('-')[0])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [17]:
test_df['Code'].unique()

array(['DHX', 'DMX', 'DTX', 'DXX'], dtype=object)

In [18]:
p_repet_summary_df['Code'] = p_repet_summary_df['TE'].apply(lambda x: x.split('_')[0])

In [19]:
p_repet_summary_df['Code'].unique()

array(['DHX-comp', 'DHX-incomp-chim', 'DHX-incomp', 'DMX-incomp',
       'DTX-comp-chim', 'DTX-comp', 'DTX-incomp-chim', 'DTX-incomp',
       'DXX-MITE-chim', 'DXX-MITE', 'DXX', 'noCat', 'PotentialHostGene',
       'RIX-comp', 'RIX-incomp-chim', 'RIX-incomp', 'RLX-comp-chim',
       'RLX-comp', 'RLX-incomp-chim', 'RLX-incomp', 'RPX-incomp-chim',
       'RSX-incomp-chim', 'RSX-incomp', 'RXX-chim', 'RXX-LARD-chim',
       'RXX-LARD', 'RXX-TRIM-chim', 'RXX-TRIM', 'RXX', 'RYX-comp-chim',
       'RYX-comp', 'RYX-incomp-chim', 'RYX-incomp', 'XXX-chim'], dtype=object)

In [20]:
code_keys = p_repet_summary_df['Code'].unique()

In [21]:
code_keys.sort()

In [22]:
code_keys

array(['DHX-comp', 'DHX-incomp', 'DHX-incomp-chim', 'DMX-incomp',
       'DTX-comp', 'DTX-comp-chim', 'DTX-incomp', 'DTX-incomp-chim', 'DXX',
       'DXX-MITE', 'DXX-MITE-chim', 'PotentialHostGene', 'RIX-comp',
       'RIX-incomp', 'RIX-incomp-chim', 'RLX-comp', 'RLX-comp-chim',
       'RLX-incomp', 'RLX-incomp-chim', 'RPX-incomp-chim', 'RSX-incomp',
       'RSX-incomp-chim', 'RXX', 'RXX-LARD', 'RXX-LARD-chim', 'RXX-TRIM',
       'RXX-TRIM-chim', 'RXX-chim', 'RYX-comp', 'RYX-comp-chim',
       'RYX-incomp', 'RYX-incomp-chim', 'XXX-chim', 'noCat'], dtype=object)

In [23]:
code_long = ['DNA transposon Helitron', 'DNA transposon Helitron', 'DNA transposon Helitron', 'DNA transposon Maverick',\
            'DNA transposon TIR', 'DNA transposon TIR', 'DNA transposon TIR', 'DNA transposon TIR', 'DNA transposon noCat',\
             'DNA transposon MITE','DNA transposon MITE', 'Potential Host Gene', 'Retrotransposon LINE', 'Retrotransposon LINE',\
             'Retrotransposon LINE','Retrotransposon LTR','Retrotransposon LTR', 'Retrotransposon LTR', 'Retrotransposon LTR', 'Retrotransposon PLE', \
             'Retrotransposon SINE',  'Retrotransposon SINE', 'Retrotransposon noCat', 'Retrotransposon LARD',\
             'Retrotransposon LARD', 'Retrotransposon TRIM', 'Retrotransposon TRIM', 'Retrotransposon noCat',  \
             'Retrotransposon Crypton','Retrotransposon Crypton','Retrotransposon Crypton','Retrotransposon Crypton',\
             'noCat', 'noCat']

In [24]:
code_dict = dict(zip(code_keys, code_long))

In [25]:
code_dict

{'DHX-comp': 'DNA transposon Helitron',
 'DHX-incomp': 'DNA transposon Helitron',
 'DHX-incomp-chim': 'DNA transposon Helitron',
 'DMX-incomp': 'DNA transposon Maverick',
 'DTX-comp': 'DNA transposon TIR',
 'DTX-comp-chim': 'DNA transposon TIR',
 'DTX-incomp': 'DNA transposon TIR',
 'DTX-incomp-chim': 'DNA transposon TIR',
 'DXX': 'DNA transposon noCat',
 'DXX-MITE': 'DNA transposon MITE',
 'DXX-MITE-chim': 'DNA transposon MITE',
 'PotentialHostGene': 'Potential Host Gene',
 'RIX-comp': 'Retrotransposon LINE',
 'RIX-incomp': 'Retrotransposon LINE',
 'RIX-incomp-chim': 'Retrotransposon LINE',
 'RLX-comp': 'Retrotransposon LTR',
 'RLX-comp-chim': 'Retrotransposon LTR',
 'RLX-incomp': 'Retrotransposon LTR',
 'RLX-incomp-chim': 'Retrotransposon LTR',
 'RPX-incomp-chim': 'Retrotransposon PLE',
 'RSX-incomp': 'Retrotransposon SINE',
 'RSX-incomp-chim': 'Retrotransposon SINE',
 'RXX': 'Retrotransposon noCat',
 'RXX-LARD': 'Retrotransposon LARD',
 'RXX-LARD-chim': 'Retrotransposon LARD',
 'RXX

In [26]:
p_repet_summary_df['Code long'] = p_repet_summary_df['Code'].apply(lambda x: code_dict[x])

In [27]:
p_repet_summary_df['Code long']

0       DNA transposon Helitron
1       DNA transposon Helitron
2       DNA transposon Helitron
3       DNA transposon Helitron
4       DNA transposon Helitron
5       DNA transposon Helitron
6       DNA transposon Helitron
7       DNA transposon Helitron
8       DNA transposon Helitron
9       DNA transposon Helitron
10      DNA transposon Helitron
11      DNA transposon Helitron
12      DNA transposon Helitron
13      DNA transposon Helitron
14      DNA transposon Helitron
15      DNA transposon Helitron
16      DNA transposon Helitron
17      DNA transposon Helitron
18      DNA transposon Helitron
19      DNA transposon Helitron
20      DNA transposon Helitron
21      DNA transposon Helitron
22      DNA transposon Helitron
23      DNA transposon Helitron
24      DNA transposon Helitron
25      DNA transposon Helitron
26      DNA transposon Helitron
27      DNA transposon Helitron
28      DNA transposon Helitron
29      DNA transposon Helitron
                 ...           
1803    

In [28]:
p_repet_summary_sum_df = pd.pivot_table(p_repet_summary_df, values=['covg', 'copies'], index='Code long', aggfunc=np.sum)

In [29]:
p_repet_summary_mean_df = pd.pivot_table(p_repet_summary_df, values='length', index='Code long', aggfunc=np.mean)

In [30]:
pd.concat([p_repet_summary_sum_df,p_repet_summary_mean_df], axis=1 )

Unnamed: 0_level_0,copies,covg,length
Code long,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
DNA transposon Helitron,1075,817566,2989.764706
DNA transposon MITE,3789,886304,490.240741
DNA transposon Maverick,268,345406,8562.666667
DNA transposon TIR,19166,12711595,4020.474359
DNA transposon noCat,5286,2376456,3034.734513
Potential Host Gene,1372,1375304,6120.490566
Retrotransposon Crypton,1337,1049299,6874.526316
Retrotransposon LARD,10752,4947564,5407.736111
Retrotransposon LINE,323,237992,4446.625
Retrotransposon LTR,18893,16276421,6384.627841


In [31]:
#now filter the gff dataframe to delete all the high coverage contigs
contigs_smaller_2000 = pd.read_csv('/home/benjamin/genome_assembly/PST79/FALCON/p_assemblies/v9_1/032017_assembly/pcontig_smaller_2000.txt',\
                                  header=None)[0].tolist()

In [32]:
p_repet_gff_filtered = p_repet_gff[p_repet_gff[0].isin(contigs_smaller_2000)].reset_index(drop=True)

In [33]:
p_repet_gff_filtered.loc[1000:20000,:]

Unnamed: 0,0,1,2,3,4,5,6,7,8
1000,pcontig_184,Pst79p_anno_REPET_tblastx,match_part,34368.0,34499.0,6.000000e-68,-,.,ID=mp2703-1_pcontig_184_OSCOPIA2_I:ClassI:LTR:...
1001,pcontig_184,Pst79p_anno_REPET_tblastx,match,11191.0,11295.0,0.000000e+00,+,.,ID=ms2704_pcontig_184_OSTONOR1_I:ClassI:LTR:Co...
1002,pcontig_184,Pst79p_anno_REPET_tblastx,match_part,11191.0,11295.0,2.000000e-32,+,.,ID=mp2704-1_pcontig_184_OSTONOR1_I:ClassI:LTR:...
1003,pcontig_184,Pst79p_anno_REPET_tblastx,match,28367.0,28432.0,0.000000e+00,-,.,ID=ms2705_pcontig_184_PCretro3_I:ClassI:LTR:Co...
1004,pcontig_184,Pst79p_anno_REPET_tblastx,match_part,28367.0,28432.0,1.000000e-84,-,.,ID=mp2705-1_pcontig_184_PCretro3_I:ClassI:LTR:...
1005,pcontig_184,Pst79p_anno_REPET_tblastx,match,27929.0,27994.0,0.000000e+00,-,.,ID=ms2706_pcontig_184_RETROFIT_I:ClassI:LTR:Co...
1006,pcontig_184,Pst79p_anno_REPET_tblastx,match_part,27929.0,27994.0,4.000000e-53,-,.,ID=mp2706-1_pcontig_184_RETROFIT_I:ClassI:LTR:...
1007,pcontig_184,Pst79p_anno_REPET_tblastx,match,23690.0,23881.0,0.000000e+00,-,.,ID=ms2707_pcontig_184_RLG_scAle_1_1-I:ClassI:L...
1008,pcontig_184,Pst79p_anno_REPET_tblastx,match_part,23690.0,23881.0,1.000000e-55,-,.,ID=mp2707-1_pcontig_184_RLG_scAle_1_1-I:ClassI...
1009,pcontig_184,Pst79p_anno_REPET_tblastx,match,12400.0,12437.0,0.000000e+00,-,.,ID=ms2708_pcontig_184_SHACOP11_I_MT:ClassI:LTR...


In [34]:
import pybedtools
import multiprocessing
import re

In [35]:
p_repet_summary_df[p_repet_summary_df['TE'].str.match('MuDR-1')]

Unnamed: 0,TE,length,covg,frags,fullLgthFrags,copies,fullLgthCopies,meanId,sdId,minId,...,maxLgth,meanLgthPerc,sdLgthPerc,minLgthPerc,q25LgthPerc,medLgthPerc,q75LgthPerc,maxLgthPerc,Code,Code long


In [36]:
p_repet_gff_filtered[1].unique()

array(['Pst79p_anno_REPET_TEs', 'Pst79p_anno_REPET_SSRs',
       'Pst79p_anno_REPET_tblastx', 'Pst79p_anno_REPET_blastx',
       'Pst79a_anno_REPET_TEs', 'Pst79a_anno_REPET_SSRs',
       'Pst79a_anno_REPET_tblastx', 'Pst79a_anno_REPET_blastx'], dtype=object)

In [37]:
p_repet_gff_filtered[p_repet_gff_filtered[1] == 'Pst79p_anno_REPET_TEs'][8][1]
                                                                         

'ID=mp7308-1_pcontig_193_DTX-incomp-chim_MCL5_Pst79_REPET-B-G1667-Map3;Parent=ms7308_pcontig_193_DTX-incomp-chim_MCL5_Pst79_REPET-B-G1667-Map3;Target=DTX-incomp-chim_MCL5_Pst79_REPET-B-G1667-Map3 8186 8708;Identity=80.2'

In [300]:
def ID_filter_gff(_feature, _id):
    """
    This filter parses out the top level id form the 9th gff column form a REPET gff file.
    It has a specific search pattern for each feature type in column 2.
    _type is defined by the feature '_'.join(feature.split("_")[-2:])
    """
    _type = '_'.join(_feature.split("_")[-2:])
    if _type == 'REPET_TEs':

        TE_pattern = 'ID=[A-Z,a-z,0-9,-]*_[A-Z,a-z,0-9]*_[0-9]*_([A-Z,a-z,0-9,-]*_[A-Z,a-z,0-9,-]*)_'
        TE_prog = re.compile(TE_pattern)
        TE_match = TE_prog.search(_id)
        return TE_match.group(1)
    if _type == 'REPET_SSRs':
        
        SSR_pattern = 'ID=[A-Z,a-z,0-9,-]*_[A-Z,a-z,0-9]*_[0-9]*_([A-Z,a-z,0-9,-]*)'
        SSR_prog = re.compile(SSR_pattern)
        SSR_match = SSR_prog.search(_id)
        return SSR_match.group(1)
    if _type == 'REPET_tblastx' or _type == 'REPET_blastx':
        #if "#" in _id:
        #     blast_pattern = 'ID=[A-Z,a-z,0-9,-]*_[A-Z,a-z,0-9]*_[0-9]*_([A-Z,a-z,0-9,-]*[_]?[A-Z,a-z,0-9,-]*[_|#|0-9]+?:[A-Z,a-z,0-9,-,:]*)'
        #else:
        #    blast_pattern = r'ID=[A-Z,a-z,0-9,-]*_[A-Z,a-z,0-9]*_[0-9]*_([\w+|:|-]*)'
        blast_prog = re.compile(r'ID=[A-Z,a-z,0-9,-]*_[A-Z,a-z,0-9]*_[0-9]*_([^;| ]*)')
        #blast_prog = re.compile(blast_pattern)
        blast_match = blast_prog.search(_id)
        return blast_match.group(1)


In [39]:
TE_test = p_repet_gff_filtered[p_repet_gff_filtered[1] == 'Pst79p_anno_REPET_TEs'][8][1]

In [227]:
SSR_test = p_repet_gff_filtered[p_repet_gff_filtered[1] == 'Pst79p_anno_REPET_SSRs'][8][67]

In [41]:
p_repet_gff_filtered[p_repet_gff_filtered[1] == 'Pst79p_anno_REPET_tblastx'][8].head(10-20)

69        ID=ms11576_pcontig_193_ATCOPIA69A_I:ClassI:LTR...
70        ID=mp11576-1_pcontig_193_ATCOPIA69A_I:ClassI:L...
71        ID=ms11577_pcontig_193_Copia-104_GM-I:ClassI:L...
72        ID=mp11577-1_pcontig_193_Copia-104_GM-I:ClassI...
73        ID=ms11578_pcontig_193_Copia-137_AA-I:ClassI:L...
74        ID=mp11578-1_pcontig_193_Copia-137_AA-I:ClassI...
75        ID=ms11579_pcontig_193_Copia-13_PTrit-I:ClassI...
76        ID=mp11579-1_pcontig_193_Copia-13_PTrit-I:Clas...
77        ID=ms11580_pcontig_193_Copia-150_CCri-I:ClassI...
78        ID=mp11580-1_pcontig_193_Copia-150_CCri-I:Clas...
79        ID=ms11581_pcontig_193_Copia-15_PPe-I:ClassI:L...
80        ID=mp11581-1_pcontig_193_Copia-15_PPe-I:ClassI...
81        ID=ms11582_pcontig_193_Copia-29_MLP-I:ClassI:L...
82        ID=mp11582-1_pcontig_193_Copia-29_MLP-I:ClassI...
83        ID=ms11583_pcontig_193_Copia-29_TC-I:ClassI:LT...
84        ID=mp11583-1_pcontig_193_Copia-29_TC-I:ClassI:...
85        ID=ms11584_pcontig_193_Copia-2

In [280]:
blast_test = p_repet_gff_filtered[p_repet_gff_filtered[1] == 'Pst79p_anno_REPET_blastx'][8][578]

In [297]:
blast_pattern = 'ID=[A-Z,a-z,0-9,-]*_[A-Z,a-z,0-9]*_[0-9]*_([A-Z,a-z,0-9,-]*[_]?[A-Z,a-z,0-9,-]*[_,#,0-9]+?:[A-Z,a-z,0-9,-,:]*)'
blast_pattern = 'ID=[A-Z,a-z,0-9,-]*_[A-Z,a-z,0-9]*_[0-9]*_(\w+);'
blast_prog = re.compile(r'ID=[A-Z,a-z,0-9,-]*_[A-Z,a-z,0-9]*_[0-9]*_([^;| ]*)')
blast_match = blast_prog.search(blast_test)

In [298]:
blast_match.group(1)

'Copia-48_Mad-I_1p#2:ClassI:LTR:Copia'

In [45]:
'_'.join('Pst79p_anno_REPET_TEs'.split("_")[-2:])

'REPET_TEs'

In [301]:
p_repet_gff_filtered['ID'] = p_repet_gff_filtered.apply(lambda row: ID_filter_gff(row[1], row[8]), axis=1)

In [269]:
p_repet_gff_filtered[8][578]

'ID=ms1410_pcontig_164_Copia-48_Mad-I_1p#2:ClassI:LTR:Copia;Target=Copia-48_Mad-I_1p#2:ClassI:LTR:Copia 1012 1032;TargetLength=1040;Identity=28.29'

In [305]:
num_unique_REs = len(p_repet_gff_filtered['ID'].unique())

In [307]:
print('This is the number of unique repetitive elements: %i' % num_unique_REs)

This is the number of unique repetitive elements: 48713


In [315]:
frequency_of_REs  = p_repet_gff_filtered.groupby('ID')[0].count()

In [321]:
frequency_of_REs.sort_values(inplace=True)

In [322]:
frequency_of_REs 

ID
GAGCGCGA6                               2
AAAAATTCAAG3                            2
AAAAATTGAAGTAG2                         2
AAAGAATGAGG2                            2
AAAGAATTTT2                             2
AACGGCAGCA2                             2
AACAATAATACGC2                          2
AACAACTCCC4                             2
AACAACTCTAGC3                           2
TGTCTT5                                 2
TGTCTT2                                 2
TGTCTGCTGAGTACA2                        2
AACAAG6                                 2
AACAAGA3                                2
AACAAGAAA3                              2
AACAAGAAGACCAAC3                        2
TGTCTGCTAGG2                            2
AACAAGACG2                              2
AACAAGAGAGT2                            2
AACAAGCATGGTCAA2                        2
AACAAGCTATTGATG2                        2
AACAAGCTC2                              2
AACAAGCTGAAGCAC3                        2
AACAAGCTTC2                    

In [323]:
p_repet_gff_filtered[8] = p_repet_gff_filtered['ID']

In [327]:
p_repet_gff_filtered_2 = p_repet_gff_filtered.iloc[:,0:9]

In [328]:
p_repet_gff_filtered_2

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,pcontig_193,Pst79p_anno_REPET_TEs,match,54973.0,55495.0,0.000000e+00,-,.,DTX-incomp-chim_MCL5
1,pcontig_193,Pst79p_anno_REPET_TEs,match_part,54973.0,55495.0,0.000000e+00,-,.,DTX-incomp-chim_MCL5
2,pcontig_193,Pst79p_anno_REPET_TEs,match,359.0,1264.0,0.000000e+00,-,.,DTX-incomp_MCL9
3,pcontig_193,Pst79p_anno_REPET_TEs,match_part,359.0,1264.0,0.000000e+00,-,.,DTX-incomp_MCL9
4,pcontig_193,Pst79p_anno_REPET_TEs,match,1430.0,3581.0,0.000000e+00,-,.,DTX-incomp_MCL9
5,pcontig_193,Pst79p_anno_REPET_TEs,match_part,1430.0,3581.0,0.000000e+00,-,.,DTX-incomp_MCL9
6,pcontig_193,Pst79p_anno_REPET_TEs,match,3651.0,5270.0,0.000000e+00,-,.,DTX-incomp_MCL9
7,pcontig_193,Pst79p_anno_REPET_TEs,match_part,3651.0,4720.0,0.000000e+00,-,.,DTX-incomp_MCL9
8,pcontig_193,Pst79p_anno_REPET_TEs,match_part,5048.0,5270.0,0.000000e+00,-,.,DTX-incomp_MCL9
9,pcontig_193,Pst79p_anno_REPET_TEs,match,4721.0,4764.0,0.000000e+00,-,.,DTX-incomp_MCL9
