In [1]:
%matplotlib inline
import pandas as pd
import os
import re
from Bio import SeqIO
import pysam
from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq
from Bio import SearchIO
from pybedtools import BedTool
import numpy as np
import pybedtools
import multiprocessing
import re



In [2]:
source_dir = '/home/benjamin/genome_assembly/PST79/FALCON/p_assemblies/v9_1/032017_assembly'

In [3]:
genome = 'Pst_104E_v12_p_ctg'

In [4]:
out_dir = '/home/benjamin/genome_assembly/PST79/FALCON/p_assemblies/v9_1/Pst_104E_v12/TE_analysis'

In [5]:
if not os.path.exists(out_dir):
    os.mkdir(out_dir)

In [6]:
#remove all commenting lines from the initial repet file
!grep -v "^#" {source_dir}/{genome}.REPET.gff > {out_dir}/{genome}.REPET.gff

In [7]:
p_repet_gff = pd.read_csv(out_dir+'/'+genome+'.REPET.gff', sep='\t', header = None)

In [8]:
TE_post_analysis_p = '/home/benjamin/genome_assembly/PST79/FALCON/p_assemblies/v9_1/REPET/Pst79_p/Pst79_p_full_annotate/postanalysis'

In [9]:
TE_post_analysis_p_header = 'TE      length  covg    frags   fullLgthFrags   copies  fullLgthCopies  meanId  sdId    minId   q25Id   medId   q75Id   maxId   meanLgth        sdLgth  minLgth q25Lgth medLgth q75Lgth maxLgth meanLgthPerc    sdLgthPerc      minLgthPerc  q25LgthPerc     medLgthPerc     q75LgthPerc     maxLgthPerc'.split(' ')

In [10]:
TE_post_analysis_p_header = [x for x in TE_post_analysis_p_header if x != '']

In [66]:
#this needs to be fixed up to pick the proper summary table
p_repet_summary_df = pd.read_csv(TE_post_analysis_p+'/'+'Pst79p_anno_chr_allTEs_nr_noSSR_join_path.annotStatsPerTE.tab' ,\
                                names = TE_post_analysis_p_header, header=None, sep='\t', skiprows=1 )

#check if I can filter the tab files for removing all TEs that are on the 2000 plus contigs
#remove tRNAs TEs with infernal

p_repet_summary_df['Code'] = p_repet_summary_df['TE'].apply(lambda x: x.split('_')[0])

code_keys = p_repet_summary_df['Code'].unique()

code_keys.sort()

code_long = ['DNA_transposon Helitron', 'DNA_transposon Helitron', 'DNA_transposon Helitron', 'DNA_transposon Maverick',\
            'DNA_transposon TIR', 'DNA_transposon TIR', 'DNA_transposon TIR', 'DNA_transposon TIR', 'DNA_transposon noCat',\
             'DNA_transposon MITE','DNA_transposon MITE', 'Potential Host Gene', 'Retrotransposon LINE', 'Retrotransposon LINE',\
             'Retrotransposon LINE','Retrotransposon LTR','Retrotransposon LTR', 'Retrotransposon LTR', 'Retrotransposon LTR', 'Retrotransposon PLE', \
             'Retrotransposon SINE',  'Retrotransposon SINE', 'Retrotransposon noCat', 'Retrotransposon LARD',\
             'Retrotransposon LARD', 'Retrotransposon TRIM', 'Retrotransposon TRIM', 'Retrotransposon noCat',  \
             'Retrotransposon DIRS','Retrotransposon DIRS','Retrotransposon DIRS','Retrotransposon DIRS',\
             'noCat', 'noCat']

code_dict = dict(zip(code_keys, code_long))

In [68]:
p_repet_summary_df['Code long'] = p_repet_summary_df['Code'].apply(lambda x: code_dict[x])

p_repet_summary_sum_df = pd.pivot_table(p_repet_summary_df, values=['covg', 'copies'], index='Code long', aggfunc=np.sum)

p_repet_summary_mean_df = pd.pivot_table(p_repet_summary_df, values='length', index='Code long', aggfunc=np.mean)

pd.concat([p_repet_summary_sum_df,p_repet_summary_mean_df], axis=1 )

Unnamed: 0_level_0,copies,covg,length
Code long,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
DNA_transposon Helitron,1075,817566,2989.764706
DNA_transposon MITE,3789,886304,490.240741
DNA_transposon Maverick,268,345406,8562.666667
DNA_transposon TIR,19166,12711595,4020.474359
DNA_transposon noCat,5286,2376456,3034.734513
Potential Host Gene,1372,1375304,6120.490566
Retrotransposon DIRS,1337,1049299,6874.526316
Retrotransposon LARD,10752,4947564,5407.736111
Retrotransposon LINE,323,237992,4446.625
Retrotransposon LTR,18893,16276421,6384.627841


In [69]:
#now filter the gff dataframe to delete all the high coverage contigs
#This might would have to be fixed as well. If we don't delete it as files should be already filtered
contigs_smaller_2000 = pd.read_csv('/home/benjamin/genome_assembly/PST79/FALCON/p_assemblies/v9_1/032017_assembly/pcontig_smaller_2000.txt',\
                                  header=None)[0].tolist()

In [70]:
p_repet_gff_filtered = p_repet_gff[p_repet_gff[0].isin(contigs_smaller_2000)].reset_index(drop=True)

In [71]:
def ID_filter_gff(_feature, _id):
    """
    This filter parses out the top level id form the 9th gff column form a REPET gff file.
    It has a specific search pattern for each feature type in column 2.
    _type is defined by the feature '_'.join(feature.split("_")[-2:])
    """
    _type = '_'.join(_feature.split("_")[-2:])
    if _type == 'REPET_TEs':

        TE_pattern = 'ID=[A-Z,a-z,0-9,-]*_[A-Z,a-z,0-9]*_[0-9]*_([A-Z,a-z,0-9,-]*_[A-Z,a-z,0-9,-]*)_'
        TE_prog = re.compile(TE_pattern)
        TE_match = TE_prog.search(_id)
        return TE_match.group(1)
    if _type == 'REPET_SSRs':
        
        SSR_pattern = 'ID=[A-Z,a-z,0-9,-]*_[A-Z,a-z,0-9]*_[0-9]*_([A-Z,a-z,0-9,-]*)'
        SSR_prog = re.compile(SSR_pattern)
        SSR_match = SSR_prog.search(_id)
        return SSR_match.group(1)
    if _type == 'REPET_tblastx' or _type == 'REPET_blastx':
        #if "#" in _id:
        #     blast_pattern = 'ID=[A-Z,a-z,0-9,-]*_[A-Z,a-z,0-9]*_[0-9]*_([A-Z,a-z,0-9,-]*[_]?[A-Z,a-z,0-9,-]*[_|#|0-9]+?:[A-Z,a-z,0-9,-,:]*)'
        #else:
        #    blast_pattern = r'ID=[A-Z,a-z,0-9,-]*_[A-Z,a-z,0-9]*_[0-9]*_([\w+|:|-]*)'
        blast_prog = re.compile(r'ID=[A-Z,a-z,0-9,-]*_[A-Z,a-z,0-9]*_[0-9]*_([^;| ]*)')
        #blast_prog = re.compile(blast_pattern)
        blast_match = blast_prog.search(_id)
        return blast_match.group(1)


In [72]:
p_repet_gff_filtered['ID'] = p_repet_gff_filtered.apply(lambda row: ID_filter_gff(row[1], row[8]), axis=1)

In [73]:
#filter out potential host genes
p_repet_gff_filtered = p_repet_gff_filtered[~p_repet_gff_filtered[8].str.contains("Potential")]

In [74]:
num_unique_REs = len(p_repet_gff_filtered['ID'].unique())

In [75]:
print('This is the number of unique repetitive elements: %i' % num_unique_REs)

This is the number of unique repetitive elements: 48659


In [76]:
frequency_of_REs  = p_repet_gff_filtered.groupby('ID')[0].count()

In [77]:
frequency_of_REs.sort_values(inplace=True)

In [78]:
p_repet_gff_filtered.to_csv(out_dir+'/'+genome+'.REPET.filtered.gff', sep='\t', header = None, index=None,columns=None)

In [79]:
p_repet_gff_filtered[8] = p_repet_gff_filtered['ID']

In [80]:
p_repet_gff_filtered_2 = p_repet_gff_filtered.iloc[:,0:9]

In [81]:
p_repet_gff_filtered_2.to_csv(out_dir+'/'+genome+'.REPET.ID_column.gff', sep='\t', header = None, index=None,columns=None)

In [82]:
#have another dataframe that only contains the REPET denovo annotation and not blast hits
repet_gff_filtered_TEs = p_repet_gff_filtered_2[~p_repet_gff_filtered_2[1].str.contains('blast')]

In [83]:
#get all the 'Code' phrases blast hits and make a blast_code_dict

blast_codes = p_repet_gff_filtered_2[(p_repet_gff_filtered_2[1]=='Pst79p_anno_REPET_tblastx') | (p_repet_gff_filtered_2[1]=='Pst79p_anno_REPET_blastx')][8]

blast_codes_list = [ ':'.join(x.split(':')[1:-1]) for x in blast_codes.unique()]

blast_codes_list_unique = list(set(blast_codes_list))

blast_codes_list_unique.sort()

blast_codes_list_unique

blast_code_long = ['Retrotransposon noCat', 'Retrotransposon DIRS', 'Retrotransposon LINE', 'Retrotransposon LTR', 'Retrotransposon PLE','DNA_transposon noCat',\
                   'DNA_transposon Crypton','DNA_transposon Helitron','DNA_transposon Maverick','DNA_transposon TIR']

blast_code_dict = dict(zip(blast_codes_list_unique, blast_code_long))


In [84]:
blast_code_dict

{'ClassI:?': 'Retrotransposon noCat',
 'ClassI:DIRS': 'Retrotransposon DIRS',
 'ClassI:LINE': 'Retrotransposon LINE',
 'ClassI:LTR': 'Retrotransposon LTR',
 'ClassI:PLE': 'Retrotransposon PLE',
 'ClassII:?': 'DNA_transposon noCat',
 'ClassII:Crypton': 'DNA_transposon Crypton',
 'ClassII:Helitron': 'DNA_transposon Helitron',
 'ClassII:Maverick': 'DNA_transposon Maverick',
 'ClassII:TIR': 'DNA_transposon TIR'}

In [85]:
#write a filer function that adds a 'code long'

In [86]:
def code_long_filter_gff(_feature, _id):
    """
    This filter parses out Order and class of the TE based on Wicker et al. using the previously generated ID column. 
    It has a specific search pattern for each feature type in column 2.
    _type is defined by the feature '_'.join(feature.split("_")[-2:])
    """
    _type = '_'.join(_feature.split("_")[-2:])
    if _type == 'REPET_TEs':
        #split the 
        code = _id.split('_')[0]
        return code_dict[code]
    if _type == 'REPET_SSRs':
        return 'SSR'
    if _type == 'REPET_tblastx' or _type == 'REPET_blastx':
        code = ':'.join(_id.split(':')[1:-1])
        return blast_code_dict[code]

In [87]:
p_repet_gff_filtered_2['Classification'] = p_repet_gff_filtered_2.apply(lambda row: code_long_filter_gff(row[1], row[8]), axis=1)

In [88]:
p_repet_gff_filtered_2["Classification"].unique()

array(['DNA_transposon TIR', 'Retrotransposon DIRS', 'Retrotransposon LTR',
       'SSR', 'DNA_transposon noCat', 'DNA_transposon MITE',
       'Retrotransposon LARD', 'Retrotransposon SINE', 'noCat',
       'DNA_transposon Helitron', 'DNA_transposon Maverick',
       'Retrotransposon LINE', 'Retrotransposon TRIM',
       'Retrotransposon noCat', 'Retrotransposon PLE',
       'DNA_transposon Crypton'], dtype=object)

In [89]:
#in the .classificaiton gff the feature column 2 is the Wicker classification of the transposon
p_repet_gff_filtered_2[8] = p_repet_gff_filtered_2['Classification'] 
#repet_gff_filtered_TEs[1] = repet_gff_filtered_TEs["Classification"]

In [90]:
p_repet_gff_filtered_2.iloc[:,:-1].to_csv(out_dir+'/'+genome+'.REPET.classification.gff', sep='\t', header = None, index=None,columns=None)

In [118]:
p_repet_gff_filtered_2[~p_repet_gff_filtered_2[1].str.contains('blast')].iloc[:,:-1].to_csv(out_dir+'/'+genome+'.REPET_noblast.classification.gff', sep='\t', header = None, index=None,columns=None)

In [92]:
#write a similar filter as before using bedtools to move through all the classifications and caclulate coverage
#summarize this in a table and compare to published stuff

In [93]:
#generate the directory structure to safe specific coverage files
os.chdir(out_dir)
TE_types = ['Retrotransposon', 'DNA_transposon', 'noCat', 'SSR']
TE_path = [os.path.join(out_dir, x) for x in TE_types]
TE_path_dict = dict(zip(TE_types, TE_path))
for TE_type in TE_types:
    new_path = os.path.join(out_dir, TE_type)
    if not os.path.exists(new_path):
        os.mkdir(new_path)

In [94]:
repet_prefix = genome+'.REPET.classification'
p_genome_file = genome+'.genome_file'

In [95]:
# Next, we create a function to pass only features for a particular
# featuretype.  This is similar to a "grep" operation when applied to every
# feature in a BedTool
def id_filter_classification(feature, _id):
    if feature[8] == _id:
        return True
    return False

In [96]:
# subset the id and safe in specific folder
# return the subsetted file as bedtool
def subset_id_classification(_id, bed_object):
    #retrotransposon 
    if 'Retrotransposon' in _id:
        out_path = TE_path_dict['Retrotransposon']   
    #DNA_transponson
    elif 'DNA_transposon' in _id:
        out_path = TE_path_dict['DNA_transposon'] 
    #notCat 
    elif 'noCat' in _id:
        out_path = TE_path_dict['noCat']
    #SSR
    elif 'SSR' in _id:
        out_path = TE_path_dict['SSR']
    repet_prefix = genome+ '.'+bed_object.fn.split('.')[-3] + '.classification'
    out_fn = out_path+'/'+repet_prefix+'.'+_id+'.gff'
    out_fn= out_fn.replace(" ", '_')
    result = bed_object.filter(id_filter_classification, _id).saveas(out_fn)
    cov_fn = out_fn.replace('gff','cov')
    cov = result.genome_coverage(dz=True,g=p_genome_file)
    cov.saveas(cov_fn)
    print("Done with %s " % (out_fn))
    #return pybedtools.BedTool(result.fn)

In [120]:
#pull in the classification gff, make classification array, loop over array to save all the cov_dataframes
RE_id_gff = pybedtools.BedTool(out_dir+'/'+genome+'.REPET.classification.gff')
g = RE_id_gff.remove_invalid().saveas(out_dir+'/'+genome+'.REPET.classification.bedobject')
#use the blast filtered dataframe as well
RE_id_gff_noblast = pybedtools.BedTool(out_dir+'/'+genome+'.REPET_noblast.classification.gff')
g_noblast = RE_id_gff_noblast.remove_invalid().saveas(out_dir+'/'+genome+'.REPET_noblast.classification.bedobject')

In [47]:
#filter through the whole REPET_TE bedobject
#maybe make some multiproccesses out of this
classifications = p_repet_gff_filtered_2["Classification"].unique()
[subset_id_classification(x, g) for x in classifications]

In [None]:
#filter through the whole REPET_TE bedobject having removed the blast hits
#maybe make some multiproccesses out of this
classifications_noblast = p_repet_gff_filtered_2[~p_repet_gff_filtered_2[1].str.contains('blast')]["Classification"].unique()
[subset_id_classification(x, g_noblast) for x in classifications_noblast]

In [55]:
cur_dir = os.path.abspath(os.path.curdir)

In [62]:
genome_df = pd.read_csv(p_genome_file, sep='\t', header=None,names=['contig', 'length'])

genome_size = genome_df['length'].sum()

In [56]:
#this caputures all REPET classifications including blast and internal REPET_TEs
#the problem found here is that the blast and the REPET annotation is sometimes contratictory and overlapping
class_cov_files = []
for dirpath, dirname, filenames in os.walk(cur_dir):
    #print(dirpath)
    #print(len(filenames))
    if len(filenames) == 0:  # empty folder
        continue
    cov_files = [dirpath +'/'+x for x in filenames if x.endswith('.cov') and 'REPET.classification' in x]
    for file in cov_files:
        class_cov_files.append(file)

#make a large summary dataframe from all the cov files where the last 
df_list =[]
class_cov_files.sort()
for file in class_cov_files:
    tmp_df = pd.read_csv(file, sep='\t', header = None)
    tmp_df["Code long"] = file.split('.')[-2]
    tmp_df.drop_duplicates(inplace=True)
    df_list.append(tmp_df)
    print(file.split('.')[-2])

df_REPET_classification = pd.concat(df_list)

cov_per_class = df_REPET_classification.pivot_table(values=1, columns= 'Code long', aggfunc='count')
cov_per_contig_per_class = df_REPET_classification.groupby([0, 'Code long'])[1].count()

noCat
SSR
DNA_transposon_noCat
DNA_transposon_Helitron
DNA_transposon_Crypton
DNA_transposon_Maverick
DNA_transposon_TIR
DNA_transposon_MITE
Retrotransposon_PLE
Retrotransposon_Crypton
Retrotransposon_LTR
Retrotransposon_DIRS
Retrotransposon_TRIM
Retrotransposon_LARD
Retrotransposon_noCat
Retrotransposon_LINE
Retrotransposon_SINE


In [None]:
#this caputures all REPET classifications including internal REPET_TEs classification only. No blast hits included
class_cov_files = []
for dirpath, dirname, filenames in os.walk(cur_dir):
    #print(dirpath)
    #print(len(filenames))
    if len(filenames) == 0:  # empty folder
        continue
    cov_files = [dirpath +'/'+x for x in filenames if x.endswith('.cov') and 'REPET_noblast.classification' in x ]
    for file in cov_files:
        class_cov_files.append(file)

#make a large summary dataframe from all the cov files where the last 
df_list =[]
class_cov_files.sort()
for file in class_cov_files:
    tmp_df = pd.read_csv(file, sep='\t', header = None)
    tmp_df["Code long"] = file.split('.')[-2]
    tmp_df.drop_duplicates(inplace=True)
    df_list.append(tmp_df)
    print(file.split('.')[-2])

df_REPET_noblast_classification = pd.concat(df_list)

cov_per_class_noblast = df_REPET_noblast_classification.pivot_table(values=1, columns= 'Code long', aggfunc='count')
cov_per_contig_per_class_noblast = df_REPET_noblast_classification.groupby([0, 'Code long'])[1].count()

In [59]:
cov_per_class/genome_size

Code long
DNA_transposon_Crypton     0.000015
DNA_transposon_Helitron    0.014380
DNA_transposon_MITE        0.012342
DNA_transposon_Maverick    0.004510
DNA_transposon_TIR         0.180167
DNA_transposon_noCat       0.055862
Retrotransposon_Crypton    0.014513
Retrotransposon_DIRS       0.003626
Retrotransposon_LARD       0.066880
Retrotransposon_LINE       0.004594
Retrotransposon_LTR        0.258021
Retrotransposon_PLE        0.000692
Retrotransposon_SINE       0.000381
Retrotransposon_TRIM       0.003303
Retrotransposon_noCat      0.001614
SSR                        0.023948
noCat                      0.012289
Name: 1, dtype: float64

In [None]:
cov_per_class_noblast/genome_size

In [60]:
(cov_per_class/genome_size*100).sum()

65.71364309754486

In [None]:
(cov_per_class_noblast/genome_size*100).sum()

In [102]:
len(df_REPET_classification)

54776012

In [None]:
len(df_REPET_noblast_classification)

In [None]:
len(df_REPET_noblast_classification.drop_duplicates([0,1]))/genome_size

In [106]:
test = p_repet_gff_filtered_2[~p_repet_gff_filtered_2[1].str.contains('blast')].iloc[:,:-1]

In [109]:
test.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8
485095,pcontig_000,Pst79p_anno_REPET_TEs,match,4244,4312,0.0,+,.,DNA_transposon MITE
485096,pcontig_000,Pst79p_anno_REPET_TEs,match_part,4244,4312,0.0,+,.,DNA_transposon MITE
483287,pcontig_000,Pst79p_anno_REPET_TEs,match,10017,10551,0.0,-,.,Retrotransposon LTR
483288,pcontig_000,Pst79p_anno_REPET_TEs,match_part,10017,10551,0.0,-,.,Retrotransposon LTR
483305,pcontig_000,Pst79p_anno_REPET_TEs,match,10552,10574,0.0,-,.,Retrotransposon LTR


In [116]:
test[(test[0]== 'pcontig_000' )&(test[3] < 20000)]

Unnamed: 0,0,1,2,3,4,5,6,7,8
485095,pcontig_000,Pst79p_anno_REPET_TEs,match,4244,4312,0.0,+,.,DNA_transposon MITE
485096,pcontig_000,Pst79p_anno_REPET_TEs,match_part,4244,4312,0.0,+,.,DNA_transposon MITE
483287,pcontig_000,Pst79p_anno_REPET_TEs,match,10017,10551,0.0,-,.,Retrotransposon LTR
483288,pcontig_000,Pst79p_anno_REPET_TEs,match_part,10017,10551,0.0,-,.,Retrotransposon LTR
483305,pcontig_000,Pst79p_anno_REPET_TEs,match,10552,10574,0.0,-,.,Retrotransposon LTR
483306,pcontig_000,Pst79p_anno_REPET_TEs,match_part,10552,10574,0.0,-,.,Retrotransposon LTR
483951,pcontig_000,Pst79p_anno_REPET_TEs,match,14601,14785,0.0,-,.,Retrotransposon LARD
483952,pcontig_000,Pst79p_anno_REPET_TEs,match_part,14601,14785,0.0,-,.,Retrotransposon LARD
485309,pcontig_000,Pst79p_anno_REPET_TEs,match,14786,14834,0.0,-,.,Retrotransposon LARD
485310,pcontig_000,Pst79p_anno_REPET_TEs,match_part,14786,14834,0.0,-,.,Retrotransposon LARD


In [108]:
test.sort_values([0,3],inplace =True)

In [112]:
df_REPET_noblast_classification.sort_values([0,1], inplace=True)

In [114]:
df_REPET_noblast_classification[df_REPET_noblast_classification.duplicated([0,1], keep=False )]

Unnamed: 0,0,1,2,Code long
3960575,pcontig_000,14602,2,DNA_transposon_noCat
5030528,pcontig_000,14602,2,Retrotransposon_LARD
3960576,pcontig_000,14603,2,DNA_transposon_noCat
5030529,pcontig_000,14603,2,Retrotransposon_LARD
3960577,pcontig_000,14604,2,DNA_transposon_noCat
5030530,pcontig_000,14604,2,Retrotransposon_LARD
3960578,pcontig_000,14605,2,DNA_transposon_noCat
5030531,pcontig_000,14605,2,Retrotransposon_LARD
3960579,pcontig_000,14606,2,DNA_transposon_noCat
5030532,pcontig_000,14606,2,Retrotransposon_LARD


In [None]:
df_REPET_classification.drop_duplicates(subset=[0,1])

In [None]:
df_REPET_classification.sort_values([0,1], inplace=True)

In [None]:
df_REPET_classification_dup = df_REPET_classification[df_REPET_classification.duplicated(subset=[0,1], keep=False)]

In [None]:
p_repet_gff_filtered_2[(p_repet_gff_filtered_2[0] ==  'pcontig_000')&(p_repet_gff_filtered_2[3] < 14602)]

In [117]:
p_repet_gff_filtered_2[(p_repet_gff_filtered_2[0] ==  'pcontig_000')&(p_repet_gff_filtered_2[3] < 20000)].sort_values(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,Classification
485096,pcontig_000,Pst79p_anno_REPET_TEs,match_part,4244,4312,0.0,+,.,DNA_transposon MITE,DNA_transposon MITE
485095,pcontig_000,Pst79p_anno_REPET_TEs,match,4244,4312,0.0,+,.,DNA_transposon MITE,DNA_transposon MITE
483288,pcontig_000,Pst79p_anno_REPET_TEs,match_part,10017,10551,0.0,-,.,Retrotransposon LTR,Retrotransposon LTR
483287,pcontig_000,Pst79p_anno_REPET_TEs,match,10017,10551,0.0,-,.,Retrotransposon LTR,Retrotransposon LTR
483305,pcontig_000,Pst79p_anno_REPET_TEs,match,10552,10574,0.0,-,.,Retrotransposon LTR,Retrotransposon LTR
483306,pcontig_000,Pst79p_anno_REPET_TEs,match_part,10552,10574,0.0,-,.,Retrotransposon LTR,Retrotransposon LTR
483951,pcontig_000,Pst79p_anno_REPET_TEs,match,14601,14785,0.0,-,.,Retrotransposon LARD,Retrotransposon LARD
483952,pcontig_000,Pst79p_anno_REPET_TEs,match_part,14601,14785,0.0,-,.,Retrotransposon LARD,Retrotransposon LARD
498268,pcontig_000,Pst79p_anno_REPET_tblastx,match,14603,14784,0.0,-,.,DNA_transposon noCat,DNA_transposon noCat
498269,pcontig_000,Pst79p_anno_REPET_tblastx,match_part,14603,14690,2e-13,-,.,DNA_transposon noCat,DNA_transposon noCat


In [None]:
df_REPET_classification_dup.head(100)

In [None]:
45267875/ genome_size

In [None]:
!head 'Pst_104E_v12_p_ctg.REPET.ID_column.cov'

In [None]:
RE_id_gff = pybedtools.BedTool(out_dir+'/'+genome+'.REPET.ID_column.gff')

In [None]:
os.chdir(out_dir)

In [None]:
repet_gff = 'Pst_104E_v12_p_ctg.REPET.gff'

In [None]:
repet_prefix = '.'.join(repet_gff.split('.')[0:2])

In [None]:
g = RE_id_gff.remove_invalid().saveas()

In [None]:
# Next, we create a function to pass only features for a particular
# featuretype.  This is similar to a "grep" operation when applied to every
# feature in a BedTool
def id_filter(feature, _id):
    if feature[8] == _id:
        return True
    return False

In [None]:
# subset the id and safe in specific folder
# return the subsetted file as bedtool
def subset_id(_id):
    #ClassI are retrotransposon form blast
    if 'ClassI:' in _id:
        out_path = TE_path_dict['Retrotransposon']   
    #ClassII are DNA_transponson
    elif 'ClassII' in _id:
        out_path = TE_path_dict['DNA_transposon'] 
    #The rest with '_' should be REPET_TEs
    elif _id.split('_')[0] in list(code_dict.keys()):
        key = code_dict[_id.split('_')[0]].split(' ')[0]
        out_path = TE_path_dict[key]
    #everything without '_' at the end should be SSR
    elif '_' not in _id:
        out_path = TE_path_dict['SSR']
    out_fn = out_path+'/'+repet_prefix+'.'+_id+'.gff'
    result = g.filter(id_filter, _id).saveas(out_fn)
    cov_fn = out_fn.replace('gff','cov')
    cov = result.genome_coverage(dz=True,g=p_genome_file)
    cov.saveas(cov_fn)
    #return pybedtools.BedTool(result.fn)

In [None]:
TE_types = ['Retrotransposon', 'DNA_transposon', 'noCat', 'SSR']
TE_path = [os.path.join(out_dir, x) for x in TE_types]
TE_path_dict = dict(zip(TE_types, TE_path))

In [None]:
for TE_type in TE_types:
    new_path = os.path.join(out_dir, TE_type)
    if not os.path.exists(new_path):
        os.mkdir(new_path)

In [None]:
_id = frequency_of_REs.index.tolist()

In [None]:
#this is REALLY slow for now. Would need to parallize this step. Look at the pool function of 
#multiprocessing

In [None]:
[subset_id(x) for x in _id]

In [None]:
full_path_list[0].replace('gff','cov')

In [None]:
_test_id = _id[0:5]

In [None]:
g = RE_id_gff.remove_invalid().saveas()

In [None]:
[x for x in _test_id if '_' not in x]

In [None]:
p_genome_file = 'Pst_104E_v12_p_ctg.genome_file'

In [None]:
all_cov_RE = g.genome_coverage(dz=True,g=p_genome_file)

In [None]:
all_cov_RE.saveas('Pst_104E_v12_p_ctg.REPET.ID_column.cov')

In [None]:
p_repet_gff_filtered[1].unique()

In [None]:
[x for x in p_repet_gff_filtered_2[(p_repet_gff_filtered_2[1] == 'Pst79p_anno_REPET_TE')  ][8].unique() if '_' not in x]

In [None]:
[x for x in p_repet_gff_filtered_2[(p_repet_gff_filtered_2[1] == 'Pst79p_anno_REPET_tblastx') | (p_repet_gff_filtered_2[1] == 'Pst79p_anno_REPET_blastx') ][8].unique() if '_' not in x]

In [None]:
p_repet_gff[p_repet_gff[8].str.contains('Copia6-VV_I_')][8]