In [1]:
# DEPENDENCIES

import pysam
import pandas as pd
import HTSeq
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# GLOBAL VARS

FC30_DMGOTH_BAMFILE = "/data2/eric/TE_LR_RNAseq/data/dmgoth101_genome_alignments/bam/FC30.against_dmgoth.bam"
FC30_DMGOTH_MAX_AS_BAMFILE = "/data2/eric/TE_LR_RNAseq/data/dmgoth101_genome_alignments/bam/FC30.against_dmgoth.filtered_max_AS.bam"

FC29_DMGOTH_BAMFILE = "/data2/eric/TE_LR_RNAseq/data/dmgoth101_genome_alignments/bam/FC29.against_dmgoth.bam"
FC29_DMGOTH_MAX_AS_BAMFILE = "/data2/eric/TE_LR_RNAseq/data/dmgoth101_genome_alignments/bam/FC29.against_dmgoth.filtered_max_AS.bam"

FC30_DMGOTH_SUBSET_BAMFILE = "/data2/eric/TE_LR_RNAseq/data/dmgoth101_genome_alignments/bam/FC30.subset.against_dmgoth.bam"
FC30_DMGOTH_SUBSET_MAX_AS_BAMFILE = "/data2/eric/TE_LR_RNAseq/data/dmgoth101_genome_alignments/bam/FC30.subset.against_dmgoth.filtered.bam"

REPEATMASKER_TE_ANNOTATIONS = "/data2/eric/TE_LR_RNAseq/data/dmgoth101_genome_alignments/annotations/dmgoth101.onecode.v2.gtf"
DMGOTH_GENE_ANNOTATIONS = "/data2/eric/TE_LR_RNAseq/data/dmgoth101_genome_alignments/annotations/Dm_Goth_10-1.dmel6.23LiftOff.sorted.gff"

DFAM_TE_ANNOTATION = "/data2/eric/TE_LR_RNAseq/data/families.flanked_LTR.hierarchy.fa"

In [2]:
def build_index(bamfile):
    bam = pysam.AlignmentFile(bamfile, 'rb')
    read_index = pysam.IndexedReads(bam)
    read_index.build()
    return read_index

def get_query_names(bamfile):
    query_names = []
    with pysam.AlignmentFile(bamfile, 'rb') as bam:
        for ali in bam:
            query_names.append(ali.query_name)
    return set(query_names)


In [3]:
def get_TE_hierarchy(consensus_fasta):
    subclass_list = []
    superfamily_list = []
    family_list = []
    with open(consensus_fasta, 'r') as consensus:
        for line in consensus:
            if line.startswith(">"):
                family = line.split("#")[0][1:]
                subclass, superfamily= line.strip().split("#")[1].split("/")
                subclass_list.append(subclass)
                superfamily_list.append(superfamily)
                family_list.append(family)
    hierarchy_df = pd.DataFrame(list(zip(subclass_list, superfamily_list, family_list)), columns =['Subclass', 'Superfamily', 'Family'])
    return hierarchy_df
    
HIERARCHY_DF = get_TE_hierarchy(DFAM_TE_ANNOTATION)

In [4]:
# Filter out reads : 

## We filter unmapped, supplementary, and reads with non-optimal AS score.

def filter_max_AS_reads(bamfile, output_bamfile):
    query_names = get_query_names(bamfile)
    read_index = build_index(bamfile)
    with pysam.AlignmentFile(output_bamfile, 'wb', template = pysam.AlignmentFile(bamfile,'rb')) as output:
        for read in query_names:
            ali_list = read_index.find(read)
            max_AS = 0
            primary_and_secondary_alignments = [ali for ali in ali_list if not (ali.is_unmapped or ali.is_supplementary)]
            if primary_and_secondary_alignments :
                max_AS = max([ali.get_tag('AS') for ali in primary_and_secondary_alignments])
                max_AS_alignments = [ali for ali in primary_and_secondary_alignments if ali.get_tag('AS') == max_AS]
                for ali in max_AS_alignments:
                    output.write(ali)
    return output_bamfile


In [5]:
class TransposableElementInsertion:
    def __init__(self, insertion_id, chrom, start, end):
        self.id = insertion_id
        self.chrom = chrom
        self.start = int(start)
        self.end = int(end)
        self.alignments = []
        self.counts = 0

In [6]:
def initiate_transposable_element_family_dict(annotations):
    transposable_element_family_dict = {}
    with open(annotations) as annot:
        for line in annot:
            splitted_line = line.strip().split("\t")
            chrom, start, end = [splitted_line[i] for i in [0, 3, 4]]
            ids = splitted_line[-1].split('"')
            family_name = ids[1]
            if int(end) - int(start) > 100:
                new_insertion = TransposableElementInsertion(ids[3], chrom, start, end)
                if family_name not in transposable_element_family_dict:
                    transposable_element_family_dict[family_name] = []
                transposable_element_family_dict[family_name].append(new_insertion)
    return transposable_element_family_dict


In [7]:
def get_subject_coverage(alignment, insertion):
    overlap_start = max(alignment.reference_start, insertion.start)
    overlap_end = min(alignment.reference_end, insertion.end)
    overlap_length = overlap_end - overlap_start
    insertion_length = insertion.end - insertion.start
    subject_coverage = overlap_length / insertion_length
    return subject_coverage

def default_filter(alignment, insertion):
    subject_coverage = get_subject_coverage(alignment, insertion)
    nb_aligned_pairs =  alignment.get_overlap(insertion.start, insertion.end)
    is_ok = (subject_coverage > 0.1 and nb_aligned_pairs > 1)
    return is_ok

def get_exceeding_alignment_length(alignment, insertion):
    overflow_length = 0
    if alignment.reference_start < insertion.start:
        overflow_length += insertion.start -alignment.reference_start
    if alignment.reference_end > insertion.end:
        overflow_length += alignment.reference_end - insertion.end
    return overflow_length

def filter_out_co_expressed(alignment, insertion):
    exceeding_length = get_exceeding_alignment_length(alignment, insertion)
    alignment_length = alignment.reference_end - alignment.reference_start
    is_ok = (exceeding_length < (0.1 * alignment_length))
    return is_ok

def no_filter(alignment, insertion):
    return True

def match_reads_with_insertions(bamfile, family_dict, filter_strategy):
    with pysam.AlignmentFile(bamfile, 'rb') as bam:
        for family, insertion_list in family_dict.items():
            for insertion in insertion_list:
                def apply_filter_strategy(read):
                    return filter_strategy(read, insertion)
                insertion.counts = bam.count(contig=insertion.chrom, start=insertion.start, end=insertion.end, read_callback=apply_filter_strategy)

    return family_dict


In [8]:
def merge_family_splitted_between_intern_and_LTR(family_dict, TE_hierarchy_df):
    bad_suffix_list = ["_I", "-I_DM", "_I_DM", "_LTR", "-LTR_DM", "_LTR_DM"]
    new_family_dict = {}
    new_row_list = []
    for family, insertion_list in family_dict.items() :
        if family in list(TE_hierarchy_df["Family"]):
            new_family_dict[family] = insertion_list
        else:
            family_name_fixed = False
            for bad_suffix in bad_suffix_list:
                if family.endswith(bad_suffix):
                    new_family = family[:-len(bad_suffix)]
                    if not new_family in new_family_dict:
                        new_family_dict[new_family] = insertion_list
                    else :
                        new_family_dict[new_family] += insertion_list
            if family_name_fixed :
                continue
            else:
                new_row_list.append(["Unknown", "Unknown", family])
    TE_hierarchy_df.append(pd.DataFrame(new_row_list,  columns = TE_hierarchy_df.columns))
    return (new_family_dict, TE_hierarchy_df)


In [88]:
def get_TE_counting(filtered_bamfile, annotations, TE_hierarchy_df, filter_strategy):
    family_dict = initiate_transposable_element_family_dict(annotations)
    family_dict = match_reads_with_insertions(filtered_bamfile, family_dict, filter_strategy)
    family_dict, TE_hierarchy_df = merge_family_splitted_between_intern_and_LTR(family_dict, TE_hierarchy_df)
    return (family_dict, TE_hierarchy_df)
# FC30_FAMILY_DICT, HIERARCHY_DF = get_TE_counting(FC30_DMGOTH_MAX_AS_BAMFILE, REPEATMASKER_TE_ANNOTATIONS, HIERARCHY_DF, filter_out_co_expressed)
# FC29_FAMILY_DICT, HIERARCHY_DF = get_TE_counting(FC29_DMGOTH_MAX_AS_BAMFILE, REPEATMASKER_TE_ANNOTATIONS, HIERARCHY_DF, filter_out_co_expressed)
# FC30_FAMILY_DICT = get_TE_counting(FC30_DMGOTH_MAX_AS_BAMFILE, REPEATMASKER_TE_ANNOTATIONS, filter_out_co_expressed)
# FC29_FAMILY_DICT = get_TE_counting(FC29_DMGOTH_MAX_AS_BAMFILE, REPEATMASKER_TE_ANNOTATIONS, filter_out_co_expressed)
FC30_FAMILY_DICT, HIERARCHY_DF = get_TE_counting(FC30_DMGOTH_MAX_AS_BAMFILE, REPEATMASKER_TE_ANNOTATIONS, HIERARCHY_DF, filter_out_co_expressed)
FC29_FAMILY_DICT, HIERARCHY_DF = get_TE_counting(FC29_DMGOTH_MAX_AS_BAMFILE, REPEATMASKER_TE_ANNOTATIONS, HIERARCHY_DF, filter_out_co_expressed)

HEY
HEY


In [89]:
def from_counting_to_df(counting_dict):
    family_list = []
    min_TE_length_list = []
    max_TE_length_list = []
    counting_list = []
    nb_expressed_insertion_list = []
    nb_insertion_list = []
    most_expressed_insertion_list = []
    for family, insertion_list in counting_dict.items():
        family_list.append(family)
        nb_insertion_list.append(len(insertion_list))
        nb_expressed_insertion = 0
        min_TE_length = 9999999999
        max_TE_length = 0
        family_counts = 0
        most_expressed_insertion = None
        max_count = 0
        for insertion in insertion_list :
            insertion_length = insertion.end - insertion.start
            min_TE_length = min(min_TE_length, insertion_length)
            max_TE_length = max(max_TE_length, insertion_length)
            family_counts += insertion.counts
            max_count = max(max_count,insertion.counts)
            if insertion.counts >= max_count :
                most_expressed_insertion = insertion.id
            if insertion.counts > 0 :
                nb_expressed_insertion += 1
        nb_expressed_insertion_list.append(nb_expressed_insertion)
        min_TE_length_list.append(min_TE_length)
        max_TE_length_list.append(max_TE_length)
        counting_list.append(family_counts)
        most_expressed_insertion_list.append(most_expressed_insertion)
        
    counting_df = pd.DataFrame(list(zip(family_list, min_TE_length_list, max_TE_length_list, counting_list, nb_insertion_list, nb_expressed_insertion_list, most_expressed_insertion_list)), columns=["Family", "Min_TE_length","Max_TE_length", "Counts", "nb_of_insertion", "nb_of_expressed_insertion", "Most_expressed_insertion"])
    return counting_df


In [90]:
FC30_DMGOTH_COUNTING_DF = from_counting_to_df(FC30_FAMILY_DICT)
FC29_DMGOTH_COUNTING_DF = from_counting_to_df(FC29_FAMILY_DICT)

In [91]:
FC30_DMGOTH_COUNTING_DF.to_csv('FC30_DMGOTH_COUNTING_DF.filter_no_co-expressed.tsv', sep = '\t', index=False)
FC29_DMGOTH_COUNTING_DF.to_csv('FC29_DMGOTH_COUNTING_DF.filter_no_co-expressed.tsv', sep = '\t', index=False)

In [92]:
def add_hierarchy_to_counting_df(counting_df, hierarchy_df):
    new_df = hierarchy_df.merge(counting_df, on='Family')
    return new_df
new_df = add_hierarchy_to_counting_df(FC29_DMGOTH_COUNTING_DF, HIERARCHY_DF)
new_df.head()

Unnamed: 0,Subclass,Superfamily,Family,Min_TE_length,Max_TE_length,Counts,nb_of_insertion,nb_of_expressed_insertion,Most_expressed_insertion
0,LINE,R1-LOA,Baggins1,121,10648,824,137,97,Baggins1_2R_RaGOO_986154_989558
1,DNA,TcMar-Tc1,BARI1,114,1563,60,9,9,BARI1_2L_RaGOO_21112657_21113769
2,DNA,TcMar-Tc1,BARI_DM,1232,1732,30,16,3,BARI_DM_3R_RaGOO_18377955_18379687
3,LINE,I-Jockey,BS,101,7501,848,66,50,BS_3L_RaGOO_26154944_26155049
4,LINE,I-Jockey,BS2,120,9056,612,117,90,BS2_2R_RaGOO_3464852_3465309


In [93]:
def create_table_from_counting(female_counting, male_counting, hierarchy):
    male_counting = male_counting.drop(columns=["Min_TE_length", "Max_TE_length", "nb_of_insertion"])
    TE_data_table = hierarchy.merge(female_counting, on='Family').merge(male_counting, on='Family')
    TE_data_table.columns = ["Subclass", "Superfamily", "Family", "Min_TE_length", "Max_TE_length", "Female_Counts", "Nb_of_insertion", "Female_nb_of_expressed_insertion", "Female_most_expressed_insertion", "Male_Counts", "Male_nb_of_expressed_insertion", "Male_most_expressed_insertion"]
    TE_data_table = TE_data_table[["Subclass", "Superfamily", "Family", "Nb_of_insertion", "Min_TE_length", "Max_TE_length", "Female_Counts", "Female_nb_of_expressed_insertion", "Female_most_expressed_insertion", "Male_Counts", "Male_nb_of_expressed_insertion", "Male_most_expressed_insertion"]]
    return TE_data_table

TE_data_table = create_table_from_counting(FC30_DMGOTH_COUNTING_DF, FC29_DMGOTH_COUNTING_DF, HIERARCHY_DF)



In [94]:
TE_data_table.sort_values(by="Male_Counts", axis=0,  ascending=False)
# TE_data_table.to_csv('TE_data_table.tsv', sep = '\t', index=False)

Unnamed: 0,Subclass,Superfamily,Family,Nb_of_insertion,Min_TE_length,Max_TE_length,Female_Counts,Female_nb_of_expressed_insertion,Female_most_expressed_insertion,Male_Counts,Male_nb_of_expressed_insertion,Male_most_expressed_insertion
15,RC,Helitron,DNAREP1_DM,3013,101,1186,48742,1979,DNAREP1_DM_2R_RaGOO_4105978_4106120,55820,2512,DNAREP1_DM_3L_RaGOO_21163504_21163657
124,LTR,Pao,ROO,439,101,15803,5917,332,ROO_I_2R_RaGOO_14213942_14227652,19755,403,ROO_I_3R_RaGOO_14005164_14015074
22,DNA,TcMar-Tc1,FB4_DM,197,102,7578,6038,157,FB4_DM_2R_RaGOO_737484_743345,7244,171,FB4_DM_3R_RaGOO_30986386_30987925
54,DNA,P,PROTOP_B,328,101,2140,5863,205,PROTOP_B_3L_RaGOO_23779716_23780536,5054,247,PROTOP_B_2L_RaGOO_21755433_21756077
10,LINE,CR1,DMCR1A,565,101,8863,7836,314,DMCR1A_2R_RaGOO_755292_755591,4695,354,DMCR1A_2R_RaGOO_4646976_4654778
...,...,...,...,...,...,...,...,...,...,...,...,...
6,LINE,I-Jockey,BS4_DM,3,103,750,26,2,BS4_DM_2R_RaGOO_9449597_9449700,18,3,BS4_DM_2R_RaGOO_9449597_9449700
49,DNA,P,PLACW_DM,1,927,927,9,1,PLACW_DM_3L_RaGOO_13917593_13918520,17,1,PLACW_DM_3L_RaGOO_13917593_13918520
57,LINE,R2,R2_DM,3,267,566,2,1,R2_DM_X_RaGOO_21998019_21998557,1,1,R2_DM_X_RaGOO_22026333_22026899
59,LTR,Gypsy,Stalker3_LTR,4,329,387,0,0,Stalker3_LTR_3L_RaGOO_26480807_26481192,0,0,Stalker3_LTR_3L_RaGOO_26480807_26481192


In [95]:
TE_data_table["Subclass"]

0      LINE
1       DNA
2       DNA
3      LINE
4      LINE
       ... 
126     LTR
127     LTR
128     LTR
129     LTR
130     LTR
Name: Subclass, Length: 131, dtype: object

In [41]:
def plot_TE_expression_profile_by_family(TE_data_table):

    fig = make_subplots(rows=1, cols=2, specs=[[{'type':'domain'}, {'type':'domain'}]])

    fig.add_trace(
        go.Pie(labels=TE_data_table["Superfamily"], values=TE_data_table["Female_Counts"], textinfo='value', name = "Female"),
        1, 1)

    fig.add_trace(
        go.Pie(labels=TE_data_table["Superfamily"], values=TE_data_table["Male_Counts"], textinfo='value', name = "Male"),
        1, 2)
    fig.update_traces(hole=.4, hoverinfo="label+value")

    fig.update_layout(
        title_text= "Superfamily expression profile",
        width=1200,
        height=500,
        # Add annotations in the center of the donut pies.
        annotations=[dict(text='Female', x=0.18, y=0.5, font_size=20, showarrow=False),
                    dict(text='Male', x=0.80, y=0.5, font_size=20, showarrow=False)])
    fig.show()

In [42]:
def plot_TE_expression_profile_by_superfamily(TE_data_table):
    fig = make_subplots(rows=1, cols=2, specs=[[{'type':'domain'}, {'type':'domain'}]])

    fig.add_trace(
        go.Pie(labels=TE_data_table["Subclass"], values=TE_data_table["Female_Counts"], textinfo='value', name = "Female"),
        1, 1)

    fig.add_trace(
        go.Pie(labels=TE_data_table["Subclass"], values=TE_data_table["Male_Counts"], textinfo='value', name = "Male"),
        1, 2)
    fig.update_traces(hole=.4, hoverinfo="label+value")

    fig.update_layout(
        title_text= "Subclass expression profile",
        width=1200,
        height=500,
        # Add annotations in the center of the donut pies.
        annotations=[dict(text='Female', x=0.18, y=0.5, font_size=20, showarrow=False),
                    dict(text='Male', x=0.80, y=0.5, font_size=20, showarrow=False)])
    fig.show()

In [96]:
plot_TE_expression_profile_by_family(TE_data_table)

In [97]:
plot_TE_expression_profile_by_superfamily(TE_data_table)