In [57]:
# DEPENDENCIES

import pysam
import pandas as pd
import HTSeq
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# GLOBAL VARS

FC30_DMGOTH_BAMFILE = "/data2/eric/TE_LR_RNAseq/data/dmgoth101_genome_alignments/bam/FC30.against_dmgoth.bam"
FC30_DMGOTH_MAX_AS_BAMFILE = "/data2/eric/TE_LR_RNAseq/data/dmgoth101_genome_alignments/bam/FC30.against_dmgoth.filtered.bam"

FC29_DMGOTH_BAMFILE = "/data2/eric/TE_LR_RNAseq/data/dmgoth101_genome_alignments/bam/FC29.against_dmgoth.bam"
FC29_DMGOTH_MAX_AS_BAMFILE = "/data2/eric/TE_LR_RNAseq/data/dmgoth101_genome_alignments/bam/FC29.against_dmgoth.filtered.bam"

FC30_DMGOTH_SUBSET_BAMFILE = "/data2/eric/TE_LR_RNAseq/data/dmgoth101_genome_alignments/bam/FC30.subset.against_dmgoth.bam"
FC30_DMGOTH_SUBSET_MAX_AS_BAMFILE = "/data2/eric/TE_LR_RNAseq/data/dmgoth101_genome_alignments/bam/FC30.subset.against_dmgoth.filtered.bam"

REPEATMASKER_TE_ANNOTATIONS = "/data2/eric/TE_LR_RNAseq/data/dmgoth101_genome_alignments/annotations/dmgoth101.onecode.v2.gtf"
DMGOTH_GENE_ANNOTATIONS = "/data2/eric/TE_LR_RNAseq/data/dmgoth101_genome_alignments/annotations/Dm_Goth_10-1.dmel6.23LiftOff.sorted.gff"

DFAM_TE_ANNOTATION = "/data2/eric/TE_LR_RNAseq/data/families.flanked_LTR.hierarchy.fa"

In [16]:
def build_index(bamfile):
    bam = pysam.AlignmentFile(bamfile, 'rb')
    read_index = pysam.IndexedReads(bam)
    read_index.build()
    return read_index

def get_query_names(bamfile):
    query_names = []
    with pysam.AlignmentFile(bamfile, 'rb') as bam:
        for ali in bam:
            query_names.append(ali.query_name)
    return set(query_names)


In [17]:
# Filter out reads : 

## We filter unmapped, supplementary, and reads with non-optimal AS score.

def filter_max_AS_reads(bamfile, output_bamfile):
    query_names = get_query_names(bamfile)
    read_index = build_index(bamfile)
    with pysam.AlignmentFile(output_bamfile, 'wb', template = pysam.AlignmentFile(bamfile,'rb')) as output:
        for read in query_names:
            ali_list = read_index.find(read)
            max_AS = 0
            primary_and_secondary_alignments = [ali for ali in ali_list if not (ali.is_unmapped or ali.is_supplementary)]
            if primary_and_secondary_alignments :
                max_AS = max([ali.get_tag('AS') for ali in primary_and_secondary_alignments])
                max_AS_alignments = [ali for ali in primary_and_secondary_alignments if ali.get_tag('AS') == max_AS]
                for ali in max_AS_alignments:
                    output.write(ali)
    return output_bamfile
# filter_max_AS_reads(FC30_DMGOTH_SUBSET_BAMFILE, FC30_DMGOTH_SUBSET_MAX_AS_BAMFILE)
# filter_max_AS_reads(FC29_DMGOTH_BAMFILE, FC29_DMGOTH_MAX_AS_BAMFILE)


In [18]:
class TransposableElementInsertion:
    def __init__(self, insertion_id, chrom, start, end):
        self.id = insertion_id
        self.chrom = chrom
        self.start = int(start)
        self.end = int(end)
        self.alignments = []
        self.counts = 0

        



In [19]:
def initiate_transposable_element_family_dict(annotations):
    transposable_element_family_dict = {}
    with open(annotations) as annot:
        for line in annot:
            splitted_line = line.strip().split("\t")
            chrom, start, end = [splitted_line[i] for i in [0, 3, 4]]
            ids = splitted_line[-1].split('"')
            family_name = ids[1]
            if int(end) - int(start) > 80:
                new_insertion = TransposableElementInsertion(ids[3], chrom, start, end)
                if family_name not in transposable_element_family_dict:
                    transposable_element_family_dict[family_name] = []
                transposable_element_family_dict[family_name].append(new_insertion)
    return transposable_element_family_dict
TE_ANNOTATION_DICT = initiate_transposable_element_family_dict(REPEATMASKER_TE_ANNOTATIONS)

In [103]:
def get_subject_coverage(alignment, insertion):
    overlap_start = max(alignment.reference_start, insertion.start)
    overlap_end = min(alignment.reference_end, insertion.end)
    overlap_length = overlap_end - overlap_start
    insertion_length = insertion.end - insertion.start
    subject_coverage = overlap_length / insertion_length
    return subject_coverage

def default_filter(alignment, insertion):
    subject_coverage = get_subject_coverage(alignment, insertion)
    nb_aligned_pairs =  alignment.get_overlap(insertion.start, insertion.end)
    is_ok = (subject_coverage > 0.1 and nb_aligned_pairs > 1)
    return is_ok

def get_exceeding_alignment_length(alignment, insertion):
    overflow_length = 0
    if alignment.reference_start < insertion.start:
        overflow_length += insertion.start -alignment.reference_start
    if alignment.reference_end > insertion.end:
        overflow_length += alignment.reference_end - insertion.end
    return overflow_length

def filter_out_co_expressed(alignment, insertion):
    exceeding_length = get_exceeding_alignment_length(alignment, insertion)
    alignment_length = alignment.reference_end - alignment.reference_start
    is_ok = (exceeding_length < (0.1 * alignment_length))
    return is_ok

def match_reads_with_insertions(bamfile, family_dict, filter_strategy):
    with pysam.AlignmentFile(bamfile, 'rb') as bam:
        for family, insertion_list in family_dict.items():
            for insertion in insertion_list:
                def apply_filter_strategy(read):
                    return filter_strategy(read, insertion)
                insertion.counts = bam.count(contig=insertion.chrom, start=insertion.start, end=insertion.end, read_callback=apply_filter_strategy)
    return family_dict
# FC30_SUBSET_FAMILY_DICT = match_reads_with_insertions(FC30_DMGOTH_SUBSET_BAMFILE, FC30_SUBSET_FAMILY_DICT, filter_out_co_expressed)

In [111]:
def get_TE_counting(filtered_bamfile, annotations, filter_strategy):
    family_dict = initiate_transposable_element_family_dict(annotations)
    match_reads_with_insertions(filtered_bamfile, family_dict, filter_strategy)
    return family_dict
FC30_FAMILY_DICT = get_TE_counting(FC30_DMGOTH_MAX_AS_BAMFILE, REPEATMASKER_TE_ANNOTATIONS, filter_out_co_expressed)
FC29_FAMILY_DICT = get_TE_counting(FC29_DMGOTH_MAX_AS_BAMFILE, REPEATMASKER_TE_ANNOTATIONS, filter_out_co_expressed)

In [102]:
# for family, insertion_list in FC30_FAMILY_DICT.items():
#     for insertion in insertion_list:
#         if insertion.id == "Copia_LTR_2L_RaGOO_9004503_9009641":
#             print(insertion.count)

# for family, insertion_list in FC30_FAMILY_DICT.items():
#     for insertion in insertion_list:
#         if insertion.id == "Copia_LTR_2L_RaGOO_9004503_9009641":
#             # print(family)
#             # print(insertion.id)
#             print(insertion.count)

5
5


In [42]:
def get_TE_hierarchy(consensus_fasta):
    subclass_list = []
    superfamily_list = []
    family_list = []
    with open(consensus_fasta, 'r') as consensus:
        for line in consensus:
            if line.startswith(">"):
                family = line.split("#")[0][1:]
                subclass, superfamily= line.strip().split("#")[1].split("/")
                subclass_list.append(subclass)
                superfamily_list.append(superfamily)
                family_list.append(family)
    hierarchy_df = pd.DataFrame(list(zip(subclass_list, superfamily_list, family_list)), columns =['Subclass', 'Superfamily', 'Family'])
    return hierarchy_df
HIERARCHY_DF = get_TE_hierarchy(DFAM_TE_ANNOTATION)

In [105]:
def from_counting_to_df(counting_dict):
    family_list = []
    min_TE_length_list = []
    max_TE_length_list = []
    counting_list = []
    for family, insertion_list in counting_dict.items():
        family_list.append(family)
        min_TE_length = 1500000
        max_TE_length = 0
        family_counts = 0
        for insertion in insertion_list :
            insertion_length = insertion.end - insertion.start
            min_TE_length = min(min_TE_length, insertion_length)
            max_TE_length = max(max_TE_length, insertion_length)
            family_counts += insertion.counts
        min_TE_length_list.append(min_TE_length)
        max_TE_length_list.append(max_TE_length)
        counting_list.append(family_counts)
    counting_df = pd.DataFrame(list(zip(family_list, min_TE_length_list, max_TE_length_list, counting_list)), columns=["Family", "Min_TE_length","Max_TE_length", "Counts"])
    return counting_df


In [112]:
FC30_DMGOTH_COUNTING_DF = from_counting_to_df(FC30_FAMILY_DICT)
FC29_DMGOTH_COUNTING_DF = from_counting_to_df(FC29_FAMILY_DICT)

In [43]:
HIERARCHY_DF

Unnamed: 0,Subclass,Superfamily,Family
0,Unknown,Unknown,XDMR_DM
1,Unknown,Unknown,XDMR
2,rRNA,Unknown,5S_DM
3,Unknown,Unknown,ALA_DM
4,Unknown,Unknown,ARS406_DM
...,...,...,...
142,LTR,Gypsy,Stalker2
143,LTR,Gypsy,STALKER4
144,LTR,Gypsy,TABOR
145,LTR,Gypsy,TIRANT


In [113]:
def add_hierarchy_to_counting_df(counting_df, hierarchy_df):
    new_df = hierarchy_df.merge(counting_df, on='Family')
    return new_df
new_df = add_hierarchy_to_counting_df(FC30_DMGOTH_COUNTING_DF, HIERARCHY_DF)
new_df.head()

Unnamed: 0,Subclass,Superfamily,Family,Min_TE_length,Max_TE_length,Counts
0,LINE,R1-LOA,Baggins1,121,10648,2
1,DNA,TcMar-Tc1,BARI1,98,1563,0
2,DNA,TcMar-Tc1,BARI_DM,1232,1732,1
3,LINE,I-Jockey,BS,87,7501,2
4,LINE,I-Jockey,BS2,82,9056,1


In [114]:
def create_table_from_counting(female_counting, male_counting, hierarchy):
    male_counting = male_counting.drop(columns=["Min_TE_length", "Max_TE_length"])
    TE_data_table = hierarchy.merge(female_counting, on='Family').merge(male_counting, on='Family')
    TE_data_table.columns = ["Subclass", "Superfamily", "Family", "Min_TE_length", "Max_TE_length", "Female_Counts", "Male_Counts"]
    return TE_data_table

TE_data_table = create_table_from_counting(FC30_DMGOTH_COUNTING_DF, FC29_DMGOTH_COUNTING_DF, HIERARCHY_DF)



In [115]:
TE_data_table.sort_values(by="Female_Counts", axis=0,  ascending=False).head()

Unnamed: 0,Subclass,Superfamily,Family,Min_TE_length,Max_TE_length,Female_Counts,Male_Counts
50,DNA,TcMar-Pogo,POGO,233,3577,220,30
10,LINE,CR1,DMCR1A,83,8863,84,117
68,DNA,CMC-Transib,TRANSIB3,94,5046,16,7
61,LINE,I-Jockey,TART-A,83,8595,15,210
37,LINE,I-Jockey,HETA,109,8920,14,787


In [62]:
def plot_TE_expression_profile_by_family(TE_data_table):

    fig = make_subplots(rows=1, cols=2, specs=[[{'type':'domain'}, {'type':'domain'}]])

    fig.add_trace(
        go.Pie(labels=TE_data_table["Superfamily"], values=TE_data_table["Female_Counts"], textinfo='value', name = "Female"),
        1, 1)

    fig.add_trace(
        go.Pie(labels=TE_data_table["Superfamily"], values=TE_data_table["Male_Counts"], textinfo='value', name = "Male"),
        1, 2)
    fig.update_traces(hole=.4, hoverinfo="label+value")

    fig.update_layout(
        title_text= "Superfamily expression profile",
        width=1200,
        height=500,
        # Add annotations in the center of the donut pies.
        annotations=[dict(text='Female', x=0.18, y=0.5, font_size=20, showarrow=False),
                    dict(text='Male', x=0.80, y=0.5, font_size=20, showarrow=False)])
    fig.show()

In [117]:
def plot_TE_expression_profile_by_superfamily(TE_data_table):

    fig = make_subplots(rows=1, cols=2, specs=[[{'type':'domain'}, {'type':'domain'}]])

    fig.add_trace(
        go.Pie(labels=TE_data_table["Subclass"], values=TE_data_table["Female_Counts"], textinfo='value', name = "Female"),
        1, 1)

    fig.add_trace(
        go.Pie(labels=TE_data_table["Subclass"], values=TE_data_table["Male_Counts"], textinfo='value', name = "Male"),
        1, 2)
    fig.update_traces(hole=.4, hoverinfo="label+value")

    fig.update_layout(
        title_text= "Subclass expression profile",
        width=1200,
        height=500,
        # Add annotations in the center of the donut pies.
        annotations=[dict(text='Female', x=0.18, y=0.5, font_size=20, showarrow=False),
                    dict(text='Male', x=0.80, y=0.5, font_size=20, showarrow=False)])
    fig.show()

In [116]:
plot_TE_expression_profile_by_family(TE_data_table)

In [118]:
plot_TE_expression_profile_by_superfamily(TE_data_table)
