In [64]:
# DEPENDENCIES

import pysam
import pandas as pd
import HTSeq

# GLOBAL VARS

FC30_DMGOTH_BAMFILE = "/data2/eric/TE_LR_RNAseq/data/dmgoth101_genome_alignments/bam/FC30.against_dmgoth.bam"
FC30_DMGOTH_MAX_AS_BAMFILE = "/data2/eric/TE_LR_RNAseq/data/dmgoth101_genome_alignments/bam/FC30.against_dmgoth.filtered.bam"

FC29_DMGOTH_BAMFILE = "/data2/eric/TE_LR_RNAseq/data/dmgoth101_genome_alignments/bam/FC29.against_dmgoth.bam"
FC29_DMGOTH_MAX_AS_BAMFILE = "/data2/eric/TE_LR_RNAseq/data/dmgoth101_genome_alignments/bam/FC29.against_dmgoth.filtered.bam"

FC30_DMGOTH_SUBSET_BAMFILE = "/data2/eric/TE_LR_RNAseq/data/dmgoth101_genome_alignments/bam/FC30.subset.against_dmgoth.bam"
FC30_DMGOTH_SUBSET_MAX_AS_BAMFILE = "/data2/eric/TE_LR_RNAseq/data/dmgoth101_genome_alignments/bam/FC30.subset.against_dmgoth.filtered.bam"

REPEATMASKER_TE_ANNOTATIONS = "/data2/eric/TE_LR_RNAseq/data/dmgoth101_genome_alignments/annotations/dmgoth101.onecode.v2.gtf"
DMGOTH_GENE_ANNOTATIONS = "/data2/eric/TE_LR_RNAseq/data/dmgoth101_genome_alignments/annotations/Dm_Goth_10-1.dmel6.23LiftOff.sorted.gff"

In [52]:
def build_index(bamfile):
    bam = pysam.AlignmentFile(bamfile, 'rb')
    read_index = pysam.IndexedReads(bam)
    read_index.build()
    return read_index

def get_query_names(bamfile):
    query_names = []
    with pysam.AlignmentFile(bamfile, 'rb') as bam:
        for ali in bam:
            query_names.append(ali.query_name)
    return set(query_names)


In [65]:
# Filter out reads : 

## We filter unmapped, supplementary, and reads with non-optimal AS score.

def filter_max_AS_reads(bamfile, output_bamfile):
    query_names = get_query_names(bamfile)
    read_index = build_index(bamfile)
    with pysam.AlignmentFile(output_bamfile, 'wb', template = pysam.AlignmentFile(bamfile,'rb')) as output:
        for read in query_names:
            ali_list = read_index.find(read)
            max_AS = 0
            primary_and_secondary_alignments = [ali for ali in ali_list if not (ali.is_unmapped or ali.is_supplementary)]
            if primary_and_secondary_alignments :
                max_AS = max([ali.get_tag('AS') for ali in primary_and_secondary_alignments])
                max_AS_alignments = [ali for ali in primary_and_secondary_alignments if ali.get_tag('AS') == max_AS]
                for ali in max_AS_alignments:
                    output.write(ali)
    return output_bamfile
# filter_max_AS_reads(FC30_DMGOTH_SUBSET_BAMFILE, FC30_DMGOTH_SUBSET_MAX_AS_BAMFILE)
filter_max_AS_reads(FC29_DMGOTH_BAMFILE, FC29_DMGOTH_MAX_AS_BAMFILE)


'/data2/eric/TE_LR_RNAseq/data/dmgoth101_genome_alignments/bam/FC29.against_dmgoth.filtered.bam'

In [10]:
class TransposableElementInsertion:
    def __init__(self, insertion_id, chrom, start, end):
        self.id = insertion_id
        self.chrom = chrom
        self.start = int(start)
        self.end = int(end)
        self.alignments = []
        self.counts = 0

        



In [67]:
def initiate_transposable_element_family_dict(annotations):
    transposable_element_family_dict = {}
    with open(annotations) as annot:
        for line in annot:
            splitted_line = line.strip().split("\t")
            chrom, start, end = [splitted_line[i] for i in [0, 3, 4]]
            ids = splitted_line[-1].split('"')
            family_name = ids[1]
            if int(end) - int(start) > 80:
                new_insertion = TransposableElementInsertion(ids[3], chrom, start, end)
                if family_name not in transposable_element_family_dict:
                    transposable_element_family_dict[family_name] = []
                transposable_element_family_dict[family_name].append(new_insertion)
    return transposable_element_family_dict
FC30_SUBSET_FAMILY_DICT = initiate_transposable_element_family_dict(REPEATMASKER_TE_ANNOTATIONS)

In [60]:
def get_subject_coverage(alignment, insertion):
    overlap_start = max(alignment.reference_start, insertion.start)
    overlap_end = min(alignment.reference_end, insertion.end)
    overlap_length = overlap_end - overlap_start
    insertion_length = insertion.end - insertion.start
    subject_coverage = overlap_length / insertion_length
    return subject_coverage

def default_filter(alignment, insertion):
    subject_coverage = get_subject_coverage(alignment, insertion)
    nb_aligned_pairs =  alignment.get_overlap(insertion.start, insertion.end)
    is_ok = (subject_coverage > 0.1 and nb_aligned_pairs > 1)
    return is_ok

def get_exceeding_alignment_length(alignment, insertion):
    overflow_length = 0
    if alignment.reference_start < insertion.start:
        overflow_length += insertion.start -alignment.reference_start
    if alignment.reference_end > insertion.end:
        overflow_length += alignment.reference_end - insertion.end
    return overflow_length

def filter_out_co_expressed(alignment, insertion):
    exceeding_length = get_exceeding_alignment_length(alignment, insertion)
    alignment_length = alignment.reference_end - alignment.reference_start
    is_ok = (exceeding_length < (0.1 * alignment_length))
    return is_ok

def match_reads_with_insertions(bamfile, family_dict, filter_strategy):
    with pysam.AlignmentFile(bamfile, 'rb') as bam:
        for family, insertion_list in family_dict.items():
            for insertion in insertion_list:
                def apply_filter_strategy(read):
                    return filter_strategy(read, insertion)
                insertion.count = bam.count(contig=insertion.chrom, start=insertion.start, end=insertion.end, read_callback=apply_filter_strategy)
    return family_dict
# FC30_SUBSET_FAMILY_DICT = match_reads_with_insertions(FC30_DMGOTH_SUBSET_BAMFILE, FC30_SUBSET_FAMILY_DICT, filter_out_co_expressed)

In [69]:
for family, insertion_list in FC30_FAMILY_DICT.items():
    for insertion in insertion_list:
        if insertion.count > 3:
            print(insertion.id)
            print(insertion.count)


TART-A_3L_RaGOO_24811786_24820381
5
TART-A_3L_RaGOO_24818846_24827437
4
HETA_3L_RaGOO_18403_20679
4
HETA_X_RaGOO_85920_94840
4
Copia_LTR_2L_RaGOO_9004503_9009641
5
Copia_LTR_2L_RaGOO_13770083_13775227
9
Copia_LTR_2L_RaGOO_14753817_14758950
19
Copia_LTR_2L_RaGOO_21771835_21776956
19
Copia_LTR_2R_RaGOO_20482857_20488005
14
Copia_LTR_3L_RaGOO_10022428_10027467
34
Copia_LTR_3L_RaGOO_10052506_10057644
26
Copia_LTR_3L_RaGOO_17623155_17628299
20
Copia_LTR_3R_RaGOO_1372041_1377188
9
ROO_I_2R_RaGOO_13238874_13250807
18
ROO_I_2R_RaGOO_14213942_14227652
45
ROO_I_3R_RaGOO_14005164_14015074
85
ROO_I_X_RaGOO_4684190_4696799
353
ROO_I_X_RaGOO_6590807_6605123
7
BLOOD_LTR_2L_RaGOO_2148055_2155527
7
BLOOD_LTR_2L_RaGOO_3268611_3276063
4
BLOOD_LTR_2R_RaGOO_416001_421270
4
POGO_2L_RaGOO_2955877_2958004
6
POGO_2R_RaGOO_7201268_7202754
37
POGO_3L_RaGOO_9733928_9735150
68
POGO_3L_RaGOO_17298517_17299810
4
POGO_3L_RaGOO_20073927_20075365
4
POGO_3R_RaGOO_11285980_11288111
4
POGO_X_RaGOO_14998652_15000110
4
POGO

In [68]:
def main(filtered_bamfile, annotations, filter_strategy):
    family_dict = initiate_transposable_element_family_dict(annotations)
    match_reads_with_insertions(filtered_bamfile, family_dict, filter_strategy)
    return family_dict
FC30_FAMILY_DICT = main(FC30_DMGOTH_MAX_AS_BAMFILE, REPEATMASKER_TE_ANNOTATIONS, filter_out_co_expressed)