In [21]:
# DEPENDENCIES

import pysam
import pandas as pd
import HTSeq

# GLOBAL VARS

FC30_DMGOTH_BAMFILE = "/data2/eric/TE_LR_RNAseq/data/dmgoth101_genome_alignments/bam/FC30.against_dmgoth.bam"
FC29_DMGOTH_BAMFILE = "/data2/eric/TE_LR_RNAseq/data/dmgoth101_genome_alignments/bam/FC29.against_dmgoth.bam"
FC30_DMGOTH_SUBSET_BAMFILE = "/data2/eric/TE_LR_RNAseq/data/dmgoth101_genome_alignments/bam/FC30.subset.against_dmgoth.bam"
FC30_DMGOTH_SUBSET_MAX_AS_BAMFILE = "/data2/eric/TE_LR_RNAseq/data/dmgoth101_genome_alignments/bam/FC30.subset.against_dmgoth.filtered.bam"

REPEATMASKER_TE_ANNOTATIONS = "/data2/eric/TE_LR_RNAseq/data/dmgoth101_genome_alignments/annotations/dmgoth101.onecode.v2.gtf"
DMGOTH_GENE_ANNOTATIONS = "/data2/eric/TE_LR_RNAseq/data/dmgoth101_genome_alignments/annotations/Dm_Goth_10-1.dmel6.23LiftOff.sorted.gff"

In [6]:
alignment_index.find("ceafd9ce-643e-4b66-835a-5af47d256ce1")

<pysam.libcalignmentfile.IteratorRowSelection at 0x7fd882b036d0>

In [11]:
def build_index(bamfile):
    bam = pysam.AlignmentFile(bamfile, 'rb')
    read_index = pysam.IndexedReads(bam)
    read_index.build()
    return read_index

def get_query_names(bamfile):
    query_names = []
    with pysam.AlignmentFile(bamfile, 'rb') as bam:
        for ali in bam:
            query_names.append(ali.query_name)
    return set(query_names)


In [20]:
# Filter out reads : 

## We filter unmapped, supplementary, and reads with non-optimal AS score.

def filter_max_AS_reads(bamfile, output_bamfile):
    query_names = get_query_names(bamfile)
    read_index = build_index(bamfile)
    with pysam.AlignmentFile(output_bamfile, 'wb', template = pysam.AlignmentFile(bamfile,'rb')) as output:
        for read in query_names:
            ali_list = read_index.find(read)
            max_AS = 0
            primary_and_secondary_alignments = [ali for ali in ali_list if not (ali.is_unmapped or ali.is_supplementary)]
            if primary_and_secondary_alignments :
                max_AS = max([ali.get_tag('AS') for ali in primary_and_secondary_alignments])
                max_AS_alignments = [ali for ali in primary_and_secondary_alignments if ali.get_tag('AS') == max_AS]
                for ali in max_AS_alignments:
                    output.write(ali)
    return output_bamfile
filter_max_AS_reads(FC30_DMGOTH_SUBSET_BAMFILE, FC30_DMGOTH_SUBSET_MAX_AS_BAMFILE)


In [44]:
class TransposableElementInsertion:
    def __init__(self, insertion_id, chrom, start, end):
        self.id = insertion_id
        self.chrom = chrom
        self.start = start
        self.end = end
        self.alignments = []

        



In [45]:
def initiate_transposable_element_family_dict(annotations):
    transposable_element_family_dict = {}
    with open(annotations) as annot:
        for line in annot:
            splitted_line = line.strip().split("\t")
            chrom, start, end = [splitted_line[i] for i in [0, 3, 4]]
            ids = splitted_line[-1].split('"')
            family_name = ids[1]
            new_insertion = TransposableElementInsertion(ids[3], chrom, start, end)
            if family_name not in transposable_element_family_dict:
                transposable_element_family_dict[family_name] = []
            transposable_element_family_dict[family_name].append(new_insertion)
    return transposable_element_family_dict
FC30_SUBSET_FAMILY_DICT = initiate_transposable_element_family_dict(REPEATMASKER_TE_ANNOTATIONS)

In [50]:
def match_reads_with_insertions(bamfile, family_dict):
    with AlignmentFile(bamfile, 'rb') as bam:
        for family, insertion_list in family_dict.items():
            for insertion in insertion_list:
                bam.fetch(insertion.chrom, )
                print(insertion.id)
                break
match_reads_with_insertions(FC30_DMGOTH_SUBSET_BAMFILE, FC30_SUBSET_FAMILY_DICT)

IVK_DM_2L_RaGOO_1_356
DM297_I_2L_RaGOO_366_2025
TAHRE_2L_RaGOO_4083_4246
TART-A_2L_RaGOO_4304_8214
HETA_2L_RaGOO_5123_5232
MDG1_LTR_2L_RaGOO_8208_9729
ZAM_I_2L_RaGOO_10009_10694
QUASIMODO_LTR_2L_RaGOO_10703_23126
Copia_LTR_2L_RaGOO_12650_17604
QUASIMODO_I_2L_RaGOO_23127_26120
BLASTOPIA_I_2L_RaGOO_26796_29201
HOBO_2L_RaGOO_29304_30758
M4DM_2L_RaGOO_30759_32079
IDEFIX_LTR_2L_RaGOO_76396_76529
DNAREP1_DM_2L_RaGOO_76559_76663
BS2_2L_RaGOO_128782_129373
QUASIMODO2-LTR_DM_2L_RaGOO_187955_188043
I_DM_2L_RaGOO_241126_241241
ROO_I_2L_RaGOO_279695_279751
ROO_LTR_2L_RaGOO_833659_841384
FW2_DM_2L_RaGOO_1444418_1448277
IDEFIX_I_2L_RaGOO_1826802_1833031
BS_2L_RaGOO_1957044_1957171
BLOOD_LTR_2L_RaGOO_2148055_2155527
TRANSPAC_LTR_2L_RaGOO_2221225_2226474
POGO_2L_RaGOO_2955877_2958004
DM412_2L_RaGOO_3096440_3096922
LINEJ1_DM_2L_RaGOO_3156645_3161540
PROTOP_B_2L_RaGOO_3199598_3199632
NOMAD_LTR_2L_RaGOO_3400786_3408481
ACCORD2_I_2L_RaGOO_3969176_3969396
MICROPIA_I_2L_RaGOO_3970137_3974092
FB4_DM_2L_RaGOO