In [1]:
# DEPENDENCIES


#### WARNING : TE WITH UNDERSCORE IN THEIR NAMES !!!

import pysam
import pandas as pd
import HTSeq
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import matplotlib.pyplot as plt
# from tqdm.notebook import tqdm, trange
from tqdm import tqdm as tqdm
import cProfile
import plotly.io as pio
pio.renderers.default = 'notebook_connected'
import numpy as np

# GLOBAL VARS

FC30_FASTQFILE = "/data2/eric/TE_LR_RNAseq/data/reads/FC30.fastq.gz"

FC30_DMGOTH_BAMFILE = "/data2/eric/TE_LR_RNAseq/data/dmgoth101_genome_alignments/bam/FC30.against_dmgoth.bam"
FC30_DMGOTH_MAX_AS_BAMFILE = "/data2/eric/TE_LR_RNAseq/data/dmgoth101_genome_alignments/bam/FC30.against_dmgoth.filtered_max_AS.bam"
FC30_DMGOTH_MAX_AS_PRIMARY_ONLY_BAMFILE ="/data2/eric/TE_LR_RNAseq/data/dmgoth101_genome_alignments/bam/FC30.against_dmgoth.filtered_max_AS.primary_only.bam"

FC29_DMGOTH_BAMFILE = "/data2/eric/TE_LR_RNAseq/data/dmgoth101_genome_alignments/bam/FC29.against_dmgoth.bam"
FC29_DMGOTH_MAX_AS_BAMFILE = "/data2/eric/TE_LR_RNAseq/data/dmgoth101_genome_alignments/bam/FC29.against_dmgoth.filtered_max_AS.bam"
FC29_DMGOTH_MAX_AS_PRIMARY_ONLY_BAMFILE ="/data2/eric/TE_LR_RNAseq/data/dmgoth101_genome_alignments/bam/FC29.against_dmgoth.filtered_max_AS.primary_only.bam"

FC30_DMGOTH_SUBSET_BAMFILE = "/data2/eric/TE_LR_RNAseq/data/dmgoth101_genome_alignments/bam/FC30.subset.against_dmgoth.bam"
FC30_DMGOTH_SUBSET_MAX_AS_BAMFILE = "/data2/eric/TE_LR_RNAseq/data/dmgoth101_genome_alignments/bam/FC30.subset.against_dmgoth.filtered.bam"

REPEATMASKER_TE_ANNOTATIONS = "/data2/eric/TE_LR_RNAseq/data/dmgoth101_genome_alignments/annotations/dmgoth101.onecode.v2.gtf"
DMGOTH_GENE_ANNOTATIONS = "/data2/eric/TE_LR_RNAseq/data/dmgoth101_genome_alignments/annotations/Dm_Goth_10-1.dmel6.23LiftOff.sorted.gff"

DFAM_TE_ANNOTATION = "/data2/eric/TE_LR_RNAseq/data/families.flanked_LTR.hierarchy.fa"
TE_HIERARCHY_FILE= "/data2/eric/TE_LR_RNAseq/TE_hierarchy.tsv"
BLASTN_FILE="/data2/eric/TE_LR_RNAseq/Dm_Goth_10-1_insertions_vsTEdb.bln"
NEW_ANNOTATION_FILE = "/data2/eric/TE_LR_RNAseq/Dm_Goth_10-1_insertions_vsTEdb.gtf"
INSERTION_TABLE = "/data2/eric/TE_LR_RNAseq/data/dmgoth101_genome_alignments/insertion_table.tsv"

VALID_CHROM_LIST = ["2L_RaGOO", "2R_RaGOO", "3L_RaGOO", "3R_RaGOO", "4_RaGOO", "X_RaGOO"]

In [4]:




def create_gtf_attributes(subjectId, chrom, start, end):
	# gene_id "IVK_DM"; transcript_id "IVK_DM_2L_RaGOO_1_356";
	gtf_attributes = 'gene_id "{}"; transcript_id "{}:{}:{}:{}";'.format(subjectId, subjectId, chrom, start, end)
	return gtf_attributes

def blastn_line_to_gtf_line(line):
	# +::2L_RaGOO:20942953-20943027   G5A     88.732  71      8       0       1
    #   71      795     865     1.86e-19        87.9

	# 2L_RaGOO        dmgoth101       exon    1       356     2558    +       .
    #   gene_id "IVK_DM"; transcript_id "IVK_DM_2L_RaGOO_1_356";
	(queryId, subjectId, percIdentity, alnLength, mismatchCount, 
    gapOpenCount, queryStart, queryEnd, subjectStart, 
    subjectEnd, eVal, bitScore) = line.split("\t")
	strand = queryId.split(":")[0]
	seq = queryId.split(":")[2]
	startQueryId, endQueryId = queryId.split(':')[3].split("-")
	source = "dmgoth101"
	feature = "exon"
	score = "."
	phase = "."
	start = min(int(subjectStart), int(subjectEnd)) + int(startQueryId)
	end = max(int(subjectStart), int(subjectEnd)) + int(startQueryId)
	# print(subjectStart, subjectEnd)
	attributes = create_gtf_attributes(subjectId, seq, str(start), str(end))
	print(attributes)
	new_gtf_line = "\t".join([seq, source, feature, str(start), str(end), score, strand, phase, attributes]) + "\n"
	# print(new_gtf_line)
	return new_gtf_line
	
#seq, source, feature, start, end, score, strand, phase, attributes

def from_blastn_to_gtf(blastn_file, new_gtf_file):
	with open(new_gtf_file, 'w') as output:
		with open(blastn_file, 'r') as input:
			for line in input:
				gtf_line = blastn_line_to_gtf_line(line)
				output.write(gtf_line)



# def from_blastn_to_gtf_no_Y_chrom(blastn_file, new_gtf_file):
# 	with open(new_gtf_file, 'w') as output:
# 		with open(blastn_file, 'r') as input:
# 			for line in input:
# 				gtf_line = blastn_line_to_gtf_line(line)
# 				if not line.startswith("Y"):
# 					output.write(gtf_line)

In [5]:
def create_TE_hierarchy(te_hierarchy_file):
	family_list = []
	superfamily_list = []
	subclass_list = []
	with open(TE_HIERARCHY_FILE, 'r') as input:
		input.readline()
		for line in input:
			if len(line.split()) == 3:
				line = line.strip()
				family, superfamily, subclass = line.split("\t")
				family_list.append(family)
				superfamily_list.append(superfamily)
				subclass_list.append(subclass)
	hierarchy_df = pd.DataFrame(list(zip(subclass_list, superfamily_list, family_list)), columns =['Subclass', 'Superfamily', 'Family'])
	return hierarchy_df	

HIERARCHY_DF  = create_TE_hierarchy(TE_HIERARCHY_FILE)

from_blastn_to_gtf(BLASTN_FILE, NEW_ANNOTATION_FILE)



gene_id "G5A"; transcript_id "G5A:2L_RaGOO:20943748:20943818";
gene_id "INE-1"; transcript_id "INE-1:2L_RaGOO:21704135:21704222";
gene_id "FB"; transcript_id "FB:2L_RaGOO:21709101:21709130";
gene_id "FB"; transcript_id "FB:2L_RaGOO:21709255:21709284";
gene_id "FB"; transcript_id "FB:2L_RaGOO:21709409:21709438";
gene_id "INE-1"; transcript_id "INE-1:2L_RaGOO:22133389:22133459";
gene_id "P-element"; transcript_id "P-element:2R_RaGOO:864782:865590";
gene_id "P-element"; transcript_id "P-element:2R_RaGOO:867342:867688";
gene_id "P-element"; transcript_id "P-element:2R_RaGOO:864782:864812";
gene_id "P-element"; transcript_id "P-element:2R_RaGOO:867658:867688";
gene_id "Dbif_P-element_M"; transcript_id "Dbif_P-element_M:2R_RaGOO:867389:867716";
gene_id "springer"; transcript_id "springer:2R_RaGOO:3159014:3159530";
gene_id "gypsy3"; transcript_id "gypsy3:2R_RaGOO:3158708:3159049";
gene_id "Quasimodo"; transcript_id "Quasimodo:2R_RaGOO:3561220:3565107";
gene_id "Quasimodo"; transcript_id "Quas

In [40]:
# def get_TE_hierarchy(consensus_fasta):
#     subclass_list = []
#     superfamily_list = []
#     family_list = []
#     with open(consensus_fasta, 'r') as consensus:
#         for line in consensus:
#             if line.startswith(">"):
#                 family = line.split("#")[0][1:]
#                 subclass, superfamily= line.strip().split("#")[1].split("/")
#                 subclass_list.append(subclass)
#                 superfamily_list.append(superfamily)
#                 family_list.append(family)
#     hierarchy_df = pd.DataFrame(list(zip(subclass_list, superfamily_list, family_list)), columns =['Subclass', 'Superfamily', 'Family'])
#     return hierarchy_df

# def manually_fix_hierarchy(hierarchy_df):
#     gypsy6A_row = {"Subclass":"LTR", "Superfamily":"Gypsy", "Family":"Gypsy6A"}
#     gypsy12A_row = {"Subclass":"LTR", "Superfamily":"Gypsy", "Family":"Gypsy12A"}
#     dm412B_row = {"Subclass":"LTR", "Superfamily":"Gypsy", "Family":"DM412B"}
#     hmsbeagle_row = {"Subclass":"LTR", "Superfamily":"Gypsy", "Family":"HMSBEAGLE"}
#     dmtom1_row = {"Subclass":"LTR", "Superfamily":"Gypsy", "Family":"DMTOM1"}
#     mudr1_row = {"Subclass":"Unknown", "Superfamily":"Unknown", "Family":"MuDR-1_DEl"}
#     stalker3_row = {"Subclass":"LTR", "Superfamily":"Gypsy", "Family":"Stalker3"}
#     P_row = {"Subclass":"TIR", "Superfamily":"P", "Family":"P-1_DY"}
#     rehavkus_row = {"Subclass":"Unknown", "Superfamily":"Unknown", "Family":"Rehavkus-1_DY"}
#     new_row_list = [gypsy6A_row, gypsy12A_row, dm412B_row, hmsbeagle_row, dmtom1_row, mudr1_row, stalker3_row, P_row,rehavkus_row]
#     for new_row in new_row_list :
#         hierarchy_df = hierarchy_df.append(new_row, ignore_index=True)
#     return hierarchy_df

In [41]:
# HIERARCHY_DF = get_TE_hierarchy(DFAM_TE_ANNOTATION)
# HIERARCHY_DF = manually_fix_hierarchy(HIERARCHY_DF)



In [42]:
### REBOOT
def gene_id_to_family_name(gene_id):
    family_name = gene_id
    suffix_list = ["_I", "-I_DM", "_I_DM", "_LTR", "-LTR_DM", "_LTR_DM"]
    for suffix in suffix_list:
        if gene_id[-len(suffix):] == suffix:
            family_name = gene_id[:-len(suffix)]
            break
    return family_name

class TE_feature:
    def __init__(self, chrom, start, end, gene_id, insertion_id):
        self.chrom = chrom
        self.start = start
        self.end = end
        # self.gene_id = gene_id
        self.insertion_id = insertion_id
        self.count = 0
        self.family = gene_id_to_family_name(gene_id)
        self.counted_reads = {}
    
    def __len__(self):
        return self.end - self.start
    
    def __repr__(self):
        return self.insertion_id
    
    def is_valid_feature(self):
        return len(self) > 150
    
def parse_TE_annotation_line(line):
    sline = line.strip().split("\t")
    chrom = sline[0]
    start = int(sline[3])
    end = int(sline[4])
    gene_id = sline[-1].split(";")[0].split('"')[1]
    insertion_id = sline[-1].strip().split('transcript_id "')[-1][:-2]

    return TE_feature(chrom, start, end, gene_id, insertion_id)

def parse_TE_annotation_file(annotations):
    ### Read an annotation file and return a dict of filled TE feature per chromosome
    dict_of_valid_TE = {}
    with open(annotations, 'r') as annot:
        for line in annot :
            new_feature = parse_TE_annotation_line(line)
            if new_feature.is_valid_feature and new_feature.chrom in VALID_CHROM_LIST:
                if new_feature.chrom not in dict_of_valid_TE:
                    dict_of_valid_TE[new_feature.chrom] = [new_feature]
                else :
                    dict_of_valid_TE[new_feature.chrom].append(new_feature)
    for chrom in dict_of_valid_TE :
        dict_of_valid_TE[chrom].sort(key=lambda x: x.start, reverse=False)
    return dict_of_valid_TE

def get_reads_mapped_on_TE_feature(alignment_file, TE_annotation):
    ### return a set of all reads that map a TE feature
    reads_mapped_on_TE = []
    dict_of_valid_TE = parse_TE_annotation_file(TE_annotation)
    with pysam.AlignmentFile(alignment_file, 'rb') as bam:
        for chrom, insertion_list in tqdm(dict_of_valid_TE.items(), desc="Chromosome", position=0, leave=True):
            for insertion in tqdm(insertion_list, desc =chrom, position=0, leave=True):
                try :
                    reads_mapped_on_TE += list(bam.fetch(contig=insertion.chrom, start=insertion.start, stop=insertion.end))
                except ValueError:
                    # print("{} chromosome not found in bamfile ! Skipping...".format(insertion.chrom))
                    pass
    return set(reads_mapped_on_TE)

In [43]:
class Gene:
    def __init__(self, chrom, start, end, gene_id):
        self.chrom = chrom
        self.start = start
        self.end = end
        self.gene_id = gene_id
    
    def __len__(self):
        return self.end - self.start
    
    def __repr__(self):
        return self.gene_id

def parse_gene_annotation_line(line):
    sline = line.strip().split("\t")
    chrom = sline[0]
    start = int(sline[3])
    end = int(sline[4])
    gene_id = sline[-1].split(";")[0].split('"')[1]
    return Gene(chrom, start, end, gene_id)


def is_overlapped(start1, end1, start2, end2):
    return end1 >= start2 and end2 >= start1

def get_overlapped_TE(read, dict_of_valid_TE):
    list_of_overlapped_TE =[]
    chrom = read.reference_name
    for insertion in dict_of_valid_TE[chrom]:
        if read.reference_end < insertion.start:
            continue
        if read.reference_end >= insertion.start and insertion.end >= read.reference_start :
            list_of_overlapped_TE.append(insertion)
    return list_of_overlapped_TE

def get_dict_of_overlapped_gene(dict_of_valid_TE, gene_annotation):
    dict_of_overlapped_gene = {}
    with open(gene_annotation, 'r') as gene_annot:
        for line in gene_annot:
            sline = line.strip().split('\t')
            if sline[2] == "gene" :
                chrom = sline[0]
                gene_start = int(sline[3])
                gene_stop = int(sline[4])
                if chrom in dict_of_valid_TE:
                    for insertion in dict_of_valid_TE[chrom]:
                        if is_overlapped(insertion.start, insertion.end, gene_start, gene_stop):
                            new_gene = parse_gene_annotation_line(line)
                            if chrom in dict_of_overlapped_gene:
                                dict_of_overlapped_gene[chrom].append(new_gene)
                            else:
                                dict_of_overlapped_gene[chrom] = [new_gene]
    return dict_of_overlapped_gene   
    
def get_overlapped_gene(read, dict_of_overlapped_gene):
    list_of_overlapped_gene =[]
    for gene in dict_of_overlapped_gene[read.reference_name]:
        if read.reference_end < gene.start:
            continue
        if is_overlapped(read.reference_start, read.reference_end, gene.start, gene.end):
            list_of_overlapped_gene.append(gene)
    return list_of_overlapped_gene

def choose_optimal_feature(read, list_of_overlapped_feature):
    min_dist = 999999999999
    optimal_feature = None
    for overlapped_feature in list_of_overlapped_feature:
        start_dist = abs(overlapped_feature.start - read.reference_start)
        end_dist = abs(overlapped_feature.end - read.reference_end)
        total_dist = start_dist + end_dist
        if total_dist < min_dist:
            min_dist = total_dist
            optimal_feature = overlapped_feature
    return optimal_feature

def get_subject_coverage(alignment, insertion):
    overlap_start = max(alignment.reference_start, insertion.start)
    overlap_end = min(alignment.reference_end, insertion.end)
    overlap_length = overlap_end - overlap_start
    insertion_length = insertion.end - insertion.start
    subject_coverage = overlap_length / insertion_length
    return subject_coverage

def default_filter(alignment, insertion):
    subject_coverage = get_subject_coverage(alignment, insertion)
    nb_aligned_pairs =  alignment.get_overlap(insertion.start, insertion.end)
    is_ok = (subject_coverage > 0.1 and nb_aligned_pairs > 1)
    return is_ok

def get_exceeding_alignment_length(alignment, insertion):
    overflow_length = 0
    if alignment.reference_start < insertion.start:
        overflow_length += insertion.start -alignment.reference_start
    if alignment.reference_end > insertion.end:
        overflow_length += alignment.reference_end - insertion.end
    return overflow_length

def is_co_expressed(alignment, list_of_overlapped_TE, list_of_overlapped_gene):
    # We need to check if the alignment get at least 10% of its aligned base on a TE and 10% on a gene
    # Subject coverage of 10% on a TE and on a gene !
    overlapping_TE = False
    overlapping_gene = False
    for TE in list_of_overlapped_TE :
        if get_subject_coverage(alignment, TE) > 0.1:
            overlapping_TE = True
            break
    for gene in list_of_overlapped_gene :
        if get_subject_coverage(alignment, gene) > 0.1:
            overlapping_gene = True
            break
    return overlapping_gene and overlapping_TE

def filter_out_co_expressed(alignment, insertion):
    exceeding_length = get_exceeding_alignment_length(alignment, insertion)
    alignment_length = alignment.reference_end - alignment.reference_start
    is_ok = (exceeding_length < (0.1 * alignment_length))
    return is_ok

def map_reads_to_TE(reads_mapped_on_TE_feature, bamfile, TE_annotations, gene_annotations):
    insertion_countings = {}
    dict_of_valid_TE = parse_TE_annotation_file(TE_annotations)
    dict_of_overlapped_gene = get_dict_of_overlapped_gene(dict_of_valid_TE, gene_annotations)
    for read in tqdm(reads_mapped_on_TE_feature, position=0, leave=True):
        ### CHIMERIC READS GOT AN "SA" FLAG : WE ARE FILTERING THEM HERE 
        if not read.has_tag("SA") :
            list_of_overlapped_gene = get_overlapped_gene(read, dict_of_overlapped_gene)
            list_of_overlapped_TE = get_overlapped_TE(read, dict_of_valid_TE)
            best_feature = choose_optimal_feature(read, list_of_overlapped_TE + list_of_overlapped_gene)
            if isinstance(best_feature, TE_feature):
                ################### FILTERS GO HERE ########################
                # if default_filter(read, best_feature) and filter_out_co_expressed(read, best_feature):
                if default_filter(read, best_feature) and filter_out_co_expressed(read, best_feature) and is_co_expressed(read, list_of_overlapped_TE, list_of_overlapped_gene):
                    best_feature.count += 1
                    best_feature.counted_reads[read.query_name] = read
    for insertion_list in dict_of_valid_TE.values():
        for insertion in insertion_list:
            if insertion.family not in insertion_countings:
                insertion_countings[insertion.family] = {insertion.insertion_id : insertion}
            else :
                insertion_countings[insertion.family][insertion.insertion_id] = insertion
    return insertion_countings

In [44]:
def get_reads_counted_on_feature(feature_id, reads_mapped_on_TE_feature, bamfile, TE_annotations, gene_annotations):
    read_list = []
    dict_of_valid_TE = parse_TE_annotation_file(TE_annotations)
    dict_of_overlapped_gene = get_dict_of_overlapped_gene(dict_of_valid_TE, gene_annotations)
    feature_chrom = None
    for chrom, feature_list in dict_of_valid_TE.items():
        if feature_id in [f.insertion_id for f in feature_list]:
            feature_chrom = chrom
    if feature_chrom == None :
        return 0
    for read in tqdm(reads_mapped_on_TE_feature, position=0, leave=True):
        if read.reference_name == feature_chrom:
            list_of_overlapped_gene = get_overlapped_gene(read, dict_of_overlapped_gene)
            list_of_overlapped_TE = get_overlapped_TE(read, dict_of_valid_TE)
            best_feature = choose_optimal_feature(read, list_of_overlapped_TE + list_of_overlapped_gene)
            if isinstance(best_feature, TE_feature):
                if best_feature.insertion_id == feature_id:
                    if default_filter(read, best_feature) and filter_out_co_expressed(read, best_feature):
                        read_list.append(read)
    return read_list

def from_counting_to_dataframe(insertion_countings):
    subclass_list = []
    superfamily_list = []
    family_list = []
    insertion_list = []
    counting_list = []
    counted_reads = []
    for family, family_insertion_dict in insertion_countings.items() :
        try:
            subclass = HIERARCHY_DF.loc[HIERARCHY_DF['Family'] == family, 'Subclass'].values[0]
            superfamily = HIERARCHY_DF.loc[HIERARCHY_DF['Family'] == family, 'Superfamily'].values[0]
        except KeyError:
            print(family + " hierarchy unknown in the consensus file : tagged as Unknown...")
            subclass = "Unknown"
            superfamily = "Unknown"
        for insertion_id, insertion in family_insertion_dict.items():
            family_list.append(family)
            insertion_list.append(insertion_id)
            counting_list.append(insertion.count)
            superfamily_list.append(superfamily)
            subclass_list.append(subclass)
            counted_reads.append(insertion.counted_reads)
    df = pd.DataFrame(list(zip(subclass_list, superfamily_list, family_list, insertion_list, counting_list, counted_reads)),
               columns =['Subclass', 'Superfamily', "Family", "Insertion", "Counting", "Reads"])
    return df

In [45]:
def get_subject_coverage(alignment, insertion):
    overlap_start = max(alignment.reference_start, insertion.start)
    overlap_end = min(alignment.reference_end, insertion.end)
    overlap_length = overlap_end - overlap_start
    insertion_length = insertion.end - insertion.start
    if insertion_length != 0 :
        subject_coverage = overlap_length / insertion_length
    else :
        subject_coverage = 0
    return subject_coverage

def get_insertion_mean_subject_coverage(insertion_id, counted_reads):
    insertion = DICT_OF_INSERTION[insertion_id]
    mean_subject_coverage = 0
    for read in counted_reads.values():
        mean_subject_coverage += get_subject_coverage(read, insertion)
    if len(counted_reads) == 0 :
        return 0
    else:
        return mean_subject_coverage/len(counted_reads)

def add_mean_subject_coverage_to_df(counting_df):
    counting_df["mean_subcov"] = counting_df.apply(lambda x: get_insertion_mean_subject_coverage(x['Insertion'], x['Reads']), axis=1)
    return counting_df

def count_TE_feature(alignment_file, TE_annotation, gene_annotation):
    reads_mapped_on_TE_feature = get_reads_mapped_on_TE_feature(alignment_file, TE_annotation)
    insertion_countings = map_reads_to_TE(reads_mapped_on_TE_feature, alignment_file, TE_annotation, gene_annotation)
    counting_df = from_counting_to_dataframe(insertion_countings)
    add_mean_subject_coverage_to_df(counting_df)
    return counting_df




# with cProfile.Profile() as pr:
#     FC30_counting_df = count_TE_feature(FC29_DMGOTH_MAX_AS_BAMFILE, REPEATMASKER_TE_ANNOTATIONS, DMGOTH_GENE_ANNOTATIONS)
# stats = pstats.Stats(pr)
# stats.sort_stats(pstats.SortKey.TIME)
# stats.dump_stats(filename="profiling_test.prof")
# pd.set_option('display.max_rows', 100)
# FC30_counting_df.sort_values(by=['Counting'], ascending=False).head(100)

In [46]:
# dict_of_valid_TE = parse_TE_annotation_file(NEW_ANNOTATION_FILE)
# DICT_OF_INSERTION = {}
# for TE_list in dict_of_valid_TE.values():
#     for feature in TE_list:
#         DICT_OF_INSERTION[feature.insertion_id] = feature 

FC30_counting_df = count_TE_feature(FC30_DMGOTH_MAX_AS_BAMFILE, NEW_ANNOTATION_FILE, DMGOTH_GENE_ANNOTATIONS)
FC29_counting_df = count_TE_feature(FC29_DMGOTH_MAX_AS_BAMFILE, NEW_ANNOTATION_FILE, DMGOTH_GENE_ANNOTATIONS)


2L_RaGOO: 100%|██████████| 302/302 [00:02<00:00, 120.12it/s]
2R_RaGOO: 100%|██████████| 282/282 [00:01<00:00, 155.15it/s]
3L_RaGOO: 100%|██████████| 337/337 [00:00<00:00, 419.22it/s]
3R_RaGOO: 100%|██████████| 231/231 [00:00<00:00, 248.28it/s]
4_RaGOO: 100%|██████████| 6/6 [00:00<00:00, 32.46it/s]
X_RaGOO: 100%|██████████| 273/273 [00:00<00:00, 286.32it/s]
Chromosome: 100%|██████████| 6/6 [00:07<00:00,  1.21s/it]
100%|██████████| 3650/3650 [00:00<00:00, 6172.03it/s]
2L_RaGOO: 100%|██████████| 302/302 [00:03<00:00, 94.01it/s]
2R_RaGOO: 100%|██████████| 282/282 [00:04<00:00, 60.59it/s]
3L_RaGOO: 100%|██████████| 337/337 [00:02<00:00, 115.43it/s]
3R_RaGOO: 100%|██████████| 231/231 [00:03<00:00, 65.08it/s]
4_RaGOO: 100%|██████████| 6/6 [00:00<00:00, 79.25it/s]
X_RaGOO: 100%|██████████| 273/273 [00:01<00:00, 162.98it/s]
Chromosome: 100%|██████████| 6/6 [00:16<00:00,  2.69s/it]
100%|██████████| 6452/6452 [00:01<00:00, 6062.61it/s]


In [49]:
FC29_counting_df.sort_values(by=['Counting'], ascending=False).head(100)

Unnamed: 0,Subclass,Superfamily,Family,Insertion,Counting,Reads,mean_subcov
0,hAT,DNA,hobo,hobo_2L_RaGOO_30168_30255,0,{},0
384,P,DNA,1360,1360_2R_RaGOO_4287136_4287230,0,{},0
378,R1,LINE,R1A1-element,R1A1-element_2L_RaGOO_17786455_17786842,0,{},0
379,P,DNA,1360,1360_2L_RaGOO_19901738_19919995,0,{},0
380,P,DNA,1360,1360_2L_RaGOO_22496894_22498382,0,{},0
...,...,...,...,...,...,...,...
291,Tc1-Mariner,DNA,pogo,pogo_3L_RaGOO_6793749_6793933,0,{},0
292,Tc1-Mariner,DNA,pogo,pogo_3L_RaGOO_9733924_9735149,0,{},0
293,Tc1-Mariner,DNA,pogo,pogo_3L_RaGOO_14849741_14849929,0,{},0
295,Tc1-Mariner,DNA,pogo,pogo_3L_RaGOO_15962202_15962333,0,{},0


In [5]:
def save_counting_df(counting_df, csv_file):
    counting_df.to_csv(csv_file, sep = '\t', index=False)

# save_counting_df(FC29_counting_df, "FC29_counting_df.v2.tsv")
# save_counting_df(FC30_counting_df, "FC30_counting_df.v2.tsv")

### SAVING COUNTING_DF WITH CHIMERIC READS EXCLUDED
# save_counting_df(FC29_counting_df, "FC29_counting_df.v2.chimeric_reads_excluded.tsv")
# save_counting_df(FC30_counting_df, "FC30_counting_df.v2.chimeric_reads_excluded.tsv")

### SAVING COUNTING_DF co_expressed
save_counting_df(FC29_counting_df, "FC29_counting_df.v2.co_expressed.tsv")
save_counting_df(FC30_counting_df, "FC30_counting_df.v2.co_expressed.tsv")

In [6]:
save_counting_df

<function __main__.save_counting_df(counting_df, csv_file)>

In [7]:
# FC29_no_chimere_counting_df = pd.read_csv("FC29_counting_df.v2.chimeric_reads_excluded.tsv", sep = "\t")
# FC30_no_chimere_counting_df = pd.read_csv("FC30_counting_df.v2.chimeric_reads_excluded.tsv", sep = "\t")


FC29_counting_df = pd.read_csv("FC29_counting_df.v2.tsv", sep = "\t")
FC30_counting_df = pd.read_csv("FC30_counting_df.v2.tsv", sep = "\t")



FC29_counting_df = pd.read_csv("FC29_counting_df.v2.co_expressed.tsv", sep = "\t")
FC30_counting_df = pd.read_csv("FC30_counting_df.v2.co_expressed.tsv", sep = "\t")

In [8]:
FC29_counting_df

Unnamed: 0,Subclass,Superfamily,Family,Insertion,Counting,Reads,mean_subcov
0,LINE,I,IVK_DM,IVK_DM_2L_RaGOO_1_356,0,{},0.0
1,LINE,I,IVK_DM,IVK_DM_2L_RaGOO_3202985_3203074,0,{},0.0
2,LINE,I,IVK_DM,IVK_DM_2L_RaGOO_14609439_14614803,0,{},0.0
3,LINE,I,IVK_DM,IVK_DM_2L_RaGOO_19834403_19834598,0,{},0.0
4,LINE,I,IVK_DM,IVK_DM_2L_RaGOO_21063001_21067348,0,{},0.0
...,...,...,...,...,...,...,...
14747,DNA,P,PLACW_DM,PLACW_DM_3L_RaGOO_13917593_13918520,0,{},0.0
14748,Unknown,Unknown,Rehavkus-1_DY,Rehavkus-1_DY_4_RaGOO_569164_569238,0,{},0.0
14749,Unknown,Unknown,Rehavkus-1_DY,Rehavkus-1_DY_4_RaGOO_569377_569421,0,{},0.0
14750,Unknown,Unknown,Rehavkus-1_DY,Rehavkus-1_DY_4_RaGOO_569556_569600,0,{},0.0


In [14]:
print(sum(FC29_counting_df["Counting"]))
print(sum(FC30_counting_df["Counting"]))

104
89


In [47]:
### Get expression ratio for each insertion (chimeric excluded)

## First, we merge male and female dataset : 

# print(FC29_no_chimere_counting_df.head())
# print(FC29_counting_df.head())

expression_df = FC29_counting_df.copy().drop(columns=["mean_subcov", "Reads"])
expression_df["Counting_no_chimere"] = FC29_no_chimere_counting_df["Counting"]

excluded_families = ["TAHRE", "TART_B1", "TART-A", "HETA"]
# excluded_families = []

expression_df = expression_df[~expression_df["Family"].isin(excluded_families)]

nb_of_reads = sum(expression_df["Counting"])
nb_of_reads_no_chimere = sum(expression_df["Counting_no_chimere"])
# print(expression_df)
def get_expression_ratio(counting):
	return counting / nb_of_reads

def get_expression_ratio_bis(counting):
	return counting / nb_of_reads

expression_df["Expression"] = expression_df["Counting"].apply(get_expression_ratio)
expression_df["Expression_no_chimere"] = expression_df["Counting_no_chimere"].apply(get_expression_ratio_bis)

# get length of each insertion
length_list = []

for i in expression_df["Insertion"].values:
	length_list.append(DICT_OF_INSERTION[i].end - DICT_OF_INSERTION[i].start + 1)
expression_df["Insertion_length"] = length_list


save_counting_df(expression_df, "FC29_expression_df.tsv")



In [42]:

# 

Unnamed: 0,Subclass,Superfamily,Family,Insertion,Counting,Counting_no_chimere,Expression,Expression_no_chimere,Insertion_length
0,LINE,I,IVK_DM,IVK_DM_2L_RaGOO_1_356,0,0,0.0,0.0,356
1,LINE,I,IVK_DM,IVK_DM_2L_RaGOO_3202985_3203074,0,0,0.0,0.0,90
2,LINE,I,IVK_DM,IVK_DM_2L_RaGOO_14609439_14614803,0,0,0.0,0.0,5365
3,LINE,I,IVK_DM,IVK_DM_2L_RaGOO_19834403_19834598,0,0,0.0,0.0,196
4,LINE,I,IVK_DM,IVK_DM_2L_RaGOO_21063001_21067348,0,0,0.0,0.0,4348
...,...,...,...,...,...,...,...,...,...
14747,DNA,P,PLACW_DM,PLACW_DM_3L_RaGOO_13917593_13918520,0,0,0.0,0.0,928
14748,Unknown,Unknown,Rehavkus-1_DY,Rehavkus-1_DY_4_RaGOO_569164_569238,0,0,0.0,0.0,75
14749,Unknown,Unknown,Rehavkus-1_DY,Rehavkus-1_DY_4_RaGOO_569377_569421,0,0,0.0,0.0,45
14750,Unknown,Unknown,Rehavkus-1_DY,Rehavkus-1_DY_4_RaGOO_569556_569600,0,0,0.0,0.0,45


In [44]:
import plotly.express as px
df = expression_df.copy()
df = df[df["Counting"] > 2]
df["exp_diff"] = df["Expression"] - df["Expression_no_chimere"]
fig = px.scatter(df, y="exp_diff", x="Insertion", color="Family", size="Insertion_length")
# fig.update_traces(marker_size=10)
fig.show()

In [51]:
import plotly.express as px
df = expression_df.copy()
df = df[df["Counting"] > 2]
df["count_diff"] = df["Counting"] - df["Counting_no_chimere"]
fig = px.scatter(df, y="count_diff", x="Insertion", color="Family", size="Insertion_length")
# fig.update_traces(marker_size=10)
fig.show()

### IMPORT COUNTINGS

In [15]:

saved_FC29_counting_df = pd.read_csv("FC29_counting_df.v2.tsv", sep = "\t")
filtered_FC29_counting_df = saved_FC29_counting_df[saved_FC29_counting_df['Counting'] != 0]
co_expressed_FC29_counting_df = pd.read_csv("FC29_counting_df.v2.co_expressed.tsv", sep = "\t")

saved_FC30_counting_df = pd.read_csv("FC30_counting_df.v2.tsv", sep = "\t")
filtered_FC30_counting_df = saved_FC30_counting_df[saved_FC30_counting_df['Counting'] != 0]
co_expressed_FC30_counting_df = pd.read_csv("FC30_counting_df.v2.co_expressed.tsv", sep = "\t")



In [11]:
def draw_icicle(df):
    fig = px.icicle(df, path=['Subclass', 'Superfamily', 'Family', 'Insertion'], values='Counting',
                    color='mean_subcov',
                    hover_data=['Counting'],
                    color_continuous_scale='RdBu',
                    color_continuous_midpoint=np.average(df['mean_subcov'],weights=df['Counting'])
                    )
    fig.update_traces(root_color="lightgrey")
    fig.update_layout(margin = dict(t=50, l=25, r=25, b=25))
    fig.show()

def draw_sunburst(df):
    fig = px.sunburst(df, path=['Subclass', 'Superfamily', 'Family', 'Insertion'], values='Counting',
                    color='mean_subcov',
                    hover_data=['Counting'],
                    color_continuous_scale='RdBu',
                    color_continuous_midpoint=np.average(df['mean_subcov'],weights=df['Counting'])
                    )
    fig.update_traces(root_color="lightgrey")
    fig.update_layout(margin = dict(t=50, l=25, r=25, b=25))
    fig.show()

In [12]:
draw_icicle(filtered_FC30_counting_df)

NameError: name 'filtered_FC30_counting_df' is not defined

In [10]:
draw_sunburst(filtered_FC30_counting_df)

In [4]:
draw_sunburst(filtered_FC29_counting_df)


NameError: name 'filtered_FC29_counting_df' is not defined

In [17]:
draw_sunburst(co_expressed_FC29_counting_df)


ZeroDivisionError: Weights sum to zero, can't be normalized

In [55]:
pd.set_option('display.max_rows', 100)
# filtered_FC29_counting_df.sort_values(by=['Counting'], ascending=False).head(100)
# print(saved_FC29_counting_df[saved_FC29_counting_df['Insertion'] == "Gypsy_I_2R_RaGOO_2048940_2052346"])
# print(saved_FC29_counting_df[saved_FC29_counting_df['Insertion'] == "Gypsy6_LTR_2R_RaGOO_2049422_2049472"])

Unnamed: 0,Family,Min_TE_length,Max_TE_length,Counts,nb_of_insertion,nb_of_expressed_insertion,Most_expressed_insertion
4,HETA,37,8921,558,18,13,HETA_3L_RaGOO_12349_18402
2,TAHRE,52,17305,461,28,11,TAHRE_2R_RaGOO_1145909_1151824
8,Copia,80,8723,280,50,11,Copia_LTR_3L_RaGOO_10022428_10027467
33,TART_B1,28,6179,247,48,1,TART_B1_2R_RaGOO_23920095_23923160
9,BLASTOPIA,61,5029,159,49,10,BLASTOPIA_LTR_2R_RaGOO_2878638_2883652
3,TART-A,33,8596,141,27,6,TART-A_X_RaGOO_3_3271
64,Jockey2,39,3419,104,49,4,Jockey2_X_RaGOO_21275337_21276483
21,TRANSPAC,245,5252,94,14,11,TRANSPAC_LTR_3R_RaGOO_21217775_21223024
70,TRANSIB2,38,4053,81,53,13,TRANSIB2_2R_RaGOO_3421509_3423952
107,Gypsy12,56,11395,76,80,6,Gypsy12_LTR_3R_RaGOO_842982_854376


In [11]:
pd.set_option('display.max_rows', 100)
filtered_FC30_counting_df.sort_values(by=['Counting'], ascending=False).head(100)
print(saved_FC30_counting_df[saved_FC30_counting_df['Insertion'] == "Gypsy6_LTR_2R_RaGOO_2049422_2049472"])

      Subclass Superfamily  Family                            Insertion  \
12296      LTR       Gypsy  Gypsy6  Gypsy6_LTR_2R_RaGOO_2049422_2049472   

       Counting Reads  mean_subcov  
12296         0    {}          0.0  


In [88]:
def get_insertion_length(insertion_name):
	start, end = insertion_name.split('_')[-2:]
	return int(end) - int(start) + 1

def create_summary_table(counting_table):
	subclass_list = []
	superfamily_list = []
	family_list = counting_table["Family"].unique()
	min_TE_length_list = []
	max_TE_length_list = []
	counting_list = []
	nb_expressed_insertion_list = []
	nb_insertion_list = []
	most_expressed_insertion_list = []
	for family in family_list :
		family_df = counting_table[counting_table["Family"] == family]
		subclass_list.append(family_df["Subclass"].values[0])
		superfamily_list.append(family_df["Superfamily"].values[0])
		min_TE_length = 9999999
		max_TE_length = 0
		for insertion_name in family_df["Insertion"]:
			insertion_length = get_insertion_length(insertion_name)
			if insertion_length < min_TE_length :
				min_TE_length = insertion_length
			if insertion_length > max_TE_length :
				max_TE_length = insertion_length
		min_TE_length_list.append(min_TE_length)
		max_TE_length_list.append(max_TE_length)
		nb_insertion_list.append(len(family_df["Insertion"]))
		counting_list.append(family_df["Counting"].sum())
		nb_expressed_insertion_list.append(len(family_df[family_df["Counting"] > 0]))
		most_expressed_insertion_list.append(family_df[family_df["Counting"] == family_df["Counting"].max()]["Insertion"].values[0])
    
	summary_df = pd.DataFrame(list(zip(subclass_list, superfamily_list, family_list, min_TE_length_list, max_TE_length_list, counting_list, nb_insertion_list, nb_expressed_insertion_list, most_expressed_insertion_list)), columns=["Subclass", "Superfamily", "Family", "Min_TE_length","Max_TE_length", "Counts", "nb_of_insertion", "nb_of_expressed_insertion", "Most_expressed_insertion"])
	return summary_df


def merge_female_and_male_df(female_counting, male_counting):
	female_df = create_summary_table(female_counting)
	male_df = create_summary_table(male_counting)
	hierarchy_cols = female_df[["Subclass", "Superfamily", "Family", "nb_of_insertion", "Min_TE_length","Max_TE_length"]]
	hierarchy_df = hierarchy_cols.copy()
	full_summary_df = hierarchy_df.merge(female_df.drop(columns=["Subclass", "Superfamily", "Min_TE_length","Max_TE_length", "nb_of_insertion"]), on='Family').merge(male_df.drop(columns=["Subclass", "Superfamily", "Min_TE_length","Max_TE_length", "nb_of_insertion"]), on='Family')
	full_summary_df.columns = ["Subclass", "Superfamily", "Family", "nb_of_insertion", "Min_TE_length", "Max_TE_length", "Female_Counts", "Female_nb_of_expressed_insertion", "Female_most_expressed_insertion", "Male_Counts", "Male_nb_of_expressed_insertion", "Male_most_expressed_insertion"]

	return full_summary_df

merged_full_summary_df = merge_female_and_male_df(saved_FC30_counting_df, saved_FC29_counting_df)
# print(merged_full_summary_df)
save_counting_df(merged_full_summary_df, "merged_full_summary.v2.tsv")

# def create_table_from_counting(female_counting, male_counting, hierarchy):
#     male_counting = male_counting.drop(columns=["Min_TE_length", "Max_TE_length", "nb_of_insertion"])
#     TE_data_table = hierarchy.merge(female_counting, on='Family').merge(male_counting, on='Family')
#     TE_data_table.columns = ["Subclass", "Superfamily", "Family", "Min_TE_length", "Max_TE_length", "Female_Counts", "Nb_of_insertion", "Female_nb_of_expressed_insertion", "Female_most_expressed_insertion", "Male_Counts", "Male_nb_of_expressed_insertion", "Male_most_expressed_insertion"]
#     TE_data_table = TE_data_table[["Subclass", "Superfamily", "Family", "Nb_of_insertion", "Min_TE_length", "Max_TE_length", "Female_Counts", "Female_nb_of_expressed_insertion", "Female_most_expressed_insertion", "Male_Counts", "Male_nb_of_expressed_insertion", "Male_most_expressed_insertion"]]
#     return TE_data_table

In [None]:
def from_counting_to_df(counting_dict):
    family_list = []
    min_TE_length_list = []
    max_TE_length_list = []
    counting_list = []
    nb_expressed_insertion_list = []
    nb_insertion_list = []
    most_expressed_insertion_list = []
    for family, insertion_list in counting_dict.items():
        family_list.append(family)
        nb_insertion_list.append(len(insertion_list))
        nb_expressed_insertion = 0
        min_TE_length = 9999999999
        max_TE_length = 0
        family_counts = 0
        most_expressed_insertion = None
        max_count = 0
        for insertion in insertion_list :
            insertion_length = insertion.end - insertion.start
            min_TE_length = min(min_TE_length, insertion_length)
            max_TE_length = max(max_TE_length, insertion_length)
            family_counts += insertion.counts
            max_count = max(max_count,insertion.counts)
            if insertion.counts >= max_count :
                most_expressed_insertion = insertion.id
            if insertion.counts > 0 :
                nb_expressed_insertion += 1
        nb_expressed_insertion_list.append(nb_expressed_insertion)
        min_TE_length_list.append(min_TE_length)
        max_TE_length_list.append(max_TE_length)
        counting_list.append(family_counts)
        most_expressed_insertion_list.append(most_expressed_insertion)
        
    counting_df = pd.DataFrame(list(zip(family_list, min_TE_length_list, max_TE_length_list, counting_list, nb_insertion_list, nb_expressed_insertion_list, most_expressed_insertion_list)), columns=["Family", "Min_TE_length","Max_TE_length", "Counts", "nb_of_insertion", "nb_of_expressed_insertion", "Most_expressed_insertion"])
    return counting_df

In [37]:
### Export reads mapped on the 3 expressed insertions of POGO.

#### Get ID of reads mapped on POGO

# insertion = "POGO_3L_RaGOO_9733928_9735150"
# insertion = "POGO_X_RaGOO_21863530_21864880"
# insertion = "POGO_2R_RaGOO_7201268_7202754"

# mapped_reads = saved_FC30_counting_df[saved_FC30_counting_df["Insertion"] == insertion]["Reads"]
# mapped_reads = mapped_reads.iloc[0].split("'")
# reads_IDs = []
# for i, j in enumerate(mapped_reads):
# 	if i % 2 :
# 		reads_IDs.append(j)
# test =  pysam.AlignmentFile(FC30_DMGOTH_MAX_AS_BAMFILE, "r")
# read_it = test.fetch(contig = "2R_RaGOO", start = 7201268, end = 7202754)
# with open(insertion + ".mapped_read.fasta", 'w') as output:
# 	for read in read_it:
# 		if read.query_name in reads_IDs:
# 			if read.query_sequence != None:
# 				new_line = "> " + read.query_name + "\n" + read.query_sequence + "\n"
# 				output.write(new_line)


