In [10]:
import pysam

TE_ANNOTATION_FILE = "/data2/eric/TE_LR_RNAseq/data/dmgoth101_genome_alignments/annotations/dmgoth101.onecode.v3.gtf"
FC30_DMGOTH_MAX_AS_PRIMARY_ONLY_BAMFILE ="/data2/eric/TE_LR_RNAseq/data/dmgoth101_genome_alignments/bam/FC30.against_dmgoth.filtered_max_AS.primary_only.bam"



In [12]:
class TE_feature:
	def __init__(self, chrom, start, end, gene_id, insertion_id):
		self.chrom = chrom
		self.start = start
		self.end = end
		self.insertion_id = insertion_id
		self.count = 0
		# self.family = gene_id_to_family_name(gene_id)
		self.counted_reads = set()

	def __len__(self):
		return self.end - self.start

	def __repr__(self):
		return self.insertion_id

	def is_valid(self, bam_chromosomes):
		return len(self) > 150 and self.chrom in bam_chromosomes

def build_TE(line):
    sline = line.strip().split("\t")
    chrom = sline[0]
    start = int(sline[3])
    end = int(sline[4])
    gene_id = sline[-1].split(";")[0].split('"')[1]
    insertion_id = sline[-1].strip().split('transcript_id "')[-1][:-2]

    return TE_feature(chrom, start, end, gene_id, insertion_id)

def regroup_TE_by_chrom(TE_feature_list):
	"""Return a dict of key:chrom and values:list of TE

	Args:
		TE_feature_list (list): flat list of TE_feature
	"""
	TE_dict = dict()
	for insertion in TE_feature_list:
		if insertion.chrom not in TE_dict:
			TE_dict[insertion.chrom] = [insertion]
		else:
			TE_dict[insertion.chrom].append(insertion)
	return TE_dict



def filter_relevant_TE_feature(bamfile, TE_annotation_file, min_TE_size=150):
	"""Generate list of TE objects that will next be counted.
	TE are filtered : we discard those which are on chromosome absent from the bamfile
	and those with length (in number of base) below a certain threshold.

	Args:
		bamfile (str): path to the alignment file
		TE_annotation_file (str): path the TE annotation file (gtf format)
		min_TE_size(int): minimal number of base for a TE to be considered as valid. Default = 150
	"""
	# Enumerating chromosomes present in the bamfile
	bam_chromosomes = pysam.AlignmentFile(bamfile).references
	# Then iterating through TE_annotation_file, creating and checking TE objects
	valid_TE_list = list()
	with open(TE_annotation_file, "r") as TE_annot:
		for line in TE_annot:
			new_TE = build_TE(line)
			if new_TE.is_valid(bam_chromosomes):
				valid_TE_list.append(new_TE)
	return valid_TE_list




In [13]:
valid_TE_list = filter_relevant_TE_feature(FC30_DMGOTH_MAX_AS_PRIMARY_ONLY_BAMFILE, TE_ANNOTATION_FILE)
TE_by_chrom = regroup_TE_by_chrom(valid_TE_list)

In [18]:
superimposed_TE_list = list()
nb_of_superimposed_TE = 0
i = 0
for TE_list in TE_by_chrom.values():
	TE_list = sorted(TE_list, key= lambda e: e.start)
	for index, TE in enumerate(TE_list[:-1]):
		if TE.end >= TE_list[index + 1].start :
			nb_of_superimposed_TE += 1
			superimposed_TE_list.append(TE.insertion_id)
print(nb_of_superimposed_TE)
print(len(valid_TE_list))

1702
10665


In [19]:
print(superimposed_TE_list[:10])


['TART-A$2L_RaGOO$4304$8214', 'TAHRE$2L_RaGOO$5229$7346', 'QUASIMODO_LTR$2L_RaGOO$10703$23126', 'QUASIMODO_I$2L_RaGOO$23127$26120', 'DM412$2L_RaGOO$3096923$3097499', 'FB4_DM$2L_RaGOO$7955778$7956846', 'ROO_LTR$2L_RaGOO$9909301$9918711', 'FB4_DM$2L_RaGOO$13428801$13430992', 'FW2_DM$2L_RaGOO$13594663$13598929', 'DOC4_DM$2L_RaGOO$16629887$16632082']


In [20]:
# Run this app with `python app.py` and
# visit http://127.0.0.1:8050/ in your web browser.

import dash
import dash_core_components as dcc
import dash_html_components as html
import plotly.express as px
import pandas as pd

app = dash.Dash(__name__)

# assume you have a "long-form" data frame
# see https://plotly.com/python/px-arguments/ for more options
df = pd.DataFrame({
    "Fruit": ["Apples", "Oranges", "Bananas", "Apples", "Oranges", "Bananas"],
    "Amount": [4, 1, 2, 2, 4, 5],
    "City": ["SF", "SF", "SF", "Montreal", "Montreal", "Montreal"]
})

fig = px.bar(df, x="Fruit", y="Amount", color="City", barmode="group")

app.layout = html.Div(children=[
    html.H1(children='Hello Dash'),

    html.Div(children='''
        Dash: A web application framework for your data.
    '''),

    dcc.Graph(
        id='example-graph',
        figure=fig
    )
])

if __name__ == '__main__':
    app.run_server(debug=True)

Dash is running on http://127.0.0.1:8050/



INFO:__main__:Dash is running on http://127.0.0.1:8050/



 * Serving Flask app '__main__' (lazy loading)
 * Environment: production
[2m   Use a production WSGI server instead.[0m
 * Debug mode: on


SystemExit: 1


To exit: use 'exit', 'quit', or Ctrl-D.

