In [None]:
import os, subprocess
from pathlib import Path, PurePosixPath
import gzip
import json
import pandas as pd, numpy as np, pyranges as pr
import plotly.express as px

In [None]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 500)
pd.set_option('display.float_format', '{:.10f}'.format)

# UTR3

## Docs / notes

**Docs:**  
* [PyRanges v1.x GitHub](https://github.com/pyranges/pyranges_1.x) / [PyRanges v1.x Docs](https://pyranges1.readthedocs.io/en/latest/index.html)

**Notes:**  
 - !!! Нужно иметь ввиду, что позиции ФИЧ в GFF, например 3'-UTR, могут быть разделены на несколько интервалов, так как разделены, например, интронами.

**Sources**
 - [miRBase](https://www.mirbase.org): hsa.gff - human
 - [multiMiR](http://multimir.org)
 - [TarBase](https://dianalab.e-ce.uth.gr/tarbasev9/downloads)
 - [miRTarBase](https://awi.cuhk.edu.cn/~miRTarBase/miRTarBase_2025/php/index.php)
 - [MiRanda](http://mirtoolsgallery.tech/mirtoolsgallery/node/1055): ineractions prediction

**Tasks:**  
1. Привести все к одной нумерации позиций: GFF - 1-based, BED - 0-based
2. 
3. Проверить правильность offtarget_id. Сейчас это по факту позиции SV.

**Commands:**  
```{bash}
mamba env export -n utr3.venv > environment.yml
```

## Settings

In [None]:
main_path = Path.cwd()

In [None]:
# Create tree
refs_dir = main_path / 'data/refs'
gnomad_dir = main_path / 'data/gnomad'
clinvar_dir = main_path / 'data/clinvar'
mirna_dir = main_path / 'data/mirna'
mirbase_dir = mirna_dir / 'mirbase'
tarbase_dir = mirna_dir / 'tarbase'
other_dir = main_path / 'data/other'
output_dir = main_path / 'data/output'

Path(refs_dir).mkdir(parents=True, exist_ok=True)
Path(gnomad_dir).mkdir(parents=True, exist_ok=True)
Path(clinvar_dir).mkdir(parents=True, exist_ok=True)
Path(mirbase_dir).mkdir(parents=True, exist_ok=True)
Path(tarbase_dir).mkdir(parents=True, exist_ok=True)
Path(other_dir).mkdir(parents=True, exist_ok=True)
Path(output_dir).mkdir(parents=True, exist_ok=True)

## Functions

In [None]:
def fetch_file(link, output_dir):
    command = f'wget --no-clobber -P {output_dir} {link}'
    subprocess.run(command, shell=True)
    filename = PurePosixPath(link).name
    return output_dir / filename

def index_gff(input_filepath):
    command = f'igvtools index {input_filepath}'
    subprocess.run(command, shell=True)

def gunzip_file(input_filepath):
    output_filepath = input_filepath.with_suffix('')
    command = f'gunzip -c {input_filepath} > {output_filepath}'
    subprocess.run(command, shell=True)
    return output_filepath

def sort_gff(input_filepath):
    output_filepath = input_filepath.with_stem(input_filepath.stem + ".sorted")
    command = f'igvtools sort {input_filepath} {output_filepath}'
    subprocess.run(command, shell=True)
    return output_filepath

def index_gff(input_filepath):
    command = f'igvtools index {input_filepath}'
    subprocess.run(command, shell=True)

def define_join_type(row):
    feature_start = row['Start']
    feature_end = row['End']
    sv_start = row['Start_sv']
    sv_end = row['End_sv']

    join_type = np.nan
    if feature_start <= sv_start and feature_end >= sv_end: # SV полностью в feature
        join_type = 'sv_in_feature'
    elif sv_start <= feature_start and sv_end >= feature_end: # feature полностью в sv
        join_type = 'feature_in_sv'
    elif sv_start < feature_start and sv_end <= sv_end:
        join_type = 'sv_left_free'
    elif sv_start > feature_start and sv_end >= sv_end:
        join_type = 'sv_right_free'
    elif sv_start == feature_start and sv_end == sv_end:
        join_type = 'full_join'

    return join_type

## Fetch data

In [None]:
gff_filepath = fetch_file('https://ftp.ensembl.org/pub/release-115/gff3/homo_sapiens/Homo_sapiens.GRCh38.115.gff3.gz', refs_dir)
fasta_filepath = fetch_file('https://ftp.ensembl.org/pub/release-115/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.toplevel.fa.gz', refs_dir)
gnomad_sv_filepath = fetch_file('https://storage.googleapis.com/gcp-public-data--gnomad/release/4.1/genome_sv/gnomad.v4.1.sv.sites.bed.gz', gnomad_dir)
clinvar_filepath = fetch_file('https://ftp.ncbi.nlm.nih.gov/pub/clinvar/tab_delimited/variant_summary.txt.gz', clinvar_dir)
mirbase_filepath = fetch_file('https://www.mirbase.org/download/hsa.gff3', mirbase_dir)
tarbase_filepath = fetch_file('https://dianalab.e-ce.uth.gr/tarbasev9/data/Homo_sapiens_TarBase-v9.tsv.gz', tarbase_dir)

so_terms_filepath = fetch_file('https://raw.githubusercontent.com/The-Sequence-Ontology/SO-Ontologies/refs/heads/master/Ontology_Files/so.json', other_dir)

## Prepare data

In [None]:
gff_gunzipped_filepath = gunzip_file(gff_filepath)
gff_sorted_filepath = sort_gff(gff_gunzipped_filepath)
index_gff(gff_sorted_filepath)

## <span style="color:#00ff00;">Main</span>

### <span style="color:#00ff00;">Create PR/DF<span>

In [None]:
# GFF
gff_pr = pr.read_gff3(str(gff_filepath))

In [None]:
# gnomAD SV
# Keep only the necessary columns (63 of 600...) in SV
with gzip.open(str(gnomad_sv_filepath), 'rt') as sv_file:
    sv_header = sv_file.readline().strip().split('\t')

target_columns = ['#chrom', 'start', 'end', 'name', 'svtype', 'samples', 'MULTIALLELIC', 'ALGORITHMS', 'BOTHSIDES_SUPPORT', 'CHR2', 'CPX_INTERVALS', 'CPX_TYPE', 'END', 'END2', 'EVIDENCE', 'LOW_CONFIDENCE_REPETITIVE_LARGE_DUP', 'MEMBERS', 'NCR', 'OUTLIER_SAMPLE_ENRICHED_LENIENT', 'PAR', 'PCRMINUS_NCR', 'PCRPLUS_NCR', 'PESR_GT_OVERDISPERSION', 'POS2', 'PREDICTED_BREAKEND_EXONIC', 'PREDICTED_COPY_GAIN', 'PREDICTED_DUP_PARTIAL', 'PREDICTED_INTERGENIC', 'PREDICTED_INTRAGENIC_EXON_DUP', 'PREDICTED_INTRONIC', 'PREDICTED_INV_SPAN', 'PREDICTED_LOF', 'PREDICTED_MSV_EXON_OVERLAP', 'PREDICTED_NEAREST_TSS', 'PREDICTED_NONCODING_BREAKPOINT', 'PREDICTED_NONCODING_SPAN', 'PREDICTED_PARTIAL_DISPERSED_DUP', 'PREDICTED_PARTIAL_EXON_DUP', 'PREDICTED_PROMOTER', 'PREDICTED_TSS_DUP', 'PREDICTED_UTR', 'RESOLVED_POSTHOC', 'SOURCE', 'SVLEN', 'SVTYPE', 'UNRESOLVED_TYPE', 'AN', 'AC', 'AF', 'N_BI_GENOS', 'N_HOMREF', 'N_HET', 'N_HOMALT', 'FREQ_HOMREF', 'FREQ_HET', 'FREQ_HOMALT', 'CN_NUMBER', 'CN_COUNT', 'CN_STATUS', 'CN_FREQ', 'CN_NONREF_COUNT', 'CN_NONREF_FREQ', 'FILTER']
target_columns_indexes = [sv_header.index(i) for i in target_columns]
target_columns[0:3] = ['Chromosome', 'Start', 'End'] # Rename columns

sv_df = pd.read_csv(str(gnomad_sv_filepath), sep='\t', usecols=target_columns_indexes, names=target_columns, comment='#')
sv_df['Chromosome'] = sv_df['Chromosome'].str.replace('chr', '')

sv_pr = pr.PyRanges(sv_df)

In [None]:
# ClinVar
clinvar_df = pd.read_csv(clinvar_filepath, sep='\t', compression='gzip')

In [None]:
clinvar_df = clinvar_df.replace(['na', '-'], np.nan)
clinvar_df = clinvar_df.rename(columns={'Stop': 'End'})
clinvar_df_filtered = clinvar_df.query('Assembly == "GRCh38"')

In [None]:
clinvar_df_filtered.isna().sum().to_frame()

In [None]:
clinvar_df_filtered

In [None]:
# miRBase
mirbase_pr = pr.read_gff3(str(mirbase_filepath))
mirbase_pr['Chromosome'] = mirbase_pr['Chromosome'].str.replace('chr', '')

In [None]:
# TarBase
tarbase_df = pd.read_csv(str(tarbase_filepath), sep='\t', compression='gzip')
tarbase_df = tarbase_df.rename(columns={'chromosome': 'Chromosome', 'start': 'Start', 'end': 'End', 'strand': 'Strand'})
tarbase_df['Chromosome'] = tarbase_df['Chromosome'].str.replace('chr', '')

tarbase_pr = pr.PyRanges(tarbase_df)
tarbase_pr = tarbase_pr.dropna(subset=['Chromosome', 'Start', 'End'], how='any')
tarbase_pr[['Start', 'End']] = tarbase_pr[['Start', 'End']].astype('int64')

In [None]:
# SO terms
with open(so_terms_filepath, 'r') as so_terms_file:
    so_terms = json.load(so_terms_file)

so_terms_filtered = {
    'lbl': [],
    'definition': []
}
for i in so_terms['graphs'][0]['nodes']:
    lbl = i.get('lbl', np.nan)
    definition = i.get('meta', {}).get('definition', {}).get('val', np.nan)
    so_terms_filtered['lbl'].append(lbl)
    so_terms_filtered['definition'].append(definition)

so_terms_df = pd.DataFrame.from_dict(so_terms_filtered)
so_terms_df['lbl'] = so_terms_df['lbl'].str.capitalize().replace('_', '')

### <span style="color:#00ff00;">EDA (Exploratory Data Analysis)</span>

In [None]:
# GFF features stats
gff_features = gff_pr.Feature.value_counts().to_frame().reset_index()
gff_features['tmp_id'] = gff_features['Feature'].str.capitalize().replace('_', '')
gff_features = gff_features.merge(so_terms_df, how='left', left_on='tmp_id', right_on='lbl')
gff_features = gff_features.drop(['lbl', 'tmp_id'], axis=1)
pd.DataFrame(gff_features)

In [None]:
# GFF biotypes stats
gff_biotypes = gff_pr.biotype.value_counts().to_frame().reset_index()
gff_biotypes['tmp_id'] = gff_biotypes['biotype'].str.capitalize().replace('_', '')
gff_biotypes = gff_biotypes.merge(so_terms_df, how='left', left_on='tmp_id', right_on='lbl')
gff_biotypes = gff_biotypes.drop(['lbl', 'tmp_id'], axis=1)
pd.DataFrame(gff_biotypes)

In [None]:
sv_pr.svtype.value_counts().to_frame().reset_index()

In [None]:
tarbase_pr['gene_location'].value_counts()

### <span style="color:#00ff00;">Prepare GFF3 data</span>

In [None]:
# Add additional metrics
gff_pr['feature_len'] = (gff_pr['End'] - gff_pr['Start']).astype('int')
gff_pr['feature_id'] = 'feature_' + gff_pr['Chromosome'].astype('str') + ':' + gff_pr['Start'].astype('str') + '-' + gff_pr['End'].astype('str')

# Self annotation of GFF with mRNA (parent) data
target_columns = ['ID', 'Name', 'biotype', 'Parent', 'tag', 'transcript_support_level', 'feature_len', 'feature_id'] # Replace the data in these empty columns with mRNA data
annotation_gff_pr = gff_pr[target_columns].dropna(subset='ID')

annotation_suffix = '_annotation'
annotated_gff_pr = gff_pr.merge(annotation_gff_pr, how='left', left_on='Parent', right_on='ID', suffixes=['', annotation_suffix])

annotated_columns = [f'{column}{annotation_suffix}' for column in target_columns]
fill_dict = dict(zip(target_columns, annotated_columns))
for target, annotation in fill_dict.items():
    annotated_gff_pr[target] = annotated_gff_pr[target].fillna(annotated_gff_pr[annotation])
annotated_gff_pr = annotated_gff_pr.drop(annotated_columns, axis=1)

In [None]:
# Create specific PRs (mRNAm, 3'-UTR, offtargets)
mrna_pr = annotated_gff_pr.query('Feature == "mRNA"')
three_utrs_pr = annotated_gff_pr.query('Feature == "three_prime_UTR"')

offtarget_features = ['five_prime_UTR',
                               'CDS',
                               #'lnc_RNA',
                               'ncRNA_gene',
                               #'snRNA',
                               #'snoRNA',
                               #'scRNA',
                               'rRNA',
                               'tRNA',
                               'processed_transcript',
                               ]
offtarget_features_pr = gff_pr.query('Feature in @offtarget_features')

In [None]:
# 3'-UTR biotype stats
# Считаем именно уникальные вхождения, так как один 3'-UTR может быть разбить на несколько интервалов в GFF (разделен интронами).
# В особенности это выражено у nonsense_mediated_decay транскриптов (нам они не пригодятся, но тем не менее).
three_utrs_pr.drop_duplicates(subset='ID')['biotype'].value_counts().to_frame().reset_index()


In [None]:
# Keep only MANE_select / MANE_Select|Ensembl_canonical
three_utrs_pr_filtered = three_utrs_pr[three_utrs_pr['tag'].str.contains('MANE_Select', na=False, regex=True)]

# Stats
# MANE_select genes count = 19437 (release_1.5) from summary file: https://ftp.ncbi.nlm.nih.gov/refseq/MANE/MANE_human/release_1.5/
three_utrs_pr_filtered['biotype'].value_counts().to_frame().reset_index()

### <span style="color:#00ff00;">Prepare SV data</span>

#### <span style="color:green;">gnomAD</span>

In [None]:
# Keep target SVs (DEL)
sv_targets = ['DEL']
sv_filter = ['PASS']
target_sv_pr = sv_pr.query('svtype in @sv_targets and FILTER in @sv_filter')

# Add columns
target_sv_pr['sv_len'] = target_sv_pr.End - target_sv_pr.Start
target_sv_pr['sv_id'] = 'sv_' + target_sv_pr['Chromosome'].astype('str') + ':' + target_sv_pr['Start'].astype('str') + '-' + target_sv_pr['End'].astype('str')

In [None]:
# SV percentiles by AF
target_sv_pr['AF'].describe(percentiles=[.1, .25, .5, .75, .9, .95, .99, .999]).to_frame().reset_index()

In [None]:
# SV percentiles by SV length
target_sv_pr['sv_len'].describe(percentiles=[.1, .25, .5, .75, .9, .95, .99, .999]).to_frame().reset_index()

In [None]:
'''fig = px.histogram( 
    x=target_sv_pr['sv_len'],
    nbins=300,
    log_y=True,
    title='gnomAD SV DEL lenght distribution (log)'
)
fig.show()'''

### <span style="color:#00ff00;">Join GFF3 x gnomAD SV</span>

In [None]:
# Join GFF x SV
features_sv_joined_pr = three_utrs_pr_filtered.join_overlaps(target_sv_pr, suffix='_sv', report_overlap_column='join_len')

In [None]:
# Add additional columns (join_start, join_end, join_id)
features_sv_joined_pr['join_start'] = np.maximum(features_sv_joined_pr['Start'], features_sv_joined_pr['Start_sv'])
features_sv_joined_pr['join_end'] = np.minimum(features_sv_joined_pr['End'], features_sv_joined_pr['End_sv'])
features_sv_joined_pr['join_id'] = 'join_' + features_sv_joined_pr['Chromosome'].astype('str') + ':' + features_sv_joined_pr['join_start'].astype('str') + '-' + features_sv_joined_pr['join_end'].astype('str')

In [None]:
# Define joint type
features_sv_joined_pr['join_type'] = features_sv_joined_pr.apply(define_join_type, axis=1)
features_sv_joined_pr['join_type'].value_counts().to_frame().reset_index()

In [None]:
# Count joints by SV
joints_count_by_sv = features_sv_joined_pr['sv_id'].value_counts().to_frame().reset_index()
joints_count_by_sv = joints_count_by_sv.rename(columns={'count': 'joins_count'})

# Add joints count data
features_sv_joined_pr = features_sv_joined_pr.merge(joints_count_by_sv, how='left')
#len(features_sv_joined_pr)

# Filter joins by join type
targey_joint_types = ['sv_in_feature', 'sv_right_free', 'sv_left_free']
features_sv_joined_pr_filtered = features_sv_joined_pr.query('join_type in @targey_joint_types and ((Strand == "+" and join_type == "sv_right_free") or (Strand == "-" and join_type == "sv_left_free") or join_type == "sv_in_feature") and joins_count == 1')

#### <span style="color:green;">Join type stats</span>

In [None]:
# Stats by join type
features_sv_joined_pr_filtered.groupby('join_type')['sv_len'].describe(percentiles=[.1, .25, .5, .75, .9, .95, .99, .999]).T

In [None]:
features_sv_joined_pr_filtered['biotype'].value_counts().to_frame().reset_index()

### <span style="color:#00ff00;">Offtarget features exclusion</span>

#### <span style="color:green;">features_sv_joined_pr_filtered copy</span>

In [None]:
# Offtarget stats
features_sv_joined_pr_filtered_cp = features_sv_joined_pr_filtered.copy()
features_sv_joined_pr_filtered_cp[['Start_tmp', 'End_tmp']] = features_sv_joined_pr_filtered_cp[['Start', 'End']] # Save old positions
features_sv_joined_pr_filtered_cp[['Start', 'End']] = features_sv_joined_pr_filtered_cp[['Start_sv', 'End_sv']] # SV positions to main positions

#### <span style="color:green;">Filter offtargets</span>

In [None]:
# Exclude offtargets (SV x offtarget overlaps)
features_sv_no_offtereget_joined_pr = features_sv_joined_pr_filtered_cp.overlap(offtarget_features_pr, invert=True, strand_behavior='ignore')

features_sv_no_offtereget_joined_pr[['Start', 'End']] = features_sv_no_offtereget_joined_pr[['Start_tmp', 'End_tmp']]
features_sv_no_offtereget_joined_pr = features_sv_no_offtereget_joined_pr.drop(['Start_tmp', 'End_tmp'], axis=1)

len(features_sv_no_offtereget_joined_pr)

In [None]:
# Joint type stats after offtargets exclution
print('Before offtargets filtering')
print(features_sv_joined_pr_filtered['join_type'].value_counts().to_frame().reset_index())
print('\nAfter offtargets filtering')
print(features_sv_no_offtereget_joined_pr['join_type'].value_counts().to_frame().reset_index())

#### <span style="color:green;">Offtarget stats</span>

In [None]:
# features_sv_offtereget_joined_pr - только для отслеживания пересечений с offtarget. Фильтрованный от offtarget pr - features_sv_no_offtereget_joined_pr (через overlap, потому что join находя пересечения с offtarget (через left) также оставляет оригинальную строку).DS_Storefeatures_sv_offtereget_joined_pr = features_sv_joined_pr_filtered_cp.join_overlaps(offtarget_features_pr, join_type='left', suffix='_offtarget', report_overlap_column='offtarget_join_len')
features_sv_offtereget_joined_pr = features_sv_joined_pr_filtered_cp.join_overlaps(offtarget_features_pr, join_type='left', suffix='_offtarget', report_overlap_column='offtarget_join_len', strand_behavior='ignore')

features_sv_offtereget_joined_pr[['Start', 'End']] = features_sv_offtereget_joined_pr[['Start_tmp', 'End_tmp']]
features_sv_offtereget_joined_pr = features_sv_offtereget_joined_pr.drop(['Start_tmp', 'End_tmp'], axis=1)

features_sv_offtereget_joined_pr['offtarget_id'] = 'offtarget_' + features_sv_offtereget_joined_pr['Chromosome'].astype('str') + ':' + features_sv_offtereget_joined_pr['Start_offtarget'].astype('str') + '-' + features_sv_offtereget_joined_pr['End_offtarget'].astype('str')

In [None]:
# Features offtargets stats
# Суммарные значения данной статистики будут выше чем разница всех джоинов - оффтартет джоинов, так как один и тот же SV (напр. DEL) могут пересекать разные изоформы одного и того же транскрипта.
features_sv_offtereget_joined_pr.groupby(['join_type', 'Feature_offtarget'])['Feature_offtarget'].agg(['count'])
#features_sv_offtereget_joined_pr.drop_duplicates(subset='sv_id').groupby(['join_type', 'Feature_offtarget'])['Feature_offtarget'].agg(['count'])

### <span style="color:#00ff00;">miRNA join</span>

#### <span style="color:green;">features_sv_no_offtereget_joined_pr copy</span>

In [None]:
# Offtarget stats
features_sv_no_offtereget_joined_pr_cp = features_sv_no_offtereget_joined_pr.copy()
features_sv_no_offtereget_joined_pr_cp[['Start_tmp', 'End_tmp']] = features_sv_no_offtereget_joined_pr_cp[['Start', 'End']] # Save old positions
features_sv_no_offtereget_joined_pr_cp[['Start', 'End']] = features_sv_no_offtereget_joined_pr_cp[['join_start', 'join_end']] # SV positions to main positions

#### <span style="color:green;">Join</span>

In [None]:
# Join TarBase
features_sv_no_offtereget_tarbase_joined_pr = features_sv_no_offtereget_joined_pr_cp.join_overlaps(tarbase_pr, suffix='_tarbase', strand_behavior='ignore', report_overlap_column='tarbase_join_len')

features_sv_no_offtereget_tarbase_joined_pr[['Start', 'End']] = features_sv_no_offtereget_tarbase_joined_pr[['Start_tmp', 'End_tmp']]
features_sv_no_offtereget_tarbase_joined_pr = features_sv_no_offtereget_tarbase_joined_pr.drop(['Start_tmp', 'End_tmp'], axis=1)

features_sv_no_offtereget_tarbase_joined_pr['tarbase_id'] = 'tarbase_' + features_sv_no_offtereget_tarbase_joined_pr['Chromosome'].astype('str') + ':' + features_sv_no_offtereget_tarbase_joined_pr['Start_tarbase'].astype('str') + '-' + features_sv_no_offtereget_tarbase_joined_pr['End_tarbase'].astype('str') + ':' + features_sv_no_offtereget_tarbase_joined_pr['mirna_name']

In [None]:
features_sv_no_offtereget_tarbase_joined_pr['AF'].describe(percentiles=[.1, .25, .5, .75, .9, .95, .99, .999]).to_frame().reset_index()

In [None]:
features_sv_no_offtereget_tarbase_joined_pr['gene_location'].value_counts()

In [None]:
features_sv_no_offtereget_tarbase_joined_pr['interaction_group'].value_counts()

In [None]:
features_sv_no_offtereget_tarbase_joined_pr['regulation'].value_counts()

In [None]:
features_sv_no_offtereget_tarbase_joined_pr['transcript_name'].value_counts().to_frame().reset_index().to_csv(output_dir / 'mirna_genes.csv', index=False)
features_sv_no_offtereget_tarbase_joined_pr['transcript_name'].value_counts().to_frame().reset_index()

In [None]:
features_sv_no_offtereget_tarbase_joined_pr['experimental_method'].value_counts()

In [None]:
features_sv_no_offtereget_tarbase_joined_pr['join_type'].value_counts()

In [None]:
features_sv_no_offtereget_tarbase_joined_pr['join_id'].value_counts()

In [None]:
pd.DataFrame(features_sv_no_offtereget_tarbase_joined_pr)

### <span style="color:#00ff00;">GFF output for IGV</span>

In [None]:
pr_for_igv = features_sv_no_offtereget_tarbase_joined_pr

In [None]:
pd.DataFrame(pr_for_igv)

In [None]:
# Add GFF
output_features_pr = pr_for_igv[['Chromosome', 'Feature', 'Start', 'End', 'Strand', 'Name', 'biotype', 'tag', 'ID']]
output_features_pr = output_features_pr.drop_duplicates()
output_features_pr['Source'] = 'GFF'
output_features_pr['color'] = 'blue'

# Add gnomAD SV
output_sv_pr = pr_for_igv[['Chromosome', 'Start_sv', 'End_sv', 'svtype', 'name', 'sv_id', 'AF', 'N_HET', 'N_HOMALT']]
output_sv_pr = output_sv_pr.drop_duplicates()
output_sv_pr['Source'] = 'gnomAD_SV'
output_sv_pr['color'] = 'red'
output_sv_pr = pr.PyRanges(output_sv_pr.rename(columns={'Start_sv': 'Start', 'End_sv': 'End', 'svtype': 'Feature', 'name': 'Name', 'sv_id': 'ID'}))

# Add joins data
output_joins_pr = pr_for_igv[['Chromosome', 'join_start', 'join_end', 'join_type', 'join_id']]
output_joins_pr = output_joins_pr.drop_duplicates()
output_joins_pr['Source'] = 'join'
output_joins_pr['color'] = 'orange'
output_joins_pr = pr.PyRanges(output_joins_pr.rename(columns={'join_start': 'Start', 'join_end': 'End', 'join_type': 'Feature', 'join_id': 'ID'}))

# Add miRNA TarBase
output_tarbase_pr = pr_for_igv[['Chromosome', 'Start_tarbase', 'End_tarbase', 'Strand_tarbase', 'mirna_name', 'interaction_group', 'mirna_id', 'tarbase_id']]
output_tarbase_pr = output_tarbase_pr.drop_duplicates(subset=['mirna_id', 'tarbase_id'])
output_tarbase_pr['Source'] = 'tarbase'
output_tarbase_pr['color'] = 'green'
output_tarbase_pr = pr.PyRanges(output_tarbase_pr.rename(columns={'Start_tarbase': 'Start', 'End_tarbase': 'End', 'Strand_tarbase': 'Strand', 'interaction_group': 'Feature', 'mirna_name': 'Name', 'tarbase_id': 'ID'}))

# Add offtarget GFF (features) data
output_offtargets_pr = features_sv_offtereget_joined_pr[['Chromosome', 'Start_offtarget', 'End_offtarget', 'Feature_offtarget', 'offtarget_id']]
output_offtargets_pr = output_offtargets_pr.query('Start_offtarget == Start_offtarget')
output_offtargets_pr = output_offtargets_pr.drop_duplicates()
output_offtargets_pr['Source'] = 'GFF_offtarget'
output_offtargets_pr['color'] = '#d9d9d9'
output_offtargets_pr = pr.PyRanges(output_offtargets_pr.rename(columns={'Start_offtarget': 'Start', 'End_offtarget': 'End', 'Feature_offtarget': 'Feature', 'offtarget_id': 'ID'}))

output_igv_pr = pd.concat([output_features_pr, output_sv_pr, output_joins_pr, output_tarbase_pr, output_offtargets_pr], ignore_index=True)

In [None]:
# Stats
output_igv_pr['Source'].value_counts().to_frame().reset_index()

In [None]:
# Fill NA and save
output_igv_pr = output_igv_pr.reset_index(drop=True)
#output_igv_pr = pr.PyRanges(output_igv_pr.fillna('.'))

output_igv_pr.to_gff3(output_dir / 'regions_for_igv.gff3')