In [1]:
import os, subprocess
from pathlib import Path, PurePosixPath
import gzip
import json
import pandas as pd, numpy as np, pyranges as pr
import plotly.express as px
import clickhouse_connect

In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 500)
pd.set_option('display.float_format', '{:.10f}'.format)

# UTR3

## Docs / notes

**Docs:**  
* [PyRanges v1.x GitHub](https://github.com/pyranges/pyranges_1.x) / [PyRanges v1.x Docs](https://pyranges1.readthedocs.io/en/latest/index.html)

**Notes:**  
 - !!! Нужно иметь ввиду, что позиции ФИЧ в GFF, например 3'-UTR, могут быть разделены на несколько интервалов, так как разделены, например, интронами.

**Sources**
 - [miRBase](https://www.mirbase.org): hsa.gff - human
 - [multiMiR](http://multimir.org)
 - [TarBase](https://dianalab.e-ce.uth.gr/tarbasev9/downloads)
 - [miRTarBase](https://awi.cuhk.edu.cn/~miRTarBase/miRTarBase_2025/php/index.php)
 - [MiRanda](http://mirtoolsgallery.tech/mirtoolsgallery/node/1055): ineractions prediction

**Tasks:**  
1. Привести все к одной нумерации позиций: GFF - 1-based, BED - 0-based
2. 
3. Проверить правильность offtarget_id. Сейчас это по факту позиции SV.

**Commands:**  
```{bash}
mamba env export -n utr3.venv > environment.yml
```

## Settings

In [50]:
main_path = Path.cwd()

In [52]:
# Create tree
refs_dir = main_path / 'data/refs'
gnomad_dir = main_path / 'data/gnomad'
rcmg_btk_sv_dir = main_path / 'data/rcmg_btk_sv'
clinvar_dir = main_path / 'data/clinvar'
mirna_dir = main_path / 'data/mirna'
mirbase_dir = mirna_dir / 'mirbase'
tarbase_dir = mirna_dir / 'tarbase'
omim_dir = main_path / 'data/omim'
domino_dir = main_path / 'data/domino'
other_dir = main_path / 'data/other'
output_dir = main_path / 'data/output'

Path(refs_dir).mkdir(parents=True, exist_ok=True)
Path(gnomad_dir).mkdir(parents=True, exist_ok=True)
Path(rcmg_btk_sv_dir).mkdir(parents=True, exist_ok=True)
Path(clinvar_dir).mkdir(parents=True, exist_ok=True)
Path(mirbase_dir).mkdir(parents=True, exist_ok=True)
Path(tarbase_dir).mkdir(parents=True, exist_ok=True)
Path(omim_dir).mkdir(parents=True, exist_ok=True)
Path(domino_dir).mkdir(parents=True, exist_ok=True)
Path(other_dir).mkdir(parents=True, exist_ok=True)
Path(output_dir).mkdir(parents=True, exist_ok=True)

## Functions

In [140]:
def fetch_file(link, output_dir):
    command = f'wget --no-clobber -P {output_dir} {link}'
    subprocess.run(command, shell=True)
    filename = PurePosixPath(link).name
    return output_dir / filename

def index_gff(input_filepath):
    command = f'igvtools index {input_filepath}'
    subprocess.run(command, shell=True)

def gunzip_file(input_filepath):
    output_filepath = input_filepath.with_suffix('')
    command = f'gunzip -c {input_filepath} > {output_filepath}'
    subprocess.run(command, shell=True)
    return output_filepath

def sort_gff(input_filepath):
    output_filepath = input_filepath.with_stem(input_filepath.stem + ".sorted")
    command = f'igvtools sort {input_filepath} {output_filepath}'
    subprocess.run(command, shell=True)
    return output_filepath

def index_gff(input_filepath):
    command = f'igvtools index {input_filepath}'
    subprocess.run(command, shell=True)

def define_join_type(row):
    feature_start = row['Start']
    feature_end = row['End']
    sv_start = row['Start_sv']
    sv_end = row['End_sv']

    join_type = np.nan
    if feature_start <= sv_start and feature_end >= sv_end: # SV полностью в feature
        join_type = 'sv_in_feature'
    elif sv_start <= feature_start and sv_end >= feature_end: # feature полностью в sv
        join_type = 'feature_in_sv'
    elif sv_start < feature_start and sv_end <= sv_end:
        join_type = 'sv_left_free'
    elif sv_start > feature_start and sv_end >= sv_end:
        join_type = 'sv_right_free'
    elif sv_start == feature_start and sv_end == sv_end:
        join_type = 'full_join'

    return join_type

def explode_json_column(column):
    column = json.loads(column) if pd.notna(column) else np.nan

    aggregated_data = {}
    if isinstance(column, list):
        for dict_ in column:
            for key, value in dict_.items():
                if key not in aggregated_data:
                    aggregated_data[key] = [value]
                else:
                    aggregated_data[key].append(value)
        aggregated_data['Inheritance_unique'] = list(set([x for x in aggregated_data['Inheritance'] if x != '']))
        for key, value in aggregated_data.items():
            new_value = [x if x != '' else 'None' for x in value]
            aggregated_data[key] = ' / '.join(new_value)
    return pd.Series(aggregated_data)



## <span style="color:#00ff00;">Fetch data</span>

In [6]:
gff_filepath = fetch_file('https://ftp.ensembl.org/pub/release-115/gff3/homo_sapiens/Homo_sapiens.GRCh38.115.gff3.gz', refs_dir)
fasta_filepath = fetch_file('https://ftp.ensembl.org/pub/release-115/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.toplevel.fa.gz', refs_dir)
gnomad_sv_filepath = fetch_file('https://storage.googleapis.com/gcp-public-data--gnomad/release/4.1/genome_sv/gnomad.v4.1.sv.sites.bed.gz', gnomad_dir)
clinvar_filepath = fetch_file('https://ftp.ncbi.nlm.nih.gov/pub/clinvar/tab_delimited/variant_summary.txt.gz', clinvar_dir)
mirbase_filepath = fetch_file('https://www.mirbase.org/download/hsa.gff3', mirbase_dir)
tarbase_filepath = fetch_file('https://dianalab.e-ce.uth.gr/tarbasev9/data/Homo_sapiens_TarBase-v9.tsv.gz', tarbase_dir)
domino_filepath = fetch_file('https://domino.iob.ch/score_all_final_19.02.19.txt', domino_dir)

so_terms_filepath = fetch_file('https://raw.githubusercontent.com/The-Sequence-Ontology/SO-Ontologies/refs/heads/master/Ontology_Files/so.json', other_dir)

File '/Users/andrejnekrasov/pro/my/utr3/data/refs/Homo_sapiens.GRCh38.115.gff3.gz' already there; not retrieving.

File '/Users/andrejnekrasov/pro/my/utr3/data/refs/Homo_sapiens.GRCh38.dna.toplevel.fa.gz' already there; not retrieving.

File '/Users/andrejnekrasov/pro/my/utr3/data/gnomad/gnomad.v4.1.sv.sites.bed.gz' already there; not retrieving.

File '/Users/andrejnekrasov/pro/my/utr3/data/clinvar/variant_summary.txt.gz' already there; not retrieving.

File '/Users/andrejnekrasov/pro/my/utr3/data/mirna/mirbase/hsa.gff3' already there; not retrieving.

File '/Users/andrejnekrasov/pro/my/utr3/data/mirna/tarbase/Homo_sapiens_TarBase-v9.tsv.gz' already there; not retrieving.

File '/Users/andrejnekrasov/pro/my/utr3/data/domino/score_all_final_19.02.19.txt' already there; not retrieving.

File '/Users/andrejnekrasov/pro/my/utr3/data/other/so.json' already there; not retrieving.



In [7]:
# RCMG BTK SV
chromosomes = list(range(1, 23)) + ['X', 'Y', 'M']
rcmg_btk_sv_filepaths = []

try:
    client = clickhouse_connect.get_client(host='192.168.86.236', user='rdr', password='Dru_Todd3')
    db_name = "ngs"
    client.database = db_name
except:
    for chrom in chromosomes:
        output_filename =  f'rcmg_btk_sv_chr{chrom}.csv.gz'
        output_filepath = rcmg_btk_sv_dir / output_filename
        rcmg_btk_sv_filepaths.append(output_filepath)

        if not output_filepath.exists():
            rcmg_bkt_sv_df = client.query_df(f"SELECT * from ngs.GENOME_hg38_2_sv_chr{chrom} WHERE SVTYPE='DEL';")

            rcmg_bkt_sv_df['Chromosome'] = str(chrom)
            rcmg_bkt_sv_df.to_csv(output_filepath, index=False, compression='gzip')
    try:
        client.close()
    except:
        pass



## <span style="color:#00ff00;">Main</span>

In [8]:
gff_gunzipped_filepath = gunzip_file(gff_filepath)
gff_sorted_filepath = sort_gff(gff_gunzipped_filepath)
index_gff(gff_sorted_filepath)

Sorting /Users/andrejnekrasov/pro/my/utr3/data/refs/Homo_sapiens.GRCh38.115.gff3  -> /Users/andrejnekrasov/pro/my/utr3/data/refs/Homo_sapiens.GRCh38.115.sorted.gff3





Done
Done


### <span style="color:#00ff00;">Prepare data</span>

#### <span style="color:green;">Prepare GFF3 data</span>

In [211]:
# GFF
gff_pr = pr.read_gff3(str(gff_filepath))

# ==============================================================================================================================================

# Add additional columns
gff_pr['feature_len'] = (gff_pr['End'] - gff_pr['Start']).astype('int')
gff_pr['feature_id'] = 'feature_' + gff_pr['Chromosome'].astype('str') + ':' + gff_pr['Start'].astype('str') + '-' + gff_pr['End'].astype('str')

# Self annotation of GFF with mRNA (parent) data
target_columns = ['ID', 'Name', 'biotype', 'Parent', 'tag', 'transcript_support_level', 'feature_len', 'feature_id'] # Replace the data in these empty columns with mRNA data
annotation_gff_pr = gff_pr[target_columns].dropna(subset='ID')

annotation_suffix = '_annotation'
annotated_gff_pr = gff_pr.merge(annotation_gff_pr, how='left', left_on='Parent', right_on='ID', suffixes=['', annotation_suffix])

annotated_columns = [f'{column}{annotation_suffix}' for column in target_columns]
fill_dict = dict(zip(target_columns, annotated_columns))
for target, annotation in fill_dict.items():
    annotated_gff_pr[target] = annotated_gff_pr[target].fillna(annotated_gff_pr[annotation])
annotated_gff_pr = annotated_gff_pr.drop(annotated_columns, axis=1)
annotated_gff_pr['gene_name'] = annotated_gff_pr['Name'].str.split('-').str[0]

# ==============================================================================================================================================

# Create specific PRs (mRNAm, 3'-UTR, offtargets)
mrna_pr = annotated_gff_pr.query('Feature == "mRNA"')
three_utrs_pr = annotated_gff_pr.query('Feature == "three_prime_UTR"')

offtarget_features = ['five_prime_UTR',
                               'CDS',
                               #'lnc_RNA',
                               'ncRNA_gene',
                               #'snRNA',
                               #'snoRNA',
                               #'scRNA',
                               'rRNA',
                               'tRNA',
                               'processed_transcript',
                               ]
offtarget_features_pr = gff_pr.query('Feature in @offtarget_features')

# ==============================================================================================================================================

# 3'-UTR biotype stats
# Считаем именно уникальные вхождения, так как один 3'-UTR может быть разбить на несколько интервалов в GFF (разделен интронами).
# В особенности это выражено у nonsense_mediated_decay транскриптов (нам они не пригодятся, но тем не менее).
three_utrs_pr.drop_duplicates(subset='ID')['biotype'].value_counts().to_frame().reset_index()

# ==============================================================================================================================================

# Keep only MANE_select / MANE_Select|Ensembl_canonical
three_utrs_pr_filtered = three_utrs_pr[three_utrs_pr['tag'].str.contains('MANE_Select', na=False, regex=True)]

# Stats
# MANE_select genes count = 19437 (release_1.5) from summary file: https://ftp.ncbi.nlm.nih.gov/refseq/MANE/MANE_human/release_1.5/
three_utrs_pr_filtered['biotype'].value_counts().to_frame().reset_index()

Unnamed: 0,biotype,count
0,protein_coding,19757


#### <span style="color:green;">gnomAD SV</span>

In [10]:
# gnomAD SV
# Keep only the necessary columns (63 of 600...) in SV
with gzip.open(str(gnomad_sv_filepath), 'rt') as sv_file:
    sv_header = sv_file.readline().strip().split('\t')

target_columns = ['#chrom', 'start', 'end', 'name', 'svtype', 'samples', 'MULTIALLELIC', 'ALGORITHMS', 'BOTHSIDES_SUPPORT', 'CHR2', 'CPX_INTERVALS', 'CPX_TYPE', 'END', 'END2', 'EVIDENCE', 'LOW_CONFIDENCE_REPETITIVE_LARGE_DUP', 'MEMBERS', 'NCR', 'OUTLIER_SAMPLE_ENRICHED_LENIENT', 'PAR', 'PCRMINUS_NCR', 'PCRPLUS_NCR', 'PESR_GT_OVERDISPERSION', 'POS2', 'PREDICTED_BREAKEND_EXONIC', 'PREDICTED_COPY_GAIN', 'PREDICTED_DUP_PARTIAL', 'PREDICTED_INTERGENIC', 'PREDICTED_INTRAGENIC_EXON_DUP', 'PREDICTED_INTRONIC', 'PREDICTED_INV_SPAN', 'PREDICTED_LOF', 'PREDICTED_MSV_EXON_OVERLAP', 'PREDICTED_NEAREST_TSS', 'PREDICTED_NONCODING_BREAKPOINT', 'PREDICTED_NONCODING_SPAN', 'PREDICTED_PARTIAL_DISPERSED_DUP', 'PREDICTED_PARTIAL_EXON_DUP', 'PREDICTED_PROMOTER', 'PREDICTED_TSS_DUP', 'PREDICTED_UTR', 'RESOLVED_POSTHOC', 'SOURCE', 'SVLEN', 'SVTYPE', 'UNRESOLVED_TYPE', 'AN', 'AC', 'AF', 'N_BI_GENOS', 'N_HOMREF', 'N_HET', 'N_HOMALT', 'FREQ_HOMREF', 'FREQ_HET', 'FREQ_HOMALT', 'CN_NUMBER', 'CN_COUNT', 'CN_STATUS', 'CN_FREQ', 'CN_NONREF_COUNT', 'CN_NONREF_FREQ', 'FILTER']
target_columns_indexes = [sv_header.index(i) for i in target_columns]
target_columns[0:3] = ['Chromosome', 'Start', 'End'] # Rename columns

sv_df = pd.read_csv(str(gnomad_sv_filepath), sep='\t', usecols=target_columns_indexes, names=target_columns, comment='#')
sv_df['Chromosome'] = sv_df['Chromosome'].str.replace('chr', '')

sv_pr = pr.PyRanges(sv_df)

# ==============================================================================================================================================

# Keep target SVs (DEL)
sv_targets = ['DEL']
sv_filter = ['PASS']
target_sv_pr = sv_pr.query('svtype in @sv_targets and FILTER in @sv_filter')

# Add columns
target_sv_pr['sv_len'] = target_sv_pr.End - target_sv_pr.Start
target_sv_pr['sv_id'] = 'sv_' + target_sv_pr['Chromosome'].astype('str') + ':' + target_sv_pr['Start'].astype('str') + '-' + target_sv_pr['End'].astype('str')

# ==============================================================================================================================================

# SV percentiles by AF
target_sv_pr['AF'].describe(percentiles=[.1, .25, .5, .75, .9, .95, .99, .999]).to_frame().reset_index()

# SV percentiles by SV length
target_sv_pr['sv_len'].describe(percentiles=[.1, .25, .5, .75, .9, .95, .99, .999]).to_frame().reset_index()

  sv_df = pd.read_csv(str(gnomad_sv_filepath), sep='\t', usecols=target_columns_indexes, names=target_columns, comment='#')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  target_sv_pr['sv_len'] = target_sv_pr.End - target_sv_pr.Start
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  target_sv_pr['sv_id'] = 'sv_' + target_sv_pr['Chromosome'].astype('str') + ':' + target_sv_pr['Start'].astype('str') + '-' + target_sv_pr['End'].astype('str')


Unnamed: 0,index,sv_len
0,count,627321.0
1,mean,8529.8884988706
2,std,139720.7127188754
3,min,51.0
4,10%,76.0
5,25%,137.0
6,50%,702.0
7,75%,4073.0
8,90%,10309.0
9,95%,23359.0


#### <span style="color:green;">ClinVar</span>

In [11]:
# ClinVar
clinvar_df = pd.read_csv(clinvar_filepath, sep='\t', compression='gzip')

# ==============================================================================================================================================

clinvar_df = clinvar_df.replace(['na', '-'], np.nan)
clinvar_df = clinvar_df.rename(columns={'Stop': 'End'})
clinvar_df_filtered = clinvar_df.query('Assembly == "GRCh38"')

  clinvar_df = pd.read_csv(clinvar_filepath, sep='\t', compression='gzip')


#### <span style="color:green;">RCMG BTK SV</span>

In [12]:
# RCMG BTK SV
rcmg_btk_sv_filepath = rcmg_btk_sv_dir / 'rcmg_btk_sv.csv.gz'

if not rcmg_btk_sv_filepath.exists():
    rcmg_bkt_sv_df = pd.DataFrame() # concated filtered aggregated
    for rcmg_btk_sv_file in rcmg_btk_sv_filepaths:
        print(rcmg_btk_sv_file)
        current_rcmg_bkt_sv_df = pd.read_csv(str(rcmg_btk_sv_file), compression='gzip')

        # Add columns
        current_rcmg_bkt_sv_df['AF'] = current_rcmg_bkt_sv_df['AD'] / current_rcmg_bkt_sv_df['DP']
        current_rcmg_bkt_sv_df['sv_len'] = abs(current_rcmg_bkt_sv_df['END'] - current_rcmg_bkt_sv_df['POS'])
        current_rcmg_bkt_sv_df['sv_id'] = 'sv_' + current_rcmg_bkt_sv_df['Chromosome'].astype('str') + ':' + current_rcmg_bkt_sv_df['POS'].astype('str') + '-' + current_rcmg_bkt_sv_df['END'].astype('str')

        # Drop columns
        current_rcmg_bkt_sv_df = current_rcmg_bkt_sv_df.drop(columns=['REF', 'ALT'])

        # Filter
        current_rcmg_bkt_sv_df_filtered = current_rcmg_bkt_sv_df.query('caller == "manta"')

        # Aggregate
        current_rcmg_bkt_sv_df_filtered_aggregated = current_rcmg_bkt_sv_df_filtered.groupby(['Chromosome', 'POS', 'END', 'SVTYPE', 'caller', 'sv_len', 'sv_id'], as_index=False).agg({
            'SID': [list, 'count'],
            'DP': [list, 'min', 'median', 'mean', 'max'],
            'AD': [list, 'min', 'median', 'mean', 'max'],
            'AF': [list, 'min', 'median', 'mean', 'max']
        })

        # Rename columns
        current_rcmg_bkt_sv_df_filtered_aggregated.columns = ['_'.join(col).strip('_') for col in current_rcmg_bkt_sv_df_filtered_aggregated.columns]
        current_rcmg_bkt_sv_df_filtered_aggregated = current_rcmg_bkt_sv_df_filtered_aggregated.rename(columns={'POS': 'Start', 'END': 'End'})
        current_rcmg_bkt_sv_df_filtered_aggregated = current_rcmg_bkt_sv_df_filtered_aggregated.reset_index(drop=True)

        # Concat
        rcmg_bkt_sv_df['Chromosome'] = rcmg_bkt_sv_df['Chromosome'].astype('str')
        rcmg_bkt_sv_df = pd.concat([rcmg_bkt_sv_df, current_rcmg_bkt_sv_df_filtered_aggregated], ignore_index=False)
    rcmg_bkt_sv_df.to_csv(str(rcmg_btk_sv_filepath), index=False, compression='gzip')
else:
    rcmg_bkt_sv_df = pd.read_csv(str(rcmg_btk_sv_filepath), compression='gzip')
    rcmg_bkt_sv_df['Chromosome'] = rcmg_bkt_sv_df['Chromosome'].astype('str')

rcmg_btk_sv_pr = pr.PyRanges(rcmg_bkt_sv_df)
rcmg_btk_sv_pr_filtered = rcmg_btk_sv_pr.query('SID_count < 30 and sv_len < 1000000')
rcmg_btk_sv_pr_filtered = rcmg_btk_sv_pr_filtered.reset_index(drop=True)

  rcmg_bkt_sv_df = pd.read_csv(str(rcmg_btk_sv_filepath), compression='gzip')


In [13]:
rcmg_btk_sv_pr['sv_len'].describe(percentiles=[.1, .25, .5, .75, .9, .95, .99, .999]).T

count     3810648.0000000000
mean       910374.0638663031
std       7319054.6422773376
min            49.0000000000
10%           631.0000000000
25%          2244.0000000000
50%         23863.5000000000
75%        256919.2500000000
90%       1169973.3000000003
95%       2045701.0000000000
99%      20410049.0000000000
99.9%    66126493.7060000002
max     242323073.0000000000
Name: sv_len, dtype: float64

#### <span style="color:green;">miRBase</span>

In [14]:
# miRBase
mirbase_pr = pr.read_gff3(str(mirbase_filepath))
mirbase_pr['Chromosome'] = mirbase_pr['Chromosome'].str.replace('chr', '')

#### <span style="color:green;">TarBase</span>

In [15]:
# TarBase
tarbase_df = pd.read_csv(str(tarbase_filepath), sep='\t', compression='gzip')
tarbase_df = tarbase_df.rename(columns={'chromosome': 'Chromosome', 'start': 'Start', 'end': 'End', 'strand': 'Strand'})
tarbase_df['Chromosome'] = tarbase_df['Chromosome'].str.replace('chr', '')

tarbase_pr = pr.PyRanges(tarbase_df)
tarbase_pr = tarbase_pr.dropna(subset=['Chromosome', 'Start', 'End'], how='any')
tarbase_pr[['Start', 'End']] = tarbase_pr[['Start', 'End']].astype('int64')

  tarbase_df = pd.read_csv(str(tarbase_filepath), sep='\t', compression='gzip')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tarbase_pr[['Start', 'End']] = tarbase_pr[['Start', 'End']].astype('int64')


#### <span style="color:green;">OMIM</span>

In [315]:
omim_df = pd.read_csv(str(omim_dir / 'omim.csv'))

target_mim_types = ['gene', 'gene/phenotype']

omim_df['combinedMap'] = omim_df['phenotypeMap']
omim_df['combinedMap'] = omim_df['combinedMap'].fillna(omim_df['geneMap'])

omim_df_filtered = omim_df.query('combinedMap == combinedMap and hgnc_gene_symbol == hgnc_gene_symbol and mim_type in @target_mim_types')

combined_map_exploded = omim_df_filtered['combinedMap'].apply(explode_json_column)
omim_df_filtered_exploded = pd.concat([omim_df_filtered, combined_map_exploded], axis=1)

omim_df_short = omim_df_filtered_exploded[['mim_number', 'title', 'ensembl_gene_id', 'hgnc_gene_symbol', 'Phenotype', 'Inheritance', 'Inheritance_unique']]

#### <span style="color:green;">Sequence Ontology terms</span>

In [16]:
# SO terms
with open(so_terms_filepath, 'r') as so_terms_file:
    so_terms = json.load(so_terms_file)

so_terms_filtered = {
    'lbl': [],
    'definition': []
}
for i in so_terms['graphs'][0]['nodes']:
    lbl = i.get('lbl', np.nan)
    definition = i.get('meta', {}).get('definition', {}).get('val', np.nan)
    so_terms_filtered['lbl'].append(lbl)
    so_terms_filtered['definition'].append(definition)

so_terms_df = pd.DataFrame.from_dict(so_terms_filtered)
so_terms_df['lbl'] = so_terms_df['lbl'].str.capitalize().replace('_', '')

### <span style="color:#00ff00;">EDA (Exploratory Data Analysis)</span>

In [17]:
# GFF features stats
gff_features = gff_pr.Feature.value_counts().to_frame().reset_index()
gff_features['tmp_id'] = gff_features['Feature'].str.capitalize().replace('_', '')
gff_features = gff_features.merge(so_terms_df, how='left', left_on='tmp_id', right_on='lbl')
gff_features = gff_features.drop(['lbl', 'tmp_id'], axis=1)
#pd.DataFrame(gff_features)

In [18]:
# GFF biotypes stats
gff_biotypes = gff_pr.biotype.value_counts().to_frame().reset_index()
gff_biotypes['tmp_id'] = gff_biotypes['biotype'].str.capitalize().replace('_', '')
gff_biotypes = gff_biotypes.merge(so_terms_df, how='left', left_on='tmp_id', right_on='lbl')
gff_biotypes = gff_biotypes.drop(['lbl', 'tmp_id'], axis=1)
#pd.DataFrame(gff_biotypes)

In [19]:
#sv_pr.svtype.value_counts().to_frame().reset_index()

In [20]:
tarbase_pr['gene_location'].value_counts()

gene_location
CDS     2502793
3UTR    2056551
5UTR        175
Name: count, dtype: int64

### <span style="color:#00ff00;">Join GFF3 x SV</span>

In [213]:
sv_for_join_pr = rcmg_btk_sv_pr_filtered # target_sv_pr / rcmg_btk_sv_pr_filtered

In [214]:
# Join GFF x SV
features_sv_joined_pr = three_utrs_pr_filtered.join_overlaps(sv_for_join_pr, suffix='_sv', report_overlap_column='join_len', strand_behavior='ignore')

In [215]:
# Add additional columns (join_start, join_end, join_id)
features_sv_joined_pr['join_start'] = np.maximum(features_sv_joined_pr['Start'], features_sv_joined_pr['Start_sv'])
features_sv_joined_pr['join_end'] = np.minimum(features_sv_joined_pr['End'], features_sv_joined_pr['End_sv'])
features_sv_joined_pr['join_id'] = 'join_' + features_sv_joined_pr['Chromosome'].astype('str') + ':' + features_sv_joined_pr['join_start'].astype('str') + '-' + features_sv_joined_pr['join_end'].astype('str')

In [216]:
# Define joint type
features_sv_joined_pr['join_type'] = features_sv_joined_pr.apply(define_join_type, axis=1)
features_sv_joined_pr['join_type'].value_counts().to_frame().reset_index()

Unnamed: 0,join_type,count
0,feature_in_sv,536774
1,sv_right_free,14759
2,sv_left_free,3118
3,sv_in_feature,1722


In [217]:
# Count joints by SV
joints_count_by_sv = features_sv_joined_pr['sv_id'].value_counts().to_frame().reset_index()
joints_count_by_sv = joints_count_by_sv.rename(columns={'count': 'joins_count'})

# Add joints count data
features_sv_joined_pr = features_sv_joined_pr.merge(joints_count_by_sv, how='left')
#len(features_sv_joined_pr)

# Filter joins by join type
targey_joint_types = ['sv_in_feature', 'sv_right_free', 'sv_left_free']
features_sv_joined_pr_filtered = features_sv_joined_pr.query('join_type in @targey_joint_types and ((Strand == "+" and join_type == "sv_right_free") or (Strand == "-" and join_type == "sv_left_free") or join_type == "sv_in_feature") and joins_count == 1')

#### <span style="color:green;">Join type stats</span>

In [218]:
# Stats by join type
features_sv_joined_pr_filtered.groupby('join_type')['sv_len'].describe(percentiles=[.1, .25, .5, .75, .9, .95, .99, .999]).T

join_type,sv_in_feature,sv_left_free,sv_right_free
count,1704.0,362.0,768.0
mean,372.1161971831,5941.4171270718,5219.3528645833
std,821.6904352966,13646.6310140371,7519.1198588396
min,50.0,77.0,60.0
10%,55.0,1072.5,1802.9
25%,65.0,1721.25,4493.0
50%,106.0,4332.0,4567.0
75%,282.5,5025.5,4620.0
90%,821.5,9749.0,6080.8
95%,1724.05,14848.5,9262.85


In [219]:
features_sv_joined_pr_filtered['biotype'].value_counts().to_frame().reset_index()

Unnamed: 0,biotype,count
0,protein_coding,2834


### <span style="color:#00ff00;">Offtarget features exclusion</span>

#### <span style="color:green;">features_sv_joined_pr_filtered copy</span>

In [220]:
# Offtarget stats
features_sv_joined_pr_filtered_cp = features_sv_joined_pr_filtered.copy()
features_sv_joined_pr_filtered_cp[['Start_tmp', 'End_tmp']] = features_sv_joined_pr_filtered_cp[['Start', 'End']] # Save old positions
features_sv_joined_pr_filtered_cp[['Start', 'End']] = features_sv_joined_pr_filtered_cp[['Start_sv', 'End_sv']] # SV positions to main positions

#### <span style="color:green;">Filter offtargets</span>

In [221]:
# Exclude offtargets (SV x offtarget overlaps)
features_sv_no_offtereget_joined_pr = features_sv_joined_pr_filtered_cp.overlap(offtarget_features_pr, invert=True, strand_behavior='ignore')

features_sv_no_offtereget_joined_pr[['Start', 'End']] = features_sv_no_offtereget_joined_pr[['Start_tmp', 'End_tmp']]
features_sv_no_offtereget_joined_pr = features_sv_no_offtereget_joined_pr.drop(['Start_tmp', 'End_tmp'], axis=1)

len(features_sv_no_offtereget_joined_pr)

2331

In [222]:
# Joint type stats after offtargets exclution
print('Before offtargets filtering')
print(features_sv_joined_pr_filtered['join_type'].value_counts().to_frame().reset_index())
print('\nAfter offtargets filtering')
print(features_sv_no_offtereget_joined_pr['join_type'].value_counts().to_frame().reset_index())

Before offtargets filtering
       join_type  count
0  sv_in_feature   1704
1  sv_right_free    768
2   sv_left_free    362

After offtargets filtering
       join_type  count
0  sv_in_feature   1380
1  sv_right_free    662
2   sv_left_free    289


#### <span style="color:green;">Offtarget stats</span>

In [223]:
# features_sv_offtereget_joined_pr - только для отслеживания пересечений с offtarget. Фильтрованный от offtarget pr - features_sv_no_offtereget_joined_pr (через overlap, потому что join находя пересечения с offtarget (через left) также оставляет оригинальную строку).DS_Storefeatures_sv_offtereget_joined_pr = features_sv_joined_pr_filtered_cp.join_overlaps(offtarget_features_pr, join_type='left', suffix='_offtarget', report_overlap_column='offtarget_join_len')
features_sv_offtereget_joined_pr = features_sv_joined_pr_filtered_cp.join_overlaps(offtarget_features_pr, join_type='left', suffix='_offtarget', report_overlap_column='offtarget_join_len', strand_behavior='ignore')

features_sv_offtereget_joined_pr[['Start', 'End']] = features_sv_offtereget_joined_pr[['Start_tmp', 'End_tmp']]
features_sv_offtereget_joined_pr = features_sv_offtereget_joined_pr.drop(['Start_tmp', 'End_tmp'], axis=1)

features_sv_offtereget_joined_pr['offtarget_id'] = 'offtarget_' + features_sv_offtereget_joined_pr['Chromosome'].astype('str') + ':' + features_sv_offtereget_joined_pr['Start_offtarget'].astype('str') + '-' + features_sv_offtereget_joined_pr['End_offtarget'].astype('str')

In [224]:
# Features offtargets stats
# Суммарные значения данной статистики будут выше чем разница всех джоинов - оффтартет джоинов, так как один и тот же SV (напр. DEL) могут пересекать разные изоформы одного и того же транскрипта.
features_sv_offtereget_joined_pr.groupby(['join_type', 'Feature_offtarget'])['Feature_offtarget'].agg(['count'])
#features_sv_offtereget_joined_pr.drop_duplicates(subset='sv_id').groupby(['join_type', 'Feature_offtarget'])['Feature_offtarget'].agg(['count'])

Unnamed: 0_level_0,Unnamed: 1_level_0,count
join_type,Feature_offtarget,Unnamed: 2_level_1
sv_in_feature,CDS,129
sv_in_feature,five_prime_UTR,27
sv_in_feature,ncRNA_gene,308
sv_left_free,CDS,688
sv_left_free,five_prime_UTR,213
sv_left_free,ncRNA_gene,71
sv_right_free,CDS,607
sv_right_free,five_prime_UTR,161
sv_right_free,ncRNA_gene,113


### <span style="color:#00ff00;">miRNA join</span>

#### <span style="color:green;">features_sv_no_offtereget_joined_pr copy</span>

In [287]:
# Offtarget stats
features_sv_no_offtereget_joined_pr_cp = features_sv_no_offtereget_joined_pr.copy()
features_sv_no_offtereget_joined_pr_cp[['Start_tmp', 'End_tmp']] = features_sv_no_offtereget_joined_pr_cp[['Start', 'End']] # Save old positions
features_sv_no_offtereget_joined_pr_cp[['Start', 'End']] = features_sv_no_offtereget_joined_pr_cp[['join_start', 'join_end']] # SV positions to main positions

#### <span style="color:green;">Join</span>

In [288]:
# Join TarBase
features_sv_no_offtereget_tarbase_joined_pr = features_sv_no_offtereget_joined_pr_cp.join_overlaps(tarbase_pr, suffix='_tarbase', strand_behavior='ignore', report_overlap_column='tarbase_join_len')

features_sv_no_offtereget_tarbase_joined_pr[['Start', 'End']] = features_sv_no_offtereget_tarbase_joined_pr[['Start_tmp', 'End_tmp']]
features_sv_no_offtereget_tarbase_joined_pr = features_sv_no_offtereget_tarbase_joined_pr.drop(['Start_tmp', 'End_tmp'], axis=1)

features_sv_no_offtereget_tarbase_joined_pr['tarbase_id'] = 'tarbase_' + features_sv_no_offtereget_tarbase_joined_pr['Chromosome'].astype('str') + ':' + features_sv_no_offtereget_tarbase_joined_pr['Start_tarbase'].astype('str') + '-' + features_sv_no_offtereget_tarbase_joined_pr['End_tarbase'].astype('str') + ':' + features_sv_no_offtereget_tarbase_joined_pr['mirna_name']

### <span style="color:#00ff00;">OMIM merge</span>

In [289]:
features_sv_no_offtereget_tarbase_joined_pr = features_sv_no_offtereget_tarbase_joined_pr.merge(omim_df_short, how='left', left_on='gene_name', right_on='hgnc_gene_symbol', suffixes=['', '_omim'])

In [290]:
features_sv_no_offtereget_tarbase_joined_pr_ad = features_sv_no_offtereget_tarbase_joined_pr[features_sv_no_offtereget_tarbase_joined_pr['Inheritance_unique'].str.contains('AD', na=False)]
features_sv_no_offtereget_tarbase_joined_pr_ad['join_len'] = features_sv_no_offtereget_tarbase_joined_pr_ad['join_len'].astype('int')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  features_sv_no_offtereget_tarbase_joined_pr_ad['join_len'] = features_sv_no_offtereget_tarbase_joined_pr_ad['join_len'].astype('int')


In [291]:
features_sv_no_offtereget_tarbase_joined_pr_ad_filtered = features_sv_no_offtereget_tarbase_joined_pr_ad.sort_values(by=['join_len', 'confidence'], ascending=False).drop_duplicates(subset='sv_id')

In [301]:
target_columns = ['feature_id', 'Feature', 'Name', 'biotype', 'tag', 'feature_len', 'sv_id', 'sv_len', 'join_id', 'join_type', 'join_len', 'SID_list', 'SID_count', 'DP_median', 'AD_median', 'AF_median', 'mirna_name', 'interaction_group', 'mim_number', 'title', 'Phenotype', 'Inheritance', 'Inheritance_unique', 'Name']

In [None]:
features_sv_no_offtereget_tarbase_joined_pr_ad_filtered_short = features_sv_no_offtereget_tarbase_joined_pr_ad_filtered[target_columns]
features_sv_no_offtereget_tarbase_joined_pr_ad_filtered_short.to_csv(output_dir / 'output.csv', index=False)

In [304]:
features_sv_no_offtereget_tarbase_joined_pr_ad_filtered_short

Unnamed: 0,feature_id,Feature,Name,biotype,tag,feature_len,sv_id,sv_len,join_id,join_type,join_len,SID_list,SID_count,DP_median,AD_median,AF_median,mirna_name,interaction_group,mim_number,title,Phenotype,Inheritance,Inheritance_unique,Name.1
7353,feature_14:77505996-77512283,three_prime_UTR,SPTLC2-201,protein_coding,"gencode_basic,gencode_primary,Ensembl_canonical,MANE_Select",6287,sv_14:77501851-77510595,8744,join_14:77505996-77510595,sv_left_free,4599,['000007167740'],1,76.0,23.0,0.3026315789,hsa-miR-31-5p,primary,605713.0,"SERINE PALMITOYLTRANSFERASE, LONG-CHAIN BASE SUBUNIT 2; SPTLC2","Neuropathy, hereditary sensory and autonomic, type IC",AD,AD,SPTLC2-201
14616,feature_21:37512531-37526358,three_prime_UTR,DYRK1A-221,protein_coding,"gencode_basic,gencode_primary,Ensembl_canonical,MANE_Select",13827,sv_21:37521863-37542473,20610,join_21:37521863-37526358,sv_right_free,4495,['000007159580'],1,149.0,72.0,0.4832214765,hsa-miR-34a-5p,primary,600855.0,DUAL-SPECIFICITY TYROSINE PHOSPHORYLATION-REGULATED KINASE 1A; DYRK1A,"Mental retardation, autosomal dominant 7",AD,AD,DYRK1A-221
4642,feature_11:128916731-128921163,three_prime_UTR,KCNJ5-202,protein_coding,"gencode_basic,gencode_primary,Ensembl_canonical,MANE_Select",4432,sv_11:128917892-128922376,4484,join_11:128917892-128921163,sv_right_free,3271,['000027020850'],1,54.0,24.0,0.4444444444,hsa-miR-25-3p,primary,600734.0,"POTASSIUM CHANNEL, INWARDLY RECTIFYING, SUBFAMILY J, MEMBER 5; KCNJ5","Hyperaldosteronism, familial, type III / Long QT syndrome 13",AD / AD,AD,KCNJ5-202
20162,feature_6:26094226-26098343,three_prime_UTR,HFE-207,protein_coding,"gencode_basic,gencode_primary,Ensembl_canonical,MANE_Select",4117,sv_6:26095362-26100273,4911,join_6:26095362-26098343,sv_right_free,2981,['000007079870'],1,190.0,46.0,0.2421052632,hsa-miR-4521,primary,613609.0,HOMEOSTATIC IRON REGULATOR; HFE,"[Transferrin serum level QTL2] / {Alzheimer disease, susceptibility to} / {Microvascular complications of diabetes 7} / {Porphyria cutanea tarda, susceptibility to} / {Porphyria variegata, susceptibility to} / Hemochromatosis","None / AD / None / AD, AR / AD / AR","AD / AD, AR / AR",HFE-207
16747,feature_3:188874479-188890671,three_prime_UTR,LPP-220,protein_coding,"gencode_basic,gencode_primary,Ensembl_canonical,MANE_Select",16192,sv_3:188880144-188882852,2708,join_3:188880144-188882852,sv_in_feature,2708,['000007120620'],1,45.0,13.0,0.2888888889,hsa-miR-15a-5p,secondary,600700.0,LIM DOMAIN-CONTAINING PREFERRED TRANSLOCATION PARTNER IN LIPOMA; LPP,"Leukemia, acute myeloid / Lipoma","AD, SMu / None","AD, SMu",LPP-220
11031,feature_18:31058839-31068014,three_prime_UTR,DSC2-202,protein_coding,"gencode_basic,gencode_primary,Ensembl_canonical,MANE_Select",9175,sv_18:31058896-31061589,2693,join_18:31058896-31061589,sv_in_feature,2693,['000007178280'],1,142.0,60.0,0.4225352113,hsa-miR-182-5p,primary,125645.0,DESMOCOLLIN 2; DSC2,Arrhythmogenic right ventricular dysplasia 11 / Arrhythmogenic right ventricular dysplasia 11 with mild palmoplantar keratoderma and woolly hair,"AD, AR / AD, AR","AD, AR",DSC2-202
6803,feature_13:48261224-48270357,three_prime_UTR,ITM2B-204,protein_coding,"gencode_basic,gencode_primary,Ensembl_canonical,MANE_Select",9133,sv_13:48268413-48271073,2660,join_13:48268413-48270357,sv_right_free,1944,['000029003700'],1,25.0,6.0,0.24,hsa-miR-18a-5p,secondary,603904.0,INTEGRAL MEMBRANE PROTEIN 2B; ITM2B,"?Retinal dystrophy with inner retinal dysfunction and ganglion cell abnormalities / Dementia, familial British / Dementia, familial Danish",AD / AD / AD,AD,ITM2B-204
11233,feature_19:11131316-11133820,three_prime_UTR,LDLR-208,protein_coding,"gencode_basic,gencode_primary,Ensembl_canonical,MANE_Select",2504,sv_19:11132469-11136375,3906,join_19:11132469-11133820,sv_right_free,1351,"['FND00002475', 'FND00002476']",2,42.5,21.5,0.506097561,hsa-miR-29a-3p,primary,606945.0,LOW DENSITY LIPOPROTEIN RECEPTOR; LDLR,"Hypercholesterolemia, familial, 1 / LDL cholesterol level QTL2","AD, AR / AD, AR","AD, AR",LDLR-208
4619,feature_11:118133376-118137026,three_prime_UTR,SCN4B-201,protein_coding,"gencode_basic,gencode_primary,Ensembl_canonical,MANE_Select",3650,sv_11:118130083-118134547,4464,join_11:118133376-118134547,sv_left_free,1171,['000005511400'],1,73.0,29.0,0.397260274,hsa-miR-124-3p,primary,608256.0,"SODIUM VOLTAGE-GATED CHANNEL, BETA SUBUNIT 4; SCN4B","Atrial fibrillation, familial, 17 / Long QT syndrome 10",AD / AD,AD,SCN4B-201
14602,feature_20:62967500-62969585,three_prime_UTR,SLC17A9-202,protein_coding,"gencode_basic,gencode_primary,Ensembl_canonical,MANE_Select",2085,sv_20:62968511-62972457,3946,join_20:62968511-62969585,sv_right_free,1074,['000027021860'],1,76.0,25.0,0.3289473684,hsa-miR-27b-3p,primary,612107.0,"SOLUTE CARRIER FAMILY 17 (VESICULAR NUCLEOTIDE TRANSPORTER), MEMBER 9; SLC17A9","Porokeratosis 8, disseminated superficial actinic type",AD,AD,SLC17A9-202


### <span style="color:#00ff00;">STATS</span>

In [286]:
#features_sv_no_offtereget_tarbase_joined_pr['AF'].describe(percentiles=[.1, .25, .5, .75, .9, .95, .99, .999]).to_frame().reset_index()

In [228]:
features_sv_no_offtereget_tarbase_joined_pr['gene_location'].value_counts()

gene_location
3UTR    23027
Name: count, dtype: int64

In [229]:
features_sv_no_offtereget_tarbase_joined_pr['interaction_group'].value_counts()

interaction_group
primary      17043
secondary     5984
Name: count, dtype: int64

In [230]:
features_sv_no_offtereget_tarbase_joined_pr['regulation'].value_counts()

regulation
Negative    23027
Name: count, dtype: int64

In [231]:
features_sv_no_offtereget_tarbase_joined_pr['transcript_name'].value_counts().to_frame().reset_index().to_csv(output_dir / 'mirna_genes.csv', index=False)
features_sv_no_offtereget_tarbase_joined_pr['transcript_name'].value_counts().to_frame().reset_index()

Unnamed: 0,transcript_name,count
0,LNPEP-201,974
1,ARPP19-208,795
2,SDE2-201,646
3,LDLR-208,525
4,GNS-201,511
...,...,...
443,BHLHA15-202,1
444,TIMELESS-203,1
445,NUB1-201,1
446,RBM33-207,1


In [232]:
features_sv_no_offtereget_tarbase_joined_pr['experimental_method'].value_counts()

experimental_method
HITS-CLIP                    13914
PAR-CLIP                      8906
Chimeric fragments             103
qCLASH                          51
Luciferase Reporter Assay       41
CLASH                            9
IMPACT-Seq                       2
3LIFE                            1
Name: count, dtype: int64

In [233]:
features_sv_no_offtereget_tarbase_joined_pr['join_type'].value_counts()

join_type
sv_in_feature    10804
sv_left_free      6299
sv_right_free     5924
Name: count, dtype: int64

In [234]:
features_sv_no_offtereget_tarbase_joined_pr['join_id'].value_counts()

join_id
join_5:97031222-97037155      974
join_15:52547044-52550754     795
join_19:11132469-11133820     720
join_12:64713448-64715411     511
join_1:179103702-179105699    445
                             ... 
join_16:89233549-89233623       1
join_2:138669156-138669681      1
join_19:47870466-47870968       1
join_11:36272291-36273283       1
join_19:47870466-47871130       1
Name: count, Length: 781, dtype: int64

In [235]:
features_sv_no_offtereget_tarbase_joined_pr['Chromosome'].value_counts()

Chromosome
1     2984
19    2102
12    2059
5     2007
15    1434
8     1392
3     1352
10    1194
4     1079
2     1076
17     909
6      905
14     760
22     664
16     547
11     469
18     394
7      393
20     368
13     314
9      291
21     177
X      157
Name: count, dtype: int64

In [236]:
features_sv_no_offtereget_tarbase_joined_pr['sv_len'].describe(percentiles=[.1, .25, .5, .75, .9, .95, .99, .999]).T

count   23027.0000000000
mean     3780.4828245104
std      4439.3120471445
min        50.0000000000
10%       226.0000000000
25%       950.0000000000
50%      2567.0000000000
75%      5933.0000000000
90%      8139.0000000000
95%     10396.0000000000
99%     15269.0000000000
99.9%   78230.0000000000
max     78230.0000000000
Name: sv_len, dtype: float64

In [237]:
pd.DataFrame(features_sv_no_offtereget_tarbase_joined_pr)

Unnamed: 0,Chromosome,Source,Feature,Start,End,Score,Strand,Frame,ID,Alias,external_name,logic_name,Name,biotype,description,gene_id,version,Parent,transcript_id,constitutive,ensembl_end_phase,ensembl_phase,exon_id,rank,tag,transcript_support_level,ccdsid,protein_id,feature_len,feature_id,gene_name,Start_sv,End_sv,SVTYPE,caller,sv_len,sv_id,SID_list,SID_count,DP_list,DP_min,DP_median,DP_mean,DP_max,AD_list,AD_min,AD_median,AD_mean,AD_max,AF_list,AF_min,AF_median,AF_mean,AF_max,join_len,join_start,join_end,join_id,join_type,joins_count,species,mirna_name,mirna_id,gene_name_tarbase,gene_id_tarbase,gene_location,transcript_name,transcript_id_tarbase,Start_tarbase,End_tarbase,Strand_tarbase,experimental_method,regulation,tissue,cell_line,article_pubmed_id,confidence,interaction_group,cell_type,microt_score,comment,tarbase_join_len,tarbase_id
0,1,ensembl_havana,three_prime_UTR,1327763,1328896,.,+,.,transcript:ENST00000343938,,,,CPTP-201,protein_coding,,,,transcript:ENST00000343938,,,,,,,"gencode_basic,gencode_primary,Ensembl_canonical,MANE_Select",2 (assigned to previous version 8),,,1133,feature_1:1327763-1328896,CPTP,1328245,1328606,DEL,manta,361,sv_1:1328245-1328606,"['000007143790', '000007094630']",2,"[121, 88]",88,104.5000000000,104.5000000000,121,"[34, 26]",26,30.0000000000,30.0000000000,34,"[0.2809917355371901, 0.29545454545454547]",0.2809917355,0.2882231405,0.2882231405,0.2954545455,361,1328245,1328606,join_1:1328245-1328606,sv_in_feature,1,Homo sapiens,hsa-miR-1843,MIMAT0039764,CPTP,ENSG00000224051,3UTR,CPTP-201,ENST00000343938,1328265,1328277,+,HITS-CLIP,Negative,Brain - Primary Visual Cortex,,30455455.0000000000,1,primary,,0.4500000000,Gestation week 16.5,12,tarbase_1:1328265-1328277:hsa-miR-1843
0,1,ensembl_havana,three_prime_UTR,1327763,1328896,.,+,.,transcript:ENST00000343938,,,,CPTP-201,protein_coding,,,,transcript:ENST00000343938,,,,,,,"gencode_basic,gencode_primary,Ensembl_canonical,MANE_Select",2 (assigned to previous version 8),,,1133,feature_1:1327763-1328896,CPTP,1328245,1328606,DEL,manta,361,sv_1:1328245-1328606,"['000007143790', '000007094630']",2,"[121, 88]",88,104.5000000000,104.5000000000,121,"[34, 26]",26,30.0000000000,30.0000000000,34,"[0.2809917355371901, 0.29545454545454547]",0.2809917355,0.2882231405,0.2882231405,0.2954545455,361,1328245,1328606,join_1:1328245-1328606,sv_in_feature,1,Homo sapiens,hsa-miR-148a-3p,MIMAT0000243,CPTP,ENSG00000224051,3UTR,CPTP-201,ENST00000343938,1328306,1328318,+,PAR-CLIP,Negative,Intestine,HCT116,26701625.0000000000,1,primary,Epithelial cells,0.3900000000,,12,tarbase_1:1328306-1328318:hsa-miR-148a-3p
0,1,ensembl_havana,three_prime_UTR,1327763,1328896,.,+,.,transcript:ENST00000343938,,,,CPTP-201,protein_coding,,,,transcript:ENST00000343938,,,,,,,"gencode_basic,gencode_primary,Ensembl_canonical,MANE_Select",2 (assigned to previous version 8),,,1133,feature_1:1327763-1328896,CPTP,1328245,1328606,DEL,manta,361,sv_1:1328245-1328606,"['000007143790', '000007094630']",2,"[121, 88]",88,104.5000000000,104.5000000000,121,"[34, 26]",26,30.0000000000,30.0000000000,34,"[0.2809917355371901, 0.29545454545454547]",0.2809917355,0.2882231405,0.2882231405,0.2954545455,361,1328245,1328606,join_1:1328245-1328606,sv_in_feature,1,Homo sapiens,hsa-miR-148b-3p,MIMAT0000759,CPTP,ENSG00000224051,3UTR,CPTP-201,ENST00000343938,1328306,1328318,+,PAR-CLIP,Negative,Intestine,HCT116,26701625.0000000000,1,primary,Epithelial cells,0.3900000000,,12,tarbase_1:1328306-1328318:hsa-miR-148b-3p
0,1,ensembl_havana,three_prime_UTR,1327763,1328896,.,+,.,transcript:ENST00000343938,,,,CPTP-201,protein_coding,,,,transcript:ENST00000343938,,,,,,,"gencode_basic,gencode_primary,Ensembl_canonical,MANE_Select",2 (assigned to previous version 8),,,1133,feature_1:1327763-1328896,CPTP,1328245,1328606,DEL,manta,361,sv_1:1328245-1328606,"['000007143790', '000007094630']",2,"[121, 88]",88,104.5000000000,104.5000000000,121,"[34, 26]",26,30.0000000000,30.0000000000,34,"[0.2809917355371901, 0.29545454545454547]",0.2809917355,0.2882231405,0.2882231405,0.2954545455,361,1328245,1328606,join_1:1328245-1328606,sv_in_feature,1,Homo sapiens,hsa-miR-101-3p,MIMAT0000099,CPTP,ENSG00000224051,3UTR,CPTP-201,ENST00000343938,1328307,1328319,+,PAR-CLIP,Negative,Intestine,HCT116,26701625.0000000000,1,primary,Epithelial cells,,,12,tarbase_1:1328307-1328319:hsa-miR-101-3p
0,1,ensembl_havana,three_prime_UTR,1327763,1328896,.,+,.,transcript:ENST00000343938,,,,CPTP-201,protein_coding,,,,transcript:ENST00000343938,,,,,,,"gencode_basic,gencode_primary,Ensembl_canonical,MANE_Select",2 (assigned to previous version 8),,,1133,feature_1:1327763-1328896,CPTP,1328245,1328606,DEL,manta,361,sv_1:1328245-1328606,"['000007143790', '000007094630']",2,"[121, 88]",88,104.5000000000,104.5000000000,121,"[34, 26]",26,30.0000000000,30.0000000000,34,"[0.2809917355371901, 0.29545454545454547]",0.2809917355,0.2882231405,0.2882231405,0.2954545455,361,1328245,1328606,join_1:1328245-1328606,sv_in_feature,1,Homo sapiens,hsa-miR-423-5p,MIMAT0004748,CPTP,ENSG00000224051,3UTR,CPTP-201,ENST00000343938,1328344,1328356,+,HITS-CLIP,Negative,Prostate,22RV1,33406413.0000000000,1,primary,Epithelial cells,0.5900000000,,12,tarbase_1:1328344-1328356:hsa-miR-423-5p
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2313,X,havana,three_prime_UTR,151924618,151925170,.,+,.,transcript:ENST00000276344,,,,MAGEA4-201,protein_coding,,,,transcript:ENST00000276344,,,,,,,"gencode_basic,gencode_primary,Ensembl_canonical,MANE_Select",2,,,552,feature_X:151924618-151925170,MAGEA4,151925098,151927335,DEL,manta,2237,sv_X:151925098-151927335,"['000007090080', '000007140250']",2,"[29, 153]",29,91.0000000000,91.0000000000,153,"[29, 53]",29,41.0000000000,41.0000000000,53,"[1.0, 0.3464052287581699]",0.3464052288,0.6732026144,0.6732026144,1.0000000000,72,151925098,151925170,join_X:151925098-151925170,sv_right_free,1,Homo sapiens,hsa-miR-191-5p,MIMAT0000440,MAGEA4,ENSG00000147381,3UTR,,ENST00000360243,151925106,151925118,+,PAR-CLIP,Negative,Prostate,22RV1,27292025.0000000000,1,primary,Epithelial cells,,,12,tarbase_X:151925106-151925118:hsa-miR-191-5p
2313,X,havana,three_prime_UTR,151924618,151925170,.,+,.,transcript:ENST00000276344,,,,MAGEA4-201,protein_coding,,,,transcript:ENST00000276344,,,,,,,"gencode_basic,gencode_primary,Ensembl_canonical,MANE_Select",2,,,552,feature_X:151924618-151925170,MAGEA4,151925098,151927335,DEL,manta,2237,sv_X:151925098-151927335,"['000007090080', '000007140250']",2,"[29, 153]",29,91.0000000000,91.0000000000,153,"[29, 53]",29,41.0000000000,41.0000000000,53,"[1.0, 0.3464052287581699]",0.3464052288,0.6732026144,0.6732026144,1.0000000000,72,151925098,151925170,join_X:151925098-151925170,sv_right_free,1,Homo sapiens,hsa-miR-181a-5p,MIMAT0000256,MAGEA4,ENSG00000147381,3UTR,,ENST00000360243,151925120,151925132,+,PAR-CLIP,Negative,Prostate,22RV1,27292025.0000000000,1,primary,Epithelial cells,0.6000000000,,12,tarbase_X:151925120-151925132:hsa-miR-181a-5p
2316,X,ensembl_havana,three_prime_UTR,153508445,153509546,.,+,.,transcript:ENST00000331595,,,,BGN-201,protein_coding,,,,transcript:ENST00000331595,,,,,,,"gencode_basic,gencode_primary,Ensembl_canonical,MANE_Select",1 (assigned to previous version 8),,,1101,feature_X:153508445-153509546,BGN,153509176,153509287,DEL,manta,111,sv_X:153509176-153509287,"['000027009070', '000028000360', '000028000370', '000029004340', '000029003420', '000029003430', '000007202770']",7,"[77, 63, 11, 40, 38, 16, 44]",11,40.0000000000,41.2857142857,77,"[18, 18, 11, 13, 7, 16, 8]",7,13.0000000000,13.0000000000,18,"[0.23376623376623376, 0.2857142857142857, 1.0, 0.325, 0.18421052631578946, 1.0, 0.18181818181818182]",0.1818181818,0.2857142857,0.4586441754,1.0000000000,111,153509176,153509287,join_X:153509176-153509287,sv_in_feature,1,Homo sapiens,hsa-miR-423-5p,MIMAT0004748,BGN,ENSG00000182492,3UTR,BGN-201,ENST00000331595,153509179,153509191,+,HITS-CLIP,Negative,Brain - Motor cortex,,24389009.0000000000,1,primary,,0.5700000000,Male - 44 years,12,tarbase_X:153509179-153509191:hsa-miR-423-5p
2317,X,ensembl_havana,three_prime_UTR,155054672,155060304,.,+,.,transcript:ENST00000369498,,,,FUNDC2-201,protein_coding,,,,transcript:ENST00000369498,,,,,,,"gencode_basic,gencode_primary,Ensembl_canonical,MANE_Select",1 (assigned to previous version 7),,,5632,feature_X:155054672-155060304,FUNDC2,155055979,155056738,DEL,manta,759,sv_X:155055979-155056738,"['000007090320', '000027021000']",2,"[108, 48]",48,78.0000000000,78.0000000000,108,"[50, 48]",48,49.0000000000,49.0000000000,50,"[0.46296296296296297, 1.0]",0.4629629630,0.7314814815,0.7314814815,1.0000000000,759,155055979,155056738,join_X:155055979-155056738,sv_in_feature,1,Homo sapiens,hsa-miR-32-5p,MIMAT0000090,FUNDC2,ENSG00000165775,3UTR,FUNDC2-201,ENST00000369498,155056053,155056065,+,HITS-CLIP,Negative,Brain - Motor cortex,,24389009.0000000000,1,primary,,0.4200000000,Male - 44 years,12,tarbase_X:155056053-155056065:hsa-miR-32-5p


### <span style="color:#00ff00;">GFF output for IGV</span>

In [259]:
pr_for_igv = features_sv_no_offtereget_tarbase_joined_pr

# ==============================================================================================================================================

# Add GFF
output_features_pr = pr_for_igv[['Chromosome', 'Feature', 'Start', 'End', 'Strand', 'Name', 'biotype', 'tag', 'ID']]
output_features_pr = output_features_pr.drop_duplicates()
output_features_pr['Source'] = 'GFF'
output_features_pr['color'] = 'blue'

# Add gnomAD SV
'''output_sv_pr = pr_for_igv[['Chromosome', 'Start_sv', 'End_sv', 'svtype', 'name', 'sv_id', 'AF', 'N_HET', 'N_HOMALT']]
output_sv_pr = output_sv_pr.drop_duplicates()
output_sv_pr['Source'] = 'gnomAD_SV'
output_sv_pr['color'] = 'red'
output_sv_pr = pr.PyRanges(output_sv_pr.rename(columns={'Start_sv': 'Start', 'End_sv': 'End', 'svtype': 'Feature', 'name': 'Name', 'sv_id': 'ID'}))'''

# Add RCMG BTK SV
output_sv_pr = pr_for_igv[['Chromosome', 'Start_sv', 'End_sv', 'SVTYPE', 'sv_len', 'SID_count', 'DP_median', 'AF_median', 'SID_list', 'sv_id']]
output_sv_pr = output_sv_pr.drop_duplicates()
output_sv_pr['Source'] = 'RCMG_BTK_SV'
output_sv_pr['color'] = 'red'
output_sv_pr = pr.PyRanges(output_sv_pr.rename(columns={'Start_sv': 'Start', 'End_sv': 'End', 'svtype': 'Feature', 'name': 'Name', 'sv_id': 'ID'}))

# Add joins data
output_joins_pr = pr_for_igv[['Chromosome', 'join_start', 'join_end', 'join_type', 'join_id']]
output_joins_pr = output_joins_pr.drop_duplicates()
output_joins_pr['Source'] = 'join'
output_joins_pr['color'] = 'orange'
output_joins_pr = pr.PyRanges(output_joins_pr.rename(columns={'join_start': 'Start', 'join_end': 'End', 'join_type': 'Feature', 'join_id': 'ID'}))

# Add miRNA TarBase
output_tarbase_pr = pr_for_igv[['Chromosome', 'Start_tarbase', 'End_tarbase', 'Strand_tarbase', 'mirna_name', 'interaction_group', 'mirna_id', 'tarbase_id']]
output_tarbase_pr = output_tarbase_pr.drop_duplicates(subset=['mirna_id', 'tarbase_id'])
output_tarbase_pr['Source'] = 'tarbase'
output_tarbase_pr['color'] = 'green'
output_tarbase_pr = pr.PyRanges(output_tarbase_pr.rename(columns={'Start_tarbase': 'Start', 'End_tarbase': 'End', 'Strand_tarbase': 'Strand', 'interaction_group': 'Feature', 'mirna_name': 'Name', 'tarbase_id': 'ID'}))

# Add offtarget GFF (features) data
output_offtargets_pr = features_sv_offtereget_joined_pr[['Chromosome', 'Start_offtarget', 'End_offtarget', 'Feature_offtarget', 'offtarget_id']]
output_offtargets_pr = output_offtargets_pr.query('Start_offtarget == Start_offtarget')
output_offtargets_pr = output_offtargets_pr.drop_duplicates()
output_offtargets_pr['Source'] = 'GFF_offtarget'
output_offtargets_pr['color'] = '#d9d9d9'
output_offtargets_pr = pr.PyRanges(output_offtargets_pr.rename(columns={'Start_offtarget': 'Start', 'End_offtarget': 'End', 'Feature_offtarget': 'Feature', 'offtarget_id': 'ID'}))

output_igv_pr = pd.concat([output_features_pr, output_sv_pr, output_joins_pr, output_tarbase_pr, output_offtargets_pr], ignore_index=True)

# ==============================================================================================================================================

# Fill NA and save
output_igv_pr = output_igv_pr.reset_index(drop=True)
#output_igv_pr = pr.PyRanges(output_igv_pr.fillna('.'))

output_igv_pr.to_gff3(output_dir / 'regions_for_igv.gff3')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  output_features_pr['Source'] = 'GFF'
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  output_features_pr['color'] = 'blue'


In [239]:
# Stats
output_igv_pr['Source'].value_counts().to_frame().reset_index()

Unnamed: 0,Source,count
0,tarbase,10440
1,RCMG_BTK_SV,809
2,GFF_offtarget,794
3,join,781
4,GFF,503
