In [None]:
import os, subprocess
from pathlib import Path, PurePosixPath
import gzip
import pandas as pd, numpy as np, pyranges as pr
import plotly.express as px

In [None]:
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', '{:.10f}'.format)

# UTR3

## Settings

[PyRanges v1.x GitHub](https://github.com/pyranges/pyranges_1.x)  
[PyRanges v1.x Docs](https://pyranges1.readthedocs.io/en/latest/index.html)

```{bash}
mamba env export -n utr3.venv > environment.yml
```

In [None]:
main_path = Path.cwd()

In [None]:
# Create tree
refs_dir = main_path / 'data/refs'
gnomad_dir = main_path / 'data/gnomad'

Path(refs_dir).mkdir(parents=True, exist_ok=True)
Path(gnomad_dir).mkdir(parents=True, exist_ok=True)

## Functions

In [None]:
def fetch_file(link, output_dir):
    command = f'wget --no-clobber -P {output_dir} {link}'
    subprocess.run(command, shell=True)
    filename = PurePosixPath(link).name
    return output_dir / filename

def define_join_type(row):
    gff_chr = row['Chromosome']
    gff_start = row['']
    gff_end = row['']
    sv_chr = row['Chromosome']
    sv_start = row['']
    sv_end = row['']



## Fetch data

In [None]:
gff_filepath = fetch_file('https://ftp.ensembl.org/pub/release-115/gff3/homo_sapiens/Homo_sapiens.GRCh38.115.chr.gff3.gz', refs_dir)
gnomad_sv_filepath = fetch_file('https://storage.googleapis.com/gcp-public-data--gnomad/release/4.1/genome_sv/gnomad.v4.1.sv.sites.bed.gz', gnomad_dir)

## Main

### Create PR/DF

In [None]:
gff_pr = pr.read_gff3(str(gff_filepath))

In [None]:
# Keep only the necessary columns (62 of 600...)
target_columns = ['#chrom', 'start', 'end', 'name', 'svtype', 'samples', 'MULTIALLELIC', 'ALGORITHMS', 'BOTHSIDES_SUPPORT', 'CHR2', 'CPX_INTERVALS', 'CPX_TYPE', 'END', 'END2', 'EVIDENCE', 'LOW_CONFIDENCE_REPETITIVE_LARGE_DUP', 'MEMBERS', 'NCR', 'OUTLIER_SAMPLE_ENRICHED_LENIENT', 'PAR', 'PCRMINUS_NCR', 'PCRPLUS_NCR', 'PESR_GT_OVERDISPERSION', 'POS2', 'PREDICTED_BREAKEND_EXONIC', 'PREDICTED_COPY_GAIN', 'PREDICTED_DUP_PARTIAL', 'PREDICTED_INTERGENIC', 'PREDICTED_INTRAGENIC_EXON_DUP', 'PREDICTED_INTRONIC', 'PREDICTED_INV_SPAN', 'PREDICTED_LOF', 'PREDICTED_MSV_EXON_OVERLAP', 'PREDICTED_NEAREST_TSS', 'PREDICTED_NONCODING_BREAKPOINT', 'PREDICTED_NONCODING_SPAN', 'PREDICTED_PARTIAL_DISPERSED_DUP', 'PREDICTED_PARTIAL_EXON_DUP', 'PREDICTED_PROMOTER', 'PREDICTED_TSS_DUP', 'PREDICTED_UTR', 'RESOLVED_POSTHOC', 'SOURCE', 'SVLEN', 'SVTYPE', 'UNRESOLVED_TYPE', 'AN', 'AC', 'AF', 'N_BI_GENOS', 'N_HOMREF', 'N_HET', 'N_HOMALT', 'FREQ_HOMREF', 'FREQ_HET', 'FREQ_HOMALT', 'CN_NUMBER', 'CN_COUNT', 'CN_STATUS', 'CN_FREQ', 'CN_NONREF_COUNT', 'CN_NONREF_FREQ']
target_columns[0:3] = ['Chromosome', 'Start', 'End']
sv_df = pd.read_csv(str(gnomad_sv_filepath), sep='\t', usecols=list(range(len(target_columns))), names=target_columns, comment='#')
sv_df['Chromosome'] = sv_df['Chromosome'].str.replace('chr', '')

sv_pr = pr.PyRanges(sv_df)

### EDA (Exploratory Data Analysis)

In [None]:
gff_pr.Feature.value_counts()

In [None]:
gff_pr.biotype.value_counts()

In [None]:
sv_pr.svtype.value_counts()

### Prepare GFF3 data

In [None]:
mrna_pr = gff_pr[gff_pr.Feature == 'mRNA']
three_utrs_pr = gff_pr[gff_pr.Feature == 'three_prime_UTR']

In [None]:
# Annotation of 3'-UTRs with mRNA (parent) data
target_columns = ['ID', 'Name', 'biotype', 'Parent', 'tag', 'transcript_support_level'] # Replace the data in these empty 3'-UTRs columns with mRNA data
mrna_pr_short = mrna_pr[target_columns]
three_utrs_pr_annotated = three_utrs_pr.merge(mrna_pr_short, how='left', left_on='Parent', right_on='ID', suffixes=['_3utr', ''])
three_utrs_pr_annotated[['Score', 'Frame']] = three_utrs_pr_annotated[['Score', 'Frame']].replace('.', np.nan)
three_utrs_pr_annotated = three_utrs_pr_annotated.dropna(axis=1, how='all') # Remove empty columns

# Stats
three_utrs_pr_annotated['biotype'].value_counts()


In [None]:
# Keep only MANE_select / MANE_Select|Ensembl_canonical
three_utrs_pr_filtered = three_utrs_pr_annotated[three_utrs_pr_annotated['tag'].str.contains('MANE_Select', na=False, regex=True)]

# Stats
# MANE_select genes count = 19437 (release_1.5) from summary file: https://ftp.ncbi.nlm.nih.gov/refseq/MANE/MANE_human/release_1.5/
three_utrs_pr_filtered['biotype'].value_counts()

In [None]:
three_utrs_pr_filtered

### Prepare gnomAD SV data 

In [None]:
sv_del = sv_pr[sv_pr.svtype == "DEL"]
sv_del.del_len = sv_del.End - sv_del.Start

In [None]:
sv_del

In [None]:
sv_del.del_len.describe(percentiles=[.1, .25, .5, .75, .9, .95, .99, .999])

In [None]:
fig = px.histogram( 
    x=sv_del.del_len,
    nbins=300,
    log_y=True,
    title='gnomAD SV DEL lenght distribution (log)'
)
fig.show()

### Join GFF3 x gnomAD SV DEL

In [None]:
three_utr_sv_joined_pr = three_utrs_pr_filtered.join_overlaps(sv_del, suffix='_sv', report_overlap_column='overlap_length')

In [None]:
three_utr_sv_joined_pr