In [1]:
import os, subprocess
from pathlib import Path, PurePosixPath
import gzip
import pandas as pd, numpy as np, pyranges as pr

# UTR3

## Settings

In [2]:
main_path = Path.cwd()

In [3]:
# Create tree
refs_dir = main_path / 'data/refs'
gnomad_dir = main_path / 'data/gnomad'

Path(refs_dir).mkdir(parents=True, exist_ok=True)
Path(gnomad_dir).mkdir(parents=True, exist_ok=True)

## Functions

In [4]:
def fetch_file(link, output_dir):
    command = f'wget --no-clobber -P {output_dir} {link}'
    subprocess.run(command, shell=True)
    filename = PurePosixPath(link).name
    return output_dir / filename

## Fetch data

In [5]:
gff_filepath = fetch_file('https://ftp.ensembl.org/pub/release-115/gff3/homo_sapiens/Homo_sapiens.GRCh38.115.chr.gff3.gz', refs_dir)
gnomad_sv_filepath = fetch_file('https://storage.googleapis.com/gcp-public-data--gnomad/release/4.1/genome_sv/gnomad.v4.1.sv.sites.bed.gz', gnomad_dir)

File '/Users/andrejnekrasov/pro/my/utr3/data/refs/Homo_sapiens.GRCh38.115.chr.gff3.gz' already there; not retrieving.

File '/Users/andrejnekrasov/pro/my/utr3/data/gnomad/gnomad.v4.1.sv.sites.bed.gz' already there; not retrieving.



## Main

In [None]:
gff_pr = pr.read_gff3(str(gff_filepath))

In [None]:
target_columns = ['#chrom', 'start', 'end', 'name', 'svtype', 'samples', 'MULTIALLELIC', 'ALGORITHMS', 'BOTHSIDES_SUPPORT', 'CHR2', 'CPX_INTERVALS', 'CPX_TYPE', 'END', 'END2', 'EVIDENCE', 'LOW_CONFIDENCE_REPETITIVE_LARGE_DUP', 'MEMBERS', 'NCR', 'OUTLIER_SAMPLE_ENRICHED_LENIENT', 'PAR', 'PCRMINUS_NCR', 'PCRPLUS_NCR', 'PESR_GT_OVERDISPERSION', 'POS2', 'PREDICTED_BREAKEND_EXONIC', 'PREDICTED_COPY_GAIN', 'PREDICTED_DUP_PARTIAL', 'PREDICTED_INTERGENIC', 'PREDICTED_INTRAGENIC_EXON_DUP', 'PREDICTED_INTRONIC', 'PREDICTED_INV_SPAN', 'PREDICTED_LOF', 'PREDICTED_MSV_EXON_OVERLAP', 'PREDICTED_NEAREST_TSS', 'PREDICTED_NONCODING_BREAKPOINT', 'PREDICTED_NONCODING_SPAN', 'PREDICTED_PARTIAL_DISPERSED_DUP', 'PREDICTED_PARTIAL_EXON_DUP', 'PREDICTED_PROMOTER', 'PREDICTED_TSS_DUP', 'PREDICTED_UTR', 'RESOLVED_POSTHOC', 'SOURCE', 'SVLEN', 'SVTYPE', 'UNRESOLVED_TYPE', 'AN', 'AC', 'AF', 'N_BI_GENOS', 'N_HOMREF', 'N_HET', 'N_HOMALT', 'FREQ_HOMREF', 'FREQ_HET', 'FREQ_HOMALT', 'CN_NUMBER', 'CN_COUNT', 'CN_STATUS', 'CN_FREQ', 'CN_NONREF_COUNT', 'CN_NONREF_FREQ']
target_columns[0:3] = ['Chromosome', 'Start', 'End']
gnomad_sv_df = pd.read_csv(str(gnomad_sv_filepath), sep='\t', usecols=list(range(len(target_columns))), names=target_columns, comment='#')
gnomad_sv_df['Chromosome'] = gnomad_sv_df['Chromosome'].str.replace('chr', '')

  gnomad_sv_df = pd.read_csv(str(gnomad_sv_filepath), sep='\t', usecols=list(range(len(target_columns))), names=target_columns, comment='#')


In [20]:
gnomad_sv_df

Unnamed: 0,Chromosome,Start,End,name,svtype,samples,MULTIALLELIC,ALGORITHMS,BOTHSIDES_SUPPORT,CHR2,...,N_HOMALT,FREQ_HOMREF,FREQ_HET,FREQ_HOMALT,CN_NUMBER,CN_COUNT,CN_STATUS,CN_FREQ,CN_NONREF_COUNT,CN_NONREF_FREQ
0,chr1,9999,295666,gnomAD-SV_v3_DUP_chr1_01c2781c,DUP,,False,depth,False,chr1,...,0.0,0.996005,0.003995,0.000000,,,,,,
1,chr1,10433,10434,gnomAD-SV_v3_BND_chr1_1a45f73a,BND,,False,manta,True,chr2,...,24.0,0.772378,0.226975,0.000646,,,,,,
2,chr1,10439,10440,gnomAD-SV_v3_BND_chr1_3fa36917,BND,,False,manta,True,chr3,...,0.0,0.991599,0.008401,0.000000,,,,,,
3,chr1,10449,10450,gnomAD-SV_v3_BND_chr1_933a2971,BND,,False,manta,True,chr18,...,75.0,0.354435,0.643333,0.002232,,,,,,
4,chr1,10463,10464,gnomAD-SV_v3_BND_chr1_7bbf34b5,BND,,False,manta,True,chr12,...,2.0,0.926078,0.073875,0.000047,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2154481,chrY,56878257,56878816,gnomAD-SV_v3_DEL_chrY_2d25b4a6,DEL,,False,wham,False,chrY,...,0.0,0.999726,0.000274,0.000000,,,,,,
2154482,chrY,56878270,56878718,gnomAD-SV_v3_DEL_chrY_ab4f307f,DEL,,False,wham,False,chrY,...,4.0,0.582192,0.390411,0.027397,,,,,,
2154483,chrY,56878973,56879563,gnomAD-SV_v3_DEL_chrY_0db85cf8,DEL,,False,wham,False,chrY,...,0.0,0.999896,0.000104,0.000000,,,,,,
2154484,chrY,56880080,56880081,gnomAD-SV_v3_BND_chrY_0151e635,BND,,False,wham,False,chrY,...,0.0,0.999967,0.000033,0.000000,,,,,,


In [21]:
gnomad_sv_pr = pr.PyRanges(gnomad_sv_df)

In [None]:
gff_pr

Unnamed: 0,Chromosome,Source,Feature,Start,End,Score,Strand,Frame,ID,Alias,...,transcript_id,constitutive,ensembl_end_phase,ensembl_phase,exon_id,rank,tag,transcript_support_level,ccdsid,protein_id
0,1,GRCh38,chromosome,0,248956422,.,.,.,chromosome:1,"CM000663.2,chr1,NC_000001.11",...,,,,,,,,,,
1,1,.,biological_region,10468,11240,1.3e+03,.,.,,,...,,,,,,,,,,
2,1,.,biological_region,10649,10657,0.999,+,.,,,...,,,,,,,,,,
3,1,.,biological_region,10654,10657,0.999,-,.,,,...,,,,,,,,,,
4,1,.,biological_region,10677,10687,0.999,+,.,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7491838,Y,.,biological_region,57215803,57215813,0.999,-,.,,,...,,,,,,,,,,
7491839,Y,.,biological_region,57215831,57215840,0.999,+,.,,,...,,,,,,,,,,
7491840,Y,.,biological_region,57215832,57215842,0.999,-,.,,,...,,,,,,,,,,
7491841,Y,.,biological_region,57215861,57215867,0.999,+,.,,,...,,,,,,,,,,


In [23]:
gnomad_sv_df

Unnamed: 0,Chromosome,Start,End,name,svtype,samples,MULTIALLELIC,ALGORITHMS,BOTHSIDES_SUPPORT,CHR2,...,N_HOMALT,FREQ_HOMREF,FREQ_HET,FREQ_HOMALT,CN_NUMBER,CN_COUNT,CN_STATUS,CN_FREQ,CN_NONREF_COUNT,CN_NONREF_FREQ
0,chr1,9999,295666,gnomAD-SV_v3_DUP_chr1_01c2781c,DUP,,False,depth,False,chr1,...,0.0,0.996005,0.003995,0.000000,,,,,,
1,chr1,10433,10434,gnomAD-SV_v3_BND_chr1_1a45f73a,BND,,False,manta,True,chr2,...,24.0,0.772378,0.226975,0.000646,,,,,,
2,chr1,10439,10440,gnomAD-SV_v3_BND_chr1_3fa36917,BND,,False,manta,True,chr3,...,0.0,0.991599,0.008401,0.000000,,,,,,
3,chr1,10449,10450,gnomAD-SV_v3_BND_chr1_933a2971,BND,,False,manta,True,chr18,...,75.0,0.354435,0.643333,0.002232,,,,,,
4,chr1,10463,10464,gnomAD-SV_v3_BND_chr1_7bbf34b5,BND,,False,manta,True,chr12,...,2.0,0.926078,0.073875,0.000047,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2154481,chrY,56878257,56878816,gnomAD-SV_v3_DEL_chrY_2d25b4a6,DEL,,False,wham,False,chrY,...,0.0,0.999726,0.000274,0.000000,,,,,,
2154482,chrY,56878270,56878718,gnomAD-SV_v3_DEL_chrY_ab4f307f,DEL,,False,wham,False,chrY,...,4.0,0.582192,0.390411,0.027397,,,,,,
2154483,chrY,56878973,56879563,gnomAD-SV_v3_DEL_chrY_0db85cf8,DEL,,False,wham,False,chrY,...,0.0,0.999896,0.000104,0.000000,,,,,,
2154484,chrY,56880080,56880081,gnomAD-SV_v3_BND_chrY_0151e635,BND,,False,wham,False,chrY,...,0.0,0.999967,0.000033,0.000000,,,,,,
