In [1]:
import os, subprocess
from pathlib import Path, PurePosixPath
import gzip
import pandas as pd, numpy as np, pyranges as pr

In [39]:
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', '{:.10f}'.format)

# UTR3

## Settings

In [2]:
main_path = Path.cwd()

In [3]:
# Create tree
refs_dir = main_path / 'data/refs'
gnomad_dir = main_path / 'data/gnomad'

Path(refs_dir).mkdir(parents=True, exist_ok=True)
Path(gnomad_dir).mkdir(parents=True, exist_ok=True)

## Functions

In [4]:
def fetch_file(link, output_dir):
    command = f'wget --no-clobber -P {output_dir} {link}'
    subprocess.run(command, shell=True)
    filename = PurePosixPath(link).name
    return output_dir / filename

## Fetch data

In [5]:
gff_filepath = fetch_file('https://ftp.ensembl.org/pub/release-115/gff3/homo_sapiens/Homo_sapiens.GRCh38.115.chr.gff3.gz', refs_dir)
gnomad_sv_filepath = fetch_file('https://storage.googleapis.com/gcp-public-data--gnomad/release/4.1/genome_sv/gnomad.v4.1.sv.sites.bed.gz', gnomad_dir)

File '/Users/andrejnekrasov/pro/my/utr3/data/refs/Homo_sapiens.GRCh38.115.chr.gff3.gz' already there; not retrieving.

File '/Users/andrejnekrasov/pro/my/utr3/data/gnomad/gnomad.v4.1.sv.sites.bed.gz' already there; not retrieving.



## Main

### Create PR/DF

In [37]:
gff_pr = pr.read_gff3(str(gff_filepath))

In [None]:
target_columns = ['#chrom', 'start', 'end', 'name', 'svtype', 'samples', 'MULTIALLELIC', 'ALGORITHMS', 'BOTHSIDES_SUPPORT', 'CHR2', 'CPX_INTERVALS', 'CPX_TYPE', 'END', 'END2', 'EVIDENCE', 'LOW_CONFIDENCE_REPETITIVE_LARGE_DUP', 'MEMBERS', 'NCR', 'OUTLIER_SAMPLE_ENRICHED_LENIENT', 'PAR', 'PCRMINUS_NCR', 'PCRPLUS_NCR', 'PESR_GT_OVERDISPERSION', 'POS2', 'PREDICTED_BREAKEND_EXONIC', 'PREDICTED_COPY_GAIN', 'PREDICTED_DUP_PARTIAL', 'PREDICTED_INTERGENIC', 'PREDICTED_INTRAGENIC_EXON_DUP', 'PREDICTED_INTRONIC', 'PREDICTED_INV_SPAN', 'PREDICTED_LOF', 'PREDICTED_MSV_EXON_OVERLAP', 'PREDICTED_NEAREST_TSS', 'PREDICTED_NONCODING_BREAKPOINT', 'PREDICTED_NONCODING_SPAN', 'PREDICTED_PARTIAL_DISPERSED_DUP', 'PREDICTED_PARTIAL_EXON_DUP', 'PREDICTED_PROMOTER', 'PREDICTED_TSS_DUP', 'PREDICTED_UTR', 'RESOLVED_POSTHOC', 'SOURCE', 'SVLEN', 'SVTYPE', 'UNRESOLVED_TYPE', 'AN', 'AC', 'AF', 'N_BI_GENOS', 'N_HOMREF', 'N_HET', 'N_HOMALT', 'FREQ_HOMREF', 'FREQ_HET', 'FREQ_HOMALT', 'CN_NUMBER', 'CN_COUNT', 'CN_STATUS', 'CN_FREQ', 'CN_NONREF_COUNT', 'CN_NONREF_FREQ']
target_columns[0:3] = ['Chromosome', 'Start', 'End']
gnomad_sv_df = pd.read_csv(str(gnomad_sv_filepath), sep='\t', usecols=list(range(len(target_columns))), names=target_columns, comment='#')
gnomad_sv_df['Chromosome'] = gnomad_sv_df['Chromosome'].str.replace('chr', '')

gnomad_sv_pr = pr.PyRanges(gnomad_sv_df)

  gnomad_sv_df = pd.read_csv(str(gnomad_sv_filepath), sep='\t', usecols=list(range(len(target_columns))), names=target_columns, comment='#')


### EDA (Exploratory Data Analysis)

In [68]:
gff_pr.Feature.value_counts()

Feature
exon                      3673949
CDS                       2284258
five_prime_UTR             426597
three_prime_UTR            340566
mRNA                       233574
lnc_RNA                    223403
biological_region          180392
ncRNA_gene                  41946
transcript                  28799
gene                        21547
pseudogenic_transcript      15201
pseudogene                  15198
snRNA                        1906
miRNA                        1879
unconfirmed_transcript       1108
snoRNA                        942
V_gene_segment                253
J_gene_segment                 97
scRNA                          49
rRNA                           49
D_gene_segment                 42
C_gene_segment                 29
chromosome                     25
tRNA                           22
processed_transcript           12
Name: count, dtype: int64

In [69]:
gff_pr.biotype.value_counts()

biotype
protein_coding                        231543
lncRNA                                224032
retained_intron                        34239
protein_coding_CDS_not_defined         26573
nonsense_mediated_decay                21949
processed_pseudogene                   18975
misc_RNA                                4414
unprocessed_pseudogene                  3898
snRNA                                   3802
miRNA                                   3758
transcribed_unprocessed_pseudogene      3176
transcribed_processed_pseudogene        2298
TEC                                     2127
snoRNA                                  1884
rRNA_pseudogene                          994
transcribed_unitary_pseudogene           402
IG_V_pseudogene                          374
IG_V_gene                                292
TR_V_gene                                214
unitary_pseudogene                       178
TR_J_gene                                158
non_stop_decay                           105
sc

### Prepare data

In [135]:
mrna_pr = gff_pr[gff_pr.Feature == 'mRNA']
three_utrs_pr = gff_pr[gff_pr.Feature == 'three_prime_UTR']

mrna_df = mrna_pr.df
three_utrs_df = three_utrs_pr.df

In [None]:
# Annotation of 3'-UTRs with mRNA data
target_columns = ['ID', 'Name', 'biotype', 'Parent', 'tag', 'transcript_support_level'] # Replace the data in these empty 3'-UTRs columns with mRNA data
mrna_df_short = mrna_df[target_columns]
three_utrs_df_annotated = three_utrs_df.merge(mrna_df_short, how='left', left_on='Parent', right_on='ID', suffixes=['_3utr', '']) 
three_utrs_df_annotated = three_utrs_df_annotated.drop(['Score', 'Frame'], axis=1) # Remove columns with "."
three_utrs_df_annotated = three_utrs_df_annotated.dropna(axis=1, how='all') # Remove empty columns
three_utrs_df_annotated = pr.PyRanges(three_utrs_df_annotated)


In [145]:
three_utrs_df_annotated

Unnamed: 0,Chromosome,Source,Feature,Start,End,Strand,Parent_3utr,ID,Name,biotype,Parent,tag,transcript_support_level
0,1,havana,three_prime_UTR,70008,71585,+,transcript:ENST00000641515,transcript:ENST00000641515,OR4F5-201,protein_coding,gene:ENSG00000186092,"gencode_basic,gencode_primary,Ensembl_canonica...",
1,1,havana_tagene,three_prime_UTR,944153,944572,+,transcript:ENST00000968544,transcript:ENST00000968544,SAMD11-218,protein_coding,gene:ENSG00000187634,"gencode_basic,gencode_primary",
2,1,havana,three_prime_UTR,944153,944574,+,transcript:ENST00000616016,transcript:ENST00000616016,SAMD11-209,protein_coding,gene:ENSG00000187634,"gencode_basic,gencode_primary,Ensembl_canonica...",5 (assigned to previous version 4)
3,1,havana,three_prime_UTR,944153,944574,+,transcript:ENST00000618323,transcript:ENST00000618323,SAMD11-213,protein_coding,gene:ENSG00000187634,gencode_basic,5 (assigned to previous version 4)
4,1,havana_tagene,three_prime_UTR,944153,944574,+,transcript:ENST00000968542,transcript:ENST00000968542,SAMD11-216,protein_coding,gene:ENSG00000187634,"gencode_basic,gencode_primary",
...,...,...,...,...,...,...,...,...,...,...,...,...,...
340561,Y,ensembl_havana,three_prime_UTR,24763068,24764925,-,transcript:ENST00000446723,transcript:ENST00000446723,DAZ3-203,protein_coding,gene:ENSG00000187191,"gencode_basic,gencode_primary",1
340562,Y,ensembl_havana,three_prime_UTR,24768898,24768933,-,transcript:ENST00000446723,transcript:ENST00000446723,DAZ3-203,protein_coding,gene:ENSG00000187191,"gencode_basic,gencode_primary",1
340563,Y,havana,three_prime_UTR,25030900,25031222,-,transcript:ENST00000382287,transcript:ENST00000382287,BPY2C-201,protein_coding,gene:ENSG00000185894,"gencode_basic,gencode_primary,Ensembl_canonica...",1
340564,Y,havana,three_prime_UTR,25031316,25031441,-,transcript:ENST00000382287,transcript:ENST00000382287,BPY2C-201,protein_coding,gene:ENSG00000185894,"gencode_basic,gencode_primary,Ensembl_canonica...",1
