In [1]:
import os, subprocess
from pathlib import Path, PurePosixPath
import gzip
import pandas as pd, numpy as np, pyranges as pr
import plotly.express as px

In [24]:
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', '{:.10f}'.format)

# UTR3

## Settings

[PyRanges v1.x GitHub](https://github.com/pyranges/pyranges_1.x)  
[PyRanges v1.x Docs](https://pyranges1.readthedocs.io/en/latest/index.html)

```{bash}
mamba env export -n utr3.venv > environment.yml
```

In [3]:
main_path = Path.cwd()

In [4]:
# Create tree
refs_dir = main_path / 'data/refs'
gnomad_dir = main_path / 'data/gnomad'

Path(refs_dir).mkdir(parents=True, exist_ok=True)
Path(gnomad_dir).mkdir(parents=True, exist_ok=True)

## Functions

In [139]:
def fetch_file(link, output_dir):
    command = f'wget --no-clobber -P {output_dir} {link}'
    subprocess.run(command, shell=True)
    filename = PurePosixPath(link).name
    return output_dir / filename


def define_join_type(row):
    feature_start = row['Start']
    feature_end = row['End']
    sv_start = row['Start_sv']
    sv_end = row['End_sv']

    join_type = np.nan
    if feature_start <= sv_start and feature_end >= sv_end: # SV полностью в feature
        join_type = 'sv_in_feature'
    elif sv_start <= feature_start and sv_end >= feature_end: # feature полностью в sv
        join_type = 'feature_in_sv'
    elif sv_start < feature_start and sv_end <= sv_end:
        join_type = 'sv_left_free'
    elif sv_start > feature_start and sv_end >= sv_end:
        join_type = 'sv_right_free'
    elif sv_start == feature_start and sv_end == sv_end:
        join_type = 'full_join'

    return join_type

## Fetch data

In [6]:
gff_filepath = fetch_file('https://ftp.ensembl.org/pub/release-115/gff3/homo_sapiens/Homo_sapiens.GRCh38.115.chr.gff3.gz', refs_dir)
gnomad_sv_filepath = fetch_file('https://storage.googleapis.com/gcp-public-data--gnomad/release/4.1/genome_sv/gnomad.v4.1.sv.sites.bed.gz', gnomad_dir)

File '/Users/andrejnekrasov/pro/my/utr3/data/refs/Homo_sapiens.GRCh38.115.chr.gff3.gz' already there; not retrieving.

File '/Users/andrejnekrasov/pro/my/utr3/data/gnomad/gnomad.v4.1.sv.sites.bed.gz' already there; not retrieving.



## Main

### Create PR/DF

In [7]:
gff_pr = pr.read_gff3(str(gff_filepath))

In [8]:
# Keep only the necessary columns (62 of 600...)
target_columns = ['#chrom', 'start', 'end', 'name', 'svtype', 'samples', 'MULTIALLELIC', 'ALGORITHMS', 'BOTHSIDES_SUPPORT', 'CHR2', 'CPX_INTERVALS', 'CPX_TYPE', 'END', 'END2', 'EVIDENCE', 'LOW_CONFIDENCE_REPETITIVE_LARGE_DUP', 'MEMBERS', 'NCR', 'OUTLIER_SAMPLE_ENRICHED_LENIENT', 'PAR', 'PCRMINUS_NCR', 'PCRPLUS_NCR', 'PESR_GT_OVERDISPERSION', 'POS2', 'PREDICTED_BREAKEND_EXONIC', 'PREDICTED_COPY_GAIN', 'PREDICTED_DUP_PARTIAL', 'PREDICTED_INTERGENIC', 'PREDICTED_INTRAGENIC_EXON_DUP', 'PREDICTED_INTRONIC', 'PREDICTED_INV_SPAN', 'PREDICTED_LOF', 'PREDICTED_MSV_EXON_OVERLAP', 'PREDICTED_NEAREST_TSS', 'PREDICTED_NONCODING_BREAKPOINT', 'PREDICTED_NONCODING_SPAN', 'PREDICTED_PARTIAL_DISPERSED_DUP', 'PREDICTED_PARTIAL_EXON_DUP', 'PREDICTED_PROMOTER', 'PREDICTED_TSS_DUP', 'PREDICTED_UTR', 'RESOLVED_POSTHOC', 'SOURCE', 'SVLEN', 'SVTYPE', 'UNRESOLVED_TYPE', 'AN', 'AC', 'AF', 'N_BI_GENOS', 'N_HOMREF', 'N_HET', 'N_HOMALT', 'FREQ_HOMREF', 'FREQ_HET', 'FREQ_HOMALT', 'CN_NUMBER', 'CN_COUNT', 'CN_STATUS', 'CN_FREQ', 'CN_NONREF_COUNT', 'CN_NONREF_FREQ']
target_columns[0:3] = ['Chromosome', 'Start', 'End']
sv_df = pd.read_csv(str(gnomad_sv_filepath), sep='\t', usecols=list(range(len(target_columns))), names=target_columns, comment='#')
sv_df['Chromosome'] = sv_df['Chromosome'].str.replace('chr', '')

sv_pr = pr.PyRanges(sv_df)

  sv_df = pd.read_csv(str(gnomad_sv_filepath), sep='\t', usecols=list(range(len(target_columns))), names=target_columns, comment='#')


### EDA (Exploratory Data Analysis)

In [9]:
gff_pr.Feature.value_counts()

Feature
exon                      3673949
CDS                       2284258
five_prime_UTR             426597
three_prime_UTR            340566
mRNA                       233574
lnc_RNA                    223403
biological_region          180392
ncRNA_gene                  41946
transcript                  28799
gene                        21547
pseudogenic_transcript      15201
pseudogene                  15198
snRNA                        1906
miRNA                        1879
unconfirmed_transcript       1108
snoRNA                        942
V_gene_segment                253
J_gene_segment                 97
scRNA                          49
rRNA                           49
D_gene_segment                 42
C_gene_segment                 29
chromosome                     25
tRNA                           22
processed_transcript           12
Name: count, dtype: int64

In [10]:
gff_pr.biotype.value_counts()

biotype
protein_coding                        231543
lncRNA                                224032
retained_intron                        34239
protein_coding_CDS_not_defined         26573
nonsense_mediated_decay                21949
processed_pseudogene                   18975
misc_RNA                                4414
unprocessed_pseudogene                  3898
snRNA                                   3802
miRNA                                   3758
transcribed_unprocessed_pseudogene      3176
transcribed_processed_pseudogene        2298
TEC                                     2127
snoRNA                                  1884
rRNA_pseudogene                          994
transcribed_unitary_pseudogene           402
IG_V_pseudogene                          374
IG_V_gene                                292
TR_V_gene                                214
unitary_pseudogene                       178
TR_J_gene                                158
non_stop_decay                           105
sc

In [11]:
sv_pr.svtype.value_counts()

svtype
DEL             1197080
BND              356035
DUP              269326
INS:ME:ALU       173374
INS               83441
INS:ME:LINE1      30223
INS:ME:SVA        17607
CPX               15189
DEL:ME:LINE1       8505
INV                2193
CNV                 721
DEL:ME:HERVK        693
CTX                  99
Name: count, dtype: int64

### Prepare GFF3 data

In [60]:
gff_pr['feature_len'] = (gff_pr['End'] - gff_pr['Start']).astype('int')

In [61]:
mrna_pr = gff_pr[gff_pr.Feature == 'mRNA']
three_utrs_pr = gff_pr[gff_pr.Feature == 'three_prime_UTR']

In [88]:
# Annotation of 3'-UTRs with mRNA (parent) data
target_columns = ['ID', 'Name', 'biotype', 'Parent', 'tag', 'transcript_support_level', 'feature_len'] # Replace the data in these empty 3'-UTRs columns with mRNA data
mrna_pr_short = mrna_pr[target_columns]
three_utrs_pr_annotated = three_utrs_pr.merge(mrna_pr_short, how='left', left_on='Parent', right_on='ID', suffixes=['_feature_suffix', ''])
three_utrs_pr_annotated[['Score', 'Frame']] = three_utrs_pr_annotated[['Score', 'Frame']].replace('.', np.nan)
three_utrs_pr_annotated = three_utrs_pr_annotated.dropna(axis=1, how='all') # Remove empty columns

# Stats
three_utrs_pr_annotated['biotype'].value_counts()



Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`



biotype
protein_coding             202218
nonsense_mediated_decay    138264
protein_coding_LoF             50
Name: count, dtype: int64

In [89]:
# Keep only MANE_select / MANE_Select|Ensembl_canonical
three_utrs_pr_filtered = three_utrs_pr_annotated[three_utrs_pr_annotated['tag'].str.contains('MANE_Select', na=False, regex=True)]

# Stats
# MANE_select genes count = 19437 (release_1.5) from summary file: https://ftp.ncbi.nlm.nih.gov/refseq/MANE/MANE_human/release_1.5/
three_utrs_pr_filtered['biotype'].value_counts()

biotype
protein_coding    19757
Name: count, dtype: int64

In [90]:
pd.DataFrame(three_utrs_pr_filtered)

Unnamed: 0,Chromosome,Source,Feature,Start,End,Strand,Parent_feature_suffix,feature_len_feature_suffix,ID,Name,biotype,Parent,tag,transcript_support_level,feature_len
0,1,havana,three_prime_UTR,70008,71585,+,transcript:ENST00000641515,1577,transcript:ENST00000641515,OR4F5-201,protein_coding,gene:ENSG00000186092,"gencode_basic,gencode_primary,Ensembl_canonica...",,6167.0000000000
2,1,havana,three_prime_UTR,944153,944574,+,transcript:ENST00000616016,421,transcript:ENST00000616016,SAMD11-209,protein_coding,gene:ENSG00000187634,"gencode_basic,gencode_primary,Ensembl_canonica...",5 (assigned to previous version 4),20652.0000000000
13,1,ensembl_havana,three_prime_UTR,944202,944693,-,transcript:ENST00000327044,491,transcript:ENST00000327044,NOC2L-201,protein_coding,gene:ENSG00000188976,"gencode_basic,gencode_primary,Ensembl_canonica...",1 (assigned to previous version 6),15054.0000000000
75,1,ensembl_havana,three_prime_UTR,965191,965719,+,transcript:ENST00000338591,528,transcript:ENST00000338591,KLHL17-201,protein_coding,gene:ENSG00000187961,"gencode_basic,gencode_primary,Ensembl_canonica...",1 (assigned to previous version 7),5136.0000000000
87,1,ensembl_havana,three_prime_UTR,974575,975865,+,transcript:ENST00000379410,1290,transcript:ENST00000379410,PLEKHN1-203,protein_coding,gene:ENSG00000187583,"gencode_basic,gencode_primary,Ensembl_canonica...",1 (assigned to previous version 7),9384.0000000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
340532,Y,havana,three_prime_UTR,24905183,24907040,+,transcript:ENST00000682740,1857,transcript:ENST00000682740,DAZ4-208,protein_coding,gene:ENSG00000205916,"gencode_basic,gencode_primary,Ensembl_canonica...",,73122.0000000000
340541,Y,havana,three_prime_UTR,25030900,25031222,-,transcript:ENST00000382287,322,transcript:ENST00000382287,BPY2C-201,protein_coding,gene:ENSG00000185894,"gencode_basic,gencode_primary,Ensembl_canonica...",1,21204.0000000000
340542,Y,havana,three_prime_UTR,25031316,25031441,-,transcript:ENST00000382287,125,transcript:ENST00000382287,BPY2C-201,protein_coding,gene:ENSG00000185894,"gencode_basic,gencode_primary,Ensembl_canonica...",1,21204.0000000000
340543,Y,havana,three_prime_UTR,25037991,25038097,-,transcript:ENST00000382287,106,transcript:ENST00000382287,BPY2C-201,protein_coding,gene:ENSG00000185894,"gencode_basic,gencode_primary,Ensembl_canonica...",1,21204.0000000000


### Prepare SV data 

In [119]:
sv_del = sv_pr[sv_pr.svtype == "DEL"]
sv_del['sv_len'] = sv_del.End - sv_del.Start
sv_del['sv_id'] = sv_del['Chromosome'].astype('str') + ':' + sv_del['Start'].astype('str') + '-' + sv_del['End'].astype('str')



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [120]:
pd.DataFrame(sv_del)

Unnamed: 0,Chromosome,Start,End,name,svtype,samples,MULTIALLELIC,ALGORITHMS,BOTHSIDES_SUPPORT,CHR2,CPX_INTERVALS,CPX_TYPE,END,END2,EVIDENCE,LOW_CONFIDENCE_REPETITIVE_LARGE_DUP,MEMBERS,NCR,OUTLIER_SAMPLE_ENRICHED_LENIENT,PAR,PCRMINUS_NCR,PCRPLUS_NCR,PESR_GT_OVERDISPERSION,POS2,PREDICTED_BREAKEND_EXONIC,PREDICTED_COPY_GAIN,PREDICTED_DUP_PARTIAL,PREDICTED_INTERGENIC,PREDICTED_INTRAGENIC_EXON_DUP,PREDICTED_INTRONIC,PREDICTED_INV_SPAN,PREDICTED_LOF,PREDICTED_MSV_EXON_OVERLAP,PREDICTED_NEAREST_TSS,PREDICTED_NONCODING_BREAKPOINT,PREDICTED_NONCODING_SPAN,PREDICTED_PARTIAL_DISPERSED_DUP,PREDICTED_PARTIAL_EXON_DUP,PREDICTED_PROMOTER,PREDICTED_TSS_DUP,PREDICTED_UTR,RESOLVED_POSTHOC,SOURCE,SVLEN,SVTYPE,UNRESOLVED_TYPE,AN,AC,AF,N_BI_GENOS,N_HOMREF,N_HET,N_HOMALT,FREQ_HOMREF,FREQ_HET,FREQ_HOMALT,CN_NUMBER,CN_COUNT,CN_STATUS,CN_FREQ,CN_NONREF_COUNT,CN_NONREF_FREQ,sv_len,sv_id
9,1,21999,30000,gnomAD-SV_v3_DEL_chr1_fa103016,DEL,,False,depth,False,chr1,,,30000,,RD,False,,0.2886910141,False,False,0.2986469865,0.1756809950,False,,,,,True,,,,,,OR4F5,,,,,,,,False,,8000,DEL,,88892.0000000000,1262.0000000000,0.0141970003,44446.0000000000,43184.0000000000,1262.0000000000,0.0000000000,0.9716060162,0.0283940006,0.0000000000,,,,,,,8001,1:21999-30000
10,1,39999,47000,gnomAD-SV_v3_DEL_chr1_b26f63f7,DEL,,False,depth,False,chr1,,,47000,,RD,False,,0.0000000000,False,False,0.0000000000,0.0000000000,False,,,,,True,,,,,,OR4F5,,,,,,,,False,,7000,DEL,,126092.0000000000,12574.0000000000,0.0997209996,63046.0000000000,50477.0000000000,12564.0000000000,5.0000000000,0.8006380200,0.1992830038,0.0000793072,,,,,,,7001,1:39999-47000
16,1,59100,59202,gnomAD-SV_v3_DEL_chr1_b0fb90d2,DEL,,False,manta,True,chr1,,,59202,,SR,False,,0.0004363670,False,False,0.0004498200,0.0002836610,False,,,,,True,,,,,,OR4F5,,,,,,,,False,,101,DEL,,126018.0000000000,1.0000000000,0.0000080000,63009.0000000000,63008.0000000000,1.0000000000,0.0000000000,0.9999840260,0.0000158707,0.0000000000,,,,,,,102,1:59100-59202
18,1,62398,62489,gnomAD-SV_v3_DEL_chr1_7ef5c6e8,DEL,,False,manta,True,chr1,,,62489,,SR,False,,0.0007196230,False,False,0.0006664000,0.0013237500,False,,,,,True,,,,,,OR4F5,,,,,,,,False,,90,DEL,,125984.0000000000,2.0000000000,0.0000160000,62992.0000000000,62990.0000000000,2.0000000000,0.0000000000,0.9999679923,0.0000317501,0.0000000000,,,,,,,91,1:62398-62489
22,1,66128,66613,gnomAD-SV_v3_DEL_chr1_ebfa4400,DEL,,False,manta,True,chr1,,,66613,,SR,False,,0.0172403008,False,False,0.0177679006,0.0112519003,False,,,,,False,,OR4F5,,,,,,,,,,,,False,,484,DEL,,123744.0000000000,61.0000000000,0.0004930000,61872.0000000000,61811.0000000000,61.0000000000,0.0000000000,0.9990140200,0.0009859060,0.0000000000,,,,,,,485,1:66128-66613
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2154479,Y,56878224,56878748,gnomAD-SV_v3_DEL_chrY_a1db9958,DEL,,False,wham,False,chrY,,,56878748,,SR,False,,0.5515679717,False,False,,0.5222939849,False,,,,,True,,,,,,CDY1,,,,,,,,False,,523,DEL,,26156.0000000000,27.0000000000,0.0010322700,26156.0000000000,26129.0000000000,27.0000000000,0.0000000000,0.9989680052,0.0010322700,0.0000000000,,,,,,,524,Y:56878224-56878748
2154480,Y,56878242,56878635,gnomAD-SV_v3_DEL_chrY_193296dc,DEL,,False,wham,False,chrY,,,56878635,,SR,False,,0.5075640082,True,False,,0.4839549959,False,,,,,True,,,,,,CDY1,,,,,,,,False,,392,DEL,,28950.0000000000,7.0000000000,0.0002417960,28950.0000000000,28943.0000000000,7.0000000000,0.0000000000,0.9997580051,0.0002417960,0.0000000000,,,,,,,393,Y:56878242-56878635
2154481,Y,56878257,56878816,gnomAD-SV_v3_DEL_chrY_2d25b4a6,DEL,,False,wham,False,chrY,,,56878816,,SR,False,,0.5053740144,True,False,,0.4834490120,False,,,,,True,,,,,,CDY1,,,,,,,,False,,558,DEL,,29172.0000000000,8.0000000000,0.0002742360,29172.0000000000,29164.0000000000,8.0000000000,0.0000000000,0.9997259974,0.0002742360,0.0000000000,,,,,,,559,Y:56878257-56878816
2154482,Y,56878270,56878718,gnomAD-SV_v3_DEL_chrY_ab4f307f,DEL,,False,wham,False,chrY,,,56878718,,PE,False,,0.9981470108,False,False,,0.9983959794,False,,,,,True,,,,,,CDY1,,,,,,,,False,,447,DEL,,146.0000000000,61.0000000000,0.4178079963,146.0000000000,85.0000000000,57.0000000000,4.0000000000,0.5821920037,0.3904109895,0.0273972992,,,,,,,448,Y:56878270-56878718


In [111]:
sv_del['AF'].describe(percentiles=[.1, .25, .5, .75, .9, .95, .99, .999])

count   1197080.0000000000
mean          0.0388642708
std           0.1348195999
min           0.0000080000
10%           0.0000080000
25%           0.0000160000
50%           0.0000500000
75%           0.0003650000
90%           0.0076041001
95%           0.5000000000
99%           0.5000000000
99.9%         0.9285709858
max           1.0000000000
Name: AF, dtype: float64

In [106]:
sv_del['sv_len'].describe(percentiles=[.1, .25, .5, .75, .9, .95, .99, .999])

count    1197080.0000000000
mean        5086.8619674541
std       154413.4211853280
min           51.0000000000
10%           99.0000000000
25%          324.0000000000
50%          609.0000000000
75%          964.0000000000
90%         5847.1000000001
95%        10927.0000000000
99%        82869.0499999998
99.9%     412913.1350000836
max     87762440.0000000000
Name: sv_len, dtype: float64

In [108]:
fig = px.histogram( 
    x=sv_del['sv_len'],
    nbins=300,
    log_y=True,
    title='gnomAD SV DEL lenght distribution (log)'
)
fig.show()

### Join GFF3 x gnomAD SV DEL

In [None]:
# Join GFF x SV
three_utr_sv_joined_pr = three_utrs_pr_filtered.join_overlaps(sv_del, suffix='_sv', report_overlap_column='overlap_len')

In [141]:
# Define joint type
three_utr_sv_joined_pr['join_type'] = three_utr_sv_joined_pr.apply(define_join_type, axis=1)
three_utr_sv_joined_pr['join_type'].value_counts()

join_type
feature_in_sv    20428
sv_in_feature     8318
sv_right_free     3701
sv_left_free      3455
Name: count, dtype: int64

In [142]:
# Count joints by SV
joints_count_by_sv = three_utr_sv_joined_pr['sv_id'].value_counts().to_frame().reset_index()
joints_count_by_sv = joints_count_by_sv.rename(columns={'count': 'joins_count'})
joints_count_by_sv

Unnamed: 0,sv_id,joins_count
0,2:49127869-136890309,447
1,5:85136274-161113193,436
2,X:35433370-85745406,314
3,7:101145532-150530080,301
4,1:58957768-111971259,270
...,...,...
24904,17:21241460-21245422,1
24905,17:21239766-21240394,1
24906,17:21239063-21240340,1
24907,17:21187110-21193500,1


In [143]:
# Add joints count data
three_utr_sv_joined_pr = three_utr_sv_joined_pr.merge(joints_count_by_sv, how='left')

In [144]:
pd.DataFrame(three_utr_sv_joined_pr).sort_values(by='sv_len')

Unnamed: 0,Chromosome,Source,Feature,Start,End,Strand,Parent_feature_suffix,feature_len_feature_suffix,ID,Name,biotype,Parent,tag,transcript_support_level,feature_len,Start_sv,End_sv,name,svtype,samples,MULTIALLELIC,ALGORITHMS,BOTHSIDES_SUPPORT,CHR2,CPX_INTERVALS,CPX_TYPE,END,END2,EVIDENCE,LOW_CONFIDENCE_REPETITIVE_LARGE_DUP,MEMBERS,NCR,OUTLIER_SAMPLE_ENRICHED_LENIENT,PAR,PCRMINUS_NCR,PCRPLUS_NCR,PESR_GT_OVERDISPERSION,POS2,PREDICTED_BREAKEND_EXONIC,PREDICTED_COPY_GAIN,PREDICTED_DUP_PARTIAL,PREDICTED_INTERGENIC,PREDICTED_INTRAGENIC_EXON_DUP,PREDICTED_INTRONIC,PREDICTED_INV_SPAN,PREDICTED_LOF,PREDICTED_MSV_EXON_OVERLAP,PREDICTED_NEAREST_TSS,PREDICTED_NONCODING_BREAKPOINT,PREDICTED_NONCODING_SPAN,PREDICTED_PARTIAL_DISPERSED_DUP,PREDICTED_PARTIAL_EXON_DUP,PREDICTED_PROMOTER,PREDICTED_TSS_DUP,PREDICTED_UTR,RESOLVED_POSTHOC,SOURCE,SVLEN,SVTYPE,UNRESOLVED_TYPE,AN,AC,AF,N_BI_GENOS,N_HOMREF,N_HET,N_HOMALT,FREQ_HOMREF,FREQ_HET,FREQ_HOMALT,CN_NUMBER,CN_COUNT,CN_STATUS,CN_FREQ,CN_NONREF_COUNT,CN_NONREF_FREQ,sv_len,sv_id,overlap_len,join_type,joins_count
16210,19,ensembl_havana,three_prime_UTR,16234435,16245906,+,transcript:ENST00000291439,11471,transcript:ENST00000291439,AP1M1-201,protein_coding,gene:ENSG00000072958,"gencode_basic,gencode_primary,Ensembl_canonica...",1 (assigned to previous version 7),47996.0000000000,16235860,16235911,gnomAD-SV_v3_DEL_chr19_3fff43cc,DEL,,False,manta,False,chr19,,,16235911,,SR,False,,0.0011330200,False,False,0.0007247100,0.0057677799,False,,,,,False,,,,,,,,,,,,,AP1M1,False,,50,DEL,,125962.0000000000,3.0000000000,0.0000240000,62981.0000000000,62978.0000000000,3.0000000000,0.0000000000,0.9999520183,0.0000476334,0.0000000000,,,,,,,51,19:16235860-16235911,51,sv_in_feature,1
33313,9,ensembl_havana,three_prime_UTR,86287732,86288676,-,transcript:ENST00000375963,944,transcript:ENST00000375963,TUT7-206,protein_coding,gene:ENSG00000083223,"gencode_basic,gencode_primary,Ensembl_canonica...",5 (assigned to previous version 7),66678.0000000000,86288323,86288374,gnomAD-SV_v3_DEL_chr9_a14a9bb6,DEL,,False,manta,False,chr9,,,86288374,,SR,False,,0.0013703500,False,False,0.0011078900,0.0043494701,False,,,,,False,,,,,,,,,,,,,TUT7,False,,50,DEL,,125896.0000000000,1.0000000000,0.0000080000,62948.0000000000,62947.0000000000,1.0000000000,0.0000000000,0.9999840260,0.0000158861,0.0000000000,,,,,,,51,9:86288323-86288374,51,sv_in_feature,1
29363,6,havana,three_prime_UTR,143940299,143941423,-,transcript:ENST00000674357,1124,transcript:ENST00000674357,PLAGL1-227,protein_coding,gene:ENSG00000118495,"gencode_basic,gencode_primary,Ensembl_canonica...",,67960.0000000000,143940528,143940579,gnomAD-SV_v3_DEL_chr6_d6ce38d8,DEL,,False,manta,False,chr6,,,143940579,,"PE,SR",False,,0.0013473800,True,False,0.0011828600,0.0032148301,False,,,,,False,,,,,,,,,,,,,PLAGL1,False,,50,DEL,,125896.0000000000,1.0000000000,0.0000080000,62948.0000000000,62947.0000000000,1.0000000000,0.0000000000,0.9999840260,0.0000158861,0.0000000000,,,,,,,51,6:143940528-143940579,51,sv_in_feature,1
16151,19,havana,three_prime_UTR,15088133,15092970,+,transcript:ENST00000641398,4837,transcript:ENST00000641398,OR1I1-202,protein_coding,gene:ENSG00000094661,"gencode_basic,gencode_primary,Ensembl_canonica...",,10760.0000000000,15088976,15089027,gnomAD-SV_v3_DEL_chr19_3360b1d2,DEL,,False,manta,False,chr19,,,15089027,,SR,False,,0.0014239301,False,False,0.0011328800,0.0047276900,False,,,,,False,,,,,,,,,,,,,OR1I1,False,,50,DEL,,125896.0000000000,1.0000000000,0.0000080000,62948.0000000000,62947.0000000000,1.0000000000,0.0000000000,0.9999840260,0.0000158861,0.0000000000,,,,,,,51,19:15088976-15089027,51,sv_in_feature,1
20647,20,havana,three_prime_UTR,326872,330224,+,transcript:ENST00000342665,3352,transcript:ENST00000342665,SOX12-201,protein_coding,gene:ENSG00000177732,"gencode_basic,gencode_primary,Ensembl_canonica...",NA (assigned to previous version 4),4673.0000000000,329782,329833,gnomAD-SV_v3_DEL_chr20_7f2d6f86,DEL,,False,manta,False,chr20,,,329833,,SR,False,,0.0010105300,False,False,0.0005997600,0.0056732199,False,,,,,False,,,,,,,,,,,,,SOX12,False,,50,DEL,,125984.0000000000,2.0000000000,0.0000160000,62992.0000000000,62990.0000000000,2.0000000000,0.0000000000,0.9999679923,0.0000317501,0.0000000000,,,,,,,51,20:329782-329833,51,sv_in_feature,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19293,2,ensembl_havana,three_prime_UTR,98793845,98794531,-,transcript:ENST00000397899,686,transcript:ENST00000397899,CRACDL-201,protein_coding,gene:ENSG00000196872,"gencode_basic,gencode_primary,Ensembl_canonica...",1 (assigned to previous version 6),142380.0000000000,49127869,136890309,gnomAD-SV_v3_DEL_chr2_5c4df4d6,DEL,,False,manta,False,chr2,,,136890309,,"PE,RD,SR",False,,0.0004669890,False,False,0.0005081300,0.0000000000,False,,,,,False,,,,"AAK1,ACMSD,ACOXL,ACTG2,ACTR1B,ACTR2,ACTR3,ACYP...",,,,,,,,,,False,,87762439,DEL,,126028.0000000000,61285.0000000000,0.4862810075,63014.0000000000,1740.0000000000,61263.0000000000,11.0000000000,0.0276129004,0.9722129703,0.0001745640,,,,,,,87762440,2:49127869-136890309,686,feature_in_sv,447
19294,2,ensembl_havana,three_prime_UTR,98997260,98998196,-,transcript:ENST00000393483,936,transcript:ENST00000393483,TSGA10-203,protein_coding,gene:ENSG00000135951,"gencode_basic,gencode_primary,Ensembl_canonica...",1 (assigned to previous version 7),157682.0000000000,49127869,136890309,gnomAD-SV_v3_DEL_chr2_5c4df4d6,DEL,,False,manta,False,chr2,,,136890309,,"PE,RD,SR",False,,0.0004669890,False,False,0.0005081300,0.0000000000,False,,,,,False,,,,"AAK1,ACMSD,ACOXL,ACTG2,ACTR1B,ACTR2,ACTR3,ACYP...",,,,,,,,,,False,,87762439,DEL,,126028.0000000000,61285.0000000000,0.4862810075,63014.0000000000,1740.0000000000,61263.0000000000,11.0000000000,0.0276129004,0.9722129703,0.0001745640,,,,,,,87762440,2:49127869-136890309,936,feature_in_sv,447
19296,2,havana,three_prime_UTR,99150834,99151669,+,transcript:ENST00000650052,835,transcript:ENST00000650052,C2orf15-203,protein_coding,gene:ENSG00000273045,"gencode_basic,gencode_primary,Ensembl_canonica...",,9963.0000000000,49127869,136890309,gnomAD-SV_v3_DEL_chr2_5c4df4d6,DEL,,False,manta,False,chr2,,,136890309,,"PE,RD,SR",False,,0.0004669890,False,False,0.0005081300,0.0000000000,False,,,,,False,,,,"AAK1,ACMSD,ACOXL,ACTG2,ACTR1B,ACTR2,ACTR3,ACYP...",,,,,,,,,,False,,87762439,DEL,,126028.0000000000,61285.0000000000,0.4862810075,63014.0000000000,1740.0000000000,61263.0000000000,11.0000000000,0.0276129004,0.9722129703,0.0001745640,,,,,,,87762440,2:49127869-136890309,835,feature_in_sv,447
19302,2,ensembl_havana,three_prime_UTR,99169262,99169374,-,transcript:ENST00000289359,112,transcript:ENST00000289359,MITD1-201,protein_coding,gene:ENSG00000158411,"gencode_basic,gencode_primary,Ensembl_canonica...",1,11796.0000000000,49127869,136890309,gnomAD-SV_v3_DEL_chr2_5c4df4d6,DEL,,False,manta,False,chr2,,,136890309,,"PE,RD,SR",False,,0.0004669890,False,False,0.0005081300,0.0000000000,False,,,,,False,,,,"AAK1,ACMSD,ACOXL,ACTG2,ACTR1B,ACTR2,ACTR3,ACYP...",,,,,,,,,,False,,87762439,DEL,,126028.0000000000,61285.0000000000,0.4862810075,63014.0000000000,1740.0000000000,61263.0000000000,11.0000000000,0.0276129004,0.9722129703,0.0001745640,,,,,,,87762440,2:49127869-136890309,112,feature_in_sv,447


In [None]:
# Filter joins by join type
targey_joint_types = ['sv_in_feature', 'sv_right_free', 'sv_left_free']
three_utr_sv_joined_pr_filtered = three_utr_sv_joined_pr.query('join_type in @targey_joint_types and ((Strand == "+" and join_type == "sv_right_free") or (Strand == "-" and join_type == "sv_left_free") or join_type == "sv_in_feature") and joins_count == 1')
three_utr_sv_joined_pr_filtered['join_type'].value_counts()

join_type
sv_in_feature    8124
sv_right_free    1924
sv_left_free     1589
Name: count, dtype: int64

In [154]:
three_utr_sv_joined_pr_filtered

Unnamed: 0,Chromosome,Source,Feature,Start,End,Strand,Parent_feature_suffix,feature_len_feature_suffix,ID,Name,...,CN_COUNT,CN_STATUS,CN_FREQ,CN_NONREF_COUNT,CN_NONREF_FREQ,sv_len,sv_id,overlap_len,join_type,joins_count
71,1,ensembl_havana,three_prime_UTR,1233268,1235041,+,transcript:ENST00000379198,1773,transcript:ENST00000379198,B3GALT6-201,...,,,,,,212,1:1234689-1234901,212,sv_in_feature,1
105,1,ensembl_havana,three_prime_UTR,1385710,1387230,-,transcript:ENST00000400809,1520,transcript:ENST00000400809,CCNL2-201,...,,,,,,1469,1:1384462-1385931,221,sv_left_free,1
106,1,ensembl_havana,three_prime_UTR,1385710,1387230,-,transcript:ENST00000400809,1520,transcript:ENST00000400809,CCNL2-201,...,,,,,,198,1:1385663-1385861,151,sv_left_free,1
108,1,ensembl_havana,three_prime_UTR,1427787,1430255,+,transcript:ENST00000378821,2468,transcript:ENST00000378821,TMEM88B-201,...,,,,,,96,1:1428033-1428129,96,sv_in_feature,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35627,X,ensembl_havana,three_prime_UTR,155054672,155060304,+,transcript:ENST00000369498,5632,transcript:ENST00000369498,FUNDC2-201,...,,,,,,70,X:155057590-155057660,70,sv_in_feature,1
35643,X,ensembl_havana,three_prime_UTR,155258234,155260802,-,transcript:ENST00000369454,2568,transcript:ENST00000369454,RAB39B-201,...,,,,,,602,X:155257764-155258366,132,sv_left_free,1
35658,X,havana,three_prime_UTR,155774738,155782459,+,transcript:ENST00000695325,7721,transcript:ENST00000695325,SPRY3-204,...,,,,,,11001,X:155777999-155789000,4460,sv_right_free,1
35659,X,havana,three_prime_UTR,155774738,155782459,+,transcript:ENST00000695325,7721,transcript:ENST00000695325,SPRY3-204,...,,,,,,131,X:155780046-155780177,131,sv_in_feature,1
