In [163]:
import os, subprocess
from pathlib import Path, PurePosixPath
import gzip
import json
import pandas as pd, numpy as np, pyranges as pr
import plotly.express as px

In [900]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 500)
pd.set_option('display.float_format', '{:.10f}'.format)

# UTR3

## Docs / notes

**Docs:**  
* [PyRanges v1.x GitHub](https://github.com/pyranges/pyranges_1.x) / [PyRanges v1.x Docs](https://pyranges1.readthedocs.io/en/latest/index.html)

**Notes:**  
 - !!! Нужно иметь ввиду, что позиции ФИЧ в GFF, например 3'-UTR, могут быть разделены на несколько интервалов, так как разделены, например, интронами.

**Tasks:**  
1. Привести все к одной нумерации позиций: GFF - 1-based, BED - 0-based
2. Проверить правильность offtarget_id. Сейчас это по факту позиции SV.

**Commands:**  
```{bash}
mamba env export -n utr3.venv > environment.yml
```

## Settings

In [3]:
main_path = Path.cwd()

In [353]:
# Create tree
refs_dir = main_path / 'data/refs'
gnomad_dir = main_path / 'data/gnomad'
other_dir = main_path / 'data/other'
output_dir = main_path / 'data/output'

Path(refs_dir).mkdir(parents=True, exist_ok=True)
Path(gnomad_dir).mkdir(parents=True, exist_ok=True)
Path(other_dir).mkdir(parents=True, exist_ok=True)
Path(output_dir).mkdir(parents=True, exist_ok=True)

## Functions

In [649]:
def fetch_file(link, output_dir):
    command = f'wget --no-clobber -P {output_dir} {link}'
    subprocess.run(command, shell=True)
    filename = PurePosixPath(link).name
    return output_dir / filename

def index_gff(input_filepath):
    command = f'igvtools index {input_filepath}'
    subprocess.run(command, shell=True)

def gunzip_file(input_filepath):
    output_filepath = input_filepath.with_suffix('')
    command = f'gunzip -c {input_filepath} > {output_filepath}'
    subprocess.run(command, shell=True)
    return output_filepath

def sort_gff(input_filepath):
    output_filepath = input_filepath.with_stem(input_filepath.stem + ".sorted")
    command = f'igvtools sort {input_filepath} {output_filepath}'
    subprocess.run(command, shell=True)
    return output_filepath

def index_gff(input_filepath):
    command = f'igvtools index {input_filepath}'
    subprocess.run(command, shell=True)

def define_join_type(row):
    feature_start = row['Start']
    feature_end = row['End']
    sv_start = row['Start_sv']
    sv_end = row['End_sv']

    join_type = np.nan
    if feature_start <= sv_start and feature_end >= sv_end: # SV полностью в feature
        join_type = 'sv_in_feature'
    elif sv_start <= feature_start and sv_end >= feature_end: # feature полностью в sv
        join_type = 'feature_in_sv'
    elif sv_start < feature_start and sv_end <= sv_end:
        join_type = 'sv_left_free'
    elif sv_start > feature_start and sv_end >= sv_end:
        join_type = 'sv_right_free'
    elif sv_start == feature_start and sv_end == sv_end:
        join_type = 'full_join'

    return join_type

## Fetch data

In [545]:
gff_filepath = fetch_file('https://ftp.ensembl.org/pub/release-115/gff3/homo_sapiens/Homo_sapiens.GRCh38.115.gff3.gz', refs_dir)
fasta_filepath = fetch_file('https://ftp.ensembl.org/pub/release-115/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.toplevel.fa.gz', refs_dir)
gnomad_sv_filepath = fetch_file('https://storage.googleapis.com/gcp-public-data--gnomad/release/4.1/genome_sv/gnomad.v4.1.sv.sites.bed.gz', gnomad_dir)

so_terms_filepath = fetch_file('https://raw.githubusercontent.com/The-Sequence-Ontology/SO-Ontologies/refs/heads/master/Ontology_Files/so.json', other_dir)

File '/Users/andrejnekrasov/pro/my/utr3/data/refs/Homo_sapiens.GRCh38.115.gff3.gz' already there; not retrieving.

File '/Users/andrejnekrasov/pro/my/utr3/data/refs/Homo_sapiens.GRCh38.dna.toplevel.fa.gz' already there; not retrieving.

File '/Users/andrejnekrasov/pro/my/utr3/data/gnomad/gnomad.v4.1.sv.sites.bed.gz' already there; not retrieving.

File '/Users/andrejnekrasov/pro/my/utr3/data/other/so.json' already there; not retrieving.



## Prepare data

In [650]:
gff_gunzipped_filepath = gunzip_file(gff_filepath)
gff_sorted_filepath = sort_gff(gff_gunzipped_filepath)
index_gff(gff_sorted_filepath)

Sorting /Users/andrejnekrasov/pro/my/utr3/data/refs/Homo_sapiens.GRCh38.115.gff3  -> /Users/andrejnekrasov/pro/my/utr3/data/refs/Homo_sapiens.GRCh38.115.sorted.gff3
Done
Done


## <span style="color:#00ff00;">Main</span>

### <span style="color:#00ff00;">Create PR/DF<span>

In [None]:
# GFF
gff_pr = pr.read_gff3(str(gff_filepath))

In [None]:
# SV
# Keep only the necessary columns (62 of 600...) in SV
target_columns = ['#chrom', 'start', 'end', 'name', 'svtype', 'samples', 'MULTIALLELIC', 'ALGORITHMS', 'BOTHSIDES_SUPPORT', 'CHR2', 'CPX_INTERVALS', 'CPX_TYPE', 'END', 'END2', 'EVIDENCE', 'LOW_CONFIDENCE_REPETITIVE_LARGE_DUP', 'MEMBERS', 'NCR', 'OUTLIER_SAMPLE_ENRICHED_LENIENT', 'PAR', 'PCRMINUS_NCR', 'PCRPLUS_NCR', 'PESR_GT_OVERDISPERSION', 'POS2', 'PREDICTED_BREAKEND_EXONIC', 'PREDICTED_COPY_GAIN', 'PREDICTED_DUP_PARTIAL', 'PREDICTED_INTERGENIC', 'PREDICTED_INTRAGENIC_EXON_DUP', 'PREDICTED_INTRONIC', 'PREDICTED_INV_SPAN', 'PREDICTED_LOF', 'PREDICTED_MSV_EXON_OVERLAP', 'PREDICTED_NEAREST_TSS', 'PREDICTED_NONCODING_BREAKPOINT', 'PREDICTED_NONCODING_SPAN', 'PREDICTED_PARTIAL_DISPERSED_DUP', 'PREDICTED_PARTIAL_EXON_DUP', 'PREDICTED_PROMOTER', 'PREDICTED_TSS_DUP', 'PREDICTED_UTR', 'RESOLVED_POSTHOC', 'SOURCE', 'SVLEN', 'SVTYPE', 'UNRESOLVED_TYPE', 'AN', 'AC', 'AF', 'N_BI_GENOS', 'N_HOMREF', 'N_HET', 'N_HOMALT', 'FREQ_HOMREF', 'FREQ_HET', 'FREQ_HOMALT', 'CN_NUMBER', 'CN_COUNT', 'CN_STATUS', 'CN_FREQ', 'CN_NONREF_COUNT', 'CN_NONREF_FREQ']
target_columns[0:3] = ['Chromosome', 'Start', 'End']
sv_df = pd.read_csv(str(gnomad_sv_filepath), sep='\t', usecols=list(range(len(target_columns))), names=target_columns, comment='#')
sv_df['Chromosome'] = sv_df['Chromosome'].str.replace('chr', '')

sv_pr = pr.PyRanges(sv_df)

In [911]:
# SO terms
with open(so_terms_filepath, 'r') as so_terms_file:
    so_terms = json.load(so_terms_file)

so_terms_filtered = {
    'lbl': [],
    'definition': []
}
for i in so_terms['graphs'][0]['nodes']:
    lbl = i.get('lbl', np.nan)
    definition = i.get('meta', {}).get('definition', {}).get('val', np.nan)
    so_terms_filtered['lbl'].append(lbl)
    so_terms_filtered['definition'].append(definition)

so_terms_df = pd.DataFrame.from_dict(so_terms_filtered)
so_terms_df['lbl'] = so_terms_df['lbl'].str.capitalize().replace('_', '')

### <span style="color:#00ff00;">EDA (Exploratory Data Analysis)</span>

In [None]:
# GFF features stats
gff_features = gff_pr.Feature.value_counts().to_frame().reset_index()
gff_features['tmp_id'] = gff_features['Feature'].str.capitalize().replace('_', '')
gff_features = gff_features.merge(so_terms_df, how='left', left_on='tmp_id', right_on='lbl')
gff_features = gff_features.drop(['lbl', 'tmp_id'], axis=1)
pd.DataFrame(gff_features)

Unnamed: 0,Feature,count,definition
0,exon,3673949,A region of the transcript sequence within a gene which is not removed from the primary RNA transcript by RNA splicing.
1,CDS,2284258,"A contiguous sequence which begins with, and includes, a start codon and ends with, and includes, a stop codon."
2,five_prime_UTR,426597,A region at the 5' end of a mature transcript (preceding the initiation codon) that is not translated into a protein.
3,three_prime_UTR,340566,A region at the 3' end of a mature transcript (following the stop codon) that is not translated into a protein.
4,mRNA,233574,Messenger RNA is the intermediate molecule between DNA and protein. It includes UTR and coding sequences. It does not contain introns.
5,lnc_RNA,223403,
6,biological_region,180392,A region defined by its disposition to be involved in a biological process.
7,ncRNA_gene,41946,A gene that encodes a non-coding RNA.
8,transcript,28799,An RNA synthesized on a DNA or RNA template by an RNA polymerase.
9,gene,21547,"A region (or regions) that includes all of the sequence elements necessary to encode a functional transcript. A gene may include regulatory regions, transcribed regions and/or other functional sequence regions."


In [None]:
# GFF biotypes stats
gff_biotypes = gff_pr.biotype.value_counts().to_frame().reset_index()
gff_biotypes['tmp_id'] = gff_biotypes['biotype'].str.capitalize().replace('_', '')
gff_biotypes = gff_biotypes.merge(so_terms_df, how='left', left_on='tmp_id', right_on='lbl')
gff_biotypes = gff_biotypes.drop(['lbl', 'tmp_id'], axis=1)
pd.DataFrame(gff_biotypes)

Unnamed: 0,biotype,count,definition
0,protein_coding,231543,"A gene which, when transcribed, can be translated into a protein."
1,lncRNA,224032,"A non-coding RNA generally longer than 200 nucleotides that cannot be classified as any other ncRNA subtype. Similar to mRNAs, lncRNAs are mainly transcribed by RNA polymerase II, are often capped by 7-methyl guanosine at their 5' ends, polyadenylated at their 3' ends and may be spliced."
2,retained_intron,34239,
3,protein_coding_CDS_not_defined,26573,
4,nonsense_mediated_decay,21949,
5,processed_pseudogene,18975,"A pseudogene created via retrotranposition of the mRNA of a functional protein-coding parent gene followed by accumulation of deleterious mutations lacking introns and promoters, often including a polyA tail."
6,misc_RNA,4414,
7,unprocessed_pseudogene,3898,
8,snRNA,3802,A small nuclear RNA molecule involved in pre-mRNA splicing and processing.
9,miRNA,3758,"Small, ~22-nt, RNA molecule that is the endogenous transcript of a miRNA gene (or the product of other non coding RNA genes). Micro RNAs are produced from precursor molecules (SO:0001244) that can form local hairpin structures, which ordinarily are processed (usually via the Dicer pathway) such that a single miRNA molecule accumulates from one arm of a hairpin precursor molecule. Micro RNAs may trigger the cleavage of their target molecules or act as translational repressors."


In [547]:
sv_pr.svtype.value_counts().to_frame().reset_index()

Unnamed: 0,svtype,count
0,DEL,1197080
1,BND,356035
2,DUP,269326
3,INS:ME:ALU,173374
4,INS,83441
5,INS:ME:LINE1,30223
6,INS:ME:SVA,17607
7,CPX,15189
8,DEL:ME:LINE1,8505
9,INV,2193


### <span style="color:#00ff00;">Prepare GFF3 data</span>

In [856]:
# Self annotation of GFF with mRNA (parent) data
target_columns = ['ID', 'Name', 'biotype', 'Parent', 'tag', 'transcript_support_level', 'feature_len', 'feature_id'] # Replace the data in these empty columns with mRNA data
annotation_gff_pr = gff_pr[target_columns].dropna(subset='ID')

annotation_suffix = '_annotation'
annotated_gff_pr = gff_pr.merge(annotation_gff_pr, how='left', left_on='Parent', right_on='ID', suffixes=['', annotation_suffix])

annotated_columns = [f'{column}{annotation_suffix}' for column in target_columns]
fill_dict = dict(zip(target_columns, annotated_columns))
for target, annotation in fill_dict.items():
    annotated_gff_pr[target] = annotated_gff_pr[target].fillna(annotated_gff_pr[annotation])
annotated_gff_pr = annotated_gff_pr.drop(annotated_columns, axis=1)

In [None]:
# Add additional columns
annotated_gff_pr['feature_len'] = (annotated_gff_pr['End'] - annotated_gff_pr['Start']).astype('int')
annotated_gff_pr['feature_id'] = 'feature_' + annotated_gff_pr['Chromosome'].astype('str') + ':' + annotated_gff_pr['Start'].astype('str') + '-' + annotated_gff_pr['End'].astype('str')

In [None]:
# Create specific PRs (mRNAm, 3'-UTR, offtargets)
mrna_pr = annotated_gff_pr.query('Feature == "mRNA"')
three_utrs_pr = annotated_gff_pr.query('Feature == "three_prime_UTR"')

offtarget_features = ['five_prime_UTR',
                               'CDS',
                               #'lnc_RNA',
                               'ncRNA_gene',
                               #'snRNA',
                               #'snoRNA',
                               #'scRNA',
                               'rRNA',
                               'tRNA',
                               'processed_transcript',
                               ]
offtarget_features_pr = gff_pr.query('Feature in @offtarget_features')

In [912]:
# 3'-UTR biotype stats
# Считаем именно уникальные вхождения, так как один 3'-UTR может быть разбить на несколько интервалов в GFF (разделен интронами).
# В особенности это выражено у nonsense_mediated_decay транскриптов (нам они не пригодятся, но тем не менее).
three_utrs_pr.drop_duplicates(subset='ID')['biotype'].value_counts().to_frame().reset_index()


Unnamed: 0,biotype,count
0,protein_coding,190254
1,nonsense_mediated_decay,21947
2,protein_coding_LoF,43
3,IG_C_gene,22
4,TR_C_gene,6
5,IG_V_gene,2


In [913]:
# Keep only MANE_select / MANE_Select|Ensembl_canonical
three_utrs_pr_filtered = three_utrs_pr[three_utrs_pr['tag'].str.contains('MANE_Select', na=False, regex=True)]

# Stats
# MANE_select genes count = 19437 (release_1.5) from summary file: https://ftp.ncbi.nlm.nih.gov/refseq/MANE/MANE_human/release_1.5/
three_utrs_pr_filtered['biotype'].value_counts().to_frame().reset_index()

Unnamed: 0,biotype,count
0,protein_coding,19757


### <span style="color:#00ff00;">Prepare SV data</span>

#### <span style="color:green;">gnomAD</span>

In [None]:
# Keep target SVs (DEL)
sv_targets = ['DEL']
target_sv_pr = sv_pr.query('svtype in @sv_targets')

# Add columns
target_sv_pr['sv_len'] = target_sv_pr.End - target_sv_pr.Start
target_sv_pr['sv_id'] = 'sv_' + target_sv_pr['Chromosome'].astype('str') + ':' + target_sv_pr['Start'].astype('str') + '-' + target_sv_pr['End'].astype('str')

In [914]:
# SV percentiles by AF
target_sv_pr['AF'].describe(percentiles=[.1, .25, .5, .75, .9, .95, .99, .999]).to_frame().reset_index()

Unnamed: 0,index,AF
0,count,1197080.0
1,mean,0.0388642708
2,std,0.1348195999
3,min,8e-06
4,10%,8e-06
5,25%,1.6e-05
6,50%,5e-05
7,75%,0.000365
8,90%,0.0076041001
9,95%,0.5


In [None]:
# SV percentiles by SV length
target_sv_pr['sv_len'].describe(percentiles=[.1, .25, .5, .75, .9, .95, .99, .999]).to_frame().reset_index()

Unnamed: 0,index,sv_len
0,count,1197080.0
1,mean,5086.8619674541
2,std,154413.421185328
3,min,51.0
4,10%,99.0
5,25%,324.0
6,50%,609.0
7,75%,964.0
8,90%,5847.1000000001
9,95%,10927.0


In [908]:
'''fig = px.histogram( 
    x=target_sv_pr['sv_len'],
    nbins=300,
    log_y=True,
    title='gnomAD SV DEL lenght distribution (log)'
)
fig.show()'''

"fig = px.histogram( \n    x=target_sv_pr['sv_len'],\n    nbins=300,\n    log_y=True,\n    title='gnomAD SV DEL lenght distribution (log)'\n)\nfig.show()"

### <span style="color:#00ff00;">Join GFF3 x gnomAD SV DEL</span>

In [867]:
# Join GFF x SV
features_sv_joined_pr = three_utrs_pr_filtered.join_overlaps(target_sv_pr, suffix='_sv', report_overlap_column='join_len')

In [None]:
# Add additional columns (join_start, join_end, join_id)
features_sv_joined_pr['join_start'] = np.maximum(features_sv_joined_pr['Start'], features_sv_joined_pr['Start_sv'])
features_sv_joined_pr['join_end'] = np.minimum(features_sv_joined_pr['End'], features_sv_joined_pr['End_sv'])
features_sv_joined_pr['join_id'] = 'join_' + features_sv_joined_pr['Chromosome'].astype('str') + ':' + features_sv_joined_pr['join_start'].astype('str') + '-' + features_sv_joined_pr['join_end'].astype('str')

In [916]:
# Define joint type
features_sv_joined_pr['join_type'] = features_sv_joined_pr.apply(define_join_type, axis=1)
features_sv_joined_pr['join_type'].value_counts().to_frame().reset_index()

Unnamed: 0,join_type,count
0,feature_in_sv,20428
1,sv_in_feature,8318
2,sv_right_free,3701
3,sv_left_free,3455


In [None]:
# Count joints by SV
joints_count_by_sv = features_sv_joined_pr['sv_id'].value_counts().to_frame().reset_index()
joints_count_by_sv = joints_count_by_sv.rename(columns={'count': 'joins_count'})

# Add joints count data
features_sv_joined_pr = features_sv_joined_pr.merge(joints_count_by_sv, how='left')
#len(features_sv_joined_pr)

# Filter joins by join type
targey_joint_types = ['sv_in_feature', 'sv_right_free', 'sv_left_free']
features_sv_joined_pr_filtered = features_sv_joined_pr.query('join_type in @targey_joint_types and ((Strand == "+" and join_type == "sv_right_free") or (Strand == "-" and join_type == "sv_left_free") or join_type == "sv_in_feature") and joins_count == 1')

#### <span style="color:green;">Join type stats</span>

In [None]:
# Stats by join type
features_sv_joined_pr_filtered.groupby('join_type')['sv_len'].describe(percentiles=[.1, .25, .5, .75, .9, .95, .99, .999]).T

join_type,sv_in_feature,sv_left_free,sv_right_free
count,8124.0,1589.0,1924.0
mean,538.8242245199,4629.9559471366,4550.9636174636
std,632.0622349426,14678.7805179974,15540.4454833119
min,51.0,52.0,53.0
10%,77.0,454.6,487.0
25%,175.0,627.0,628.0
50%,515.0,1325.0,1102.0
75%,657.0,5075.0,4697.5
90%,796.0,9015.6,9576.1
95%,1135.85,13867.2,13220.1


In [917]:
features_sv_joined_pr_filtered['biotype'].value_counts().to_frame().reset_index()

Unnamed: 0,biotype,count
0,protein_coding,11637


### <span style="color:#00ff00;">Offtarget features exclusion</span>

In [1142]:
# 
features_sv_joined_pr_filtered_cp = features_sv_joined_pr_filtered.copy()
features_sv_joined_pr_filtered_cp[['Start_tmp', 'End_tmp']] = features_sv_joined_pr_filtered_cp[['Start', 'End']] # Save old positions
features_sv_joined_pr_filtered_cp[['Start', 'End']] = features_sv_joined_pr_filtered_cp[['Start_sv', 'End_sv']] # SV positions to main positions

In [1143]:
# features_sv_offtereget_joined_pr - только для отслеживания пересечений с offtarget. Фильтрованный от offtarget pr - features_sv_no_offtereget_joined_pr (через overlap, потому что join находя пересечения с offtarget (через left) также оставляет оригинальную строку).DS_Storefeatures_sv_offtereget_joined_pr = features_sv_joined_pr_filtered_cp.join_overlaps(offtarget_features_pr, join_type='left', suffix='_offtarget', report_overlap_column='offtarget_join_len')
features_sv_offtereget_joined_pr = features_sv_joined_pr_filtered_cp.join_overlaps(offtarget_features_pr, join_type='left', suffix='_offtarget', report_overlap_column='offtarget_join_len', strand_behavior='ignore')

features_sv_offtereget_joined_pr[['Start', 'End']] = features_sv_offtereget_joined_pr[['Start_tmp', 'End_tmp']]
features_sv_offtereget_joined_pr = features_sv_offtereget_joined_pr.drop(['Start_tmp', 'End_tmp'], axis=1)

features_sv_offtereget_joined_pr['offtarget_id'] = 'offtarget_' + features_sv_offtereget_joined_pr['Chromosome'].astype('str') + ':' + features_sv_offtereget_joined_pr['Start_offtarget'].astype('str') + '-' + features_sv_offtereget_joined_pr['End_offtarget'].astype('str')

In [None]:
# Features offtargets stats
# Суммарные значения данной статистики будут выше чем разница всех джоинов - оффтартет джоинов, так как один и тот же SV (напр. DEL) могут пересекать разные изоформы одного и того же транскрипта.
features_sv_offtereget_joined_pr.groupby(['join_type', 'Feature_offtarget'])['Feature_offtarget'].agg(['count'])
#features_sv_offtereget_joined_pr.drop_duplicates(subset='sv_id').groupby(['join_type', 'Feature_offtarget'])['Feature_offtarget'].agg(['count'])

Unnamed: 0_level_0,Unnamed: 1_level_0,count
join_type,Feature_offtarget,Unnamed: 2_level_1
sv_in_feature,CDS,166
sv_in_feature,five_prime_UTR,6
sv_in_feature,ncRNA_gene,1512
sv_in_feature,processed_transcript,1
sv_left_free,CDS,112
sv_left_free,five_prime_UTR,31
sv_left_free,ncRNA_gene,403
sv_left_free,snoRNA,1
sv_right_free,CDS,91
sv_right_free,five_prime_UTR,57


In [1147]:
# Exclude offtargets (SV x offtarget overlaps)
features_sv_no_offtereget_joined_pr = features_sv_joined_pr_filtered_cp.overlap(offtarget_features_pr, invert=True, strand_behavior='ignore')

features_sv_no_offtereget_joined_pr[['Start', 'End']] = features_sv_no_offtereget_joined_pr[['Start_tmp', 'End_tmp']]
features_sv_no_offtereget_joined_pr = features_sv_no_offtereget_joined_pr.drop(['Start_tmp', 'End_tmp'], axis=1)

In [None]:
# Joint type stats after offtargets exclution
print('Before offtargets filtering')
print(features_sv_joined_pr_filtered['join_type'].value_counts().to_frame().reset_index())
print('\nAfter offtargets filtering')
print(features_sv_no_offtereget_joined_pr['join_type'].value_counts().to_frame().reset_index())

Before offtarget filtering
       join_type  count
0  sv_in_feature   8124
1  sv_right_free   1924
2   sv_left_free   1589

After offtarget filtering
       join_type  count
0  sv_in_feature   6439
1  sv_right_free   1278
2   sv_left_free   1042


### <span style="color:#00ff00;">GFF output for IGV</span>

In [1134]:
# Create universal pr fot igv
pr_for_igv = features_sv_no_offtereget_joined_pr

In [1135]:
# Create empty GFF pr
output_gff_columns = ['Chromosome', 'Source', 'Feature', 'Start', 'End', 'Score', 'Strand', 'ID', 'color', 'type', 'Name', 'biotype', 'tag', 'transcript_id']
output_gff_pr = pr.PyRanges(columns=output_gff_columns)

In [1136]:
# Add GFF data
output_features_pr = pr_for_igv[['Chromosome', 'Feature', 'Start', 'End', 'Strand', 'Name', 'biotype', 'tag', 'ID']]
output_features_pr = output_features_pr.drop_duplicates()
output_features_pr['Source'] = 'GFF'
output_features_pr['color'] = 'green'

output_gff_pr = pr.concat([output_gff_pr, output_features_pr])
len(output_gff_pr)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.



5428

In [1137]:
# Add SV data
output_sv_pr = pr_for_igv[['Chromosome', 'Start_sv', 'End_sv', 'svtype']]
output_sv_pr = output_sv_pr.drop_duplicates()
output_sv_pr['Source'] = 'gnomAD_SV'
output_sv_pr['color'] = 'red'
output_sv_pr['ID'] = pr_for_igv['sv_id']

output_sv_pr = pr.PyRanges(output_sv_pr.rename(columns={'Start_sv': 'Start', 'End_sv': 'End', 'svtype': 'Feature'}))
output_gff_pr = pr.concat([output_gff_pr, output_sv_pr])
len(output_gff_pr)

16025

In [1138]:
# Add joins data
output_joins_pr = pr_for_igv[['Chromosome', 'join_start', 'join_end', 'join_type']]
output_joins_pr = output_joins_pr.drop_duplicates()
output_joins_pr['Source'] = 'join'
output_joins_pr['color'] = 'orange'
output_joins_pr['ID'] = pr_for_igv['join_id']

output_joins_pr = pr.PyRanges(output_joins_pr.rename(columns={'join_start': 'Start', 'join_end': 'End', 'join_type': 'Feature'}))
output_gff_pr = pr.concat([output_gff_pr, output_joins_pr])
len(output_gff_pr)

26603

In [1139]:
# Add offtarget GFF (features) data
output_offtargets_pr = features_sv_offtereget_joined_pr[['Chromosome', 'Start_offtarget', 'End_offtarget', 'Feature_offtarget', 'offtarget_id']]
output_offtargets_pr = output_offtargets_pr.query('Start_offtarget == Start_offtarget')
output_offtargets_pr = output_offtargets_pr.drop_duplicates()
output_offtargets_pr['Source'] = 'GFF_offtarget'
output_offtargets_pr['color'] = '#a3adac'

output_offtargets_pr = pr.PyRanges(output_offtargets_pr.rename(columns={'Start_offtarget': 'Start', 'End_offtarget': 'End', 'Feature_offtarget': 'Feature', 'offtarget_id': 'ID'}))
output_gff_pr = pr.concat([output_gff_pr, output_offtargets_pr])
len(output_gff_pr)

30416

In [1140]:
# Stats
output_gff_pr['Source'].value_counts().to_frame().reset_index()

Unnamed: 0,Source,count
0,gnomAD_SV,10597
1,join,10578
2,GFF,5428
3,GFF_offtarget,3813


In [1141]:
# Fill NA and save
output_gff_pr = output_gff_pr.reset_index(drop=True)
output_gff_pr = pr.PyRanges(output_gff_pr.fillna('.'))

output_gff_pr.to_gff3(output_dir / 'regions_for_igv.gff3')