In [11]:
import glob
import tabix
import csv
import numpy as np

from itertools import groupby
from operator import itemgetter

import pyranges as pr
import pandas as pd

In [12]:
def map_chromosome(chrom):


    chrom_dict = {
        '1':'NC_000001.10',
        '2':'NC_000002.11',
        '3':'NC_000003.11',
        '4':'NC_000004.11',
        '5':'NC_000005.9',
        '6':'NC_000006.11',
        '7':'NC_000007.13',
        '8':'NC_000008.10',
        '9':'NC_000009.11',
        '10':'NC_000010.10',
        '11':'NC_000011.9',
        '12':'NC_000012.11',
        '13':'NC_000013.10',
        '14':'NC_000014.8',
        '15':'NC_000015.9',
        '16':'NC_000016.9',
        '17':'NC_000017.10',
        '18':'NC_000018.9',
        '19':'NC_000019.9',
        '20':'NC_000020.10',
        '21':'NC_000021.8',
        '22':'NC_000022.10',
        'X':'NC_000023.10',
        'Y':'NC_000024.9',
        'MT':'NC_012920.1'
    }

    return chrom_dict[chrom]



def get_overlapping_genome_features(chrom, pos, end, annotations_pr):

    chrom = map_chromosome(chrom)

    variant = {'Chromosome': [chrom], 'Start': [pos], 'End': [end]}

    variant_pr = pr.from_dict(variant)

    overlapping_genome_features = variant_pr.join(annotations_pr).as_df()
    
    return overlapping_genome_features

In [13]:
def parse_genome_features(overlapping_genome_features):

    gene_features = {'gene': None, 'pseudogene': None,'miRNA': None,'tRNA': None  }  
    transcript_features = {'transcript': None, 'mRNA': None, 'lnc_RNA': None,'primary_transcript': None, 'snoRNA': None, 'antisense_RNA': None, 'snRNA': None, 'guide_RNA': None, 'C_gene_segment': None, 'V_gene_segment': None, 'J_gene_segment': None, 'D_gene_segment': None, 'scRNA': None }

    genes = {}
    transcripts = {}
    transcript_exons = {}

    for row in overlapping_genome_features.itertuples():

        if row.Feature in gene_features:

            if row.Feature == 'miRNA':

                biotype = 'miRNA'

            elif row.Feature == 'tRNA':

                biotype = 'tRNA'

            else:

                biotype = row.gene_biotype

            genes[row.ID] = {'name': row.Name, 'start': row.Start_b, 'end': row.End_b, 'biotype': biotype}


        if row.Feature in transcript_features:


            if pd.isnull(row.Name):

                name = row.ID

            else:

                name = row.Name

            transcripts[row.ID] = {'name': row.Name, 'start': row.Start_b, 'end': row.End_b, 'gene': row.Parent}


    for row in overlapping_genome_features.itertuples():

        if row.Feature == 'exon':

            parent = row.Parent
            start = row.Start_b
            end = row.End_b

            if parent in transcripts:

                exon_n = row.ID.split('-')[-1]

                if row.transcript_id not in transcript_exons:

                    gene_id = genes[transcripts[parent]['gene']]

                    transcript_exons[row.transcript_id] = {'exons': [[start, end, exon_n]], 'gene':gene_id }

                else:

                    transcript_exons[row.transcript_id]['exons'].append([start, end, exon_n])

            elif parent in genes:

                if parent not in transcript_exons:

                    gene_id = genes[parent]

                    transcript_exons[parent] = {'exons': [[start, end, 'na']], 'gene':gene_id }

                else:

                    transcript_exons[parent]['exons'].append([start, end, 'na'])  


            else:

                print('oh no')
                print(parent, start, end)

    parsed_overlapping_genome_features = {'transcripts': transcripts, 'genes': genes, 'transcript_exons': transcript_exons}
    
    return parsed_overlapping_genome_features

In [14]:
files_to_merge = glob.glob('results/coverage/*_region_coverage_data.csv')

master_df = pd.DataFrame()

for file in files_to_merge:

    df  = pd.read_csv(file)

    master_df = master_df.append(df)

In [15]:
master_df.head()


master_df = master_df.head(1000)

In [16]:
annotations = pr.read_gff3('../../sv_consequences/annotations/GRCh37_latest_genomic.gff.gz')

region_annotation_dict = {}

In [17]:
def annotate_rows(df, annotations):
    
    chrom = df['chrom']
    start = df['start']
    end = df['end']
    
    region_key = f'{chrom}:{start}-{end}'
        
    if region_key in region_annotation_dict:
        
        return region_annotation_dict[region_key]
    
    else:
        
        feature_df = get_overlapping_genome_features(chrom, int(start), int(end), annotations)
        
        parsed_features =  parse_genome_features(feature_df)
        
        region_annotation_dict[region_key] = parsed_features
        
        return parsed_features

In [18]:
master_df['annotations'] = master_df.apply(annotate_rows, axis=1,args=(annotations,))

In [19]:
master_df.to_csv('master.csv', index=False)

In [55]:
new_rows = []

for row in master_df.itertuples():
    
    anno = row.annotations
            
    row = [row.mean_coverage, row.region_length, row.error, row.chrom, row.start, row.end, row.gaps, row.pct_gtr_20x, row.pct_gtr_160x, row.sample_id, anno ]
    
    for ts in anno['transcript_exons']:
        
        gene = anno['transcript_exons'][ts]['gene']['name']
        
        exons = []
        
        for exon in anno['transcript_exons'][ts]['exons']:
                        
            exons.append(exon[2])       
        
        new_rows.append(row + [gene] + [exons] + [ts])
        
        

In [56]:
new_df = pd.DataFrame(new_rows, columns = list(master_df.columns) + ['gene_name', 'exons', 'transcript'] )

In [57]:
new_df

Unnamed: 0,mean_coverage,region_length,error,chrom,start,end,gaps,pct_gtr_20x,pct_gtr_160x,sample_id,annotations,gene_name,exons,transcript
0,31.995851,241,none,1,955532,955773,"{20: [[955532, 955617]], 160: [[955532, 955772]]}",0.643154,0.000000,191010_D00501_0366_BH5JWHBCX3_18M01315,{'transcripts': {'rna-NM_001305275.2': {'name'...,AGRN,[1],NM_001305275.2
1,31.995851,241,none,1,955532,955773,"{20: [[955532, 955617]], 160: [[955532, 955772]]}",0.643154,0.000000,191010_D00501_0366_BH5JWHBCX3_18M01315,{'transcripts': {'rna-NM_001305275.2': {'name'...,AGRN,[1],NM_198576.4
2,113.784768,302,none,1,957560,957862,"{20: [], 160: [[957560, 957674], [957726, 9578...",1.000000,0.168874,191010_D00501_0366_BH5JWHBCX3_18M01315,{'transcripts': {'rna-NM_001305275.2': {'name'...,AGRN,[2],NM_001305275.2
3,113.784768,302,none,1,957560,957862,"{20: [], 160: [[957560, 957674], [957726, 9578...",1.000000,0.168874,191010_D00501_0366_BH5JWHBCX3_18M01315,{'transcripts': {'rna-NM_001305275.2': {'name'...,AGRN,[2],NM_198576.4
4,145.988636,88,none,1,970636,970724,"{20: [], 160: [[970636, 970654], [970681, 9707...",1.000000,0.295455,191010_D00501_0366_BH5JWHBCX3_18M01315,{'transcripts': {'rna-NM_001305275.2': {'name'...,AGRN,[3],NM_001305275.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3152,197.676000,250,none,1,26135050,26135300,"{20: [], 160: [[26135050, 26135084], [26135274...",1.000000,0.756000,191010_D00501_0366_BH5JWHBCX3_18M01315,{'transcripts': {'rna-NM_020451.3': {'name': '...,SELENON,[4],NM_206926.2
3153,124.442424,165,none,1,26135496,26135661,"{20: [], 160: [[26135496, 26135660]]}",1.000000,0.000000,191010_D00501_0366_BH5JWHBCX3_18M01315,{'transcripts': {'rna-NM_020451.3': {'name': '...,SELENON,[6],NM_020451.3
3154,124.442424,165,none,1,26135496,26135661,"{20: [], 160: [[26135496, 26135660]]}",1.000000,0.000000,191010_D00501_0366_BH5JWHBCX3_18M01315,{'transcripts': {'rna-NM_020451.3': {'name': '...,SELENON,[5],NM_206926.2
3155,91.865169,178,none,1,26136153,26136331,"{20: [], 160: [[26136153, 26136330]]}",1.000000,0.000000,191010_D00501_0366_BH5JWHBCX3_18M01315,{'transcripts': {'rna-NM_020451.3': {'name': '...,SELENON,[7],NM_020451.3


In [58]:
new_df[new_df['gene_name']=='RNU4ATAC']

Unnamed: 0,mean_coverage,region_length,error,chrom,start,end,gaps,pct_gtr_20x,pct_gtr_160x,sample_id,annotations,gene_name,exons,transcript
