Parsing gencode file takes ~40sec

In [2]:
import pandas as pd
import requests
import numpy as np
from functools import reduce


class ResidueMapper:
    
    def __init__(self, GENCODEFile: str, verbose: bool =False) -> None:
        
        self.verbose = verbose

        # Reading GENCODE data and filter for CDS segments:
        gtf_columns = ['chr', 'source', 'featureType', 'start', 'end', 'score', 'strand', 'phase', 'annotation']
        df = (
            pd.read_csv(GENCODEFile, sep='\t', comment='#', header=0, names=gtf_columns)
            .query('featureType == "CDS"')
            .reset_index(drop=True)
            .drop(['source', 'score', 'phase'], axis=1)
        )

        # Parse annotations for separate columns:
        annotations_df = pd.DataFrame(
            df.annotation
            .apply(lambda features: {feature.split('=')[0]: feature.split('=')[1] for feature in features.split(';')})
            .to_list()
        )

        # Join annotations with coordinates:
        self.full_gencode = (
            df
            .drop('annotation', axis=1)
            .merge(annotations_df, left_index=True, right_index=True)

            # Remove versions:
            .assign(
                gene_id = lambda df: df.gene_id.str.replace(r'\.\d+', '', regex=True),
                transcript_id = lambda df: df.transcript_id.str.replace(r'\.\d+', '', regex=True),
                protein_id = lambda df: df.protein_id.str.replace(r'\.\d+', '', regex=True),
                exon_id = lambda df: df.exon_id.str.replace(r'\.\d+', '', regex=True),
                chr = lambda df: df.chr.str.replace(r'chr', '', regex=False),
                feature_length = lambda df: df.end - df.start
            )

            # Dropping unused columns:
            .drop([
                'ID', 'Parent', 'gene_type', 'transcript_type', 'transcript_support_level',
                'exon_number', 'level', 'hgnc_id', 'tag', 'havana_gene', 'havana_transcript',
                'ccdsid', 'ont', 'featureType', 'transcript_name'
            ], axis=1)
        )
    
    def map_position(self, translation_id: str, aminoacid_position: int) -> list:

        # Filter gencode data for relevant protein only:
        selected = (
            self.full_gencode
            .query('protein_id == @translation_id')
            .sort_values('start')
            .reset_index(drop=True)
        )

        # Extract relevant data:
        row = selected.iloc[0]
        gene_id = row['gene_id']
        chromosome = row['chr']
        transcript_id = row['transcript_id']
        gene_name = row['gene_name']
        strand = row['strand']

        # Magic:
        positions = reduce(lambda x,y: x+y,(selected.apply(lambda row: list(range(row['start'],row['end']+1)), axis = 1)),[])

        # If the gene is on the negative strand we need to flip the list:
        if strand == '-':
            positions = positions[::-1]

        # Extracting positions:
        extracted_positions = positions[(aminoacid_position-1)*3:(aminoacid_position-1)*3+3]
        return [{'chr': chromosome, 'pos': position, 'strand': strand} for position in extracted_positions]


# Source file:
gencodeFile = 'https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_40/gencode.v40.annotation.gff3.gz'

# Map an amino acid position of a given protein:
translation_id = 'ENSP00000327251'
aminoacid_position = 115

mapper = ResidueMapper(gencodeFile)
mapper.map_position(translation_id, aminoacid_position)




[{'chr': '17', 'pos': 27787802, 'strand': '-'},
 {'chr': '17', 'pos': 27787801, 'strand': '-'},
 {'chr': '17', 'pos': 27787800, 'strand': '-'}]

In [3]:
%%bash

chrom=17
start=27787800
end=27787802

echo https://rest.ensembl.org/sequence/region/human/${chrom}:${start}..${end}:-1?content-type=text/plain

https://rest.ensembl.org/sequence/region/human/17:27787800..27787802:-1?content-type=text/plain


In [4]:
# An other example:
translation_id = 'ENSP00000267101'
aminoacid_position = 420

mapper.map_position(translation_id, aminoacid_position)



[{'chr': '12', 'pos': 56093060, 'strand': '+'},
 {'chr': '12', 'pos': 56093061, 'strand': '+'},
 {'chr': '12', 'pos': 56093062, 'strand': '+'}]

In [5]:
# An other re-run:
translation_id = 'ENSP00000327251'
aminoacid_position = 115

mapper.map_position(translation_id, aminoacid_position)



[{'chr': '17', 'pos': 27787802, 'strand': '-'},
 {'chr': '17', 'pos': 27787801, 'strand': '-'},
 {'chr': '17', 'pos': 27787800, 'strand': '-'}]

One query takes ~100ms. So 10 lookup per second. 250000 lookups ~7 hours without parallelization.

In [7]:
full_gencode = mapper.full_gencode.copy()
translation_id = 'ENSP00000327251'

selected = full_gencode.query('protein_id == @translation_id')
len(selected)

26

In [139]:
row = selected.iloc[0]

gene_id = row['gene_id']
chromosome = row['chr']
transcript_id = row['transcript_id']
gene_name = row['gene_name']
strand = row['strand']

# Magic:
positions = reduce(lambda x,y: x+y,(selected.apply(lambda row: list(range(row['start'],row['end']+1)), axis = 1)),[])

# If the gene is on the negative strand we need to flip the list:
if strand == '-':
    positions = positions[::-1]

positions


[27757353,
 27757352,
 27757351,
 27757350,
 27757349,
 27757348,
 27757347,
 27757346,
 27757345,
 27757344,
 27757343,
 27757342,
 27757341,
 27757340,
 27757339,
 27757338,
 27757337,
 27757336,
 27757335,
 27757334,
 27757333,
 27757332,
 27757331,
 27757330,
 27757329,
 27757328,
 27757327,
 27757326,
 27757325,
 27757324,
 27757323,
 27757322,
 27757321,
 27757320,
 27757319,
 27757318,
 27757317,
 27757316,
 27757315,
 27757314,
 27757313,
 27757312,
 27757311,
 27757310,
 27757309,
 27757308,
 27757307,
 27757306,
 27757305,
 27757304,
 27757303,
 27757302,
 27757301,
 27757300,
 27757299,
 27757298,
 27757297,
 27757296,
 27757295,
 27757294,
 27757293,
 27757292,
 27757291,
 27757290,
 27757289,
 27757288,
 27757287,
 27757286,
 27757285,
 27757284,
 27757283,
 27757282,
 27757281,
 27757280,
 27757279,
 27757278,
 27757277,
 27757276,
 27757275,
 27757274,
 27757273,
 27757272,
 27757271,
 27757270,
 27757269,
 27757268,
 27757267,
 27757266,
 27757265,
 27757264,
 27757263,

In [14]:
import numpy as np
from pandarallel import pandarallel


def calculate_all_residues(selected: pd.DataFrame) -> pd.DataFrame:
    row = selected.iloc[0]

    gene_id = row['gene_id']
    chromosome = row['chr']
    transcript_id = row['transcript_id']
    gene_name = row['gene_name']
    strand = row['strand']
    translation_id = row['protein_id']

    # Magic:
    positions = reduce(lambda x,y: x+y,(selected.apply(lambda row: list(range(row['start'],row['end']+1)), axis = 1)),[])

    # If the gene is on the negative strand we need to flip the list:
    if strand == '-':
        positions = positions[::-1]

    # Sanitizing early termination:
    positions = positions[ : -(len(positions)%3) or None]
    
    try:
        return (
            pd.DataFrame(np.reshape(positions, (-1, 3)), columns=['pos1', 'pos2', 'pos3'])
            .assign(
                protein_id=translation_id,
                gene_id=gene_id,
                chr=chromosome,
                strand=strand,
                amino_acid_position=lambda df: list(range(1,len(df)+1))
            )
            .reindex()
        )
    except:
        print(f'Failed: {translation_id}')

        
        
pandarallel.initialize(progress_bar=True)

generated = full_gencode.groupby('protein_id').parallel_apply(calculate_all_residues)


INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=13573), Label(value='0 / 13573')))…

In [9]:
import numpy as np
import requests

test_df = (
    full_gencode.query('protein_id == "ENSP00000388892"')
    .drop(['feature_length', 'protein_id'], axis=1)
    .reset_index(drop=True)
    .assign(
        dna = lambda df: df.apply(lambda row: requests.get(f'https://rest.ensembl.org/sequence/region/human/{row["chr"]}:{row["start"]}..{row["end"]}:1?content-type=text/plain').text, axis=1)
    )
)

test_df.head()

Unnamed: 0,chr,start,end,strand,gene_id,transcript_id,gene_name,exon_id,dna
0,13,24804367,24804429,+,ENSG00000132972,ENST00000418120,RNF17,ENSE00003553374,ATGACAGATGTTTCAGTAACGGTTTGTCATATAAATAGTCCTGGAG...
1,13,24825619,24825772,+,ENSG00000132972,ENST00000418120,RNF17,ENSE00001706074,ATAGAGGGCCTGGATATTTTATTTCTATTAAAGACAATCGAGGAAT...
2,13,24830484,24830599,+,ENSG00000132972,ENST00000418120,RNF17,ENSE00001671772,GATTGCCTGGACATCAGGAAGTTGAAGTTAAATATGTGGACTTTGG...
3,13,24831858,24831978,+,ENSG00000132972,ENST00000418120,RNF17,ENSE00001786297,GCAATTAAATGTAAGTTGGCCTATATTGAACCATATAAAAGGACAA...
4,13,24842041,24842161,+,ENSG00000132972,ENST00000418120,RNF17,ENSE00001729598,AAATTCTGGAAGATAATGTGCTCTTAGTTGAGCTTTTCGATTCTCT...


In [219]:
test_df

Unnamed: 0,chr,start,end,strand,gene_id,transcript_id,gene_name,exon_id,dna
0,13,24804367,24804429,+,ENSG00000132972,ENST00000418120,RNF17,ENSE00003553374,ATGACAGATGTTTCAGTAACGGTTTGTCATATAAATAGTCCTGGAG...
1,13,24825619,24825772,+,ENSG00000132972,ENST00000418120,RNF17,ENSE00001706074,ATAGAGGGCCTGGATATTTTATTTCTATTAAAGACAATCGAGGAAT...
2,13,24830484,24830599,+,ENSG00000132972,ENST00000418120,RNF17,ENSE00001671772,GATTGCCTGGACATCAGGAAGTTGAAGTTAAATATGTGGACTTTGG...
3,13,24831858,24831978,+,ENSG00000132972,ENST00000418120,RNF17,ENSE00001786297,GCAATTAAATGTAAGTTGGCCTATATTGAACCATATAAAAGGACAA...
4,13,24842041,24842161,+,ENSG00000132972,ENST00000418120,RNF17,ENSE00001729598,AAATTCTGGAAGATAATGTGCTCTTAGTTGAGCTTTTCGATTCTCT...
5,13,24843744,24843971,+,ENSG00000132972,ENST00000418120,RNF17,ENSE00001731008,ATACATCCTCAAAGATAATTCTCAAAAGCATATTGAAGTTTGGGAT...
6,13,24844652,24844802,+,ENSG00000132972,ENST00000418120,RNF17,ENSE00003641245,TTTAGAAGAAAAGATGATAGCTGCTTATGAAAACTCAAAATGGGAA...
7,13,24844961,24845079,+,ENSG00000132972,ENST00000418120,RNF17,ENSE00001144924,GTCTTGCTGTATGATGTGGGTGTTGAACTAGTAGTGAATGTTGACT...
8,13,24850341,24850443,+,ENSG00000132972,ENST00000418120,RNF17,ENSE00001144918,ACCAGCTGGTGGGAGTGACAAGTGGACAGCAACAGCTTGTGACTGT...
9,13,24851456,24851571,+,ENSG00000132972,ENST00000418120,RNF17,ENSE00001144911,GAAAACAACACAACATGGCCATTACCTGTGAAAATTTTCTGCAGAG...


In [10]:
reduce(lambda x,y: x+y, test_df.dna)

'ATGACAGATGTTTCAGTAACGGTTTGTCATATAAATAGTCCTGGAGATTTCTATCTTCAGTTGATAGAGGGCCTGGATATTTTATTTCTATTAAAGACAATCGAGGAATTCTATAAAAGTGAAGATGGAGAAAATCTGGAAATCCTCTGTCCAGTTCAAGATCAAGCCTGTGTAGCTAAATTTGAAGATGGAATTTGGTACCGAGCAAAAGTTATCGGATTGCCTGGACATCAGGAAGTTGAAGTTAAATATGTGGACTTTGGTAATACTGCAAAAATAACAATCAAAGACGTGCGTAAAATAAAGGATGAGTTTCTGAATGCCCCAGAGAAGGCAATTAAATGTAAGTTGGCCTATATTGAACCATATAAAAGGACAATGCAGTGGTCCAAAGAAGCTAAAGAAAAATTTGAAGAAAAGGCTCAAGATAAATTTATGACATGTTCAGTTATCAAAATTCTGGAAGATAATGTGCTCTTAGTTGAGCTTTTCGATTCTCTTGGTGCTCCTGAAATGACTACTACTAGTATTAATGACCAGCTAGTTAAAGAGGGCCTAGCATCTTATGAAATAGGATACATCCTCAAAGATAATTCTCAAAAGCATATTGAAGTTTGGGATCCTTCTCCAGAAGAAATTATTTCAAATGAAGTACACAACTTAAATCCTGTGTCTGCAAAATCTCTACCTAATGAGAATTTTCAGTCACTTTATAATAAGGAATTGCCTGTGCATATCTGTAATGTAATATCTCCTGAGAAGATTTATGTTCAGTGGTTGTTAACTGAAAACTTACTTAATAGTTTAGAAGAAAAGATGATAGCTGCTTATGAAAACTCAAAATGGGAACCTGTTAAATGGGAAAATGATATGCACTGTGCTGTTAAGATCCAAGATAAAAATCAGTGGCGAAGAGGCCAGATCATCAGAATGGTTACAGACACATTGGTAGAGGTCTTGCTGTATGATGTGGGTGTTGAACTAGTAGTGAATGTTGAC

In [18]:
# full_gencode.query('protein_id == "ENSP00000388892"').feature_length.sum()
generated.to_csv('generated_mappings.tsv.gz', sep='\t', compression='infer', index=False)

In [17]:
# generated.head()
# generated.to_json('generated_mappings.json.gz', compression='infer',orient='records', lines=True)
generated.loc['ENSP00000369816', 401]

pos1                           7633347
pos2                           7633348
pos3                           7633349
protein_id             ENSP00000369816
gene_id                ENSG00000129214
chr                                 17
strand                               +
amino_acid_position                402
Name: (ENSP00000369816, 401), dtype: object

In [1]:
%%bash 

ls -lah

total 472
drwxrwxr-x  11 dsuveges  384566875   352B  1 May 01:07 .
drwxrwxr-x  80 dsuveges  384566875   2.5K 25 Apr 15:13 ..
drwxrwxr-x   7 dsuveges  384566875   224B 30 Apr 01:30 .ipynb_checkpoints
-rw-r--r--   1 dsuveges  384566875    53K 28 Apr 23:31 MapAminoAcid2Genome.ipynb
-rw-r--r--   1 dsuveges  384566875   671B 29 Apr 09:03 Mapping aminioacid to genome.ipynb
-rw-r--r--   1 dsuveges  384566875    16K 22 Apr 16:57 Mapping drugs to pdb.ipynb
-rw-r--r--   1 dsuveges  384566875    56K  1 May 01:07 ResidueMapper.ipynb
drwxr-xr-x   6 dsuveges  384566875   192B 22 Apr 00:30 data
-rw-rw-r--   1 dsuveges  384566875    88K 21 Apr 21:59 drug2pdb.ipynb
-rw-r--r--   1 dsuveges  384566875   141B 30 Apr 20:10 lajhar.txt
-rw-r--r--   1 dsuveges  384566875   139B 30 Apr 20:11 lajharok2.txt


In [37]:
import pandas as pd
from lxml import etree
from collections import defaultdict

with open('/Users/dsuveges/Downloads/1dqa.xml', 'rb') as fobj:
    xml = fobj.read()

root = etree.fromstring(xml)

In [28]:
['pocok' for x in root.findall('entity')]

[]

In [36]:
data = []

for entry in root.findall('{http://www.ebi.ac.uk/pdbe/docs/sifts/eFamily.xsd}entity'):
    data.update({
        'chain': entry.get('entityId'),
        'type': entry.get('type')
    })

['A', 'B', 'C', 'D']

In [60]:
pdb_id = '6rlb'
data = requests.get(f'https://www.ebi.ac.uk/pdbe/api/mappings/ensembl/{pdb_id}').json()
# pd.DataFrame(data['6rlb']['Ensembl'])
df = (
    pd.DataFrame(data['6rlb']['Ensembl'].values())
    .explode('mappings')
    .set_index('identifier')
)

df = (
    pd.DataFrame(df['mappings'].values.tolist(), index=df.index)
    .assign(
        pdb_id = pdb_id,
        pdb_start = lambda df: df.start.apply(lambda start: start['author_residue_number']),
        pdb_end = lambda df: df.end.apply(lambda start: start['author_residue_number'])
    )
    .drop(['ordinal', 'entity_id', 'end', 'genome_start', 'start', 'exon_id', 'genome_end'], axis=1)
)

df.head()

Unnamed: 0_level_0,accession,chain_id,unp_end,transcript_id,coverage,unp_start,translation_id,struct_asym_id,pdb_id,pdb_start,pdb_end
identifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
ENSG00000138036,Q8TCX1,E,3,ENST00000260605,1.0,1,ENSP00000260605,E,6rlb,,
ENSG00000138036,Q8TCX1-5,E,3,ENST00000398823,1.0,1,ENSP00000381804,E,6rlb,,
ENSG00000138036,Q8TCX1-4,E,3,ENST00000406852,1.0,1,ENSP00000385738,E,6rlb,,
ENSG00000138036,Q8TCX1-2,E,3,ENST00000605786,1.0,1,ENSP00000474032,E,6rlb,,
ENSG00000138036,Q8TCX1,E,42,ENST00000260605,1.0,3,ENSP00000260605,E,6rlb,,42.0


In [57]:
df

Unnamed: 0_level_0,ordinal,entity_id,end,accession,genome_start,chain_id,start,unp_end,transcript_id,exon_id,coverage,unp_start,translation_id,genome_end,struct_asym_id,pdb_id
identifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
ENSG00000138036,1,4,"{'author_residue_number': None, 'author_insert...",Q8TCX1,43774139,E,"{'author_residue_number': None, 'author_insert...",3,ENST00000260605,ENSE00001555602,1.000,1,ENSP00000260605,43774145,E,6rlb
ENSG00000138036,1,4,"{'author_residue_number': None, 'author_insert...",Q8TCX1-5,43774139,E,"{'author_residue_number': None, 'author_insert...",3,ENST00000398823,ENSE00003270714,1.000,1,ENSP00000381804,43774145,E,6rlb
ENSG00000138036,1,4,"{'author_residue_number': None, 'author_insert...",Q8TCX1-4,43774139,E,"{'author_residue_number': None, 'author_insert...",3,ENST00000406852,ENSE00003593496,1.000,1,ENSP00000385738,43774145,E,6rlb
ENSG00000138036,1,4,"{'author_residue_number': None, 'author_insert...",Q8TCX1-2,43774139,E,"{'author_residue_number': None, 'author_insert...",3,ENST00000605786,ENSE00003578395,1.000,1,ENSP00000474032,43774145,E,6rlb
ENSG00000138036,1,4,"{'author_residue_number': 42, 'author_insertio...",Q8TCX1,43776782,E,"{'author_residue_number': None, 'author_insert...",42,ENST00000260605,ENSE00003549380,1.000,3,ENSP00000260605,43776899,E,6rlb
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ENSG00000125971,1,5,"{'author_residue_number': 535, 'author_inserti...",Q9NP97-2,34526268,H,"{'author_residue_number': None, 'author_insert...",27,ENST00000300469,ENSE00003592194,1.000,2,ENSP00000300469,34526343,H,6rlb
ENSG00000125971,1,5,"{'author_residue_number': 535, 'author_inserti...",Q9NP97,34526268,H,"{'author_residue_number': None, 'author_insert...",27,ENST00000357156,ENSE00003592194,1.000,2,ENSP00000349679,34526343,H,6rlb
ENSG00000125971,1,5,"{'author_residue_number': 535, 'author_inserti...",Q9NP97-2,34529852,H,"{'author_residue_number': 535, 'author_inserti...",27,ENST00000300469,ENSE00001909747,0.048,27,ENSP00000300469,34529852,H,6rlb
ENSG00000125971,1,5,"{'author_residue_number': 591, 'author_inserti...",Q9NP97,34534628,H,"{'author_residue_number': 535, 'author_inserti...",83,ENST00000357156,ENSE00003606807,1.000,27,ENSP00000349679,34534796,H,6rlb
