Parsing gencode file takes ~40sec

In [24]:
import pandas as pd
import requests
import numpy as np
from functools import reduce
import numpy as np
from pandarallel import pandarallel

class ResidueMapper:
    
    def __init__(self, GENCODEFile: str, verbose: bool =False) -> None:
        
        self.verbose = verbose

        # Reading GENCODE data and filter for CDS segments:
        gtf_columns = ['chr', 'source', 'featureType', 'start', 'end', 'score', 'strand', 'phase', 'annotation']
        df = (
            pd.read_csv(GENCODEFile, sep='\t', comment='#', header=0, names=gtf_columns)
            .query('featureType == "CDS"')
            .reset_index(drop=True)
            .drop(['source', 'score', 'phase'], axis=1)
        )

        # Parse annotations for separate columns:
        annotations_df = pd.DataFrame(
            df.annotation
            .apply(lambda features: {feature.split('=')[0]: feature.split('=')[1] for feature in features.split(';')})
            .to_list()
        )

        # Join annotations with coordinates:
        self.full_gencode = (
            df
            .drop('annotation', axis=1)
            .merge(annotations_df, left_index=True, right_index=True)

            # Remove versions:
            .assign(
                gene_id = lambda df: df.gene_id.str.replace(r'\.\d+', '', regex=True),
                transcript_id = lambda df: df.transcript_id.str.replace(r'\.\d+', '', regex=True),
                protein_id = lambda df: df.protein_id.str.replace(r'\.\d+', '', regex=True),
                exon_id = lambda df: df.exon_id.str.replace(r'\.\d+', '', regex=True),
                chr = lambda df: df.chr.str.replace(r'chr', '', regex=False),
                feature_length = lambda df: df.end - df.start
            )

            # Dropping unused columns:
            .drop([
                'ID', 'Parent', 'gene_type', 'transcript_type', 'transcript_support_level',
                'exon_number', 'level', 'hgnc_id', 'tag', 'havana_gene', 'havana_transcript',
                'ccdsid', 'ont', 'featureType', 'transcript_name'
            ], axis=1)
        )
    
    def map_position(self, translation_id: str, aminoacid_position: int) -> list:

        # Filter gencode data for relevant protein only:
        selected = (
            self.full_gencode
            .query('protein_id == @translation_id')
            .sort_values('start')
            .reset_index(drop=True)
        )

        # Extract relevant data:
        row = selected.iloc[0]
        gene_id = row['gene_id']
        chromosome = row['chr']
        transcript_id = row['transcript_id']
        gene_name = row['gene_name']
        strand = row['strand']

        # Magic:
        positions = reduce(lambda x,y: x+y,(selected.apply(lambda row: list(range(row['start'],row['end']+1)), axis = 1)),[])

        # If the gene is on the negative strand we need to flip the list:
        if strand == '-':
            positions = positions[::-1]

        # Extracting positions:
        extracted_positions = positions[(aminoacid_position-1)*3:(aminoacid_position-1)*3+3]
        return [{'chr': chromosome, 'pos': position, 'strand': strand} for position in extracted_positions]


# Source file:
gencodeFile = 'https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_40/gencode.v40.annotation.gff3.gz'

# Map an amino acid position of a given protein:
translation_id = 'ENSP00000327251'
aminoacid_position = 115

mapper = ResidueMapper(gencodeFile)
mapper.map_position(translation_id, aminoacid_position)




KeyboardInterrupt: 

In [3]:
%%bash

chrom=17
start=27787800
end=27787802

echo https://rest.ensembl.org/sequence/region/human/${chrom}:${start}..${end}:-1?content-type=text/plain

https://rest.ensembl.org/sequence/region/human/17:27787800..27787802:-1?content-type=text/plain


In [None]:
# An other example:
translation_id = 'ENSP00000267101'
aminoacid_position = 420

mapper.map_position(translation_id, aminoacid_position)



In [21]:
# An other re-run:
translation_id = 'ENSP00000327251'
aminoacid_position = 115

mapper.map_position(translation_id, aminoacid_position)



NameError: name 'mapper' is not defined

One query takes ~100ms. So 10 lookup per second. 250000 lookups ~7 hours without parallelization.

In [20]:
full_gencode = mapper.full_gencode.copy()
translation_id = 'ENSP00000327251'

selected = full_gencode.query('protein_id == @translation_id')
len(selected)

NameError: name 'mapper' is not defined

In [11]:
row = selected.iloc[0]

gene_id = row['gene_id']
chromosome = row['chr']
transcript_id = row['transcript_id']
gene_name = row['gene_name']
strand = row['strand']

# if strand is negative, we need to reverse order the CDSs:
if strand == '-':
    selected = selected.sort_values('start')

# Magic:
positions = reduce(lambda x,y: x+y,(selected.apply(lambda row: list(range(row['start'],row['end']+1)), axis = 1)),[])

# If the gene is on the negative strand we need to flip the list:
if strand == '-':
    positions = positions[::-1]

print(positions[:10])
print(selected)
print(
    pd.DataFrame(np.reshape(positions, (-1, 3)), columns=['pos1', 'pos2', 'pos3'])
    .assign(
        protein_id=translation_id,
        gene_id=gene_id,
        chr=chromosome,
        strand=strand,
        amino_acid_position=lambda df: list(range(1,len(df)+1))
    )
    .query('amino_acid_position == 115')
    .reindex()
)

[27798809, 27798808, 27798807, 27798806, 27798805, 27798804, 27798803, 27798802, 27798801, 27798800]
       chr     start       end strand          gene_id    transcript_id  \
675197  17  27757246  27757353      -  ENSG00000007171  ENST00000313735   
675196  17  27758881  27759075      -  ENSG00000007171  ENST00000313735   
675195  17  27760030  27760178      -  ENSG00000007171  ENST00000313735   
675194  17  27760623  27760744      -  ENSG00000007171  ENST00000313735   
675193  17  27761144  27761231      -  ENSG00000007171  ENST00000313735   
675192  17  27762798  27763005      -  ENSG00000007171  ENST00000313735   
675191  17  27763981  27764144      -  ENSG00000007171  ENST00000313735   
675190  17  27765535  27765716      -  ENSG00000007171  ENST00000313735   
675189  17  27766510  27766588      -  ENSG00000007171  ENST00000313735   
675188  17  27767705  27767837      -  ENSG00000007171  ENST00000313735   
675187  17  27768977  27769151      -  ENSG00000007171  ENST00000313735   

In [9]:
import numpy as np
import pandas as pd
from pandarallel import pandarallel


def calculate_all_residues(selected: pd.DataFrame) -> pd.DataFrame:
    row = selected.iloc[0]

    gene_id = row['gene_id']
    chromosome = row['chr']
    transcript_id = row['transcript_id']
    gene_name = row['gene_name']
    strand = row['strand']
    translation_id = row['protein_id']
    
    # if strand is negative, we need to reverse order the CDSs:
    if strand == '-':
        selected = selected.sort_values('start')


    # Magic:
    positions = reduce(lambda x,y: x+y,(selected.apply(lambda row: list(range(row['start'],row['end']+1)), axis = 1)),[])

    # If the gene is on the negative strand we need to flip the list:
    if strand == '-':
        positions = positions[::-1]

    # Sanitizing early termination:
    positions = positions[ : -(len(positions)%3) or None]
    
    try:
        return (
            pd.DataFrame(np.reshape(positions, (-1, 3)), columns=['pos1', 'pos2', 'pos3'])
            .assign(
                protein_id=translation_id,
                gene_id=gene_id,
                chr=chromosome,
                strand=strand,
                amino_acid_position=lambda df: list(range(1,len(df)+1))
            )
            .reindex()
        )
    except:
        print(f'Failed: {translation_id}')

        
        
pandarallel.initialize(progress_bar=True)

# Extracting processed data from mapper object:
full_gencode = mapper.full_gencode.copy()

# Generate all possible protein mappings:
generated = full_gencode.groupby('protein_id').parallel_apply(calculate_all_residues)


INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=13573), Label(value='0 / 13573')))…

KeyboardInterrupt: 

In [9]:
import numpy as np
import requests

test_df = (
    full_gencode.query('protein_id == "ENSP00000388892"')
    .drop(['feature_length', 'protein_id'], axis=1)
    .reset_index(drop=True)
    .assign(
        dna = lambda df: df.apply(lambda row: requests.get(f'https://rest.ensembl.org/sequence/region/human/{row["chr"]}:{row["start"]}..{row["end"]}:1?content-type=text/plain').text, axis=1)
    )
)

test_df.head()

Unnamed: 0,chr,start,end,strand,gene_id,transcript_id,gene_name,exon_id,dna
0,13,24804367,24804429,+,ENSG00000132972,ENST00000418120,RNF17,ENSE00003553374,ATGACAGATGTTTCAGTAACGGTTTGTCATATAAATAGTCCTGGAG...
1,13,24825619,24825772,+,ENSG00000132972,ENST00000418120,RNF17,ENSE00001706074,ATAGAGGGCCTGGATATTTTATTTCTATTAAAGACAATCGAGGAAT...
2,13,24830484,24830599,+,ENSG00000132972,ENST00000418120,RNF17,ENSE00001671772,GATTGCCTGGACATCAGGAAGTTGAAGTTAAATATGTGGACTTTGG...
3,13,24831858,24831978,+,ENSG00000132972,ENST00000418120,RNF17,ENSE00001786297,GCAATTAAATGTAAGTTGGCCTATATTGAACCATATAAAAGGACAA...
4,13,24842041,24842161,+,ENSG00000132972,ENST00000418120,RNF17,ENSE00001729598,AAATTCTGGAAGATAATGTGCTCTTAGTTGAGCTTTTCGATTCTCT...


In [219]:
test_df

Unnamed: 0,chr,start,end,strand,gene_id,transcript_id,gene_name,exon_id,dna
0,13,24804367,24804429,+,ENSG00000132972,ENST00000418120,RNF17,ENSE00003553374,ATGACAGATGTTTCAGTAACGGTTTGTCATATAAATAGTCCTGGAG...
1,13,24825619,24825772,+,ENSG00000132972,ENST00000418120,RNF17,ENSE00001706074,ATAGAGGGCCTGGATATTTTATTTCTATTAAAGACAATCGAGGAAT...
2,13,24830484,24830599,+,ENSG00000132972,ENST00000418120,RNF17,ENSE00001671772,GATTGCCTGGACATCAGGAAGTTGAAGTTAAATATGTGGACTTTGG...
3,13,24831858,24831978,+,ENSG00000132972,ENST00000418120,RNF17,ENSE00001786297,GCAATTAAATGTAAGTTGGCCTATATTGAACCATATAAAAGGACAA...
4,13,24842041,24842161,+,ENSG00000132972,ENST00000418120,RNF17,ENSE00001729598,AAATTCTGGAAGATAATGTGCTCTTAGTTGAGCTTTTCGATTCTCT...
5,13,24843744,24843971,+,ENSG00000132972,ENST00000418120,RNF17,ENSE00001731008,ATACATCCTCAAAGATAATTCTCAAAAGCATATTGAAGTTTGGGAT...
6,13,24844652,24844802,+,ENSG00000132972,ENST00000418120,RNF17,ENSE00003641245,TTTAGAAGAAAAGATGATAGCTGCTTATGAAAACTCAAAATGGGAA...
7,13,24844961,24845079,+,ENSG00000132972,ENST00000418120,RNF17,ENSE00001144924,GTCTTGCTGTATGATGTGGGTGTTGAACTAGTAGTGAATGTTGACT...
8,13,24850341,24850443,+,ENSG00000132972,ENST00000418120,RNF17,ENSE00001144918,ACCAGCTGGTGGGAGTGACAAGTGGACAGCAACAGCTTGTGACTGT...
9,13,24851456,24851571,+,ENSG00000132972,ENST00000418120,RNF17,ENSE00001144911,GAAAACAACACAACATGGCCATTACCTGTGAAAATTTTCTGCAGAG...


In [10]:
reduce(lambda x,y: x+y, test_df.dna)

'ATGACAGATGTTTCAGTAACGGTTTGTCATATAAATAGTCCTGGAGATTTCTATCTTCAGTTGATAGAGGGCCTGGATATTTTATTTCTATTAAAGACAATCGAGGAATTCTATAAAAGTGAAGATGGAGAAAATCTGGAAATCCTCTGTCCAGTTCAAGATCAAGCCTGTGTAGCTAAATTTGAAGATGGAATTTGGTACCGAGCAAAAGTTATCGGATTGCCTGGACATCAGGAAGTTGAAGTTAAATATGTGGACTTTGGTAATACTGCAAAAATAACAATCAAAGACGTGCGTAAAATAAAGGATGAGTTTCTGAATGCCCCAGAGAAGGCAATTAAATGTAAGTTGGCCTATATTGAACCATATAAAAGGACAATGCAGTGGTCCAAAGAAGCTAAAGAAAAATTTGAAGAAAAGGCTCAAGATAAATTTATGACATGTTCAGTTATCAAAATTCTGGAAGATAATGTGCTCTTAGTTGAGCTTTTCGATTCTCTTGGTGCTCCTGAAATGACTACTACTAGTATTAATGACCAGCTAGTTAAAGAGGGCCTAGCATCTTATGAAATAGGATACATCCTCAAAGATAATTCTCAAAAGCATATTGAAGTTTGGGATCCTTCTCCAGAAGAAATTATTTCAAATGAAGTACACAACTTAAATCCTGTGTCTGCAAAATCTCTACCTAATGAGAATTTTCAGTCACTTTATAATAAGGAATTGCCTGTGCATATCTGTAATGTAATATCTCCTGAGAAGATTTATGTTCAGTGGTTGTTAACTGAAAACTTACTTAATAGTTTAGAAGAAAAGATGATAGCTGCTTATGAAAACTCAAAATGGGAACCTGTTAAATGGGAAAATGATATGCACTGTGCTGTTAAGATCCAAGATAAAAATCAGTGGCGAAGAGGCCAGATCATCAGAATGGTTACAGACACATTGGTAGAGGTCTTGCTGTATGATGTGGGTGTTGAACTAGTAGTGAATGTTGAC

In [18]:
# full_gencode.query('protein_id == "ENSP00000388892"').feature_length.sum()
generated.to_csv('generated_mappings.tsv.gz', sep='\t', compression='infer', index=False)

In [17]:
# generated.head()
# generated.to_json('generated_mappings.json.gz', compression='infer',orient='records', lines=True)
generated.loc['ENSP00000369816', 401]

pos1                           7633347
pos2                           7633348
pos3                           7633349
protein_id             ENSP00000369816
gene_id                ENSG00000129214
chr                                 17
strand                               +
amino_acid_position                402
Name: (ENSP00000369816, 401), dtype: object

In [1]:
%%bash 

ls -lah

total 472
drwxrwxr-x  11 dsuveges  384566875   352B  1 May 01:07 .
drwxrwxr-x  80 dsuveges  384566875   2.5K 25 Apr 15:13 ..
drwxrwxr-x   7 dsuveges  384566875   224B 30 Apr 01:30 .ipynb_checkpoints
-rw-r--r--   1 dsuveges  384566875    53K 28 Apr 23:31 MapAminoAcid2Genome.ipynb
-rw-r--r--   1 dsuveges  384566875   671B 29 Apr 09:03 Mapping aminioacid to genome.ipynb
-rw-r--r--   1 dsuveges  384566875    16K 22 Apr 16:57 Mapping drugs to pdb.ipynb
-rw-r--r--   1 dsuveges  384566875    56K  1 May 01:07 ResidueMapper.ipynb
drwxr-xr-x   6 dsuveges  384566875   192B 22 Apr 00:30 data
-rw-rw-r--   1 dsuveges  384566875    88K 21 Apr 21:59 drug2pdb.ipynb
-rw-r--r--   1 dsuveges  384566875   141B 30 Apr 20:10 lajhar.txt
-rw-r--r--   1 dsuveges  384566875   139B 30 Apr 20:11 lajharok2.txt


In [37]:
import pandas as pd
from lxml import etree
from collections import defaultdict

with open('/Users/dsuveges/Downloads/1dqa.xml', 'rb') as fobj:
    xml = fobj.read()

root = etree.fromstring(xml)

In [28]:
['pocok' for x in root.findall('entity')]

[]

In [36]:
data = []

for entry in root.findall('{http://www.ebi.ac.uk/pdbe/docs/sifts/eFamily.xsd}entity'):
    data.update({
        'chain': entry.get('entityId'),
        'type': entry.get('type')
    })

['A', 'B', 'C', 'D']

In [60]:
pdb_id = '6rlb'
data = requests.get(f'https://www.ebi.ac.uk/pdbe/api/mappings/ensembl/{pdb_id}').json()
# pd.DataFrame(data['6rlb']['Ensembl'])
df = (
    pd.DataFrame(data['6rlb']['Ensembl'].values())
    .explode('mappings')
    .set_index('identifier')
)

df = (
    pd.DataFrame(df['mappings'].values.tolist(), index=df.index)
    .assign(
        pdb_id = pdb_id,
        pdb_start = lambda df: df.start.apply(lambda start: start['author_residue_number']),
        pdb_end = lambda df: df.end.apply(lambda start: start['author_residue_number'])
    )
    .drop(['ordinal', 'entity_id', 'end', 'genome_start', 'start', 'exon_id', 'genome_end'], axis=1)
)

df.head()

Unnamed: 0_level_0,accession,chain_id,unp_end,transcript_id,coverage,unp_start,translation_id,struct_asym_id,pdb_id,pdb_start,pdb_end
identifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
ENSG00000138036,Q8TCX1,E,3,ENST00000260605,1.0,1,ENSP00000260605,E,6rlb,,
ENSG00000138036,Q8TCX1-5,E,3,ENST00000398823,1.0,1,ENSP00000381804,E,6rlb,,
ENSG00000138036,Q8TCX1-4,E,3,ENST00000406852,1.0,1,ENSP00000385738,E,6rlb,,
ENSG00000138036,Q8TCX1-2,E,3,ENST00000605786,1.0,1,ENSP00000474032,E,6rlb,,
ENSG00000138036,Q8TCX1,E,42,ENST00000260605,1.0,3,ENSP00000260605,E,6rlb,,42.0


In [12]:
from pyspark.sql.types import ArrayType, StringType, IntegerType, StructType, StructField
from pyspark.sql import SparkSession
import pyspark.sql.functions as f

# establish spark connection
spark = (
    SparkSession.builder
    .master('local[*]')
    .getOrCreate()
)

In [55]:



gff3_columns = ['strand', 'phase', 'annotation']

gff3_schema = StructType([
    StructField("chr", StringType(), False),
    StructField("source", StringType(), False),
    StructField("featureType", StringType(), False),
    StructField("start", IntegerType(), False),
    StructField("end", IntegerType(), False),
    StructField("score", StringType(), False),
    StructField("strand", StringType(), False),
    StructField("phase", StringType(), False),
    StructField("annotation", StringType(), False),
])

CDS = (
    spark.read.option("comment", "#")
    .csv('/Users/dsuveges/Downloads/gencode.v40.annotation.gff3.gz', sep='\t', schema=gff3_schema)
    
    # Filter for coding sequences only:
    .filter(f.col('featureType') == 'CDS')
    
    # Parsing GFF3 annotation:
    .withColumn('parsed_annotation', parse_annotation(f.col('annotation')))
    
    # Selecting columns + extract fields:
    .select('chr', 'start', 'end', 'strand', 'parsed_annotation.*')
    .persist()
)

In [155]:

test_df = (
    CDS
    
    # Cleaning identifiers from version information:
    .withColumn('gene_id', remove_version(f.col('gene_id')))
    .withColumn('transcript_id', remove_version(f.col('transcript_id')))
    .withColumn('protein_id', remove_version(f.col('protein_id')))
    
    # Generate test dataset:
    .filter(f.col('chr') == 'chr12')
    
    # Order dataframe:
    .orderBy(['chr', 'start'])
#     .filter(f.col('strand') == '-')
    .persist()
)

print(test_df.count())
test_df.show()

49113
+-----+------+------+------+---------------+---------------+---------+---------------+
|  chr| start|   end|strand|        gene_id|  transcript_id|gene_name|     protein_id|
+-----+------+------+------+---------------+---------------+---------+---------------+
|chr12| 66883| 67436|     +|ENSG00000120645|ENST00000538872|   IQSEC3|ENSP00000437554|
|chr12| 99146| 99214|     +|ENSG00000120645|ENST00000538872|   IQSEC3|ENSP00000437554|
|chr12|125633|125912|     +|ENSG00000120645|ENST00000538872|   IQSEC3|ENSP00000437554|
|chr12|138267|139354|     +|ENSG00000120645|ENST00000538872|   IQSEC3|ENSP00000437554|
|chr12|138273|139354|     +|ENSG00000120645|ENST00000382841|   IQSEC3|ENSP00000372292|
|chr12|141124|141285|     +|ENSG00000120645|ENST00000538872|   IQSEC3|ENSP00000437554|
|chr12|141124|141285|     +|ENSG00000120645|ENST00000382841|   IQSEC3|ENSP00000372292|
|chr12|157025|157147|     +|ENSG00000120645|ENST00000538872|   IQSEC3|ENSP00000437554|
|chr12|157025|157147|     +|ENSG00000

In [130]:
@f.udf(ArrayType(IntegerType()))
def generate_numbers(start, end):
    """Generating numbers between a lower and upper boundary"""
    return np.arange(start, end + 1).tolist()

@f.udf(ArrayType(ArrayType(IntegerType())))
def reshape_list(a: list) -> list:
    """Reshaping list into a 3 by n array of array"""
    a = a[: - (len(a) % 3)] if len(a) % 3 else a
    return np.reshape(a, (int(len(a)/3), 3)).tolist()

processed = (
    test_df

    # Generating positions for all CDS fragments:
    .withColumn('positions', generate_numbers(f.col('start'), f.col('end')))
    
    # Grouping by protein_id
    .groupBy('protein_id')
    .agg(
        f.first('chr').alias('chr'),
        f.first('strand').alias('strand'),
        f.first('gene_id').alias('gene_id'),
        f.first('transcript_id').alias('transcript_id'),
        f.first('gene_name').alias('gene_name'),
        f.flatten(f.collect_set('positions')).alias('positions')
    )
    .withColumn(
        'positions',
        f.when(f.col('strand') == '+', f.col('positions'))
        .otherwise(f.reverse(f.col('positions')))
    )
    .withColumn('positions', reshape_list(f.col('positions')))
    .withColumn('bases', f.explode(f.expr("""transform(positions,(x,i)-> struct(x as position,(i+1) as position_number))""")))
    .select('*', 'bases.*')
    .drop('positions', 'bases')
)


processed.write.mode('overwrite').parquet('protein2genome_map')


In [131]:
mappings = (
    spark.read.parquet('protein2genome_map')
)

In [132]:
mappings.select('protein_id').distinct().count()

108583

In [136]:
mappings.select('gene_id').distinct().count()

20429

In [137]:
mappings.printSchema()

root
 |-- protein_id: string (nullable = true)
 |-- chr: string (nullable = true)
 |-- strand: string (nullable = true)
 |-- gene_id: string (nullable = true)
 |-- transcript_id: string (nullable = true)
 |-- gene_name: string (nullable = true)
 |-- position: array (nullable = true)
 |    |-- element: integer (containsNull = true)
 |-- position_number: integer (nullable = true)



In [143]:
tests = {
    'first_exon_positive_strand': {
        'filters': [
            ('gene_name', 'DYNLL1'),
            ('position_number', 2)
        ],
        'expected': [120496425, 120496426, 120496427]
    },
    'second_exon_positive_strand': {
        'filters': [
            ('protein_id', 'ENSP00000376297'),
            ('position_number', 70)
        ],
        'expected': [120498148, 120498149, 120498150]
    },
    'first_exon_negative_strand': {'gene_name': '', 'position': 12},
    'second_exon_negative_strand': {'gene_name': '', 'position': 12},
    'test_translation_id': {'protein_id': '', 'position': 12}
}

(
    mappings
    .filter(
        (f.col('gene_name') == 'DYNLL1') & 
        (f.col('position_number') == 2)
    )
    .show(1, False, True)
)

-RECORD 0--------------------------------------------
 protein_id      | ENSP00000449088                   
 chr             | chr12                             
 strand          | +                                 
 gene_id         | ENSG00000088986                   
 transcript_id   | ENST00000552870                   
 gene_name       | DYNLL1                            
 position        | [120496425, 120496426, 120496427] 
 position_number | 2                                 
only showing top 1 row



In [147]:
(
    mappings
    .filter(
        (f.col('protein_id') == 'ENSP00000376297') & 
        (f.col('position_number') == 70)
    )
    .show(1, False, True)
)

-RECORD 0--------------------------------------------
 protein_id      | ENSP00000376297                   
 chr             | chr12                             
 strand          | +                                 
 gene_id         | ENSG00000088986                   
 transcript_id   | ENST00000392509                   
 gene_name       | DYNLL1                            
 position        | [120496491, 120496492, 120496493] 
 position_number | 70                                



In [156]:
(
    test_df
    .filter(f.col('protein_id') == 'ENSP00000376297')
#     .select('protein_id', 'strand', 'position', 'position_number')
    .show(100, truncate=False)
)

+-----+---------+---------+------+---------------+---------------+---------+---------------+
|chr  |start    |end      |strand|gene_id        |transcript_id  |gene_name|protein_id     |
+-----+---------+---------+------+---------------+---------------+---------+---------------+
|chr12|120496422|120496553|+     |ENSG00000088986|ENST00000392509|DYNLL1   |ENSP00000376297|
|chr12|120498073|120498210|+     |ENSG00000088986|ENST00000392509|DYNLL1   |ENSP00000376297|
+-----+---------+---------+------+---------------+---------------+---------+---------------+



In [154]:
test_df.filter(f.col('transcript_id') == 'ENST00000392509').show()

+-----+---------+---------+------+---------------+---------------+---------+---------------+
|  chr|    start|      end|strand|        gene_id|  transcript_id|gene_name|     protein_id|
+-----+---------+---------+------+---------------+---------------+---------+---------------+
|chr12|120496422|120496553|     +|ENSG00000088986|ENST00000392509|   DYNLL1|ENSP00000376297|
|chr12|120498073|120498210|     +|ENSG00000088986|ENST00000392509|   DYNLL1|ENSP00000376297|
+-----+---------+---------+------+---------------+---------------+---------+---------------+



In [2]:
import pandas as pd

df = pd.read_csv('https://www.ebi.ac.uk/gwas/api/search/downloads/full', sep='\t')
df.head()

  df = pd.read_csv('https://www.ebi.ac.uk/gwas/api/search/downloads/full', sep='\t')


Unnamed: 0,DATE ADDED TO CATALOG,PUBMEDID,FIRST AUTHOR,DATE,JOURNAL,LINK,STUDY,DISEASE/TRAIT,INITIAL SAMPLE SIZE,REPLICATION SAMPLE SIZE,...,CONTEXT,INTERGENIC,RISK ALLELE FREQUENCY,P-VALUE,PVALUE_MLOG,P-VALUE (TEXT),OR or BETA,95% CI (TEXT),PLATFORM [SNPS PASSING QC],CNV
0,2017-08-10,28443625,Justice AE,2017-04-26,Nat Commun,www.ncbi.nlm.nih.gov/pubmed/28443625,"Genome-wide meta-analysis of 241,258 adults ac...",Waist circumference adjusted for BMI (adjusted...,"97,400 European ancestry women, 63,892 Europea...","21,496 European ancestry women, 24,385 Europea...",...,intergenic_variant,1.0,0.7215,1e-06,6.0,(men),0.0293,[0.017-0.041] unit decrease,"Affymetrix, Illumina, Perlegen [up to 2800000]...",N
1,2017-08-10,28443625,Justice AE,2017-04-26,Nat Commun,www.ncbi.nlm.nih.gov/pubmed/28443625,"Genome-wide meta-analysis of 241,258 adults ac...",Waist circumference adjusted for BMI (adjusted...,"97,400 European ancestry women, 63,892 Europea...","21,496 European ancestry women, 24,385 Europea...",...,intron_variant,0.0,0.4712,3e-08,7.522879,(men),0.0275,[0.018-0.037] unit decrease,"Affymetrix, Illumina, Perlegen [up to 2800000]...",N
2,2017-08-10,28443625,Justice AE,2017-04-26,Nat Commun,www.ncbi.nlm.nih.gov/pubmed/28443625,"Genome-wide meta-analysis of 241,258 adults ac...",Waist circumference adjusted for BMI (adjusted...,"97,400 European ancestry women, 63,892 Europea...","21,496 European ancestry women, 24,385 Europea...",...,intron_variant,0.0,0.3801,1e-07,7.0,(men),0.0268,[0.017-0.037] unit decrease,"Affymetrix, Illumina, Perlegen [up to 2800000]...",N
3,2017-08-10,28443625,Justice AE,2017-04-26,Nat Commun,www.ncbi.nlm.nih.gov/pubmed/28443625,"Genome-wide meta-analysis of 241,258 adults ac...",Waist circumference adjusted for BMI (adjusted...,"97,400 European ancestry women, 63,892 Europea...","21,496 European ancestry women, 24,385 Europea...",...,regulatory_region_variant,1.0,0.4322,7e-08,7.154902,(men),0.0264,[0.017-0.036] unit decrease,"Affymetrix, Illumina, Perlegen [up to 2800000]...",N
4,2017-08-10,28443625,Justice AE,2017-04-26,Nat Commun,www.ncbi.nlm.nih.gov/pubmed/28443625,"Genome-wide meta-analysis of 241,258 adults ac...",Waist circumference adjusted for BMI (adjusted...,"97,400 European ancestry women, 63,892 Europea...","21,496 European ancestry women, 24,385 Europea...",...,TF_binding_site_variant,1.0,0.6556,6e-07,6.221849,(men),0.026,[0.016-0.036] unit decrease,"Affymetrix, Illumina, Perlegen [up to 2800000]...",N


In [3]:
len(df)

398342

In [8]:
df = pd.read_csv('https://www.ebi.ac.uk/gwas/api/search/downloads/full', sep='\t')
len(df.loc[df['STRONGEST SNP-RISK ALLELE'].str.endswith('?')])/len(df)

0.22570052869142596

In [18]:
df = pd.read_csv('https://www.ebi.ac.uk/gwas/api/search/downloads/full', sep='\t')
print(f'Number of associations without chromosome/position: {len(df.loc[df.CHR_POS.isna()])}')
print(f'Number of associations without chromosome/position but with rsid: {len(df.loc[(df.CHR_POS.isna())&(df.SNPS.str.startswith("rs"))])}')


Number of associations without chromosome/position: 18246
Number of associations without chromosome/position but with rsid: 2636


In [15]:
df.columns

Index(['DATE ADDED TO CATALOG', 'PUBMEDID', 'FIRST AUTHOR', 'DATE', 'JOURNAL',
       'LINK', 'STUDY', 'DISEASE/TRAIT', 'INITIAL SAMPLE SIZE',
       'REPLICATION SAMPLE SIZE', 'REGION', 'CHR_ID', 'CHR_POS',
       'REPORTED GENE(S)', 'MAPPED_GENE', 'UPSTREAM_GENE_ID',
       'DOWNSTREAM_GENE_ID', 'SNP_GENE_IDS', 'UPSTREAM_GENE_DISTANCE',
       'DOWNSTREAM_GENE_DISTANCE', 'STRONGEST SNP-RISK ALLELE', 'SNPS',
       'MERGED', 'SNP_ID_CURRENT', 'CONTEXT', 'INTERGENIC',
       'RISK ALLELE FREQUENCY', 'P-VALUE', 'PVALUE_MLOG', 'P-VALUE (TEXT)',
       'OR or BETA', '95% CI (TEXT)', 'PLATFORM [SNPS PASSING QC]', 'CNV'],
      dtype='object')