In [None]:
%%bash

# Fetching pre-release data.
gsutil cp -r gs://open-targets-pre-data-releases/22.04/output/etl/parquet/errors/evidence  ~/project_data/

In [1]:
import pandas as pd
import json
import requests

from pyspark.sql.types import ArrayType, StringType, IntegerType
from pyspark.sql import SparkSession
import pyspark.sql.functions as f

# establish spark connection
spark = (
    SparkSession.builder
    .master('local[*]')
    .getOrCreate()
)

In [2]:
(
    spark.read.parquet('/Users/dsuveges/project_data/evidence')
    .printSchema()
)

root
 |-- datasourceId: string (nullable = true)
 |-- targetId: string (nullable = true)
 |-- alleleOrigins: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- allelicRequirements: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- ancestry: string (nullable = true)
 |-- ancestryId: string (nullable = true)
 |-- beta: double (nullable = true)
 |-- betaConfidenceIntervalLower: double (nullable = true)
 |-- betaConfidenceIntervalUpper: double (nullable = true)
 |-- biologicalModelAllelicComposition: string (nullable = true)
 |-- biologicalModelGeneticBackground: string (nullable = true)
 |-- biologicalModelId: string (nullable = true)
 |-- biomarkerName: string (nullable = true)
 |-- biomarkers: struct (nullable = true)
 |    |-- geneExpression: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- id: string (nullable = true)
 |    |    |    |-- name: string (nullable = true)
 |    |-- variant

In [4]:
failed_evidence = (
    spark.read.parquet('/Users/dsuveges/project_data/evidence')
    .persist()
)

print(f'Number of failed evidence: {failed_evidence.count()}')

Number of failed evidence: 354943


In [9]:
resolved_reason = (
    spark.read.parquet('/Users/dsuveges/project_data/evidence')
    .groupby('datasourceId', 'resolvedTarget', 'resolvedDisease', 'nullifiedScore', 'markedDuplicate')
    .count()
    .orderBy('datasourceId')
#     .show(150)
    .persist()
)

resolved_reason.show(250)

+------------------+--------------+---------------+--------------+---------------+------+
|      datasourceId|resolvedTarget|resolvedDisease|nullifiedScore|markedDuplicate| count|
+------------------+--------------+---------------+--------------+---------------+------+
| cancer_biomarkers|          true|          false|         false|          false|    19|
| cancer_biomarkers|          true|           true|         false|           true|    29|
|cancer_gene_census|          true|          false|         false|           true|  7018|
|cancer_gene_census|          true|          false|         false|          false|   897|
|            chembl|         false|           true|         false|          false|     9|
|            chembl|          true|           true|         false|           true| 67656|
|            chembl|          true|          false|         false|           true|   265|
|            chembl|          true|          false|         false|          false|  3983|
|         

In [11]:
resolved_reason.filter(f.col('resolvedTarget') == False).show(250)

+------------------+--------------+---------------+--------------+---------------+-----+
|      datasourceId|resolvedTarget|resolvedDisease|nullifiedScore|markedDuplicate|count|
+------------------+--------------+---------------+--------------+---------------+-----+
|            chembl|         false|           true|         false|          false|    9|
|  expression_atlas|         false|           true|         false|          false|   15|
|    gene2phenotype|         false|          false|         false|          false|    2|
|    gene2phenotype|         false|           true|         false|          false|    4|
|    gene2phenotype|         false|           true|          true|          false|    2|
|       gene_burden|         false|          false|         false|          false|   13|
|       gene_burden|         false|           true|         false|          false|   83|
|  genomics_england|         false|           true|         false|          false|  534|
|  genomics_england| 

In [17]:
(
    failed_evidence
    .filter(f.col('resolvedTarget') == False)
    .groupby(
        'datasourceId', 'targetFromSource', 'targetFromSourceId'
    )
    .count()
    .orderBy(['datasourceId', 'count'], ascending=False)
    .show(300)
)

+------------------+----------------+------------------+-----+
|      datasourceId|targetFromSource|targetFromSourceId|count|
+------------------+----------------+------------------+-----+
|uniprot_literature|            null|            Q156A1|    1|
|            sysbio|            null|               FYB|    2|
|            sysbio|            null|              WARS|    1|
|            sysbio|            null|          PPAPDC1B|    1|
|            sysbio|            null|           STATIP1|    1|
|            sysbio|            null|          C1orf143|    1|
|            sysbio|            null|           MOBKL2B|    1|
|            sysbio|            null|            TMEPAI|    1|
|            sysbio|            null|         C20orf133|    1|
|            sysbio|            null|            FAM49A|    1|
|            sysbio|            null|             LARGE|    1|
|            sysbio|            null|              CTGF|    1|
|            sysbio|            null|             MLL3S

In [22]:
failed_disease = (
    failed_evidence
    .filter(f.col('resolvedDisease') == False)
    .groupby(
        'datasourceId', 'diseaseFromSource', 'diseaseFromSourceId', 'diseaseFromSourceMappedId'
    )
    .count()
    .orderBy(['datasourceId', 'count'], ascending=False)
    .persist()
)

print(f'Number of disease failed: {failed_disease.count()}')
failed_disease.show(300)

Number of disease failed: 15860
+------------------+--------------------+-------------------+-------------------------+-----+
|      datasourceId|   diseaseFromSource|diseaseFromSourceId|diseaseFromSourceMappedId|count|
+------------------+--------------------+-------------------+-------------------------+-----+
|  uniprot_variants|Polycystic kidney...|        OMIM:173900|                     null|   60|
|  uniprot_variants|Ataxia telangiect...|        OMIM:208900|              EFO_0004922|   41|
|  uniprot_variants|              Glioma|        OMIM:137800|            MONDO_0100242|   20|
|  uniprot_variants|              Glioma|        OMIM:137800|            MONDO_0024498|   20|
|  uniprot_variants| Neural tube defects|        OMIM:182940|            MONDO_0020705|   20|
|  uniprot_variants|Glanzmann thromba...|        OMIM:619267|                     null|   19|
|  uniprot_variants|Stuttering, famil...|        OMIM:184450|                     null|   15|
|  uniprot_variants|     Neu

In [23]:
failed_disease.show(truncate=False)

+----------------+-------------------------------------------------------------------------------------+-------------------+-------------------------+-----+
|datasourceId    |diseaseFromSource                                                                    |diseaseFromSourceId|diseaseFromSourceMappedId|count|
+----------------+-------------------------------------------------------------------------------------+-------------------+-------------------------+-----+
|uniprot_variants|Polycystic kidney disease 1 with or without polycystic liver disease                 |OMIM:173900        |null                     |60   |
|uniprot_variants|Ataxia telangiectasia                                                                |OMIM:208900        |EFO_0004922              |41   |
|uniprot_variants|Glioma                                                                               |OMIM:137800        |MONDO_0100242            |20   |
|uniprot_variants|Glioma                                  