In [21]:
from pyspark.sql import functions as f, SparkSession, types as t
from pyspark.sql.window import Window

spark = SparkSession.builder.getOrCreate()

diseases = (
    spark.read.parquet('gs://open-targets-data-releases/22.11/output/etl/parquet/diseases/')
    .select(
        f.col('id').alias('diseaseId'), 
        f.col('name').alias('disaseName')
    )
)

drugs = (
    spark.read.parquet('gs://open-targets-data-releases/22.11/output/etl/parquet/molecule/')
    .select(
        f.col('id').alias('drugId'), 
        f.col('name').alias('drugName'),
        f.col('linkedDiseases.rows').alias('indications'),
        f.explode('linkedTargets.rows').alias('targetId')
    )
    .select(
        '*',
        f.explode('indications').alias('diseaseId')
    )
    .join(
        diseases, on='diseaseId', how='left'
    )
    .groupBy('drugId', 'drugName', 'targetId')
    .agg(f.collect_set(f.col('disaseName')).alias('indications'))
)

targets = (
    spark.read.parquet('gs://open-targets-data-releases/22.11/output/etl/parquet/targets/')
    .filter(f.col('biotype') == 'protein_coding')
    .select(
        f.col('genomicLocation.chromosome').alias('chromosome'),
        f.col('genomicLocation.start').alias('start'),
        f.col('genomicLocation.end').alias('end'),
        f.col('id').alias("targetId"), 
        f.col('approvedName').alias('targetName'), 
        f.col('approvedSymbol').alias('targetSymbol'),
    )
    .join(drugs, on='targetId', how='left')
    .persist()
)

targets.show()





+---------------+----------+---------+---------+--------------------+------------+-------------+----------------+--------------------+
|       targetId|chromosome|    start|      end|          targetName|targetSymbol|       drugId|        drugName|         indications|
+---------------+----------+---------+---------+--------------------+------------+-------------+----------------+--------------------+
|ENSG00000059588|         1|234391313|234479179|TAR (HIV-1) RNA b...|      TARBP1|         null|            null|                null|
|ENSG00000070182|        14| 64746283| 64879907|spectrin beta, er...|        SPTB|         null|            null|                null|
|ENSG00000070366|        17|  2059839|  2303785|SMG6 nonsense med...|        SMG6|         null|            null|                null|
|ENSG00000072071|        19| 14147743| 14206187|adhesion G protei...|      ADGRL1|         null|            null|                null|
|ENSG00000073536|        17| 35128730| 35142304| notchl

                                                                                

In [22]:
on_chr22 = targets.filter(f.col('chromosome') == '22').select('targetId').distinct().count()
on_chr22_w_drugs = (
    targets
    .filter(
        (f.col('chromosome') == '22') &
        (f.col('drugId').isNotNull()) &
        (f.col('indications').isNotNull())
    )
    .select('targetId')
    .distinct()
    .count()
)

print(f'Number of genes on chr22: {on_chr22}')
print(f'Number of genes on chr22 with drugs: {on_chr22_w_drugs}')

                                                                                

Number of genes on chr22: 447
Number of genes on chr22 with drugs: 28


In [23]:
on_chr22_w_drugs = (
    targets
    .filter(
        (f.col('chromosome') == '22') &
        (f.col('drugId').isNotNull()) &
        (f.col('indications').isNotNull())
    )
    .distinct()
    .persist()
)
(
    on_chr22_w_drugs
    .groupBy('targetId','targetSymbol', 'targetName')
    .agg(
        f.count(f.col('targetId')).alias('drugCount')
    )
    .orderBy('drugCount', ascending=False)
    .show(30, truncate=False)
)


+---------------+------------+-------------------------------------------------------+---------+
|targetId       |targetSymbol|targetName                                             |drugCount|
+---------------+------------+-------------------------------------------------------+---------+
|ENSG00000186951|PPARA       |peroxisome proliferator activated receptor alpha       |24       |
|ENSG00000128271|ADORA2A     |adenosine A2a receptor                                 |22       |
|ENSG00000100346|CACNA1I     |calcium voltage-gated channel subunit alpha1 I         |16       |
|ENSG00000186716|BCR         |BCR activator of RhoGEF and GTPase                     |11       |
|ENSG00000100429|HDAC10      |histone deacetylase 10                                 |9        |
|ENSG00000100385|IL2RB       |interleukin 2 receptor subunit beta                    |8        |
|ENSG00000166862|CACNG2      |calcium voltage-gated channel auxiliary subunit gamma 2|8        |
|ENSG00000278195|SSTR3       |

                                                                                

In [30]:

diseases = (
    spark.read.parquet('gs://open-targets-data-releases/22.11/output/etl/parquet/diseases/')
    .select(
        f.col('id').alias('diseaseId'), 
        f.explode(f.col('therapeuticAreas')).alias('id')
    )
    .join(
        (
            spark.read.parquet('gs://open-targets-data-releases/22.11/output/etl/parquet/diseases/')
            .select(
                f.col('id'), 
                f.col('name').alias('diseaseName')
            )
        ), on='id', how='left'
    )
    .groupBy('diseaseId')
    .agg(
        f.collect_set('diseaseName').alias('therapeuticAreas')
    )
)

gwas_loci = (
    spark.read.json('gs://open-targets-data-releases/22.11/input/evidence-files/genetics-portal-evidences.json.gz')
    .filter(f.col('resourceScore')>=0.5)
    .select(
        f.col('targetFromSourceId').alias('targetId'),
        f.col("diseaseFromSourceMappedId").alias('diseaseId'),
    )
    .join(diseases, on='diseaseId')
    .drop('diseaseId')
    .select('targetId', f.explode('therapeuticAreas').alias('therapeuticArea'))
    .groupBy('targetId','therapeuticArea')
    .agg(f.count('*').alias('lociCount'))
    .groupBy('targetId')
    .agg(
        f.collect_list(
            f.struct(
                f.col('therapeuticArea'),
                f.col("lociCount")
            )
        ).alias('therapeuticAreas')
    )
    .persist()
)

gwas_loci.show()



+---------------+--------------------+
|       targetId|    therapeuticAreas|
+---------------+--------------------+
|ENSG00000198870|[{urinary system ...|
|ENSG00000173926|  [{measurement, 5}]|
|ENSG00000182158|[{measurement, 35...|
|ENSG00000188938|[{biological proc...|
|ENSG00000070182|[{measurement, 59...|
|ENSG00000070366|[{cancer or benig...|
|ENSG00000153885|[{biological proc...|
|ENSG00000224578|[{cardiovascular ...|
|ENSG00000162377|  [{measurement, 1}]|
|ENSG00000183032|[{measurement, 26...|
|ENSG00000262655|[{measurement, 16...|
|ENSG00000106012|[{musculoskeletal...|
|ENSG00000087087|  [{measurement, 1}]|
|ENSG00000143013|[{measurement, 22...|
|ENSG00000177150|[{musculoskeletal...|
|ENSG00000166046|[{measurement, 2}...|
|ENSG00000163406|[{measurement, 37...|
|ENSG00000120341|[{nervous system ...|
|ENSG00000143198|  [{measurement, 5}]|
|ENSG00000213079|[{respiratory or ...|
+---------------+--------------------+
only showing top 20 rows



                                                                                

In [33]:
chr22_w_gwas = (
    on_chr22_w_drugs
    .join(gwas_loci, on='targetId', how='left')
    .persist()
)

chr22_w_gwas.sample(0.2).select('targetName', 'indications', 'therapeuticAreas').show(vertical=True, truncate=False)

22/12/24 00:54:16 WARN org.apache.spark.sql.execution.CacheManager: Asked to cache already cached data.


-RECORD 0-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [41]:
(
    chr22_w_gwas
    .withColumn('ta', f.explode('therapeuticAreas'))
    .select('targetId', f.col('ta.lociCount').alias('count'), f.col('ta.therapeuticArea').alias('therapeuticArea'))
    .distinct()
    .groupBy('targetId')
    .count()
    .orderBy('count', ascending=False)
    .show(39)
)

+---------------+-----+
|       targetId|count|
+---------------+-----+
|ENSG00000183765|   11|
|ENSG00000100385|    7|
|ENSG00000100368|    7|
|ENSG00000100311|    7|
|ENSG00000100346|    6|
|ENSG00000186951|    5|
|ENSG00000100170|    5|
|ENSG00000198355|    5|
|ENSG00000186716|    5|
|ENSG00000100030|    3|
|ENSG00000093010|    2|
|ENSG00000100300|    1|
|ENSG00000177663|    1|
|ENSG00000128271|    1|
|ENSG00000159958|    1|
|ENSG00000196236|    1|
|ENSG00000240972|    1|
+---------------+-----+



In [42]:
(
    spark.read.json('gs://open-targets-data-releases/22.11/input/evidence-files/genetics-portal-evidences.json.gz')
    .filter(
        (f.col('resourceScore')>=0.5) &
        (f.col('targetFromSourceId') == 'ENSG00000196236')
    )
    .show()
)
  

[Stage 298:>                                                        (0 + 1) / 1]

+-------------------+---------------------------+---------------------------+------------------+-------------------+--------------------+-------------------------+----------+---------+--------------------------------+--------------------------------+--------------+--------------+---------+----------------------+---------------+------------------+------------+---------------+------------------+------------------------------+---------------+-----------+
|               beta|betaConfidenceIntervalLower|betaConfidenceIntervalUpper|      datasourceId|         datatypeId|   diseaseFromSource|diseaseFromSourceMappedId|literature|oddsRatio|oddsRatioConfidenceIntervalLower|oddsRatioConfidenceIntervalUpper|pValueExponent|pValueMantissa|projectId|publicationFirstAuthor|publicationYear|     resourceScore|     studyId|studySampleSize|targetFromSourceId|variantFunctionalConsequenceId|      variantId|variantRsId|
+-------------------+---------------------------+---------------------------+-----------

                                                                                