In [11]:
from pyspark.sql import SparkSession, functions as f, types as t

spark = SparkSession.builder.getOrCreate()
evidence_output = spark.read.parquet('gs://open-targets-data-releases/23.02/output/etl/parquet/evidence/sourceId=expression_atlas/').persist()

(
    evidence_output
    .select(
        'datasourceId',
        'targetId',
        'biosamplesFromSource',
        'confidence',
        'contrast',
        'datatypeId',
        'literature',
        'log2FoldChangePercentileRank',
        'log2FoldChangeValue',
        'resourceScore',
        'studyId',                                                                                             
        'studyOverview',
        'diseaseId'
    )
    .show(1, False, True)
)

-RECORD 0----------------------------------------------------------------------------------------------------------------------------------------
 datasourceId                 | expression_atlas                                                                                                 
 targetId                     | ENSG00000051341                                                                                                  
 biosamplesFromSource         | [UBERON_0002369]                                                                                                 
 confidence                   | high                                                                                                             
 contrast                     | 'adrenocortical carcinoma' vs 'normal'                                                                           
 datatypeId                   | rna_expression                                                                              

23/02/23 11:04:25 WARN org.apache.spark.sql.execution.CacheManager: Asked to cache already cached data.


## General stats on the evidence set

In [15]:
evidence_count = evidence_output.select('diseaseId', 'studyId').count()
print(f'Number of evidence: {evidence_count}')

disease_count = evidence_output.select('diseaseId').distinct().count()
print(f'Number of unique diseases: {disease_count}')

study_count = evidence_output.select('studyId').distinct().count()
print(f'Number of unique studies: {study_count}')



Number of evidence: 230903
Number of unique diseases: 204
Number of unique studies: 289


## Distribution of studies across diseases

- **Q**: how many diseases a represented by multiple studies? 
- **A**: roughly 30% of the diseases are represented by multiple studies. It allows selecting the most powered study for the disease.

In [10]:
diseases_w_multiple_studies = (
    evidence_output
    .groupBy('diseaseId')
    .agg(
        f.collect_set(f.col('studyId')).alias('studies'),
        f.size(f.collect_set(f.col('studyId'))).alias('study_count')
    )
    .filter(f.col('study_count') > 1)
    .orderBy('study_count', ascending=False)
    .persist()
)
number_of_diseases_w_multiple_studies = diseases_w_multiple_studies.count()
print(f'Number of diseases with multiple studies: {number_of_diseases_w_multiple_studies}')

diseases_w_multiple_studies.show(truncate=False)

23/02/23 11:01:28 WARN org.apache.spark.sql.execution.CacheManager: Asked to cache already cached data.


Number of diseases with multiple studies: 63
+-------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------+
|diseaseId    |studies                                                                                                                                                                            |study_count|
+-------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------+
|EFO_0000384  |[E-MTAB-2967, E-GEOD-66207, E-MTAB-5464, E-GEOD-69762, E-MEXP-2083, E-MTAB-5783, E-GEOD-83687, E-GEOD-93624, E-GEOD-101794, E-GEOD-57945, E-GEOD-112057, E-GEOD-59071, E-GEOD-6731]|13         |
|EFO_0000729  |[E-MTAB-2967, E-MTAB-7915, E-MTAB-5464, E-MEXP-2083, E-TABM-734, E-GEOD-65114, E-GEOD-83687, E-MTAB-7860, E-

## Distribution of diseases across studies

- **Q**: Is it possible that some studies are mapped to multiple diseases? 
- **A**: So it seems, although some studies are mapped to multiple diseases, these studies are complex with different contrasts.

In [14]:
studies_w_multiple_diseases = (
    evidence_output
    .groupBy('studyId')
    .agg(
        f.collect_set(f.col('diseaseId')).alias('diseaseIds'),
        f.size(f.collect_set(f.col('diseaseId'))).alias('disease_count')
    )
    .filter(f.col('disease_count') > 1)
    .orderBy('disease_count', ascending=False)
    .persist()
)
number_of_studies_w_multiple_diseases = studies_w_multiple_diseases.count()
print(f'Number of studies with multiple diseases: {number_of_studies_w_multiple_diseases}')

studies_w_multiple_diseases.show(truncate=False)

study_example = 'E-GEOD-50161'
(
    evidence_output
    .filter(f.col('studyId') == study_example)
    .select(
        'datasourceId',
        'biosamplesFromSource',
        'contrast',
        'studyId',                                                                                             
        'studyOverview',
        'diseaseId'
    )
    .distinct()
    .show(truncate=False)
)

23/02/23 11:11:19 WARN org.apache.spark.sql.execution.CacheManager: Asked to cache already cached data.


Number of studies with multiple diseases: 53
+-------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------+
|studyId      |diseaseIds                                                                                                                                                                     |disease_count|
+-------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------+
|E-GEOD-3307  |[MONDO_0010680, MONDO_0019064, EFO_0000225, MONDO_0020336, MONDO_0010679, MONDO_0009675, MONDO_0010311, Orphanet_269, EFO_0000557, MONDO_0004976, MONDO_0011787, MONDO_0009676]|12           |
|E-GEOD-68086 |[EFO_1000044, EFO_0000365, MONDO_0018531, EFO_0000305, EFO_0003060, EFO_0000519]                                    