# Bulk data process

1. Read associations
2. Drop all mouse evidence
3. Group by associations
4. Calculate harmonic sum to get association score
5. Annotate disease if haematologically relevant or not


## For stats

1. Get a relevant set of genes from reactome
2. Generate an equivalent set of genes by random

## Stats

1. Apply score threshold for association -> Get average count of hematological diseases
2. Apply score threshold for association -> for each gene, compare median association for every vs. hematological traits. -> calculate average
3. 

In [3]:
%%bash 


# Fetching associations from google:
gsutil cp -r gs://open-targets-data-releases/22.06/output/etl/parquet/associationByDatatypeDirect /Users/dsuveges/project_data/
gsutil cp -r gs://open-targets-data-releases/22.06/output/etl/parquet/targets /Users/dsuveges/project_data/

# The same data can be found on ftp: ftp://ftp.ebi.ac.uk/pub/databases/opentargets/platform/22.06/output/etl/parquet

# Fetching efo dataset from the platform:
wget https://platform.opentargets.org/data/ontology/efo_json/diseases_efo.jsonl -O diseases_efo.json

ls -lah *json

-rw-r--r--  1 dsuveges  384566875   2.4M 24 Jun 11:17 diseases_efo.json


--2022-07-05 07:39:11--  https://platform.opentargets.org/data/ontology/efo_json/diseases_efo.jsonl
Resolving platform.opentargets.org (platform.opentargets.org)... 35.201.119.159
Connecting to platform.opentargets.org (platform.opentargets.org)|35.201.119.159|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2541582 (2.4M) [application/octet-stream]
Saving to: ‘diseases_efo.json’

     0K .......... .......... .......... .......... ..........  2% 4.86M 0s
    50K .......... .......... .......... .......... ..........  4% 3.50M 1s
   100K .......... .......... .......... .......... ..........  6% 1.18M 1s
   150K .......... .......... .......... .......... ..........  8% 38.7M 1s
   200K .......... .......... .......... .......... .......... 10% 43.2M 1s
   250K .......... .......... .......... .......... .......... 12% 7.16M 1s
   300K .......... .......... .......... .......... .......... 14% 10.1M 0s
   350K .......... .......... .......... .......... .......

In [125]:
from statistics import median

import pyspark.sql
import pyspark.sql.types as t
import pyspark.sql.functions as f
from pyspark.sql.window import Window


global spark

spark = (
    pyspark.sql.SparkSession
   .builder
   .master("local[*]")
   .getOrCreate()
)
   
@f.udf(t.FloatType())
def harmonic_sum(data: list, scale_factor: float = 1, cap: float = None) -> float:
    """
    Returns an harmonic sum for the data passed
    Args:
        data (list): list of floats to compute the harmonic sum from
        scale_factor (float): a scaling factor to multiply to each datapoint. Defaults to 1
        cap (float): if not None, never return an harmonic sum higher than the cap value.
    Returns:
        harmonic_sum (float): the harmonic sum of the data passed
    """

    data.sort(reverse=True)
    harmonic_sum = sum(s / ((i+1) ** scale_factor) for i, s in enumerate(data))
    
    # Applying cap:
    if cap is not None and harmonic_sum > cap:
        return cap
      
    return harmonic_sum

# An UDF to calculate median:
median_udf = f.udf(lambda l: median(l), t.FloatType())

association_dataset = '/Users/dsuveges/project_data/associationByDatatypeDirect'
disease_dataset = '/Users/dsuveges/project/random_notebooks/2022.06.08-disease_expansion/diseases_efo.json'

### Processing disease table

1. Reading json into spark table.
2. Exploding parent table.
3. Annotate each disease if the disease or the parent is haematologically relevant
4. Aggregate annotation for each disease.

In [128]:
relevant_disease_pattern = ['immuno', 'hemato', 'hemo', 'blood', 'bleed']


disease_raw = (
    spark.read.json(disease_dataset)
    .withColumnRenamed('id', 'diseaseId')
    .withColumnRenamed('name', 'diseaseLabel')
    .persist()
)

annotated_diseases = (
    disease_raw
    # Exploding parent column:
    .select('diseaseId', 'diseaseLabel', f.explode_outer('parentIds').alias('parentId'))
    
    # Get parent disease names:
    .join(
        (
            disease_raw
            .withColumnRenamed('diseaseId', 'parentId')
            .withColumnRenamed('diseaseLabel', 'parentLabel')
            .drop('parentIds')
        ), on='parentId', how='left')
    
    # Check if disease name OR parent name is relevant:
    .withColumn(
        'isRelevant',
        f.when(f.col('diseaseLabel').rlike('|'.join(relevant_disease_pattern)), True)
        .when(f.col('parentLabel').rlike('|'.join(relevant_disease_pattern)), True)
        .otherwise(False)
    )
    
    # Aggregating by disease, check if at least one parent is relevant:
    .groupBy('diseaseId', 'diseaseLabel')
    .agg(
        f.expr("any(isRelevant)").alias('isRelevant')
    )
    .withColumn(
        'isRelevant',
        f.when(f.col('isRelevant') == True, 'relevant')
        .otherwise('not_relevant')
    )
    .persist()
)

annotated_diseases.filter(f.col('isRelevant') == 'relevant').count()


1411

### Reading targets

1. Read parquet and select columns

In [58]:
targets = (
    spark.read.parquet('/Users/dsuveges/project_data/targets')
    .select(
        f.col('id').alias('targetId'),
        f.col('approvedSymbol').alias('targetSymbol'),
        f.col('approvedName').alias('targetName')
    )
    .persist()
)

### Processing associations

1. Read assoction file
2. Drop animal model data
3. Aggregating associations
4. Calcualte harmonic sum
5. Join disease info
6. Join target info

In [54]:
print(f'Number of all associations: {spark.read.parquet(association_dataset).select("diseaseId", "targetId").distinct().count()}')

no_animal_associations = (
    spark.read.parquet(association_dataset)
    .filter(f.col('datatypeId') != 'animal_model')
    .select("diseaseId", "targetId").distinct()
)

print(f'Number of associations supported by sources not including mouse: {no_animal_associations.count()}')


Number of all associations: 2120908
Number of associations supported by sources not including mouse: 1614569


In [130]:
assoc_df = (
    # Reading associations:
    spark.read.parquet(association_dataset)
    
    # Dropping all animal model data:
    .filter(f.col('datatypeId') != 'animal_model')
    
    # aggregating by association and calculate overall score:
    .groupBy('diseaseId', 'targetId')
    .agg(harmonic_sum(f.collect_list(f.col('score'))).alias('overall_score'))
    
    # Joining with diseases and disease annotation.
    .join(annotated_diseases, on='diseaseId')
    
    # Joining with target annotation:
    .join(targets, on='targetId', how='left')
    .persist()
)

# How does it look like:
assoc_df.show()

# Saving data in a partitioned parquet:
assoc_df.write.mode('overwrite').parquet('Associations_w_disease_annot')

# Saving data in a single tsv file:
assoc_df.toPandas().to_csv('annotated_associations.tsv.gz', sep='\t', compression='infer', index=False)

+---------------+----------+-------------+--------------------+------------+------------+--------------------+
|       targetId| diseaseId|overall_score|        diseaseLabel|  isRelevant|targetSymbol|          targetName|
+---------------+----------+-------------+--------------------+------------+------------+--------------------+
|ENSG00000113749|DOID_10718| 0.0121586155|          giardiasis|not_relevant|        HRH2|histamine recepto...|
|ENSG00000120937|DOID_13406|  0.018237924|pulmonary sarcoid...|not_relevant|        NPPB|natriuretic pepti...|
|ENSG00000066427| DOID_7551| 0.0121586155|           gonorrhea|not_relevant|       ATXN3|            ataxin 3|
|ENSG00000095739| DOID_7551|   0.03039654|           gonorrhea|not_relevant|       BAMBI|BMP and activin m...|
|ENSG00000102755| DOID_7551|   0.18691845|           gonorrhea|not_relevant|        FLT1|fms related recep...|
|ENSG00000103335| DOID_7551|    0.5912127|           gonorrhea|not_relevant|      PIEZO1|piezo type mechan...|
|

## Analysis

In [131]:
aggregated_associations = (
    assoc_df
    
    # Applying a filter based on the overall score: 
    # iteratively we can adjust this to make sure we are not too stringent
    .filter(
        (f.col('overall_score') >= 0.1)
    )
    
    # Grouping data by target and the relevance flag:
    .groupby('targetId', 'targetName', 'targetSymbol')
    .pivot('isRelevant')
    .agg(
        f.max(f.col('overall_score')).alias('max'),
        f.mean(f.col('overall_score')).alias('mean_score'),
        f.count(f.col('overall_score')).alias('disease_count'),
        median_udf(f.collect_list(f.col('overall_score'))).alias('median_score'),
    )
    .persist()
)

aggregated_associations.count()

23738

In [132]:
(
    aggregated_associations
    .toPandas()
    .to_csv('aggregated_associations.tsv.gz', sep='\t', index=False)
)