# Reviewed process

Based on the meeting we had on the 7th with the IMPC team, we agreed on some of the things:

1. Let's keep minimal score cutoff for removing low-confidence associations.
2. It makes sense to remove IMPC from dataset
3. Scoring is important so we should properly calculate the overall association score.

To properly calculate the overall association score, I need to follow this procedure:

1. Using the downloadable association data with datasource scores
2. Apply weight on each datasource
3. Aggregate by disease/target pairs.
4. Apply harmonic sum on the weighted datasource scores. scale by the maximal value.

In [13]:
from statistics import median
from functools import reduce 

from pyspark.sql import dataframe
import pyspark.sql
import pyspark.sql.types as t
import pyspark.sql.functions as f
from pyspark.sql.window import Window

spark = (
    pyspark.sql.SparkSession
   .builder
   .master("local[*]")
   .getOrCreate()
)
   
@f.udf(t.FloatType())
def harmonic_sum(data: list, scale_factor: float = 1, cap: float = None) -> float:
    """
    Returns an harmonic sum for the data passed
    Args:
        data (list): list of floats to compute the harmonic sum from
        scale_factor (float): a scaling factor to multiply to each datapoint. Defaults to 1
        cap (float): if not None, never return an harmonic sum higher than the cap value.
    Returns:
        harmonic_sum (float): the harmonic sum of the data passed
    """

    data.sort(reverse=True)
    harmonic_sum = sum(s / ((i+1) ** scale_factor) for i, s in enumerate(data))
    
    # Applying cap:
    if cap is not None and harmonic_sum > cap:
        return cap
      
    return harmonic_sum

# An UDF to calculate median:
median_udf = f.udf(lambda l: median(l), t.FloatType())

association_dataset = '/Users/dsuveges/project_data/associationByDatasourceDirect'
disease_dataset = '/Users/dsuveges/project/random_notebooks/2022.06.08-disease_expansion/diseases_efo.json'

### Processing disease as usual

In [90]:
relevant_disease_pattern = ['immuno', 'hemato', 'hemo', 'blood', 'bleed']


disease_raw = (
    spark.read.json(disease_dataset)
    .withColumnRenamed('id', 'diseaseId')
    .withColumnRenamed('name', 'diseaseLabel')
    .persist()
)

annotated_diseases = (
    disease_raw
    # Exploding parent column:
    .select('diseaseId', 'diseaseLabel', f.explode_outer('parentIds').alias('parentId'))
    
    # Get parent disease names:
    .join(
        (
            disease_raw
            .withColumnRenamed('diseaseId', 'parentId')
            .withColumnRenamed('diseaseLabel', 'parentLabel')
            .drop('parentIds')
        ), on='parentId', how='left')
    
    # Check if disease name OR parent name is relevant:
    .withColumn(
        'isRelevant',
        f.when(f.col('diseaseLabel').rlike('|'.join(relevant_disease_pattern)), True)
        .when(f.col('parentLabel').rlike('|'.join(relevant_disease_pattern)), True)
        .otherwise(False)
    )
    
    # Aggregating by disease, check if at least one parent is relevant:
    .groupBy('diseaseId', 'diseaseLabel')
    .agg(
        f.expr("any(isRelevant)").alias('isRelevant')
    )
    .withColumn(
        'isRelevant',
        f.when(f.col('isRelevant') == True, 'relevant')
        .otherwise('not_relevant')
    )
    .persist()
)

relevant_count = annotated_diseases.filter(f.col('isRelevant') == 'relevant').count()

print(f'Number of all diseases: {annotated_diseases.count()}')
print(f'Number of relevant diseases: {relevant_count}')

annotated_diseases.show()


Number of all diseases: 23074
Number of relevant diseases: 1411
+-------------+--------------------+------------+
|    diseaseId|        diseaseLabel|  isRelevant|
+-------------+--------------------+------------+
|   GO_0046483|heterocycle metab...|not_relevant|
|   HP_0000112|         Nephropathy|not_relevant|
|   HP_0000405|Conductive hearin...|not_relevant|
|   HP_0000712|  Emotional lability|not_relevant|
|   HP_0001392|Abnormality of th...|not_relevant|
|   HP_0002376|Developmental reg...|not_relevant|
|   HP_0009487|Ulnar deviation o...|not_relevant|
|   HP_0012868|Abnormal sperm ta...|not_relevant|
|MONDO_0002185|        hyperostosis|not_relevant|
|MONDO_0003453|conjunctival intr...|not_relevant|
|MONDO_0004900|  peripheral vertigo|not_relevant|
|MONDO_0007244|      Caffey disease|not_relevant|
|MONDO_0007321|autosomal dominan...|not_relevant|
|MONDO_0007949|   Marshall syndrome|not_relevant|
|MONDO_0008016|trismus-pseudocam...|not_relevant|
|MONDO_0008682|Denys-Drash syndrome|

### Processing targets as usual

In [3]:
targets = (
    spark.read.parquet('/Users/dsuveges/project_data/targets')
    .select(
        f.col('id').alias('targetId'),
        f.col('approvedSymbol').alias('targetSymbol'),
        f.col('approvedName').alias('targetName')
    )
    .persist()
)

### Read and explore associations

In [4]:
(
    spark.read.parquet(association_dataset)
    .show()
)

+-------------------+------------+-----------+---------------+-------------------+-------------+
|         datatypeId|datasourceId|  diseaseId|       targetId|              score|evidenceCount|
+-------------------+------------+-----------+---------------+-------------------+-------------+
|genetic_association| gene_burden|EFO_0000589|ENSG00000008710| 0.5543493036511893|           10|
|genetic_association| gene_burden|EFO_0000589|ENSG00000084674| 0.9160297417262854|           12|
|genetic_association| gene_burden|EFO_0000589|ENSG00000110245| 0.5422007545467868|            4|
|genetic_association| gene_burden|EFO_0000589|ENSG00000119772|0.20834690189720867|            1|
|genetic_association| gene_burden|EFO_0000589|ENSG00000124827| 0.1877214660192899|            1|
|genetic_association| gene_burden|EFO_0000589|ENSG00000130164| 0.9799287909752019|           35|
|genetic_association| gene_burden|EFO_0000589|ENSG00000132855| 0.5118943392430458|            4|
|genetic_association| gene_bur

In [25]:
target = 'ENSG00000065361' # HER3
disease = 'EFO_0000571' # Lung adenocarcinoma
# The association score is 0.37

overall_score = (
    spark.read.parquet('/Users/dsuveges/project_data/associationByOverallDirect/')
    .drop('evidenceCount')
    .withColumnRenamed('score', 'overallScore')
)

example_assoc = (
    spark.read.parquet(association_dataset)
    .filter(
        (f.col('targetId') == target) 
#         & (f.col('diseaseId') == disease)
    )
    .join(overall_score, on=['targetId', 'diseaseId'], how='inner')
    .persist()
)

example_assoc.show()

+---------------+--------------+-------------------+------------------+--------------------+-------------+--------------------+
|       targetId|     diseaseId|         datatypeId|      datasourceId|               score|evidenceCount|        overallScore|
+---------------+--------------+-------------------+------------------+--------------------+-------------+--------------------+
|ENSG00000065361| MONDO_0003036|         literature|         europepmc| 0.01823792392834863|            1|0.002217479128108...|
|ENSG00000065361|Orphanet_44890|         literature|         europepmc|0.015198269940290528|            2| 0.07437794575529624|
|ENSG00000065361|Orphanet_44890|         known_drug|            chembl| 0.12158615952232422|            1| 0.07437794575529624|
|ENSG00000065361|   EFO_1000391|genetic_association|ot_genetics_portal| 0.10756044474610776|            2| 0.06538930696596199|
|ENSG00000065361| MONDO_0017574|genetic_association|uniprot_literature|   0.607930797611621|            

In [30]:
from itertools import chain

weights = {
    "cancer_gene_census": 1,
    "cancer_biomarkers": 0.5,
    "chembl": 1,
    "crispr": 1,
    "europepmc": 0.2,
    "eva": 1,
    "eva_somatic": 1,
    "expression_atlas": 0.2,
    "gene2phenotype": 1,
    "genomics_england": 1,
    "ot_genetics_portal": 1,
    "intogen": 1,
    "impc": 0.2,
    "phewas_catalog": 1,
    "progeny": 0.5,
    "reactome": 1,
    "slapenrich": 0.5,
    "sysbio": 0.5,
    "uniprot": 1,
    "uniprot_literature": 1,
    "uniprot_somatic": 1,
}

@f.udf(t.FloatType())
def harmonic_sum(data: list, scale_factor: float = 1, cap: float = None) -> float:
    """
    Returns an harmonic sum for the data passed
    Args:
        data (list): list of floats to compute the harmonic sum from
        scale_factor (float): a scaling factor to multiply to each datapoint. Defaults to 1
        cap (float): if not None, never return an harmonic sum higher than the cap value.
    Returns:
        harmonic_sum (float): the harmonic sum of the data passed
    """

    data.sort(reverse=True)
    harmonic_sum = sum(s / ((i+1) ** scale_factor) for i, s in enumerate(data))
    
    # Applying cap:
    if cap is not None and harmonic_sum > cap:
        return cap
      
    return harmonic_sum



# The theoretical maximum of the scores:
max_score = 2.0490903410431183 # 3.377194851502127

mapping_expr = f.create_map([f.lit(x) for x in chain(*weights.items())])
(
    example_assoc
    .withColumn('weighted_score', f.col('score') * mapping_expr[f.col("datasourceId")])
    .groupBy('diseaseId', 'targetId')
    .agg(
        f.sum(f.col('evidenceCount')).alias('evidence_count'),
        harmonic_sum(f.collect_list(f.col('weighted_score'))).alias('harmonic_score'),
        f.first('overallScore').alias('originalScore')
    )
    .withColumn('calculated_overallScore', f.col('harmonic_score')/max_score)
    .show()
)

+--------------+---------------+--------------+--------------+--------------------+-----------------------+
|     diseaseId|       targetId|evidence_count|harmonic_score|       originalScore|calculated_overallScore|
+--------------+---------------+--------------+--------------+--------------------+-----------------------+
| MONDO_0003036|ENSG00000065361|             1|  0.0036475847|0.002217479128108...|   0.001780099504592...|
|Orphanet_44890|ENSG00000065361|             3|    0.12310599| 0.07437794575529624|    0.06007836035369098|
|   EFO_1000391|ENSG00000065361|             2|    0.10756045| 0.06538930696596199|    0.05249180389663397|
| MONDO_0017574|ENSG00000065361|             1|     0.6079308|  0.3695798546847018|     0.2966832487959592|
| MONDO_0009491|ENSG00000065361|             1|  0.0060793078|0.003695798546847...|   0.002966832469779377|
| MONDO_0016748|ENSG00000065361|             1|     0.6079308|  0.3695798546847018|     0.2966832487959592|
| MONDO_0021118|ENSG00000065

In [14]:
# @f.udf(t.FloatType())
def harmonic_sum(data: list, scale_factor: float = 1, cap: float = None) -> float:
    """
    Returns an harmonic sum for the data passed
    Args:
        data (list): list of floats to compute the harmonic sum from
        scale_factor (float): a scaling factor to multiply to each datapoint. Defaults to 1
        cap (float): if not None, never return an harmonic sum higher than the cap value.
    Returns:
        harmonic_sum (float): the harmonic sum of the data passed
    """

    data.sort(reverse=True)
    harmonic_sum = sum(s / ((i+1) ** scale_factor) for i, s in enumerate(data))
    
    # Applying cap:
    if cap is not None and harmonic_sum > cap:
        return cap
      
    return harmonic_sum

harmonic_sum(list(weights.values()))

3.377194851502127

In [29]:
(
    example_assoc
    .filter(f.col('diseaseId') == 'EFO_0004587')
    .show()
)

+---------------+-----------+-------------------+------------------+-------------------+-------------+------------------+
|       targetId|  diseaseId|         datatypeId|      datasourceId|              score|evidenceCount|      overallScore|
+---------------+-----------+-------------------+------------------+-------------------+-------------+------------------+
|ENSG00000065361|EFO_0004587|genetic_association|ot_genetics_portal|0.30630293777671963|            3|0.1862109892733839|
+---------------+-----------+-------------------+------------------+-------------------+-------------+------------------+



In [32]:
l = [10, 20, 6, 4323, 12, 32, 12, 342]
bp = [2, 4, 7]

[l[b[0]:b[1]] for b in zip([0] + bp, bp + [len(l)])]

[[10, 20], [6, 4323], [12, 32, 12], [342]]

## Problem with diease classification

**Date**: 2022.08.22

As Violeta reported my script wrongly classified some of the diseases. Two examples:
- EFO_0000095 - chronic lymphocytic leukemia => Not relevant
- EFO_0004587 - lymphocyte count => not_relevant
- EFO_0007993 - lymphocyte percentage of leukocytes => not_relevant


### Steps:

1. Increase the scope of words that mark a disease relevant for hematological point of view.
2. Get a list of all diseases that relevant in hematology
3. Re-organize the dataset from term -> parent into a term -> children format.
4. For each relevant disease term, recursively fetch all children.
5. From the combined set of all descendats, generate a unique set of diseases that are considered relevant.
5. Annotate with a boolean flag all diseases if they are relevant or not.

### Follow up:

1. Re-run the entire analysis.
2. Save data.
3. Submit to Google drive.

In [2]:
association_dataset = '/Users/dsuveges/project_data/associationByDatatypeDirect'
disease_dataset = '/Users/dsuveges/project/random_notebooks/2022.06.08-disease_expansion/diseases_efo.json'

# All children terms of a disease will be collected if the disease label contains any of these words:
relevant_disease_pattern = ['immuno', 'hemato', 'hemo', 'blood', 'bleed', 'lympho']

# Reading and parsing disease dataset:
disease_raw = (
    spark.read.json(disease_dataset)
    .withColumnRenamed('id', 'diseaseId')
    .withColumnRenamed('name', 'diseaseLabel')
    .persist()
)

disease_raw.show()
print(f'Number of diseases: {disease_raw.count()}')

+------------+--------------------+--------------------+
|   diseaseId|        diseaseLabel|           parentIds|
+------------+--------------------+--------------------+
|DOID_0050890|     synucleinopathy|[MONDO_0019052, M...|
|  DOID_10113|     trypanosomiasis|     [MONDO_0002428]|
|  DOID_10718|          giardiasis|[MONDO_0002428, E...|
|  DOID_13406|pulmonary sarcoid...|[Orphanet_797, MO...|
|   DOID_1947|      trichomoniasis|     [MONDO_0002428]|
|   DOID_7551|           gonorrhea|[EFO_0003955, MON...|
| EFO_0011021|BRCA1 mutation ca...|       [EFO_0007658]|
| EFO_0011022|BRCA2 mutation ca...|       [EFO_0007658]|
|  GO_0000003|reproductive process|        [GO_0008150]|
|  GO_0000226|microtubule cytos...|        [GO_0006996]|
|  GO_0000278|  mitotic cell cycle|        [GO_0007049]|
|  GO_0001775|     cell activation|        [GO_0044763]|
|  GO_0002445|type II hypersens...|        [GO_0002524]|
|  GO_0002524|    hypersensitivity|        [GO_0002526]|
|  GO_0002526|acute inflammator

In [22]:
# Function to retrieve all ascendants of a disease term:
def get_children(did: str, children: list=[]) -> list:
    """
    Every terms are added to the ancestors list.
    Every terms are looked up in the children dictionary. 
    If a term could not be found returning the ancestry dictionary
    If a term has childrend, we call the same function for all children.
    
    Args:
        did (str): disease identifier
        children (list): list of ancestors
    Return:
        list of ancestors
    """
    children.append(did)

    # Extract child terms:
    try:
        c = descendant_mapping[did]

    # The term doesn't have children:
    except KeyError:
        return children
    
    for child in c:
        get_children(child, children)

    return children


# Get all disease values with relevant names:
relevant_parent_terms = [ x[0] for x in (
    disease_raw
    .filter(f.col('diseaseLabel').rlike('|'.join(relevant_disease_pattern)))
    .select('diseaseId')
    .collect()
)]

# Reconstruct data so instead of parent, it will contain children terms:
childrend_rows = (
    disease_raw
    .select('diseaseId', f.explode('parentIds').alias('parentId'))
    .groupBy('parentId')
    .agg(f.collect_set(f.col('diseaseId')).alias('childIds'))
    .collect()
)

# Creating a mapping dictionary: keys are parent terms, values are lists of children.
descendant_mapping = {row['parentId']: row['childIds'] for row in childrend_rows}

# Fetching all the descendant of relevant diseases:
all_children = [get_children(term, []) for term in relevant_parent_terms]

# Concatenating the lists into one set of unique diseases:
all_relevant = list(set(reduce(lambda x, y: x+y, all_children)))

print(f'Number of relevant diseases: {len(all_relevant)}')

Number of relevant diseases: 2419


In [32]:
relevant_parents = (
    disease_raw
    .filter(f.col('diseaseLabel').rlike('|'.join(relevant_disease_pattern)))
    .select('diseaseId', 'diseaseLabel')
    .persist()
)

(
    relevant_parents
    .toPandas()
    .to_csv('relevant_parent_terms.tsv', sep='\t', index=False)
)

relevant_parents.show()


+----------+--------------------+
| diseaseId|        diseaseLabel|
+----------+--------------------+
|GO_0007596|   blood coagulation|
|GO_1900133|regulation of ren...|
|GO_1900134|negative regulati...|
|GO_1900135|positive regulati...|
|HP_0000225|   Gingival bleeding|
|HP_0000573|  Retinal hemorrhage|
|HP_0001871|Abnormality of bl...|
|HP_0001892|   Abnormal bleeding|
|HP_0001898|Increased red blo...|
|HP_0001933|Subcutaneous hemo...|
|HP_0002239|Gastrointestinal ...|
|HP_0003010|Prolonged bleedin...|
|HP_0003111|Abnormal blood io...|
|HP_0004421|Elevated systolic...|
|HP_0004804|Congenital hemoly...|
|HP_0005117|Elevated diastoli...|
|HP_0005387|Combined immunode...|
|HP_0008277|Abnormal blood zi...|
|HP_0010931|Abnormal blood so...|
|HP_0011015|Abnormal blood gl...|
+----------+--------------------+
only showing top 20 rows



In [33]:
(
    relevant_diseases
    .join(relevant_parents.select('diseaseId', f.lit(True).alias('isParent')), on='diseaseId', how='left')
    .withColumn('isParent', f.when(f.col('isParent') == True, True).otherwise(False))
    .filter((f.col('isParent')==False) & (f.col('isRelevant')==True))
    .show()
)

+----------+--------------------+--------------------+----------+--------+
| diseaseId|        diseaseLabel|           parentIds|isRelevant|isParent|
+----------+--------------------+--------------------+----------+--------+
|GO_0072718|response to cispl...|       [EFO_0004647]|      true|   false|
|GO_0097328|response to carbo...|       [EFO_0004647]|      true|   false|
|GO_1902520|response to doxor...|       [EFO_0005257]|      true|   false|
|GO_1902522|response to epiru...|       [EFO_0005257]|      true|   false|
|HP_0000132|         Menorrhagia|[HP_0001892, HP_0...|      true|   false|
|HP_0000978|Bruising suscepti...|        [HP_0001933]|      true|   false|
|HP_0000979|             Purpura|        [HP_0001933]|      true|   false|
|HP_0001693|       Cardiac shunt|        [HP_0011028]|      true|   false|
|HP_0001873|    Thrombocytopenia|        [HP_0001871]|      true|   false|
|HP_0001876|        Pancytopenia|        [HP_0001871]|      true|   false|
|HP_0001877|Abnormal eryt

In [37]:
# Annotating diseases if they are hematologically relevant:
relevant_diseases = (
    disease_raw
    .withColumn('isRelevant', f.when(f.col('diseaseId').isin(all_relevant), True).otherwise(False))
    .persist()
)

relevant_diseases.show()
# # Saving relevant diseases as a tsv:
# (
#     relevant_diseases
#     .filter(f.col('isRelevant') == True)
#     .select('diseaseId', 'diseaseLabel')
#     .toPandas()
#     .to_csv('/Users/dsuveges/Downloads/all_relevant_diseases.tsv', sep='\t', index=False)
# )

# Saving all diseases with annotation:
# (
#     relevant_diseases
#     .withColumn('parentIds', f.concat_ws('|', f.col('parentIds')))
#     .toPandas()
#     .to_csv('all_diseases_w_flag.tsv', sep='\t', index=False)
# )

+------------+--------------------+--------------------+----------+
|   diseaseId|        diseaseLabel|           parentIds|isRelevant|
+------------+--------------------+--------------------+----------+
|DOID_0050890|     synucleinopathy|[MONDO_0019052, M...|     false|
|  DOID_10113|     trypanosomiasis|     [MONDO_0002428]|     false|
|  DOID_10718|          giardiasis|[MONDO_0002428, E...|     false|
|  DOID_13406|pulmonary sarcoid...|[Orphanet_797, MO...|     false|
|   DOID_1947|      trichomoniasis|     [MONDO_0002428]|     false|
|   DOID_7551|           gonorrhea|[EFO_0003955, MON...|     false|
| EFO_0011021|BRCA1 mutation ca...|       [EFO_0007658]|     false|
| EFO_0011022|BRCA2 mutation ca...|       [EFO_0007658]|     false|
|  GO_0000003|reproductive process|        [GO_0008150]|     false|
|  GO_0000226|microtubule cytos...|        [GO_0006996]|     false|
|  GO_0000278|  mitotic cell cycle|        [GO_0007049]|     false|
|  GO_0001775|     cell activation|        [GO_0

### Processing associations

1. Read assoction file
2. Drop animal model data
3. Aggregating associations
4. Calcualte harmonic sum
5. Join disease info
6. Join target info


In [6]:
print(f'Number of all associations: {spark.read.parquet(association_dataset).select("diseaseId", "targetId").distinct().count()}')

no_animal_associations = (
    spark.read.parquet(association_dataset)
    .filter(f.col('datatypeId') != 'animal_model')
    .select("diseaseId", "targetId").distinct()
)

print(f'Number of associations supported by sources not including mouse: {no_animal_associations.count()}')


Number of all associations: 2120908
Number of associations supported by sources not including mouse: 1614569


In [7]:
@f.udf(t.FloatType())
def harmonic_sum(data: list, scale_factor: float = 1, cap: float = None) -> float:
    """
    Returns an harmonic sum for the data passed
    Args:
        data (list): list of floats to compute the harmonic sum from
        scale_factor (float): a scaling factor to multiply to each datapoint. Defaults to 1
        cap (float): if not None, never return an harmonic sum higher than the cap value.
    Returns:
        harmonic_sum (float): the harmonic sum of the data passed
    """

    data.sort(reverse=True)
    harmonic_sum = sum(s / ((i+1) ** scale_factor) for i, s in enumerate(data))
    
    # Applying cap:
    if cap is not None and harmonic_sum > cap:
        return cap
      
    return harmonic_sum

# An UDF to calculate median:
median_udf = f.udf(lambda l: median([x for x in l if x >= 1]), t.FloatType())


# Processing targets:
targets = (
    spark.read.parquet('/Users/dsuveges/project_data/targets')
    .select(
        f.col('id').alias('targetId'),
        f.col('approvedSymbol').alias('targetSymbol'),
        f.col('approvedName').alias('targetName')
    )
    .persist()
)


In [8]:
assoc_df = (
    # Reading associations:
    spark.read.parquet(association_dataset)
    
    # Dropping all animal model data:
    .filter(f.col('datatypeId') != 'animal_model')
    
    # aggregating by association and calculate overall score:
    .groupBy('diseaseId', 'targetId')
    .agg(harmonic_sum(f.collect_list(f.col('score'))).alias('overall_score'))
    
    # Joining with diseases and disease annotation.
    .join(relevant_diseases.drop('parentIds'), on='diseaseId')
    
    # Joining with target annotation:
    .join(targets, on='targetId', how='left')
    .persist()
)

# How does it look like:
assoc_df.show()


+---------------+----------+-------------+--------------------+----------+------------+--------------------+
|       targetId| diseaseId|overall_score|        diseaseLabel|isRelevant|targetSymbol|          targetName|
+---------------+----------+-------------+--------------------+----------+------------+--------------------+
|ENSG00000113749|DOID_10718| 0.0121586155|          giardiasis|     false|        HRH2|histamine recepto...|
|ENSG00000120937|DOID_13406|  0.018237924|pulmonary sarcoid...|     false|        NPPB|natriuretic pepti...|
|ENSG00000066427| DOID_7551| 0.0121586155|           gonorrhea|     false|       ATXN3|            ataxin 3|
|ENSG00000095739| DOID_7551|   0.03039654|           gonorrhea|     false|       BAMBI|BMP and activin m...|
|ENSG00000102755| DOID_7551|   0.18691845|           gonorrhea|     false|        FLT1|fms related recep...|
|ENSG00000103335| DOID_7551|    0.5912127|           gonorrhea|     false|      PIEZO1|piezo type mechan...|
|ENSG00000106004| D

In [107]:
# Saving data in a partitioned parquet:
(
    assoc_df
    .select('targetId', 'targetSymbol', 'targetName', 'diseaseId', 'diseaseLabel', 'isRelevant', 'overall_score')
    .write.mode('overwrite').parquet('Associations_w_disease_annot')
)

# Saving data in a single tsv file:
(
    assoc_df
    .select('targetId', 'targetSymbol', 'targetName', 'diseaseId', 'diseaseLabel', 'isRelevant', 'overall_score')
    .toPandas()
    .to_csv('annotated_associations.tsv.gz', sep='\t', compression='infer', index=False)
)

In [14]:
aggregated_associations = (
    assoc_df
    
    # Applying a filter based on the overall score: 
    # iteratively we can adjust this to make sure we are not too stringent
    .filter(
        (f.col('overall_score') >= 0.1)
    )
    
    # Grouping data by target and the relevance flag:
    .groupby('targetId', 'targetName', 'targetSymbol')
    .pivot('isRelevant')
    .agg(
        f.max(f.col('overall_score')).alias('max'),
        f.mean(f.col('overall_score')).alias('mean_score'),
        f.count(f.col('overall_score')).alias('disease_count'),
        median_udf(f.collect_list(f.col('overall_score'))).alias('median_score'),
    )
    .persist()
)

aggregated_associations.count()

# Saving data in in tsv:
(
    aggregated_associations
    .toPandas()
    .to_csv('aggregated_associations.tsv.gz', sep='\t', index=False)
)

In [15]:
%%bash


ls -lah

total 97200
drwxr-xr-x   12 dsuveges  EBI\Domain Users   384B 23 Aug 09:45 .
drwxrwxr-x   87 dsuveges  EBI\Domain Users   2.7K 16 Aug 14:38 ..
drwxr-xr-x    5 dsuveges  EBI\Domain Users   160B  7 Jul 21:14 .ipynb_checkpoints
drwxr-xr-x  404 dsuveges  EBI\Domain Users    13K 22 Aug 22:31 Associations_w_disease_annot
-rw-r--r--    1 dsuveges  EBI\Domain Users   132K 22 Aug 22:19 Bulk_analysis.ipynb
-rw-r--r--    1 dsuveges  EBI\Domain Users    44K 23 Aug 09:45 Correct process.ipynb
-rw-r--r--    1 dsuveges  EBI\Domain Users    38K 12 Jul 19:07 Finding disease parents.ipynb
-rw-r--r--    1 dsuveges  EBI\Domain Users   1.1M 23 Aug 09:45 aggregated_associations.tsv.gz
-rw-r--r--    1 dsuveges  EBI\Domain Users   118K 22 Aug 16:53 all_relevant_diseases.tsv
-rw-r--r--    1 dsuveges  EBI\Domain Users    43M 22 Aug 22:31 annotated_associations.tsv.gz
-rw-r--r--    1 dsuveges  EBI\Domain Users   2.4M 24 Jun 11:17 diseases_efo.json
drwxr-xr-x    3 dsuveges  EBI\Domain Users    96B  6 Jul 08:09 pu

In [34]:
disease_dataset = '/Users/dsuveges/project_data/diseases_22.06'

# Create a dataframe with the relevant disease identifiers and the corresponding category label:
category_of_interest = spark.createDataFrame([
    {'id': 'EFO_0005803', 'category': 'D'}, # hemotologic diseases 
    {'id': 'EFO_0004503', 'category': 'M'}, # hemotological measurement
    {'id': 'HP_0001871',  'category': 'P'}  # Abnormality of the blood and blood-forming tissues
])

# Relevant diseases are all descendants of the above terms:
relevant_diseases = (
    spark.read.parquet(disease_dataset)
    
    # Extract the relevant rows from the disease index:
    .join(category_of_interest, on='id', how='right')
    
    # Extract descendants:
    .select('category', f.explode('descendants').alias('id'))
    
    # Grouping by disease id -> get a list of categories the disease is annotated with:
    .groupby('id')
    .agg(f.collect_set('category').alias('category'))
    .persist()
)

# Join the disease index with the above generated list:
annotated_diseases = (
    spark.read.parquet(disease_dataset)
    .join(relevant_diseases, on='id', how='left')
    .select(
        f.col('id').alias('diseaseId'),
        f.col('name').alias('diseaseName'),
        f.col('category')
    )
    .withColumn('isRelevant', f.when(f.col('category').isNotNull(), True).otherwise(False))
    .persist()
)

annotated_diseases.show()

+-----------+--------------------+--------+----------+
|  diseaseId|         diseaseName|category|isRelevant|
+-----------+--------------------+--------+----------+
|  DOID_7551|           gonorrhea|    null|     false|
|EFO_0004254|membranous glomer...|    null|     false|
|EFO_0005189|respiratory quotient|    null|     false|
|EFO_0005853|response to silic...|    null|     false|
|EFO_0006317|response to thiop...|    null|     false|
|EFO_0007229|      cryptococcosis|    null|     false|
|EFO_0007391|Nematoda infectio...|    null|     false|
|EFO_0008080|cerebrospinal flu...|    null|     false|
|EFO_0008167|interleukin 1 Rec...|    null|     false|
|EFO_0008181|interleukin 23 re...|    null|     false|
|EFO_0009960|atypical femoral ...|    null|     false|
|EFO_0010586|    CD40 measurement|    null|     false|
|EFO_0010717|            afebrile|    null|     false|
|EFO_0010977|macrovascular com...|    null|     false|
|EFO_0011044|BMI-adjusted neck...|    null|     false|
|EFO_00200

In [55]:
# relevant_diseases.show()

(
    annotated_diseases
    .withColumnRenamed('isRelevant', 'isRelevantNew')
    .drop('diseaseName')
    .join(
        (
            relevant_diseases
            .withColumnRenamed('isRelevant', 'isRelevantOld')
            .drop('parentIds')
        )
        , on='diseaseId', how='inner'
    )
    .withColumn(
        'source',
        f.when(
            (
                (f.col('isRelevantOld') == True) &
                (f.col('isRelevantNew') == True)
            ), f.lit('both'))
        .when(
            (
                (f.col('isRelevantOld') == True) &
                (f.col('isRelevantNew') == False)
            ), f.lit('old_only'))
        .when(
            (
                (f.col('isRelevantOld') == True) &
                (f.col('isRelevantNew') == True)
            ), f.lit('new_only'))
        .otherwise(f.lit('none'))
    )
#     .select('diseaseLabel', 'category', 'isRelevantOld', 'isRelevantNew')
#     .filter(
#         (f.col('isRelevantOld') == True) &
#         (f.col('isRelevantNew') == False)
#     )
#     .groupBy('source')
#     .count()
    .select('diseaseLabel', 'diseaseId', 'isRelevantOld', 'isRelevantNew', 'source')
    .toPandas()
    .to_csv('comparing_disease_classification.tsv', sep='\t', index=False)
)

In [59]:
(
    spark.read.parquet('/Users/dsuveges/Downloads/ENSG00000130203-colocalising-studies.json')
    .show(1, False, True)
#     .count()
)

Py4JJavaError: An error occurred while calling o15445.parquet.
: org.apache.hadoop.fs.UnsupportedFileSystemException: No FileSystem for scheme "gs"
	at org.apache.hadoop.fs.FileSystem.getFileSystemClass(FileSystem.java:3281)
	at org.apache.hadoop.fs.FileSystem.createFileSystem(FileSystem.java:3301)
	at org.apache.hadoop.fs.FileSystem.access$200(FileSystem.java:124)
	at org.apache.hadoop.fs.FileSystem$Cache.getInternal(FileSystem.java:3352)
	at org.apache.hadoop.fs.FileSystem$Cache.get(FileSystem.java:3320)
	at org.apache.hadoop.fs.FileSystem.get(FileSystem.java:479)
	at org.apache.hadoop.fs.Path.getFileSystem(Path.java:361)
	at org.apache.spark.sql.execution.streaming.FileStreamSink$.hasMetadata(FileStreamSink.scala:46)
	at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:377)
	at org.apache.spark.sql.DataFrameReader.loadV1Source(DataFrameReader.scala:325)
	at org.apache.spark.sql.DataFrameReader.$anonfun$load$3(DataFrameReader.scala:307)
	at scala.Option.getOrElse(Option.scala:189)
	at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:307)
	at org.apache.spark.sql.DataFrameReader.parquet(DataFrameReader.scala:833)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:566)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.base/java.lang.Thread.run(Thread.java:829)
