In [24]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import split, col, udf, lit
from pyspark.sql.types import StringType, IntegerType, TimestampType, StructType

# establish spark connection
spark = (
    SparkSession.builder
    .master('local[*]')
    .getOrCreate()
)

G2P_mutationCsq2functionalCsq = {
    'loss of function': 'SO_0002054',  # loss_of_function_variant
    'all missense/in frame': 'SO_0001650',  # inframe_variant
    'uncertain': 'SO_0002220',  # function_uncertain_variant
    'activating': 'SO_0002053',  # gain_of_function_variant
    'dominant negative': 'SO_0002052',  # dominant_negative_variant
    '': None,
    'gain of function': 'SO_0002053',  # gain_of_function_variant
    'cis-regulatory or promotor mutation': 'SO_0001566',  # regulatory_region_variant
    '5_prime or 3_prime UTR mutation': 'SO_0001622',  # UTR_variant
    'increased gene dosage': 'SO_0001911',  # copy_number_increase
    'part of contiguous gene duplication': 'SO_1000173'  # tandem_duplication
}

def translate(mapping):
    def translate_(col):
        return mapping.get(col)
    return udf(translate_, StringType())



# List of INTOGen files:
intogen_files = [
    '/Users/dsuveges/repositories/evidence_datasource_parsers/resources/CancerG2P.csv.gz',
    '/Users/dsuveges/repositories/evidence_datasource_parsers/resources/DDG2P.csv.gz',
    '/Users/dsuveges/repositories/evidence_datasource_parsers/resources/EyeG2P.csv.gz',
    '/Users/dsuveges/repositories/evidence_datasource_parsers/resources/SkinG2P.csv.gz'
]


# Specify schema -> this schema is applied for all INTOGen files:
intogen_schema = (
    StructType()
    .add('gene_symbol', StringType())
    .add('gene_mim', IntegerType())
    .add('disease_name', StringType())
    .add('disease_mim', StringType())
    .add('DDD_category', StringType())
    .add('allelic_requirement_list', StringType())
    .add('mutation_consequence', StringType())
    .add('phenotype_list', StringType())
    .add('organ_specificity_list', StringType())
    .add('pmid_list', StringType())
    .add('panel', StringType())
    .add('prev_symbol_list', StringType())
    .add('hgnc_id', IntegerType())
    .add('gene_disease_entry_date', TimestampType())
)

# Load all files for one go:
intogen_data = (
    spark.read.csv(intogen_files, schema=intogen_schema, enforceSchema=True, header=True)
    
    # Split pubmed IDs to list:
    .withColumn('literature', split(col('pmid_list'), ';'))
    
    # Split phenotypes:
    .withColumn('phenotypes', split(col('phenotype_list'), ';'))
    
    # Split organ specificity:
    .withColumn('organ_specificities', split(col('organ_specificity_list'), ';'))
    
    # Split allelic requirements:
    .withColumn('allelicRequirements', split(col('allelic_requirement_list'), ';'))
     
)

evidence_df = (
    intogen_data
    
    # Renaming columns:
    .withColumnRenamed('gene_symbol', 'targetFromSourceId')
    .withColumnRenamed('disease_mim', 'diseaseFromSourceId')
    .withColumnRenamed('disease_name', 'diseaseFromSource')
    .withColumnRenamed('panel', 'studyId')
    .withColumnRenamed('DDD_category', 'confidence')
    
    # Map functional consequences:
    .withColumn("variantFunctionalConsequenceId", translate(G2P_mutationCsq2functionalCsq)("mutation_consequence"))
   
    # Adding literature columns:
    .withColumn('datasourceId', lit('gene2phenotype'))
    .withColumn('datatypeId', lit('genetic_literature'))
    
    # Selecting relevant columns:
    .select(
        'datasourceId', 'datatypeId', 'targetFromSourceId', 'diseaseFromSource',
        'diseaseFromSourceId', 'confidence', 'studyId', 'literature', 
        'allelicRequirements', 'variantFunctionalConsequenceId'
    )
)



evidence_df.show(2, vertical=True, truncate=False)



-RECORD 0----------------------------------------------------------------
 datasourceId                   | gene2phenotype                         
 datatypeId                     | genetic_literature                     
 targetFromSourceId             | HMX1                                   
 diseaseFromSource              | OCULOAURICULAR SYNDROME                
 diseaseFromSourceId            | 612109                                 
 confidence                     | probable                               
 studyId                        | DD                                     
 literature                     | [18423520]                             
 allelicRequirements            | [biallelic]                            
 variantFunctionalConsequenceId | SO_0002054                             
-RECORD 1----------------------------------------------------------------
 datasourceId                   | gene2phenotype                         
 datatypeId                     | gene

In [28]:
intogen_data.filter(
    F.col('mutation_consequence').isNotNull() & 
    F.col('variantFunctionalConsequenceId').isNull()).show(20, vertical=True, truncate=False)

(0 rows)



In [34]:
(
    intogen_data
    .filter( F.col('hgnc_id')=='3431')
    .show(vertical=True, truncate=False)
)

-RECORD 0--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 gene_symbol                    | ERBB3                                                                                                                                                                    
 gene_mim                       | 190151                                                                                                                                                                   
 disease_name                   | LETHAL CONGENITAL CONTRACTURE SYNDROME TYPE 2                                                                                                                            
 disease_mim                    | 607598                                                                                                                                                

In [49]:
import ontoma

In [50]:
ont_obj = ontoma.interface.OnToma()

disease_name = "OCULOAURICULAR SYNDROME"

ont_obj.find_term(disease_name, verbose=True)

INFO     - ontoma.downloaders - ZOOMA to EFO mappings - Parsed 3663 rows
INFO     - ontoma.downloaders - OMIM to EFO mappings - Parsed 8561 rows
INFO     - ontoma.interface - EFO OBO parsed. Size: 26947 nodes
INFO     - ontoma.interface - Parsed 125463 Name to EFO mapping 
INFO     - ontoma.interface - Found http://www.orpha.net/ORDO/Orphanet_157962 for OCULOAURICULAR SYNDROME from EFO OBO - match - None


{'term': 'http://www.orpha.net/ORDO/Orphanet_157962',
 'label': 'Oculoauricular syndrome, Schorderet type',
 'source': 'EFO OBO',
 'quality': 'match',
 'action': None}

In [None]:
disease_term = None


# Get ontoma mapping:
ontoma_mapping = ont_obj.find_term(disease_name, verbose=True)

# If something returned:
if ontoma_mapping:
    
    # 
    if ontoma_mapping['action'] is None:
        disease_term = ontoma_mapping['label']
    
    # Mactch found:
    elif ontoma_mapping['quality'] == 'match':
        
        # Match in HP or ORDO, check if there is a match in MONDO too. If so, give preference to MONDO hit
        mondo_mapping = self.search_mondo(disease_name)
        if mondo_mapping:
            if mondo_mapping['exact']:
                logging.info(f"Using MONDO match")
                return mondo_mapping
            else:
                logging.info(f"No exact matches in MONDO, using OnToma results")
                return {'id': ontoma_mapping['term'], 'name': ontoma_mapping['label']}
        else:
            logging.info(f"No match in MONDO, using OnToma results")
            return {'id': ontoma_mapping['term'], 'name': ontoma_mapping['label']} 

In [111]:
import logging

class disease_map(object):
    
    def __init__(self):
        self.ontoma = ontoma.interface.OnToma()

    def map_disease(self, disease_name, omim_id):
        logging.info(f"Mapping '{disease_name}'")

        # Search disease name using OnToma and accept perfect matches
        ontoma_mapping = self.ontoma.find_term(disease_name, verbose=True)
        
        # If there's some mapping available:
        if ontoma_mapping:
            
            # Extracting term if no action is required:
            if ontoma_mapping['action'] is None:
                return ontoma_mapping
                
            # When there is an exact match, but action is required:
            elif ontoma_mapping['quality'] == "match":
                
                # Match in HP or ORDO, check if there is a match in MONDO too. If so, give preference to MONDO hit
                mondo_mapping = self.search_mondo(disease_name)
                
                if mondo_mapping:
                    # Mondo mapping good - return
                    if mondo_mapping['exact']:
                        return mondo_mapping
                    # Mondo mapping bad - return ontoma
                    else:
                        return ontoma_mapping 
                else:
                    # Mondo mapping bad - return ontoma
                    return ontoma_mapping

            else:
                # OnToma fuzzy match. First check if the mapping term has a xref to the OMIM id. 
                # If not, check in MONDO and if there is not match ignore evidence and report disease
                if self.ontoma.get_efo_from_xref(f"OMIM:{omim_id}"):
                    for efo_xref in self.ontoma.get_efo_from_xref(f"OMIM:{omim_id}"):
                        # Extract EFO id from OnToma results
                        efo_id = ontoma_mapping['term'].split('/')[-1].replace('_', ':')

                        if efo_id == efo_xref['id']:
                            return ontoma_mapping

                # xref search didn't work, try MONDO as the last resort
                mondo_mapping = self.search_mondo(disease_name)
                if mondo_mapping:
                    if mondo_mapping['exact']:
                        return mondo_mapping
                    else:
                        return None
                else:
                    # Record the unmapped disease
                    return None

        else:
            # No match in EFO, HP or ORDO
            mondo_mapping = self.search_mondo(disease_name)
            if mondo_mapping:
                if mondo_mapping['exact']:
                    return mondo_mapping
                else:
                    return None
            else:
                return None

            
    def search_mondo(self, disease_name):

        disease_name = disease_name.lower()

        # mondo_lookup works like a dictionary lookup so if disease is not in there it raises and error instead of returning `None`
        try:
            mondo_term = self.ontoma.mondo_lookup(disease_name)
            return {
                'id': mondo_term, 
                'name': self.ontoma.get_mondo_label(mondo_term), 
                'exact': True
            }
        except KeyError as e:
            exact_ols_mondo = self.ontoma._ols.besthit(disease_name, ontology=['mondo'], field_list=['iri', 'label'], exact=True)
            
            if exact_ols_mondo:
                return {'term': exact_ols_mondo['iri'], 'name': exact_ols_mondo['label'], 'exact':True}
            
            else:
                ols_mondo = self.ontoma._ols.besthit(disease_name,
                                                     ontology=['mondo'],
                                                     field_list=['iri', 'label'],
                                                     bytype='class')
                if ols_mondo:
                    return {'term': ols_mondo['iri'], 'name': ols_mondo['label'], 'exact': False}
                else:
                    return None


dm_obj = disease_map()


# UDF to look up EFO mappings:
@udf(StringType())
def map_disease(label, disease_id):
    lookup = dm_obj.map_disease(label, disease_id)
    if lookup:
        return lookup['term'].split('/')[-1]
    else:
        return None
    
diseases = (
    evidence_df
    .select('diseaseFromSource', 'diseaseFromSourceId')
    .distinct()
    .limit(10)
    .withColumn('diseaseFromSourceMappedId', map_disease(col('diseaseFromSource'), col('diseaseFromSourceId')))
    .persist()
)

diseases.s

INFO     - ontoma.downloaders - ZOOMA to EFO mappings - Parsed 3663 rows
INFO:ontoma.downloaders:ZOOMA to EFO mappings - Parsed 3663 rows
INFO     - ontoma.downloaders - OMIM to EFO mappings - Parsed 8561 rows
INFO:ontoma.downloaders:OMIM to EFO mappings - Parsed 8561 rows


+--------------------+-------------------+-------------------------+
|   diseaseFromSource|diseaseFromSourceId|diseaseFromSourceMappedId|
+--------------------+-------------------+-------------------------+
|SPLIT HAND AND FO...|             220600|                     null|
|SEVERE COBBLESTON...|             615041|                     null|
|   PFEIFFER SYNDROME|             101600|             Orphanet_710|
|CONGENITAL ICHTHY...|     No disease mim|          Orphanet_183435|
|MENTAL RETARDATIO...|             613192|                     null|
|MENTAL RETARDATIO...|             612621|                     null|
|MUCOPOLYSACCHARID...|             253000|          Orphanet_309297|
|Severe Infantile ...|     No disease mim|                     null|
|Retinitis pigment...|             613767|             Orphanet_791|
|    Oguchi disease-2|             613411|           Orphanet_75382|
+--------------------+-------------------+-------------------------+



In [82]:
diseases = (
    evidence_df
    .select('diseaseFromSource', 'diseaseFromSourceId')
    .distinct()
    .limit(10)
    .withColumn('diseaseFromSourceMappedId', map_disease(col('diseaseFromSource'), col('diseaseFromSourceId')))
    .persist()
)


diseases.show()

+--------------------+-------------------+-------------------------+
|   diseaseFromSource|diseaseFromSourceId|diseaseFromSourceMappedId|
+--------------------+-------------------+-------------------------+
|SPLIT HAND AND FO...|             220600|                     null|
|SEVERE COBBLESTON...|             615041|                     null|
|   PFEIFFER SYNDROME|             101600|                     null|
|CONGENITAL ICHTHY...|     No disease mim|                     null|
|MENTAL RETARDATIO...|             613192|                     null|
|MENTAL RETARDATIO...|             612621|                     null|
|MUCOPOLYSACCHARID...|             253000|                     null|
|Severe Infantile ...|     No disease mim|                     null|
|Retinitis pigment...|             613767|                     null|
|    Oguchi disease-2|             613411|                     null|
+--------------------+-------------------+-------------------------+



In [48]:
@udf(StringType())
def map_disease(label, disease_id):
    lookup = dm_obj.map_disease(label, disease_id)
    if lookup:
        return lookup['id'].split('/')[-1]
    else:
        return None
    


In [113]:
mapped_diseases = (
    diseases
    .limit(10)
    .withColumn('diseaseFromSourceMappedId', map_disease(col('diseaseFromSource'), col('diseaseFromSourceId')))
    .persist()
)

for i, row in mapped_diseases.toPandas().iterrows():
    print(row['diseaseFromSourceId'])

220600
615041
101600
No disease mim
613192
612621
253000
No disease mim
613767
613411


In [114]:
for _, row in mapped_diseases.toPandas().iterrows():
    print(row['diseaseFromSourceId'])

220600
615041
101600
No disease mim
613192
612621
253000
No disease mim
613767
613411
