# Fetch data

In [1]:
%%bash

wget https://archive.monarchinitiative.org/latest/tsv/gene_associations/gene_disease.9606.tsv.gz 

--2021-05-18 17:57:19--  https://archive.monarchinitiative.org/latest/tsv/gene_associations/gene_disease.9606.tsv.gz
Resolving archive.monarchinitiative.org (archive.monarchinitiative.org)... 128.193.83.101
Connecting to archive.monarchinitiative.org (archive.monarchinitiative.org)|128.193.83.101|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 311041 (304K) [application/octet-stream]
Saving to: ‘gene_disease.9606.tsv.gz’

     0K .......... .......... .......... .......... .......... 16%  293K 1s
    50K .......... .......... .......... .......... .......... 32%  320K 1s
   100K .......... .......... .......... .......... .......... 49% 3.60M 0s
   150K .......... .......... .......... .......... .......... 65%  549K 0s
   200K .......... .......... .......... .......... .......... 82%  724K 0s
   250K .......... .......... .......... .......... .......... 98% 5.68M 0s
   300K ...                                                   100% 19.6M=0.5s

2021-05-18 17

In [44]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.types import StringType, IntegerType, TimestampType, StructType, ArrayType

# establish spark connection
spark = (
    SparkSession.builder
    .getOrCreate()
)

orphanet_file = 'gene_disease.9606.tsv.gz'


orphanet_df = (
    spark.read.csv(orphanet_file,sep='\t', header=True)
    
    .withColumnRenamed('object', 'diseaseFromSourceId')
    .withColumnRenamed('object_label', 'diseaseFromSource')
    
    .withColumn('source', F.explode(F.split(F.col('is_defined_by'), '\|')))
    .withColumn('targetIdSource', F.split(F.col('subject'), ':').getItem(0))
    .withColumn('targetFromSourceId', F.split(F.col('subject'), ':').getItem(1))
    .withColumn('sourceId', F.lit('orphanet'))
    .withColumn('test_sources', get_sources(F.col('is_defined_by')))
    .filter(
        (F.col('source') == 'https://archive.monarchinitiative.org/#orphanet')
        & (F.col('subject_taxon_label') == 'Homo sapiens')
    )
    
    .drop(*['subject_taxon_label', 'subject', 'subject_taxon'])
    .persist()
)

orphanet_df.show(2, truncate=False, vertical=True)

-RECORD 0-----------------------------------------------------------------------------------------------------------
 subject_label       | LOC111365204                                                                                 
 diseaseFromSourceId | MONDO:0007630                                                                                
 diseaseFromSource   | North Carolina macular dystrophy                                                             
 relation            | RO:0003303                                                                                   
 relation_label      | causes condition                                                                             
 evidence            | ECO:0000322|ECO:0000220                                                                      
 evidence_label      | imported manually asserted information used in automatic assertion|sequencing assay evidence 
 source              | https://archive.monarchinitiative.org/#or

In [22]:
orphanet_df.select('targetIdSource').distinct().show(30, truncate=False)

+--------------+
|targetIdSource|
+--------------+
|NCBIGene      |
|HGNC          |
+--------------+



In [43]:
string = 'https://archive.monarchinitiative.org/#omim|https://archive.monarchinitiative.org/#orphanet'

get_sources = F.udf(
    lambda source: [ url.split('#')[1] if len(url.split('#')) > 1 else url for url in source.split('|')], 
    ArrayType(StringType())
)


In [34]:
get_link = F.udf(
    lambda x: eco_dicts.value[1][x],
    StringType()
)

# Changing strategy - get file from Orphanet directly

In [45]:
%%bash

wget http://www.orphadata.org/data/xml/en_product6.xml 


--2021-05-19 18:46:35--  http://www.orphadata.org/data/xml/en_product6.xml
Resolving www.orphadata.org (www.orphadata.org)... 194.167.41.5
Connecting to www.orphadata.org (www.orphadata.org)|194.167.41.5|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 19436974 (19M) [text/xml]
Saving to: ‘en_product6.xml’

     0K .......... .......... .......... .......... ..........  0%  172K 1m50s
    50K .......... .......... .......... .......... ..........  0%  280K 89s
   100K .......... .......... .......... .......... ..........  0%  281K 81s
   150K .......... .......... .......... .......... ..........  1%  332K 75s
   200K .......... .......... .......... .......... ..........  1% 1.30M 63s
   250K .......... .......... .......... .......... ..........  1%  355K 61s
   300K .......... .......... .......... .......... ..........  1% 1.23M 54s
   350K .......... .......... .......... .......... ..........  2%  363K 54s
   400K .......... .......... .......... ........

In [48]:
import xml.etree.ElementTree as ET
tree = ET.parse('en_product6.xml')
root = tree.getroot()

In [118]:
orphanet_disorders = []

for disorder in root.find('DisorderList').findall('Disorder'):
    value = disorder.find('OrphaCode')
    
    parsed_disorder = {
        "diseaseFromSource": disorder.find('Name').text,
        "diseaseFromSourceId": disorder.find('OrphaCode').text,
        "type": disorder.find('DisorderType/Name').text,
    }
    
    for association in disorder.find('DisorderGeneAssociationList'):
        
        evidence = parsed_disorder.copy()
        try:
            evidence['literature'] = [pmid.replace('[PMID]','') for pmid in association.find('SourceOfValidation').text.split('_') if '[PMID]' in pmid]
        except AttributeError:
            evidence['literature'] = []
            
        evidence['associationType'] = association.find('DisorderGeneAssociationType/Name').text
        evidence['associationStatus'] = association.find('DisorderGeneAssociationStatus/Name').text
        
        # Parse gene name and id:
        gene = association.find('Gene')
        evidence['targetFromSource'] = gene.find('Name').text
        evidence['targetFromSourceIds'] = [xref.find('Reference').text for xref in gene.find('ExternalReferenceList') if 'ENSG' in xref.find('Reference').text]
        
        orphanet_disorders.append(evidence)
    
orphanet_df = pd.DataFrame(orphanet_disorders)
orphanet_df.head()


Unnamed: 0,diseaseFromSource,diseaseFromSourceId,type,literature,associationType,associationStatus,targetFromSource,targetFromSourceIds
0,"Multiple epiphyseal dysplasia, Al-Gazali type",166024,Disease,[22587682],Disease-causing germline mutation(s) in,Assessed,kinesin family member 7,[ENSG00000166813]
1,Aspartylglucosaminuria,93,Disease,[11309371],Disease-causing germline mutation(s) in,Assessed,aspartylglucosaminidase,[ENSG00000038002]
2,Brachydactyly-short stature-retinitis pigmento...,166035,Malformation syndrome,[28285769],Disease-causing germline mutation(s) in,Assessed,CWC27 spliceosome associated protein homolog,[]
3,Multiple sulfatase deficiency,585,Disease,[17657823],Disease-causing germline mutation(s) in,Assessed,sulfatase modifying factor 1,[ENSG00000144455]
4,Beta-mannosidosis,118,Disease,[18980795],Disease-causing germline mutation(s) in,Assessed,mannosidase beta,[ENSG00000109323]


In [121]:
orphanet_df['type'].unique()

array(['Disease', 'Malformation syndrome', 'Clinical subtype',
       'Etiological subtype', 'Morphological anomaly',
       'Biological anomaly', 'Clinical syndrome',
       'Particular clinical situation in a disease or syndrome',
       'Histopathological subtype', 'Clinical group', 'Category'],
      dtype=object)

In [122]:
orphanet_df.associationType.unique()

array(['Disease-causing germline mutation(s) in',
       'Modifying germline mutation in', 'Major susceptibility factor in',
       'Candidate gene tested in',
       'Disease-causing germline mutation(s) (loss of function) in',
       'Disease-causing somatic mutation(s) in',
       'Disease-causing germline mutation(s) (gain of function) in',
       'Role in the phenotype of', 'Part of a fusion gene in',
       'Biomarker tested in'], dtype=object)

In [124]:
orphanet_df.associationStatus.value_counts()

Assessed            7377
Not yet assessed     479
Name: associationStatus, dtype: int64

In [130]:
orphanet_df.targetFromSourceIds.apply(lambda x: len(x)).value_counts()

1    7803
0      53
Name: targetFromSourceIds, dtype: int64

In [129]:
orphanet_df.literature.apply(lambda x: len(x)).value_counts()

1    5199
2    1412
0     821
3     301
4      71
5      28
6      16
7       8
Name: literature, dtype: int64

In [142]:
assert isinstance(root, ET.Element)

In [145]:
root.find('DisorderList').get('count')

'3845'

In [149]:
len(orphanet_df.diseaseFromSourceId.unique())

3845

In [157]:
len(orphanet_df.loc[lambda df: df['targetFromSourceIds'].apply(lambda x: len(x)==0)])

53

In [155]:
orphanet_df['targetFromSourceIds']

0       [ENSG00000166813]
1       [ENSG00000038002]
2                      []
3       [ENSG00000144455]
4       [ENSG00000109323]
              ...        
7851    [ENSG00000152591]
7852    [ENSG00000158055]
7853    [ENSG00000170315]
7854    [ENSG00000170315]
7855    [ENSG00000158055]
Name: targetFromSourceIds, Length: 7856, dtype: object

In [158]:
spark

In [162]:
from pyspark import SparkContext

orphanet_json = SparkContext.parallelize(orphanet_disorders)
# orphanet_df = sqlContext.read.json(orphanet_json)


TypeError: parallelize() missing 1 required positional argument: 'c'

In [164]:
from pyspark.sql import Row
spark.createDataFrame(Row(**x) for x in orphanet_disorders).show(2, truncate=False, vertical=True)



-RECORD 0------------------------------------------------------------
 diseaseFromSource   | Multiple epiphyseal dysplasia, Al-Gazali type 
 diseaseFromSourceId | 166024                                        
 type                | Disease                                       
 literature          | [22587682]                                    
 associationType     | Disease-causing germline mutation(s) in       
 associationStatus   | Assessed                                      
 targetFromSource    | kinesin family member 7                       
 targetFromSourceIds | [ENSG00000166813]                             
-RECORD 1------------------------------------------------------------
 diseaseFromSource   | Aspartylglucosaminuria                        
 diseaseFromSourceId | 93                                            
 type                | Disease                                       
 literature          | [11309371]                                    
 associationType    

In [170]:
cicaful = [
    {
        "pocok": 31,
        "kutya": "csicseriborso",
        "bablencse": ['ingyom', 'bingyom']
    },
    {
        "pocok": 2324,
        "kutya": "fsdklmsdfkl",
        "bablencse": []
    }
]

(
    spark.createDataFrame(Row(**x) for x in cicaful)
#     .withColumn('cicaful', F.explode(F.col('bablencse')))
    .select('pocok','kutya', F.explode_outer(F.col('bablencse')))
    .show()
)

+-----+-------------+-------+
|pocok|        kutya|    col|
+-----+-------------+-------+
|   31|csicseriborso| ingyom|
|   31|csicseriborso|bingyom|
| 2324|  fsdklmsdfkl|   null|
+-----+-------------+-------+



In [2]:
import logging 

def parserOrphanetXml(orphanet_file: str) -> list:
    '''
    Function to parse Orphanet xml dump and return the parsed
    data as a pandas dataframe.

    Args:
        orphanet_file (str): Orphanet XML filename

    Returns:
        parsed data as a list of dictionary
    '''

    # Reading + validating xml:
    tree = ET.parse(orphanet_file)
    assert isinstance(tree, ET.ElementTree)

    root = tree.getroot()
    assert isinstance(root, ET.Element)

    # Checking if the basic nodes are in the xml structure:

    logging.info(f"There are {root.find('DisorderList').get('count')} disease in the Orphanet xml file.")

    orphanet_disorders = []

    for disorder in root.find('DisorderList').findall('Disorder'):

        # Extracting disease information:
        parsed_disorder = {
            "diseaseFromSource": disorder.find('Name').text,
            "diseaseFromSourceId": 'Orphanet_' + disorder.find('OrphaCode').text,
            "type": disorder.find('DisorderType/Name').text,
        }

        # One disease might be mapped to multiple genes:
        for association in disorder.find('DisorderGeneAssociationList'):

            # For each mapped genes, an evidence is created:
            evidence = parsed_disorder.copy()

            # Not all gene/disease association is backed up by publication:
            try:
                evidence['literature'] = [pmid.replace('[PMID]', '') for pmid in association.find('SourceOfValidation').text.split('_') if '[PMID]' in pmid]
            except AttributeError:
                evidence['literature'] = None

            evidence['associationType'] = association.find('DisorderGeneAssociationType/Name').text
            evidence['associationStatus'] = association.find('DisorderGeneAssociationStatus/Name').text

            # Parse gene name and id - going for Ensembl gene id only:
            gene = association.find('Gene')
            evidence['targetFromSource'] = gene.find('Name').text
            
            # Extracting ensembl gene id from cross references:
            ensembl_gene_id = [xref.find('Reference').text for xref in gene.find('ExternalReferenceList') if 'ENSG' in xref.find('Reference').text]
            evidence['targetFromSourceId'] = ensembl_gene_id[0] if len(ensembl_gene_id) > 0 else None

            # Collect evidence:
            orphanet_disorders.append(evidence)

    return orphanet_disorders


pod = parserOrphanetXml('en_product6.xml')
len(pod)

NameError: name 'ET' is not defined

In [5]:
import argparse
import logging
import sys
import time

import xml.etree.ElementTree as ET
from pyspark.conf import SparkConf
from pyspark.sql import SparkSession, Row
from pyspark.sql.types import StringType
import pyspark.sql.functions as F

from ontoma import OnToma

# establish spark connection
spark = (
    SparkSession.builder
    .getOrCreate()
)


input_file = 'en_product6.xml'
orphanet_disorders = parserOrphanetXml(input_file)
orphanet_df = (
    spark.createDataFrame(Row(**x) for x in orphanet_disorders)
    # Exploding
    .withColumn('dataSourceId', F.lit('orphanet'))
    .withColumn('datatypeId', F.lit('genetic_association'))
    .persist()
)



In [6]:
class ontoma_efo_lookup():
    '''
    Simple class to map orphanet ids to efo
    '''
    def __init__(self):
        self.otmap = OnToma()

    def get_mapping(self, disease_lable, disease_id):

        mappings = self.query_ontoma(disease_id)

        if mappings and 'EFO' in mappings['source']:
            return mappings['term'].split('/')[-1]
        else:
            mappings = self.query_ontoma(disease_id)

        if mappings and 'EFO' in mappings['source']:
            return mappings['term'].split('/')[-1]
        else:
            return None

    def query_ontoma(self, term):

        try:
            mappings = self.otmap.find_term(term, verbose=True)
        except Exception:
            time.sleep(3)
            mappings = self.otmap.find_term(term, verbose=True)

        return mappings

ol = ontoma_efo_lookup()
ont_udf = F.udf(ol.get_mapping, StringType())


INFO     - ontoma.downloaders - ZOOMA to EFO mappings - Parsed 3663 rows
INFO:ontoma.downloaders:ZOOMA to EFO mappings - Parsed 3663 rows
INFO     - ontoma.downloaders - OMIM to EFO mappings - Parsed 8561 rows
INFO:ontoma.downloaders:OMIM to EFO mappings - Parsed 8561 rows


In [7]:
# (
#     orphanet_df
#     .select('diseaseFromSourceId', 'diseaseFromSource')
#     .limit(100)
#     .withColumn('test_mapping', ont_udf(F.col('diseaseFromSourceId')))
#     .show(truncate=False)
# )

ol.get_mapping('Multiple epiphyseal dysplasia, Al-Gazali type', 'Orphanet_166024')

INFO     - ontoma.interface - EFO OBO parsed. Size: 26965 nodes
INFO:ontoma.interface:EFO OBO parsed. Size: 26965 nodes
INFO     - ontoma.interface - Parsed 125228 Name to EFO mapping 
INFO:ontoma.interface:Parsed 125228 Name to EFO mapping 
INFO     - ontoma.interface - Found http://www.orpha.net/ORDO/Orphanet_166024 for Orphanet_166024 from OLS API EFO lookup - match - None
INFO:ontoma.interface:Found http://www.orpha.net/ORDO/Orphanet_166024 for Orphanet_166024 from OLS API EFO lookup - match - None


'Orphanet_166024'

In [280]:
(
    orphanet_df
    .select('diseaseFromSourceId')
    .distinct()
    .limit(10)
    .withColumn('test_mapping', ont_udf(F.col('diseaseFromSourceId')))
    .show(truncate=False)
)

+-------------------+---------------+
|diseaseFromSourceId|test_mapping   |
+-------------------+---------------+
|Orphanet_640       |Orphanet_640   |
|Orphanet_157716    |Orphanet_157716|
|Orphanet_1766      |Orphanet_1766  |
|Orphanet_2067      |Orphanet_2067  |
|Orphanet_251663    |null           |
|Orphanet_250977    |Orphanet_250977|
|Orphanet_221046    |Orphanet_221046|
|Orphanet_320401    |Orphanet_320401|
|Orphanet_369852    |Orphanet_369852|
|Orphanet_363540    |Orphanet_363540|
+-------------------+---------------+



In [296]:
orphanet_df.groupby('type').count().sort('count').show(truncate=False)

+------------------------------------------------------+-----+
|type                                                  |count|
+------------------------------------------------------+-----+
|Category                                              |1    |
|Particular clinical situation in a disease or syndrome|4    |
|Biological anomaly                                    |11   |
|Clinical group                                        |12   |
|Clinical syndrome                                     |39   |
|Histopathological subtype                             |69   |
|Morphological anomaly                                 |233  |
|Etiological subtype                                   |504  |
|Clinical subtype                                      |1058 |
|Malformation syndrome                                 |1286 |
|Disease                                               |4639 |
+------------------------------------------------------+-----+



In [297]:
orphanet_df.groupby('associationType').count().sort('count').show(truncate=False)

+----------------------------------------------------------+-----+
|associationType                                           |count|
+----------------------------------------------------------+-----+
|Modifying germline mutation in                            |44   |
|Biomarker tested in                                       |47   |
|Disease-causing somatic mutation(s) in                    |197  |
|Disease-causing germline mutation(s) (gain of function) in|210  |
|Part of a fusion gene in                                  |232  |
|Role in the phenotype of                                  |233  |
|Candidate gene tested in                                  |372  |
|Major susceptibility factor in                            |572  |
|Disease-causing germline mutation(s) (loss of function) in|1141 |
|Disease-causing germline mutation(s) in                   |4808 |
+----------------------------------------------------------+-----+



## Cicaful


|types                                    |
|:------------------------------------------------------|
|Biological anomaly                                    |
|Particular clinical situation in a disease or syndrome|
|Disease                                               |
|Category                                              |
|Malformation syndrome                                 |
|Clinical group                                        |
|Clinical syndrome                                     |
|Histopathological subtype                             |
|Clinical subtype                                      |
|Etiological subtype                                   |
|Morphological anomaly                                 |


In [316]:
(
    orphanet_df
    .groupby('targetFromSourceId', 'diseaseFromSourceId')
    .agg(
        F.count('targetFromSourceId').alias('count'),
        F.collect_set(F.col('associationType')).alias('associationType'),
        F.collect_set(F.col('associationStatus')).alias('associationType'),
        F.collect_set(F.col('type')).alias('type'),
        F.collect_set(F.col('diseaseFromSource')).alias('diseaseFromSource'),
    )
    .filter(F.col('count') > 1)
    .show(40, vertical=True, truncate=False)
)

-RECORD 0---------------------------------------------------------------------------------------------------------------------------------------
 targetFromSourceId  | ENSG00000170175                                                                                                          
 diseaseFromSourceId | Orphanet_98913                                                                                                           
 count               | 2                                                                                                                        
 associationType     | [Disease-causing germline mutation(s) (gain of function) in, Disease-causing germline mutation(s) (loss of function) in] 
 associationType     | [Assessed]                                                                                                               
 type                | [Etiological subtype]                                                                                      

In [312]:
orphanet_df.printSchema()

root
 |-- diseaseFromSource: string (nullable = true)
 |-- diseaseFromSourceId: string (nullable = true)
 |-- type: string (nullable = true)
 |-- literature: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- associationType: string (nullable = true)
 |-- associationStatus: string (nullable = true)
 |-- targetFromSource: string (nullable = true)
 |-- targetFromSourceId: string (nullable = true)
 |-- dataSourceId: string (nullable = false)
 |-- datatypeId: string (nullable = false)



In [319]:
orphanet_df.select('associationType').distinct().show(truncate=False)

+----------------------------------------------------------+
|associationType                                           |
+----------------------------------------------------------+
|Major susceptibility factor in                            |
|Disease-causing germline mutation(s) (loss of function) in|
|Part of a fusion gene in                                  |
|Candidate gene tested in                                  |
|Role in the phenotype of                                  |
|Biomarker tested in                                       |
|Disease-causing germline mutation(s) in                   |
|Modifying germline mutation in                            |
|Disease-causing somatic mutation(s) in                    |
|Disease-causing germline mutation(s) (gain of function) in|
+----------------------------------------------------------+

