In [55]:
%%bash


old='/Users/dsuveges/repositories/evidence_datasource_parsers/gene2phenotype-2021-06-08.json.gz'
new='/Users/dsuveges/repositories/evidence_datasource_parsers/output/gene2phenotype-2021-06-15.json.gz/*json.gz'


cat <( echo -e "diseaseFromSource\tdiseaseFromSourceId\tdiseaseFromSourceMappedId") \
    <( gzcat $old | jq -r '"\(.diseaseFromSource)\t\(.diseaseFromSourceId)\t\(.diseaseFromSourceMappedId)"') > old.tsv
    
cat <( echo -e "diseaseFromSource\tdiseaseFromSourceMappedId") \
    <( gzcat $new | jq -r '"\(.diseaseFromSource)\t\(.diseaseFromSourceMappedId)"') > new.tsv
    

grep -v Orphanet_314381 old.tsv | sponge old.tsv
grep -v Orphanet_314381 new.tsv | sponge new.tsv

head old.tsv

diseaseFromSource	diseaseFromSourceId	diseaseFromSourceMappedId
JOUBERT SYNDROME TYPE 5	610188	null
MICROPHTHALMIA SYNDROMIC TYPE 9	601186	Orphanet_2470
CONGENITAL DISORDERS OF GLYCOSYLATION	612379	Orphanet_137
HYPOMYELINATION WITH ATROPHY OF THE BASAL GANGLIA AND CEREBELLUM	No disease mim	null
FRUCTOSE 1,6 BISPHOSPHATASE DEFICIENCY	229700	Orphanet_348
MEIER-GORLIN SYNDROME 4	613804	Orphanet_2554
SHWACHMAN-DIAMOND SYNDROME	260400	Orphanet_811
FANCONI ANEMIA COMPLEMENTATION GROUP P	613951	MONDO_0013499
FANCONI ANEMIA, COMPLEMENTATION GROUP F	603467	Orphanet_121719


In [56]:
import pandas as pd


old_df = pd.read_csv('old.tsv', sep='\t').drop_duplicates()
new_df = pd.read_csv('new.tsv', sep='\t').drop_duplicates()
merged = old_df.merge(new_df, on='diseaseFromSource', how='outer')

In [57]:
unmapped = merged.loc[merged.diseaseFromSourceMappedId_x.notna() & merged.diseaseFromSourceMappedId_y.isna()]
len(unmapped)

60

In [39]:
import ontoma

class disease_map(object):

    def __init__(self):
        self.ontoma = ontoma.interface.OnToma()

    def map_disease(self, disease_name, omim_id):
        logging.info(f"Mapping '{disease_name}'")

        # Search disease name using OnToma and accept perfect matches
        ontoma_mapping = self.ontoma.find_term(disease_name, verbose=True)

        # If there's some mapping available:
        if ontoma_mapping:

            # Extracting term if no action is required:
            if ontoma_mapping['action'] is None:
                return ontoma_mapping

            # When there is an exact match, but action is required:
            elif ontoma_mapping['quality'] == "match":

                # Match in HP or ORDO, check if there is a match in MONDO too. If so, give preference to MONDO hit
                mondo_mapping = self.search_mondo(disease_name)

                if mondo_mapping:
                    # Mondo mapping good - return
                    if mondo_mapping['exact']:
                        return mondo_mapping
                    # Mondo mapping bad - return ontoma
                    else:
                        return ontoma_mapping
                else:
                    # Mondo mapping bad - return ontoma
                    return ontoma_mapping

            else:
                # OnToma fuzzy match. First check if the mapping term has a xref to the OMIM id. 
                # If not, check in MONDO and if there is not match ignore evidence and report disease
                if self.ontoma.get_efo_from_xref(f"OMIM:{omim_id}"):
                    for efo_xref in self.ontoma.get_efo_from_xref(f"OMIM:{omim_id}"):
                        # Extract EFO id from OnToma results
                        efo_id = ontoma_mapping['term'].split('/')[-1].replace('_', ':')

                        if efo_id == efo_xref['id']:
                            return ontoma_mapping

                # xref search didn't work, try MONDO as the last resort
                mondo_mapping = self.search_mondo(disease_name)
                if mondo_mapping:
                    if mondo_mapping['exact']:
                        return mondo_mapping
                    else:
                        return None
                else:
                    # Record the unmapped disease
                    return None

        else:
            # No match in EFO, HP or ORDO
            mondo_mapping = self.search_mondo(disease_name)
            if mondo_mapping:
                if mondo_mapping['exact']:
                    return mondo_mapping
                else:
                    return None
            else:
                return None

    def search_mondo(self, disease_name):

        disease_name = disease_name.lower()

        # mondo_lookup works like a dictionary lookup so if disease is not in there it raises and error instead of returning `None`
        try:
            mondo_term = self.ontoma.mondo_lookup(disease_name)
            return {
                'id': mondo_term, 
                'name': self.ontoma.get_mondo_label(mondo_term),
                'exact': True
            }
        except KeyError as e:
            exact_ols_mondo = self.ontoma._ols.besthit(disease_name,
                                                       ontology=['mondo'], field_list=['iri', 'label'], exact=True)

            if exact_ols_mondo:
                return {'term': exact_ols_mondo['iri'], 'name': exact_ols_mondo['label'], 'exact':True}

            else:
                ols_mondo = self.ontoma._ols.besthit(disease_name,
                                                     ontology=['mondo'],
                                                     field_list=['iri', 'label'],
                                                     bytype='class')
                if ols_mondo:
                    return {'term': ols_mondo['iri'], 'name': ols_mondo['label'], 'exact': False}
                else:
                    return None
                
                
dm = disease_map()

INFO     - ontoma.downloaders - ZOOMA to EFO mappings - Parsed 3663 rows
INFO     - ontoma.downloaders - OMIM to EFO mappings - Parsed 8561 rows


In [43]:
dm.map_disease('MICROPHTHALMIA SYNDROMIC TYPE 9', '601186')

INFO     - ontoma.interface - EFO OBO parsed. Size: 26999 nodes
INFO:ontoma.interface:EFO OBO parsed. Size: 26999 nodes
INFO     - ontoma.interface - Parsed 125341 Name to EFO mapping 
INFO:ontoma.interface:Parsed 125341 Name to EFO mapping 
INFO     - ontoma.interface - Found http://www.orpha.net/ORDO/Orphanet_2470 for MICROPHTHALMIA SYNDROMIC TYPE 9 from EFO OBO - match - None
INFO:ontoma.interface:Found http://www.orpha.net/ORDO/Orphanet_2470 for MICROPHTHALMIA SYNDROMIC TYPE 9 from EFO OBO - match - None


{'term': 'http://www.orpha.net/ORDO/Orphanet_2470',
 'label': 'Matthew-Wood syndrome',
 'source': 'EFO OBO',
 'quality': 'match',
 'action': None}

In [42]:
import logging

In [52]:
def map_disease(row):
    label = row['diseaseFromSource']
    disease_id = row['diseaseFromSourceId']
    lookup = dm.map_disease(label, disease_id)
    if lookup:
        try:
            return lookup['term'].split('/')[-1]
        except Exception as e:
            print(lookup)
    else:
        return None

In [53]:
map_disease('MICROPHTHALMIA SYNDROMIC TYPE 9', '601186')

TypeError: map_disease() takes 1 positional argument but 2 were given

In [54]:
(
    unmapped
    .head()
    .apply(map_disease,axis=1)
)

INFO     - ontoma.interface - Found http://www.orpha.net/ORDO/Orphanet_2470 for MICROPHTHALMIA SYNDROMIC TYPE 9 from EFO OBO - match - None
INFO:ontoma.interface:Found http://www.orpha.net/ORDO/Orphanet_2470 for MICROPHTHALMIA SYNDROMIC TYPE 9 from EFO OBO - match - None
INFO     - ontoma.interface - Found http://www.orpha.net/ORDO/Orphanet_137 for CONGENITAL DISORDERS OF GLYCOSYLATION from EFO OBO - match - None
INFO:ontoma.interface:Found http://www.orpha.net/ORDO/Orphanet_137 for CONGENITAL DISORDERS OF GLYCOSYLATION from EFO OBO - match - None
INFO     - ontoma.interface - Found http://www.orpha.net/ORDO/Orphanet_2554 for MEIER-GORLIN SYNDROME 4 from OT Zooma Mappings - match - None
INFO:ontoma.interface:Found http://www.orpha.net/ORDO/Orphanet_2554 for MEIER-GORLIN SYNDROME 4 from OT Zooma Mappings - match - None
INFO     - ontoma.interface - Found http://www.orpha.net/ORDO/Orphanet_811 for SHWACHMAN-DIAMOND SYNDROME from OT Zooma Mappings - match - None
INFO:ontoma.interface:Foun

1    Orphanet_2470
2     Orphanet_137
5    Orphanet_2554
6     Orphanet_811
7    MONDO_0013499
dtype: object

In [51]:
unmapped.head()

Unnamed: 0,diseaseFromSource,diseaseFromSourceId,diseaseFromSourceMappedId_x,diseaseFromSourceMappedId_y
1,MICROPHTHALMIA SYNDROMIC TYPE 9,601186,Orphanet_2470,
2,CONGENITAL DISORDERS OF GLYCOSYLATION,612379,Orphanet_137,
5,MEIER-GORLIN SYNDROME 4,613804,Orphanet_2554,
6,SHWACHMAN-DIAMOND SYNDROME,260400,Orphanet_811,
7,FANCONI ANEMIA COMPLEMENTATION GROUP P,613951,MONDO_0013499,


In [75]:
import pandas as pd 

class FetchFromFtp(object):

    """
    This class is to retrieve the association table from the most recent GWAS Catalog release.
    Expects the ftp host address.

    It also returns the release date.
    """

    def __init__(self, url):
        self.FTP_HOST = url

        # Initialize connection and go to folder:
        self.ftps = ftplib.FTP_TLS(self.FTP_HOST)
        self.ftps.login()

    def fetch_file_list(self, path):
        # Get list of files and the date of modification:
        files = []
        self.ftp.cwd(path)
        self.ftp.dir(files.append)

        files = [' '.join(x.split()[8:]) for x in files]

        return files

    def fetch_last_update_date(self, path):
        """
        This function returns the date of the most recently modified file.
        """

        # Get list of files and the date of modification:
        files = []
        self.ftp.cwd(path)
        self.ftp.dir(files.append)

        # Get all dates:
        dates = [' '.join(x.split()[5:8]) for x in files]
        dates_parsed = [parser.parse(x) for x in dates]

        release_date = max(dates_parsed)
        return release_date.strftime('%Y-%m-%d')

    def fetch_file(self, path, file):
        sio = io.BytesIO()

        def handle_binary(more_data):
            sio.write(more_data)

        self.ftp.retrbinary(f"RETR {path}/{file}", handle_binary)
        sio.seek(0)  # Go back to the start
        zippy = gzip.GzipFile(fileobj=sio)
        return zippy

    def fetch_tsv(self, path, file, skiprows=None, header='infer'):
        self.tsv_data = pd.read_csv(
            f'sftp://{self.FTP_HOST}/{path}/{file}',
            sep='\t', dtype=str, skiprows=skiprows, header=header
        )

    def close_connection(self):
        self.ftp.close()


In [78]:
f = FetchFromFtp('ftp.ebi.ac.uk')

TypeError: __init__() got an unexpected keyword argument 'user'

In [74]:
f.fetch_file_list('/pub/databases/gencode/Gencode_human')

error_perm: 530 Please login with USER and PASS.

In [80]:
ftps = ftplib.FTP_TLS('ftp.ebi.ac.uk', user=None, passwd=None)
ftps.