# Imports

In [1]:
"""WeirdTargets Module"""
##########################################
#      Future Statement Definitions      #
##########################################
from __future__ import division
from __future__ import print_function
from __future__ import generator_stop
from __future__ import unicode_literals
from __future__ import absolute_import
##########################################
#        Python Standard Library         #
##########################################
import argparse
import re 
import math
import json 
import itertools
import os
import sys
import time
import struct
import datetime
import inspect
import collections
import multiprocess
import multiprocessing
import operator as op
from functools import reduce
import requests
##########################################
#   3rd Party Data Loading & Analysis    #
##########################################
import ujson
import numpy as np
import pandas as pd
from tqdm import tqdm
from gzip import GzipFile
from toolz import partition_all
##########################################
#      3rd Party Parallel Computing      #
##########################################
import pathos
import psutil
import dask.dataframe as dd
import pathos.multiprocessing as pmp
from dask.threaded import get as ddscheduler
from opentargets import OpenTargetsClient
##########################################
#       Module Level Dunder Names        #
##########################################
__copyright__ = "Copyrights © 2019 Aly shmahell."
__credits__   = ["Aly Shmahell"]
__version__   = "0.1.1"
__maintainer__= "Aly Shmahell"
__email__     = ["aly.shmahell@gmail.com"]
__status__    = "Alpha"
##########################################
##########################################
##########################################

## Exception Class

In [2]:
class WeirdTargetsException(Exception):
    """
    Exception Class
    """
    __module__ = Exception.__module__
    def __init__(self, error):
        try:
            line = sys.exc_info()[-1].tb_lineno
        except AttributeError:
            line = inspect.currentframe().f_back.f_lineno
        self.args = f"{type(self).__name__} (line {line}): {error}",
        sys.exit(self)

## Printer Class

In [3]:
class WeirdTargetsPrinter(object):
  
    def pretty(self, x):
        return re.sub(r"\n\s+", "\n", x)

    def oneliner(self, x):
        return re.sub(r"\n\s*", " ", x)

## Argument Parser Classes

In [4]:
class WeirdTargetsArgParse(WeirdTargetsPrinter):

    def __init__(self):
        self.args = None
        
    def __getitem__(self, key):
        return getattr(self.args, key)

In [5]:
class SmallTargetsArgParse(WeirdTargetsArgParse):

    def __id_regex(self, id):
        regex = re.compile(r'^EFO_|^ENSG')
        if not regex.match(id):
            raise WeirdTargetsException(self.oneliner("""--id should
                                                          start with: 
                                                          <EFO_> or 
                                                          <ENSG>"""))
        return id
    def __check_args_compatibility(self, type, id):
        if not((re.match(r'EFO_.+\sdisease', f"{id} {type}"))
                or (re.match(r'ENSG.+\starget', f"{id} {type}"))):
            raise WeirdTargetsException(self.oneliner(f"""type: {type} 
                                                           and id: {id} 
                                                           are incompatible."""))
            
    def __init__(self):
        super(SmallTargetsArgParse, self).__init__()
        parser = argparse.ArgumentParser()
        parser.add_argument("--type",
                            type=str,
                            help=self.oneliner("""specify 
                                                   a type to 
                                                   look for, 
                                                   a target or 
                                                   a disease."""),
                            required=True,
                            choices=['disease', 'target'])
        parser.add_argument("--id",
                            type=self.__id_regex,
                            help="specify an id to look for.",
                            required=True)
        self.args = parser.parse_args()
        self.__check_args_compatibility(self.args.type,
                                        self.args.id)

In [6]:
class BigTargetsArgParse(WeirdTargetsArgParse):
    def __init__(self):
        super(BigTargetsArgParse, self).__init__()
        parser = argparse.ArgumentParser()
        parser.add_argument("--filename",
                            type=str,
                            help=self.oneliner("""specify a
                                                   file that 
                                                   holds a 
                                                   collection 
                                                   of json 
                                                   objects."""),
                            required=True)
        parser.add_argument("--tmp_dir",
                            type=str,
                            help=self.oneliner("""specify a
                                                   folder that 
                                                   will hold
                                                   lazily extracted
                                                   features."""),
                            required=True)
        self.args = parser.parse_args()

## WeirdTagets

In [7]:
class WeirdTargets(WeirdTargetsPrinter):

    def __init__(self):
        self.empty_string = ""
        
    def testParallelism(self):
        print("Testing Parallelism:")
        print(f"\t No. of Cores Available: {pmp.cpu_count()}")
        with pmp.Pool(pmp.cpu_count()) as pool:
            PIDS = pool.map(lambda _: f"{os.getpid()}", range(pmp.cpu_count()+1))
            print(f"\t No. of Cores Utilized:  {np.unique(PIDS).size}")

In [8]:
weird_targets = WeirdTargets()
weird_targets.testParallelism()

Testing Parallelism:
	 No. of Cores Available: 8
	 No. of Cores Utilized:  8


## SmallTargets

In [9]:
class SmallTargets(WeirdTargets):

    def __init__(self, type, id):
        self.type         = type
        self.id           = id
        self.inputs       = None
        self.outputs      = []
        self.elapsed_time = None
        super(SmallTargets, self).__init__()

    def __call__(self):
        if self.type == "disease":
            try:
                self.inputs = OpenTargetsClient().get_associations_for_disease(self.id)
            except:
                raise WeirdTargetsException(f"Incorrect Disease ID: {self.id}")
        if self.type == "target":
            try:
                self.inputs = OpenTargetsClient().get_associations_for_target(self.id)
            except:
                raise WeirdTargetsException(f"Incorrect Target ID: {self.id}")
        if not self.inputs:
            raise WeirdTargetsException(self.oneliner("""The query did not
                                                          return any usefull
                                                          information."""))
        self.elapsed_time = time.time()
        with pmp.Pool(pmp.cpu_count()) as pool:
            overalls           = pool.map(lambda entry: entry['association_score']['overall'], self.inputs)
            squared_overalls   = pool.map(lambda overall: overall**2,                          overalls)
            minimum            = min(overalls)
            maximum            = max(overalls)
            average            = sum(overalls)/len(self.inputs)
            standard_deviation = np.sqrt(
                sum(squared_overalls)/len(self.inputs) - average**2)
            self.outputs = {
                "Maximum"           : maximum,
                "Minimum"           : minimum,
                "Average"           : average,
                "Standard Deviation": standard_deviation
            }
        self.elapsed_time = time.time() - self.elapsed_time

    def __str__(self):
        if not self.outputs:
            raise WeirdTargetsException(self.oneliner("""you need to call 
                                                          the SmallTargets 
                                                          object first."""))
        return self.pretty(f"""Number of Entries :       {len(self.inputs)}\n
                                Elapsed Time      :       {self.elapsed_time} sec\n
                                Maximum           :       {self.outputs['Maximum']}\n
                                Minumum           :       {self.outputs['Minimum']}\n
                                Average           :       {self.outputs['Average']}\n
                                Standard Deviation:       {self.outputs['Standard Deviation']}""")

In [10]:
small_targets = SmallTargets('target', 'ENSG00000157764')
small_targets()
print(small_targets)

  self.api_specs = yaml.load(self.swagger_yaml)


Number of Entries :       1343
Elapsed Time      :       2.2975881099700928 sec
Maximum           :       1.0
Minumum           :       0.004
Average           :       0.28802892735518787
Standard Deviation:       0.31332124396113537


## BigTargets

In [11]:
def ncr(n, r):
    r = min(r, n-r)
    numer = reduce(op.mul, range(n, n-r, -1), 1)
    denom = reduce(op.mul, range(1, r+1), 1)
    return numer / denom

def downloader(url, filepath):
    r = requests.get(url, stream=True)
    total_size = int(r.headers.get('content-length', 0))
    block_size = 1024**2
    t=tqdm(total=total_size, unit='iB', unit_scale=True)
    with open(filepath, 'wb') as f:
        for data in r.iter_content(block_size):
            t.update(len(data))
            f.write(data)
    t.close()
    if total_size != 0 and t.n != total_size:
        print("ERROR, something went wrong")

class BigTargets(WeirdTargets):

    def __init__(self, rawdir, rawurl, keysdict, outdir, outfile, groupnames, resultdir, resultfile, save_association_cardinatily=False):
        if not os.path.isdir(outdir):
            os.mkdir(outdir, 755)
        if not os.path.isdir(resultdir):
            os.mkdir(resultdir, 755)
        if not os.path.isdir(rawdir):
            os.mkdir(rawdir, 755)
        super(BigTargets, self).__init__()
        self.rawdir         :"rawdir"    = rawdir
        self.rawfile        :"rawfile"   = rawurl.split('/')[-1]
        self.rawurl         :"rawurl"    = rawurl
        self.infile         :"infile"    = os.path.join(self.rawdir, self.rawfile)
        self.keysdict       :"keysdict"  = keysdict
        self.outdir         :"outdir"    = outdir
        self.outfile        :"outfile"   = outfile
        self.groupnames     :"groupnames"= groupnames
        self.resultdir      :"resultdir" = resultdir
        self.resultfile     :"resultfile"= resultfile
        self.save_association_cardinatily = save_association_cardinatily

    def __download__(self):
       downloader(self.rawurl, self.infile)

    def __traverse__(self, parsed, keys):
        if len(keys)>1:
            return self.__traverse__(parsed[keys[0]], keys[1:])
        return parsed[keys[0]]

    def __parsejson__(self, pyObject, tqdmObject):
        tqdmObject.update(1)
        parsed = ujson.loads(pyObject)
        obj = {
            key: self.__traverse__(parsed, self.keysdict[key])
            for key in self.keysdict.keys()
        }
        return obj

    def __json2panda__(self, batch, tqdmObject):
        parsedJSON = map(lambda b: self.__parsejson__(b, tqdmObject), batch)
        df = pd.DataFrame.from_records(parsedJSON, 
                                       columns=["target",
                                                "disease",
                                                "score"])
        return df
    def __peek__(self, iterable):
        try:
            first = next(iterable)
        except StopIteration:
            return None, None
        return first, itertools.chain([first], iterable)

    def __persist__(self):
        with tqdm(desc=f"Feature Extraction{self.empty_string:>18}") as tqdmObject:
            with GzipFile(self.infile) as f:
                batches = partition_all(
                    math.floor(psutil.virtual_memory()[1]/(1024**3))*int(2e+4), f
                )
                while True:
                    df, frames = self.__peek__(
                        map(
                            lambda b: self.__json2panda__(b, tqdmObject), batches
                        )
                    )
                    if frames == None:
                        break
                    df.to_hdf(
                        os.path.join(self.outdir, self.outfile),
                        key=f'{self.outfile.split(".")[0]}',
                        mode='a',
                        format='table',
                        append=True
                    )

    def __process__(self):
        with tqdm(desc=f"Mapping{self.empty_string:>18}") as tqdmo:
            def setter(x):
              tqdmo.update(1)
              return set(x.to_numpy())
            groups = self.df.groupby(self.groupnames[0])[self.groupnames[1]].apply(setter)
        array = groups.to_numpy()
        names = names = list(groups.keys())
        combinations = itertools.combinations(range(len(groups)), 2)
        def compare(combination):
            commons = array[combination[0]] & array[combination[1]]
            return 1 if len(commons) >= 2 else 0, ", ".join([names[combination[0]], names[combination[1]]])
        nck = int(ncr(array.shape[0], 2))
        if self.save_association_cardinatily:
            temp = np.memmap(
                os.path.join(self.resultdir, 'temp.memmap'), 
                dtype='|U30', 
                mode='w+', 
                shape=(nck,2)
            )
        counter = 0
        trueindex = 0
        iterable = range(nck)
        with tqdm(desc=f"Reducing{self.empty_string:>18}", iterable=iterable) as tqdmo:
            for combination in combinations:
                nck, commons = compare(combination)
                counter += nck
                if nck > 0 and self.save_association_cardinatily:
                    temp[trueindex][0] = commons
                    temp[trueindex][1] = str(nck)
                    trueindex += 1
                tqdmo.update(1)
        if self.save_association_cardinatily:
            results = np.memmap(
                os.path.join(self.resultdir, self.resultfile), 
                dtype='|U30', 
                mode='w+', 
                shape=(trueindex, 2)
            )
            results[:] = temp[:trueindex]
            os.remove(os.path.join(self.resultdir, 'temp.memmap'))
        else:
            results = None
        return results, counter
    def __call__(self):
        try:
            if not (os.path.exists(os.path.join(self.rawdir, self.rawfile))):
                self.__download__()
            if not (os.path.exists(os.path.join(self.outdir, self.outfile))):
                self.__persist__()
            self.df     = pd.read_hdf(
                os.path.join(self.outdir, self.outfile),
                key=f'{self.outfile.split(".")[0]}'
            )
            results, counter = self.__process__()
            print(f"No. Associations: {counter}")
        except KeyboardInterrupt:
            for name in dir():
                if not name.startswith('_'):
                    del globals()[name]
            print("Keyboard Interrupt")

In [12]:
big_targets = BigTargets(
    './content/datasets',
    'https://storage.googleapis.com/open-targets-data-releases/17.12/17.12_evidence_data.json.gz',
    {
        "target" : ["target", "id"],
        "disease": ["disease", "id"],
        "score":   ["scores", "association_score"]
    }, 
    './content/results',
    'tds.h5',
    ['target', 'disease'],
    './content/results',
    'result.memmap'
)
big_targets()

Mapping                  : 33109it [00:06, 4928.75it/s] 
Reducing                  : 100%|██████████| 548086386/548086386 [42:57<00:00, 212615.28it/s]  

No. Associations: 121114622



