<a href="https://colab.research.google.com/github/AlyShmahell/WeirdTargets/blob/master/weirdtargets2020.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports

In [1]:
! pip install ujson



In [17]:
import os
import gc
import sys
import ujson
import psutil
import requests
import itertools
import numpy              as     np
import pandas             as     pd
import operator           as     op
from   copy               import copy
from   tqdm               import tqdm
from   functools          import reduce
from   gzip               import GzipFile
from   toolz              import partition_all
from   pathlib            import Path
from   concurrent.futures import ProcessPoolExecutor

# nCr

In [3]:
def ncr(n, r):
    r     = min(r, n-r)
    numer = reduce(op.mul, range(n, n-r, -1), 1)
    denom = reduce(op.mul, range(1, r+1), 1)
    return  numer / denom

# msuba

In [4]:
def msuba(S, t_size):
    numbers     = S.copy()
    mean        = np.mean(numbers)
    std         = np.std(numbers)
    numbers     = (numbers - mean)/std
    best_sum    = 0
    best_start  = best_end = 0
    current_sum = 0
    for current_end, x in enumerate(numbers):
        if current_sum <= 0:
            current_start = current_end
            current_sum   = x
        else:
            current_sum  += x
        if current_sum > best_sum and current_sum*std+((current_end-current_start)*mean) < t_size:
            best_sum   = current_sum
            best_start = current_start
            best_end   = current_end + 1
    return best_end - best_start

# Downloader

In [5]:
def downloader(url):
    def func(r, path, filename, total_size, block_size):
        t          = tqdm(
            desc       = f"downloading {filename}",
            total      = total_size, 
            unit       = 'iB', 
            unit_scale = True
        )
        with open(path/filename, 'wb') as f:
            for data in r.iter_content(block_size):
                t.update(len(data))
                f.write(data)
        t.close()
        try:
            assert not(total_size != 0 and t.n != total_size)
        except:
            sys.exit(f"downloaded {t.n}/{total_size}", flush=True)
    r          = requests.get(url, stream=True)
    total_size = int(r.headers.get('content-length', 0))
    block_size = 1024**2
    filename   = url.split('/')[-1]
    path       = Path(Path.home())/Path('workspace')
    path.mkdir(parents=True, exist_ok=True)
    try:
        assert Path(path/filename).stat().st_size == total_size
        print(f"{filename} is already downloaded", flush=True)
    except:
        func(r, path, filename, total_size, block_size)
    filename    = filename.split('.')
    (filename, 
     extension) = ".".join(filename[:-1]), filename[-1]
    return path, filename, extension

# Preprocessor

In [10]:
class Prerocessor:
    def __traverse__(self, parsed, keys):
        if len(keys)>1:
            return self.__traverse__(parsed[keys[0]], keys[1:])
        return parsed[keys[0]]
    def __json__(self, string):
        parsed = ujson.loads(string)
        obj = {
            key: self.__traverse__(parsed, self.keygroup[key])
            for key in self.keygroup.keys()
        }
        return obj
    def __df__(self, batch):
        json = map(self.__json__, batch)
        df   = pd.DataFrame.from_records(json, 
                                         columns=["target",
                                                  "disease",
                                                  "score"]
        )
        return df
    def __init__(self, url, keygroup):
        (path, 
         filename, 
         extension)   = downloader(url)
        self.keygroup = keygroup
        S = []
        with tqdm(desc=f"Feature Probing") as tqdmo:
            with GzipFile(f"{path}/{filename}.{extension}") as f:
                for b in f:
                    S += [33+len(b)]
                    tqdmo.update(1)
        n = msuba(S, int(psutil.virtual_memory().available/5))
        with tqdm(desc=f"Feature Extraction, n={n}") as tqdmo:
            with GzipFile(f"{path}/{filename}.{extension}") as f:
                batches = partition_all(n, f)
                for batch in batches:
                    df    = self.__df__(batch)
                    df.to_hdf(
                        f"{path}/{filename}.hdf",
                        key='features',
                        mode='a',
                        format='table',
                        append=True
                    )
                    tqdmo.update(len(df.index))
        self.df = pd.read_hdf(
                f"{path}/{filename}.hdf",
                key='features'
            )


In [11]:
%timeit
data = Prerocessor(
    'https://storage.googleapis.com/open-targets-data-releases/17.12/17.12_evidence_data.json.gz',
    {
        "target" : ["target", "id"],
        "disease": ["disease", "id"],
        "score":   ["scores", "association_score"]
    }
)

17.12_evidence_data.json.gz is already downloaded


Feature Probing: 5784597it [03:10, 30314.68it/s]
Feature Extraction, n=492474: 5909688it [07:29, 13140.03it/s]


In [27]:
class CycleEnumeration:
    def hash(self):
        with tqdm(desc=f"Hashing") as tqdmo:
            def setter(x):
                tqdmo.update(1)
                return set(x.to_numpy())
            self.table = self.df.groupby('target')['disease'].apply(setter).to_numpy()
    def map(self):
        self.hash()
        combinations = itertools.combinations(range(self.table.shape[0]), 2)
        nck          = int(ncr(self.table.shape[0], 2))
        base         = int(nck/os.cpu_count())
        self.chunks  = []
        cc           = 0
        partition    = 0
        with tqdm(desc=f"Mapping") as tqdmo:
            for successive in range(base, nck, base):
                self.chunks.append(
                                {
                                    "num": cc,
                                    "len": base,
                                    "val": itertools.islice(
                                                combinations,
                                                partition,
                                                successive,
                                                1
                                            ) 
                                }
                            )
                partition  = successive
                cc        += 1
                tqdmo.update(base)
            if successive < nck:
                self.chunks.append(
                                {
                                    "num": cc,
                                    "len": nck,
                                    "val": itertools.islice(
                                                combinations,
                                                successive,
                                                nck,
                                                1
                                            ) 
                                }
                            )
                tqdmo.update(base)
    def worker(self, chunk):
        local = 0
        num = chunk["num"]
        with tqdm(desc=f'Reducing - pool #{num}', total=chunk['len'], leave=True, file=sys.stdout, position=0) as tqdmo:
            for combination in chunk['val']:
                common = self.table[combination[0]] & self.table[combination[1]]
                if len(common) >= 2:
                    local += 1
                tqdmo.update(1)
        return local
    def reduce(self):
        with ProcessPoolExecutor() as executor:
            running_tasks = executor.map(self.worker, self.chunks)
            for running_task in running_tasks:
                self.counter += running_task
    def __init__(self, df):
        self.counter = 0
        self.df = df
        self.map()
        self.reduce()
        print(self.counter)

In [28]:
CycleEnumeration(data.df)

Hashing: 33109it [00:14, 2350.36it/s] 
Mapping: 548086386it [00:00, 649023410825.90it/s]


Reducing - pool #1:  50%|█████     | 274043193/548086386 [11:34<11:34, 394552.96it/s]
Reducing - pool #0: 100%|██████████| 274043193/274043193 [17:17<00:00, 264181.29it/s]
121114622


<__main__.CycleEnumeration at 0x7f79d6060978>