# 0) Import, load, etc.

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import networkx as nx
import time
import threading
import datetime
import multiprocessing as mp
from multiprocessing import Process, Manager
from itertools import repeat
import multiprocessing as mp
import os
import pprint
from magictree import * #tree layout
import pickle
import sys

In [2]:
mimic = sorted(np.load('mimic_sequences.npy'))
p2p_jaccard = np.load('p2p_jaccard.npy')
p2p_wuplus = np.load('p2p_wuplus.npy')

In [3]:
np.random.seed(100)

# 1) Select datapoint

In [None]:
#2519

In [None]:
i2e_index = 300#np.random.choice(len(mimic))
print(i2e_index)
i2e = mimic[i2e_index]
print(i2e)

# 2) Get k REAL neighbours

In [4]:
k1 = 50

In [5]:
def get_closest_datapoints_indexes(start_index,dist_matrix,k):
    candidates = sorted(enumerate(dist_matrix[start_index]),key=lambda x:x[1])[:k]
    return list(candidates)

In [None]:
wup_nbh_indx = [el[0] for el in get_closest_datapoints_indexes(i2e_index,p2p_wuplus,k1)] 
jac_nbh_indx = [el[0] for el in get_closest_datapoints_indexes(i2e_index,p2p_jaccard,k1)] 

# 3) Flatten

In [6]:
def flat_dp(dp_index):
    codes = list(set([a for b in mimic[dp_index][:-1] for a in b]))
    fdp = {}
    for c in codes:  
        seq = [1 if c in el else 0 for el in mimic[dp_index][:-1][::-1]] #already rev
        decay = [1/2**i for i in range(1,len(mimic[dp_index]))]
        #print(c,seq,decay,np.multiply(seq,decay),sum(np.multiply(seq,decay)))
        fdp[c]=sum(np.multiply(seq,decay))
    return fdp
        
def flat_cloud(kdp):
    local_flat = [flat_dp(n) for n in kdp]
    all_codes = sorted(list(set([a for b in [list(lf.keys()) for lf in local_flat] for a in b])))
    feat_mx = np.full((len(kdp),len(all_codes)),0.)
    for row,lf in enumerate(local_flat):
        for (k,v) in lf.items():
            feat_mx[row][all_codes.index(k)]=v
    return feat_mx,all_codes
    

In [None]:
wup_nbh_flat,wup_codes = flat_cloud(wup_nbh_indx)
jac_nbh_flat,jac_codes = flat_cloud(jac_nbh_indx)

# 4) Make ksyn synthetic datapoints

In [7]:
k2 = 20

In [8]:
def perturb(flat_k,points_per_dp=k2):
    num_codes = flat_k.shape[1]
    feat_mx = np.full((k1*k2,num_codes),0.)
    for f in range(num_codes):
        feat_mean = np.mean(flat_k[:,f])
        feat_std = np.std(flat_k[:,f])
        feat_mx[:,f]=np.random.normal(feat_mean,feat_std,k1*k2)
    feat_mx[feat_mx<0.]=0.
    feat_mx[feat_mx>1.]=1.
    return feat_mx

In [None]:
wup_syn_flat = perturb(wup_nbh_flat)
jac_syn_flat = perturb(jac_nbh_flat)

# 5) Unflatten

In [9]:
max_past = max([len(x) for x in mimic])
past_mean = np.mean([len(x) for x in mimic])
past_std = np.std([len(x) for x in mimic])

In [10]:
def unflat_cloud(cloud,code_list):
    res = []
    for dp in cloud: #iterate over synthetic datapoints
        dp_mx = np.full((len(code_list),max_past),0) #codes * visits
        for col,temp_val in enumerate(dp):
            check = .5
            for visit_index in range(max_past-1,-1,-1):
                if temp_val>=check:
                    dp_mx[col,visit_index]=1
                    temp_val-=check
                check/=2
            #dp_mx matrix filled with values
        raw_seq = [[code_list[c] for c in range(dp_mx.shape[0]) if dp_mx[c,v]==1] for v in range(dp_mx.shape[1])]#
        num_visit = max(1,int(np.ceil(np.random.normal(past_mean,past_std))))
        res.append(raw_seq[-num_visit:])
    return res

In [None]:
wup_syn_seq = unflat_cloud(wup_syn_flat,wup_codes)
jac_syn_seq = unflat_cloud(jac_syn_flat,jac_codes)

# 6) [toy] caching

In [None]:
mega_dict = {}
mega_dict[(i2e_index,'jac')] = (jac_nbh_indx,jac_nbh_flat,jac_syn_flat,jac_syn_seq)
mega_dict[(i2e_index,'wup')] = (wup_nbh_indx,wup_nbh_flat,wup_syn_flat,wup_syn_seq)
pickle.dump(mega_dict,open('mimic_cache.pkl','wb'),protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
test = pickle.load(open('mimic_cache.pkl','rb'))
print(test.keys())

# 7) MEGALOOP

In [11]:
len(mimic)

7499

In [12]:
jac_dict = {}
wup_dict = {}

for indx in range(len(mimic)):
    if indx%100==0:
        print(indx,time.ctime())
    try:
        i2e = mimic[indx]
        wup_nbh_indx = [el[0] for el in get_closest_datapoints_indexes(indx,p2p_wuplus,k1)] 
        jac_nbh_indx = [el[0] for el in get_closest_datapoints_indexes(indx,p2p_jaccard,k1)] 
        wup_nbh_flat,wup_codes = flat_cloud(wup_nbh_indx)
        jac_nbh_flat,jac_codes = flat_cloud(jac_nbh_indx)
        wup_syn_flat = perturb(wup_nbh_flat)
        jac_syn_flat = perturb(jac_nbh_flat)
        wup_syn_seq = unflat_cloud(wup_syn_flat,wup_codes)#
        jac_syn_seq = unflat_cloud(jac_syn_flat,jac_codes)  
        wup_dict[indx] = (wup_nbh_indx,wup_nbh_flat,wup_syn_flat,wup_syn_seq)
        jac_dict[indx] = (jac_nbh_indx,jac_nbh_flat,jac_syn_flat,jac_syn_seq)
    except Exception as ex:
        print('FUCKUP',indx)

0 Thu Jul 18 12:38:04 2019
100 Thu Jul 18 12:54:36 2019
200 Thu Jul 18 13:10:22 2019
300 Thu Jul 18 13:26:21 2019
400 Thu Jul 18 13:41:59 2019
500 Thu Jul 18 13:56:54 2019
600 Thu Jul 18 14:10:01 2019
700 Thu Jul 18 14:22:25 2019
800 Thu Jul 18 14:35:53 2019
900 Thu Jul 18 14:48:00 2019
1000 Thu Jul 18 14:59:48 2019
1100 Thu Jul 18 15:14:52 2019
1200 Thu Jul 18 15:28:13 2019
1300 Thu Jul 18 15:41:29 2019
1400 Thu Jul 18 15:56:00 2019
1500 Thu Jul 18 16:09:47 2019
1600 Thu Jul 18 16:22:47 2019
1700 Thu Jul 18 16:34:49 2019
1800 Thu Jul 18 16:48:14 2019
1900 Thu Jul 18 16:57:44 2019
2000 Thu Jul 18 17:07:35 2019
2100 Thu Jul 18 17:16:56 2019
2200 Thu Jul 18 17:28:53 2019
2300 Thu Jul 18 17:40:00 2019
2400 Thu Jul 18 17:48:44 2019
2500 Thu Jul 18 17:57:36 2019
2600 Thu Jul 18 18:07:42 2019
2700 Thu Jul 18 18:19:54 2019
2800 Thu Jul 18 18:31:54 2019
2900 Thu Jul 18 18:44:36 2019
3000 Thu Jul 18 18:55:14 2019
3100 Thu Jul 18 19:06:45 2019
3200 Thu Jul 18 19:19:05 2019
3300 Thu Jul 18 19:31:

In [13]:
missed = []
for i in range(len(mimic)):
    try:
        wup_dict[i]
        jac_dict[i]
    except:
        missed.append(i)

In [14]:
np.array(missed)

array([], dtype=float64)

In [15]:
pickle.dump(jac_dict,open('/data/big/perotti/jac_cache_2.pkl','wb'),protocol=pickle.HIGHEST_PROTOCOL)    

In [16]:
pickle.dump(wup_dict,open('/data/big/perotti/wup_cache_2.pkl','wb'),protocol=pickle.HIGHEST_PROTOCOL)    

In [17]:
len(list(pickle.load(open('/data/big/perotti/jac_cache_2.pkl','rb')).keys()))
#pickle.load(open('/data/big/perotti/wup_cache.pkl','rb'))[0]

7499

In [18]:
len(list(pickle.load(open('/data/big/perotti/wup_cache_2.pkl','rb')).keys()))

7499