In [5]:
#%%
import multiprocessing
import pandas as pd
import pickle
import argparse
from pathlib import Path
import os, sys
from itertools import repeat

from DataSynthesizer.DataDescriber import DataDescriber
from DataSynthesizer.DataGenerator import DataGenerator
from DataSynthesizer.ModelInspector import ModelInspector

from pathos.pools import _ProcessPool, ProcessPool

PROJ_PATH = Path('/home/dogu86/colon_synthesis_2')
sys.path.append(PROJ_PATH.joinpath('src').as_posix())

In [6]:
from MyModule.utils import *
import random
config = load_config()

#%%

PROJ_PATH = Path(config['path_config']['project_path'])
INPUT_PATH = PROJ_PATH.joinpath('data/processed/1_apply_bayesian/preprocess_data/D1')
OUTPUT_PATH = PROJ_PATH.joinpath('data/processed/1_apply_bayesian/apply_bayesian/D1')

#%%

if not OUTPUT_PATH.exists():
    OUTPUT_PATH.mkdir(parents=True)

#%%

def load_categorical():
    with open(INPUT_PATH.joinpath('categorical_columns.pkl'),'rb') as f:
        return pickle.load(f)      

#%%
def load_data(name):
    '''
    returns the data and the categoricals
    '''
    path = INPUT_PATH.joinpath(f'pt_{name}.csv')
    df = pd.read_csv(path)
    categoricals = load_categorical()
    cats = set(df.columns.tolist()) & set(categoricals)
    cats = {cat : True for cat in cats}
    return df, cats

In [None]:
def create_bayesian(name, epsilon):
    _, cats = load_data(name)
    thresholds = config['bayesian_config']['threshold_value']
    degree_of_network = config['bayesian_config']['degree_of_network']
    num_tuples_to_generate = config['bayesian_config']['number_of_tuples']
    
    description_file = OUTPUT_PATH.joinpath(f'out/epsilon{epsilon}/description_{name}.json')
    synthetic_data = OUTPUT_PATH.joinpath(f'out/epsilon{epsilon}/synthetic_data_{name}.csv')
    
    if not OUTPUT_PATH.joinpath(f'out/epsilon{epsilon}').exists():
        OUTPUT_PATH.joinpath(f'out/epsilon{epsilon}').mkdir(parents=True)
    
    candidate_keys = {"PT_SBST_NO":True}
    
    describer = DataDescriber(category_threshold=thresholds)
    
    describer.describe_dataset_in_correlated_attribute_mode(dataset_file=INPUT_PATH.joinpath(f'pt_{name}.csv'),
                                                        epsilon=epsilon,
                                                        k=degree_of_network,
                                                        attribute_to_is_categorical=cats,
                                                        attribute_to_is_candidate_key=candidate_keys)
    describer.save_dataset_description_to_file(description_file)
    
#    generator = DataGenerator()
#    generator.generate_dataset_in_correlated_attribute_mode(num_tuples_to_generate, description_file)
#    generator.save_synthetic_data(synthetic_data)
    return


def main():
    # data = PROJ_PATH.joinpath('data/processed/0_preprocess/D0.pkl')
    argparse.ArgumentParser()
    data = read_file(PROJ_PATH, 'data/processed/0_preprocess/D1.pkl')
    patients = data['PT_SBST_NO'].unique().tolist()
    epsilons  = config['epsilon']
    
    random.seed(config['random_seed'])
    sampled_patients = random.sample(patients, 100)
    
    epsilon = 1
    with _ProcessPool(8) as p :
        p.starmap(create_bayesian, zip(patients, repeat(epsilon)))
            
if __name__ == "__main__":
    main()

Adding ROOT EX_DIAG_Monocyte(#)

Adding ROOT TRTM_CASB_CSTR_NT
Adding ROOT PTH_SRGC_SGPT_PATL_T_STAG_VL
Adding ROOT PTH_SRGC_SGPT_MTST_LN_CNT
Adding ROOT PTH_SRGC_SGPT_CELL_DIFF_CD
Adding ROOT PTH_SRGC_SGPT_PATL_N_STAG_VL
Adding ROOT PT_BSNF_BSPT_SEX_CD
Adding attribute EX_DIAG_Monocyte(%)
Adding attribute EX_DIAG_RBC COUNT
Adding attribute EX_DIAG_RBC COUNT
Adding attribute EX_DIAG_Neutrophil(#)
Adding attribute EX_DIAG_Monocyte(#)
Adding attribute EX_DIAG_RBC COUNT
Adding attribute EX_DIAG_RDW
Adding attribute EX_DIAG_PDW(fL)
Adding attribute EX_DIAG_T. Bilirubin
Adding attribute PTH_SRGC_SGPT_PATL_M_STAG_VL
Adding attribute PTH_MLCR_MLPT_MSIE_RSLT_CD
Adding attribute EX_DIAG_T. Bilirubin
Adding attribute EX_DIAG_RBC COUNT
Adding attribute PTH_MLCR_MLPT_NREX_RSLT_CD
Adding attribute PTH_BPSY_BPTH_CELL_DIFF_CD
Adding attribute EX_DIAG_Neutrophil(%)
Adding attribute EX_DIAG_Total Protein
Adding attribute EX_DIAG_T. Bilirubin
Adding attribute EX_DIAG_Total Protein
Adding attribute EX_DI

Adding attribute EX_DIAG_AST(GOT)
Adding attribute EX_DIAG_Hemoglobin
Adding attribute EX_DIAG_Basophil(#)
Adding attribute EX_DIAG_RBC COUNT
Adding attribute EX_DIAG_RBC COUNT
Adding ROOT PTH_MLCR_MLPT_KE2E_RSLT_CD
Adding ROOT PTH_SRGC_SGPT_CELL_DIFF_CD
Adding attribute EX_DIAG_Total Protein
Adding attribute EX_DIAG_Creatinine
Adding attribute EX_DIAG_Delta neutrophil 1
Adding attribute EX_DIAG_RBC COUNT
Adding attribute EX_DIAG_Lymphocyte(%)
Adding attribute PTH_SRGC_SGPT_CELL_DIFF_CD
Adding attribute EX_DIAG_Monocyte(%)
Adding attribute PTH_BPSY_BPTH_CELL_DIFF_CD
Adding attribute EX_DIAG_nRBC(%)
Adding attribute EX_DIAG_Thrombotic microangiopathy score
Adding attribute EX_DIAG_Calcium
Adding attribute EX_DIAG_Cholesterol
Adding attribute EX_DIAG_Thrombotic microangiopathy score
Adding attribute EX_DIAG_PLT Count
Adding attribute EX_DIAG_RDW
Adding attribute EX_DIAG_Lymphocyte(#)
Adding attribute EX_DIAG_MCHC
Adding attribute EX_DIAG_Basophil(%)
Adding attribute EX_DIAG_Monocyte(#)
A

In [None]:
#%%
import pandas as pd
from DataSynthesizer.DataGenerator import DataGenerator
import os, sys
import argparse
from itertools import repeat
from pathlib import Path
from pathos.multiprocessing import _ProcessPool
import random

from MyModule.utils import *
config = load_config()

PROJ_PATH = Path('/home/dogu86/colon_synthesis_2')
sys.path.append(PROJ_PATH.joinpath('src').as_posix())

INPUT_PATH = PROJ_PATH.joinpath('data/processed/1_apply_bayesian/apply_bayesian/out/D1')
OUTPUT_PATH = PROJ_PATH.joinpath('data/processed/1_apply_bayesian/produce_data/D1')

In [None]:
if not OUTPUT_PATH.exists() :
    OUTPUT_PATH.mkdir(parents=True)


#%%
epsilons = config.get('epsilon')
mean_observation_days = config['bayesian_config'].get('mean_observation_days')
sd_observation_days = config['bayesian_config'].get('sd_observation_days')

def generate_data(epsilon, description_idx, sample_number):
    '''
    epsilon : 0.1 ~ 10000
    description_idx : patient id (1,2,..)
    sample_number : pseudo sample ID
    '''
    output_path = OUTPUT_PATH.joinpath(f'epsilon{epsilon}')
    if not output_path.exist() :
        output_path.mkdir()
        
    num_tuples = -1
    while num_tuples <= 0 :
        num_tuples = round(np.random.normal(mean_observation_days, sd_observation_days))
    
    # the outcome : Recur (DG_RCNF), DEATH (DEAD) should be the same
    original_data_path = PROJ_PATH.joinpath(f'data/processed/1_apply_bayesian/preprocess_data/pt_{description_idx}.csv')
    original_data = pd.read_csv(original_data_path)
    time_idx = original_data['TIME']

    if original_data['DEAD_NFRM_DEAD'].sum() > 0 :
        row = original_data.query('DEAD_NFRM_DEAD == 1')
        death_time = row['TIME']
    
    if original_data['DG_RCNF_RLPS'].sum() > 0 : 
        row = original_data.query('DG_RCNF_RLPS == 1')
        relapse_time = row['TIME']
    
    path = INPUT_PATH.joinpath(f'epsilon{epsilon}').joinpath(f'description_{description_idx}.json')
    
    # BN generator 생성
    generator = DataGenerator()
    generator.generate_dataset_in_correlated_attribute_mode(num_tuples, path)
    df = generator.synthetic_dataset
    
    # time 선별
    df = df[df.TIME.isin(time_idx)].copy()
    
    try :

        df.loc[relapse_time, "DG_RCNF_RLPS"] = 1
    except :
        pass
    
    try : 
        df.loc[death_time, "DEAD_NFRM_DEAD"] = 1
        
        index = min(df[df.DEAD_NFRM_DEAD == 1].index.values)
        df = df.loc[0:index,]
    except :
        pass
    
    
    df.to_pickle(output_path.joinpath(f'synthetic_data_{sample_number}.pkl'))


#%%
def return_description_files(path : Path):
    import os, sys
    files = os.listdir(path)
    
    f = lambda x : 'description' in x

    files = sorted(list(filter(f, files)))
    return files 


#%%
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--epsilon','-e', type=float, help='choose the epsilon')
    parser.add_argument('--multiplier', '-m', type=int ,help='how much time to resample the data')
    parser.add_argument('--sample', '-s', type=bool, default=False,help='whether you are going to sample')
    parser.add_argument('--sample_number', '-sn', type=int, default=0, help='how much you are going to sample')
    
    args = parser.parse_args()
    
    
    files = return_description_files(INPUT_PATH.joinpath(f'epsilon{args.epsilon}'))
    if args.sample :
        files = random.sample(files, args.sample_number)
        files = files*args.multiplier
        
    pseudo_patient_id = [i for i in range(0, args.multiplier*len(files))]
    
    epsilons  = config['epsilon']
    
    with _ProcessPool(16) as p:
        for epsilon in epsilons:
            p.starmap(generate_data, zip(repeat(args.epsilon), files, pseudo_patient_id))

if __name__ == "__main__":
    main()
    

In [6]:
import pandas as pd
a = pd.read_csv('/mnt/synthetic_data/data/processed/4_evaluate/make_whole_data/D1/comparsion_data_0.1.csv')
a['BSPT_IDGN_AGE'].value_counts()

49.0    147
48.0    131
46.0    106
47.0     98
45.0     95
44.0     89
40.0     72
43.0     72
39.0     62
41.0     56
42.0     53
38.0     52
37.0     46
34.0     37
36.0     33
33.0     28
35.0     27
32.0     25
31.0     19
30.0     16
29.0     12
28.0     10
27.0      6
26.0      6
20.0      5
21.0      4
24.0      4
23.0      3
22.0      3
16.0      2
19.0      1
14.0      1
Name: BSPT_IDGN_AGE, dtype: int64

In [5]:
b = pd.read_csv('/mnt/synthetic_data/data/processed/4_evaluate/make_whole_data/D1/original.csv')
b[b.BSPT_IDGN_AGE < 50]['BSPT_IDGN_AGE'].value_counts()

49    164
48    142
46    121
47    111
45    105
44    101
43     84
40     79
42     70
39     70
41     67
38     60
37     51
34     41
33     37
36     34
32     30
35     30
31     21
30     17
29     13
28     12
26      8
27      7
24      6
20      5
21      4
22      4
23      3
16      2
14      1
19      1
Name: BSPT_IDGN_AGE, dtype: int64

In [10]:
a['PT_SBST_NO'].nunique()

1321

In [9]:
b[b.BSPT_IDGN_AGE < 50]

Unnamed: 0,PT_SBST_NO,BSPT_SEX_CD,BSPT_FRST_DIAG_CD,BSPT_IDGN_AGE,BSPT_STAG_VL,BSPT_T_STAG_VL,BSPT_N_STAG_VL,BSPT_M_STAG_VL,BSPT_OPRT,TRTM_RD_RDT,OVRL_DAYS,DEAD
0,RN00000006,M,C20,42,1,1,0,0,1,0,656,0
9,RN00000023,M,C18,44,2,3,0,0,1,0,1746,0
10,RN00000024,M,C18,37,1,1,0,0,1,0,1787,0
12,RN00000026,M,C18,41,1,1,0,0,1,0,1726,0
15,RN00000030,M,C18,42,1,1,0,0,1,0,3937,0
...,...,...,...,...,...,...,...,...,...,...,...,...
10297,RN00018792,M,C20,49,3,3,2,0,1,0,1818,0
10310,RN00018816,F,C20,21,3,3,2,0,1,1,1666,1
10315,RN00018824,M,C18,36,1,2,0,0,1,0,946,0
10319,RN00018832,M,C18,39,2,3,0,0,1,0,392,0


In [12]:
b[b.BSPT_IDGN_AGE < 50]['PT_SBST_NO'].nunique()

1501

In [13]:
a

Unnamed: 0,PT_SBST_NO,BSPT_SEX_CD,BSPT_FRST_DIAG_CD,BSPT_IDGN_AGE,BSPT_STAG_VL,BSPT_T_STAG_VL,BSPT_N_STAG_VL,BSPT_M_STAG_VL,BSPT_OPRT,TRTM_RD_RDT,OVRL_DAYS,DEAD
0,0,F,C19,41.0,4.0,3.0,0.0,1.0,1,1,3534.0,0
1,0,,,,,,,,1,1,3534.0,0
2,1,M,C20,39.0,1.0,1.0,0.0,0.0,1,0,0.0,0
3,10,F,C20,33.0,1.0,1.0,0.0,0.0,0,0,0.0,0
4,100,F,C18,47.0,3.0,4.0,1.0,0.0,1,0,1177.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
1556,995,M,C18,43.0,3.0,3.0,1.0,0.0,1,0,1671.0,0
1557,996,M,C18,38.0,,,,,1,0,524.0,0
1558,997,F,C20,46.0,3.0,3.0,1.0,0.0,0,0,672.0,0
1559,998,M,C18,37.0,1.0,2.0,0.0,0.0,1,0,906.0,0


In [16]:
import os
len(os.listdir('/mnt/synthetic_data/data/processed/1_apply_bayesian/preprocess_data/D1'))

1322

In [44]:
s1 = pd.read_pickle('/mnt/synthetic_data/data/processed/2_restore/restore_to_s1/D1/S1_0.1.pkl')
d1 = pd.read_pickle('/mnt/synthetic_data/data/processed/0_preprocess/D1.pkl')


In [49]:
s1.PT_BSNF_BSPT_IDGN_AGE.value_counts()

49.0    99825
48.0    96595
46.0    88215
45.0    75860
47.0    70865
40.0    60290
44.0    59090
43.0    48375
39.0    47150
41.0    44885
42.0    40540
38.0    37780
37.0    32535
36.0    30720
33.0    21950
34.0    21725
31.0    21060
30.0    11385
35.0    11110
22.0    10380
23.0     6185
29.0     5690
32.0     5375
21.0     3540
27.0     3030
16.0     2405
20.0     2325
26.0     2260
24.0     1880
28.0     1335
19.0       85
14.0       60
Name: PT_BSNF_BSPT_IDGN_AGE, dtype: int64

In [50]:
d1.PT_BSNF_BSPT_IDGN_AGE.value_counts()

49.0    20112
48.0    19450
46.0    17750
45.0    15270
47.0    14271
40.0    12130
44.0    11907
43.0     9752
39.0     9492
41.0     9033
42.0     8161
38.0     7608
37.0     6553
36.0     6178
33.0     4418
34.0     4382
31.0     4231
30.0     2293
35.0     2249
22.0     2079
23.0     1240
29.0     1150
32.0     1100
21.0      712
27.0      612
16.0      483
20.0      470
26.0      458
24.0      380
28.0      277
19.0       18
14.0       13
Name: PT_BSNF_BSPT_IDGN_AGE, dtype: int64

In [54]:
d1.PT_SBST_NO.nunique()

1321

In [23]:
d1[d1.PT_BSNF_BSPT_IDGN_AGE <50][].nunique()

PT_SBST_NO                          1321
TIME                                3602
DG_RCNF_RLPS                           2
EX_DIAG_CEA                         1980
OPRT_NFRM_OPRT_CLCN_OPRT_KIND_CD       5
OPRT_NFRM_OPRT_CURA_RSCT_CD            2
PTH_BPSY_BPTH_BPSY_RSLT_CONT           2
PTH_BPSY_BPTH_CELL_DIFF_CD             6
PTH_MLCR_MLPT_BRME_RSLT_CD             2
PTH_MLCR_MLPT_KE2E_RSLT_CD             2
PTH_MLCR_MLPT_KRES_RSLT_CD             2
PTH_MLCR_MLPT_MSIE_RSLT_CD             3
PTH_MLCR_MLPT_NREX_RSLT_CD             2
PTH_MNTY_IMPT_HM1E_RSLT_CD             2
PTH_MNTY_IMPT_HP2E_RSLT_CD             2
PTH_MNTY_IMPT_HS2E_RSLT_CD             2
PTH_MNTY_IMPT_HS6E_RSLT_CD             2
PTH_SRGC_SGPT_CELL_DIFF_CD             6
PTH_SRGC_SGPT_MTST_LN_CNT             23
PTH_SRGC_SGPT_NERV_PREX_CD             3
PTH_SRGC_SGPT_PATL_M_STAG_VL           2
PTH_SRGC_SGPT_PATL_N_STAG_VL           4
PTH_SRGC_SGPT_PATL_STAG_VL             1
PTH_SRGC_SGPT_PATL_T_STAG_VL           5
PTH_SRGC_SGPT_SR

In [38]:
d1

Unnamed: 0,PT_SBST_NO,TIME,DG_RCNF_RLPS,EX_DIAG_CEA,OPRT_NFRM_OPRT_CLCN_OPRT_KIND_CD,OPRT_NFRM_OPRT_CURA_RSCT_CD,PTH_BPSY_BPTH_BPSY_RSLT_CONT,PTH_BPSY_BPTH_CELL_DIFF_CD,PTH_MLCR_MLPT_BRME_RSLT_CD,PTH_MLCR_MLPT_KE2E_RSLT_CD,...,PT_BSNF_BSPT_N_STAG_VL,PT_BSNF_BSPT_SEX_CD,PT_BSNF_BSPT_STAG_VL,PT_BSNF_BSPT_T_STAG_VL,PT_BSNF_BSPT_FRST_DIAG_YMD,TRTM_CASB_CSTR_NT,TRTM_CASB_CSTR_PRPS_CD,TRTM_CASB_CSTR_REGN_CD,TRTM_RD_RDT,DEAD_NFRM_DEAD
288370,0,0,0.0,,,,,,,,...,0.0,1.0,4.0,3.0,1.0,,,,,0.0
288371,0,8,0.0,1.64,,,,,,,...,0.0,1.0,4.0,3.0,,,,,,0.0
288372,0,35,0.0,1.64,11.0,2.0,,,,,...,0.0,1.0,4.0,3.0,,,,,,0.0
288373,0,41,0.0,0.68,,,,,,,...,0.0,1.0,4.0,3.0,,,,,,0.0
288374,0,54,0.0,0.36,,,,,,,...,0.0,1.0,4.0,3.0,,,,,,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
420428,10205,262,0.0,2.58,,,,,,,...,0.0,1.0,0.0,0.0,,,,,,0.0
800174,10211,0,0.0,,,,,,,,...,0.0,0.0,0.0,0.0,1.0,,,,,0.0
800175,10211,300,0.0,2.25,,,,,,,...,0.0,0.0,0.0,0.0,,,,,,0.0
503541,10212,0,0.0,,,,,,,,...,,0.0,,,1.0,,,,,0.0
