In [1]:
import pandas as pd 

In [2]:
ad_dup = pd.DataFrame(pd.read_csv("ad_dup.csv"))
weight = pd.DataFrame(pd.read_csv("weight.csv"))
birth = pd.DataFrame(pd.read_csv("birth.csv"))
primary = pd.DataFrame(pd.read_csv("primary.csv"))

In [3]:
info = ad_dup
info

Unnamed: 0,subject_id,hadm_id,admittime,diagnosis
0,10059,142582,2150-08-07 21:40:00,VARICEAL BLEED
1,10059,122098,2150-08-22 17:33:00,LOWER GI BLEED
2,10088,169938,2107-01-04 11:59:00,SEPSIS;PNEUMONIA;TELEMETRY
3,10088,168233,2107-01-29 04:00:00,CONGESTIVE HEART FAILURE
4,10088,149044,2107-05-12 18:00:00,UROSEPSIS
5,10094,168074,2180-02-29 18:54:00,HYPOTENSION;TELEMETRY
6,10094,122928,2180-03-15 22:35:00,SEPSIS;TELEMETRY
7,10117,187023,2138-06-05 17:23:00,FEVER
8,10117,105150,2138-11-09 18:08:00,FEVER
9,10119,157466,2117-08-05 18:27:00,ACUTE CHOLECYSTITIS


In [4]:
import time

DT_FORMAT='%Y-%m-%d %H:%M:%S'

def year(time_: str) -> float:
    t = time.strptime(time_, DT_FORMAT)
    return time.mktime(t) / (365 * 86400)

In [5]:
birth_dict = birth.set_index('subject_id').to_dict()['dob']
weight_dict = weight.set_index('hadm_id').to_dict()['patientweight']
primary_dict = primary.set_index('hadm_id').to_dict()['primary']

In [6]:
info["birthtime"] = info["subject_id"].map(birth_dict)
info["age"] = info["admittime"].map(year) - info["birthtime"].map(year)

info["weight"] = info["hadm_id"].map(weight_dict)

info["primary"] = info["hadm_id"].map(primary_dict)

In [7]:
info = info[pd.isna(info["weight"]) == False]
info = info[["subject_id", "age", "weight", "primary"]]
info = info.sort_values(by=["subject_id", "age"])
info 

Unnamed: 0,subject_id,age,weight,primary
13,40124,66.630415,41.5,4
14,40124,67.148609,47.4,4
15,40310,40.629113,81.7,27
16,40310,41.084049,74.5,707
17,41795,48.980972,141.7,27
18,41795,49.148957,139.0,42
19,41976,62.296678,69.8,707
20,41976,62.506075,76.6,38
21,41976,62.555986,70.0,707
22,41976,63.679056,75.8,27


In [14]:
primary_set = set(info["primary"])
primary_set

{'15', '19', '27', '3', '38', '39', '4', '42', '438', '51', '57', '707'}

In [22]:
num_dict = dict(zip(primary_set, range(len(primary_set))))
disease_dict = dict(zip(range(len(primary_set)), primary_set))
num_dict

{'4': 0,
 '27': 1,
 '51': 2,
 '42': 3,
 '19': 4,
 '38': 5,
 '707': 6,
 '438': 7,
 '15': 8,
 '3': 9,
 '39': 10,
 '57': 11}

In [23]:
disease_dict

{0: '4',
 1: '27',
 2: '51',
 3: '42',
 4: '19',
 5: '38',
 6: '707',
 7: '438',
 8: '15',
 9: '3',
 10: '39',
 11: '57'}

In [25]:
import json
import datetime
import numpy as np


class JsonEncoder(json.JSONEncoder):

    def default(self, obj):
        if isinstance(obj, np.integer):
            return int(obj)
        elif isinstance(obj, np.floating):
            return float(obj)
        elif isinstance(obj, np.ndarray):
            return obj.tolist()
        elif isinstance(obj, datetime):
            return obj.__str__()
        else:
            return super(MyEncoder, self).default(obj)


def save_dict(filename, dic):
    '''save dict into json file'''
    with open(filename, 'w') as json_file:
        json.dump(dic, json_file, ensure_ascii=False, cls=JsonEncoder)


def load_dict(filename):
    '''load dict from json file'''
    with open(filename, "r") as json_file:
        dic = json.load(json_file)
    return dic

In [26]:
save_dict("disease_dict.json", disease_dict)

In [27]:
load_dict("disease_dict.json")

{'0': '4',
 '1': '27',
 '2': '51',
 '3': '42',
 '4': '19',
 '5': '38',
 '6': '707',
 '7': '438',
 '8': '15',
 '9': '3',
 '10': '39',
 '11': '57'}

In [18]:
info["primary"] = info["primary"].map(num_dict)
info 

Unnamed: 0,subject_id,age,weight,primary
13,40124,66.630415,41.5,0
14,40124,67.148609,47.4,0
15,40310,40.629113,81.7,1
16,40310,41.084049,74.5,6
17,41795,48.980972,141.7,1
18,41795,49.148957,139.0,3
19,41976,62.296678,69.8,6
20,41976,62.506075,76.6,5
21,41976,62.555986,70.0,6
22,41976,63.679056,75.8,1


In [28]:
info.to_csv("train_data.csv", index=None)