In [1]:
import pandas as pd
import numpy as np

In [33]:
# Variables 
MIMIC_PATH = "../mimiciv2.2/hosp/"
POST_PROCESS_PATH = "./post_process/"

# make the POST_PROCESS_PATH
from pathlib import Path
Path(POST_PROCESS_PATH).mkdir(parents=True, exist_ok=True)

In [3]:
pat_df = pd.read_csv(MIMIC_PATH + "patients.csv",  usecols=['subject_id', 'gender', 'anchor_age'], index_col='subject_id')

In [4]:
pat_df.shape

(299712, 2)

In [5]:
pat_df["anchor_age"].min()

18

In [6]:
pat_df["anchor_age"].max()

91

In [7]:
pat_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 299712 entries, 10000032 to 19999987
Data columns (total 2 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   gender      299712 non-null  object
 1   anchor_age  299712 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 6.9+ MB


In [8]:
pat_df.index.nunique()

299712

## Admission File

In [9]:
adm_df = pd.read_csv(MIMIC_PATH + "admissions.csv", usecols=['subject_id', 'admittime', 'hadm_id'], index_col='subject_id', parse_dates=['admittime'])
adm_df.head()

Unnamed: 0_level_0,hadm_id,admittime
subject_id,Unnamed: 1_level_1,Unnamed: 2_level_1
10000032,22595853,2180-05-06 22:23:00
10000032,22841357,2180-06-26 18:27:00
10000032,25742920,2180-08-05 23:44:00
10000032,29079034,2180-07-23 12:35:00
10000068,25022803,2160-03-03 23:16:00


In [10]:
# remove day_time of admissions 
# TODO: Does datetime need reducing ?
adm_df["admittime"] = adm_df["admittime"].dt.date
adm_df.head()

Unnamed: 0_level_0,hadm_id,admittime
subject_id,Unnamed: 1_level_1,Unnamed: 2_level_1
10000032,22595853,2180-05-06
10000032,22841357,2180-06-26
10000032,25742920,2180-08-05
10000032,29079034,2180-07-23
10000068,25022803,2160-03-03


In [11]:
adm_df.shape

(431231, 2)

In [12]:
adm_df.index.nunique()

180733

In [13]:
adm_sorted_df = adm_df.reset_index().sort_values(["subject_id", "admittime"]).drop_duplicates("subject_id")
adm_sorted_df

Unnamed: 0,subject_id,hadm_id,admittime
0,10000032,22595853,2180-05-06
4,10000068,25022803,2160-03-03
5,10000084,23052089,2160-11-21
7,10000108,27250926,2163-09-27
8,10000117,22927623,2181-11-15
...,...,...,...
431207,19999733,27674281,2152-07-08
431217,19999784,26194817,2119-06-18
431227,19999828,29734428,2147-07-18
431229,19999840,26071774,2164-07-25


In [14]:
adm_sorted_df['admittime']

0         2180-05-06
4         2160-03-03
5         2160-11-21
7         2163-09-27
8         2181-11-15
             ...    
431207    2152-07-08
431217    2119-06-18
431227    2147-07-18
431229    2164-07-25
431230    2145-11-02
Name: admittime, Length: 180733, dtype: object

In [15]:
pat_df["anchor_age"]

subject_id
10000032    52
10000048    23
10000068    19
10000084    72
10000102    27
            ..
19999828    46
19999829    28
19999840    58
19999914    49
19999987    57
Name: anchor_age, Length: 299712, dtype: int64

In [16]:
adm_sorted_df.head(1)

Unnamed: 0,subject_id,hadm_id,admittime
0,10000032,22595853,2180-05-06


In [17]:
pat_df.head(1)

Unnamed: 0_level_0,gender,anchor_age
subject_id,Unnamed: 1_level_1,Unnamed: 2_level_1
10000032,F,52


In [18]:
df7 = adm_sorted_df.merge(pat_df, left_on = "subject_id", right_on = pat_df.index, how = "left")
df7

Unnamed: 0,subject_id,hadm_id,admittime,gender,anchor_age
0,10000032,22595853,2180-05-06,F,52
1,10000068,25022803,2160-03-03,F,19
2,10000084,23052089,2160-11-21,M,72
3,10000108,27250926,2163-09-27,M,25
4,10000117,22927623,2181-11-15,F,48
...,...,...,...,...,...
180728,19999733,27674281,2152-07-08,F,19
180729,19999784,26194817,2119-06-18,M,57
180730,19999828,29734428,2147-07-18,F,46
180731,19999840,26071774,2164-07-25,M,58


# Dropping redundant columns

In [30]:
df7.reset_index(drop = True, inplace = True )

In [31]:
df7

Unnamed: 0,subject_id,hadm_id,admittime,gender,anchor_age
0,10000032,22595853,2180-05-06,F,52
1,10000068,25022803,2160-03-03,F,19
2,10000084,23052089,2160-11-21,M,72
3,10000108,27250926,2163-09-27,M,25
4,10000117,22927623,2181-11-15,F,48
...,...,...,...,...,...
180728,19999733,27674281,2152-07-08,F,19
180729,19999784,26194817,2119-06-18,M,57
180730,19999828,29734428,2147-07-18,F,46
180731,19999840,26071774,2164-07-25,M,58


In [34]:
df7.to_csv(POST_PROCESS_PATH + "selected_ids.csv", index = False)