In [19]:
import pandas as pd
import numpy as np

In [20]:
df = pd.read_parquet("diabetic_data_cleaned_pca_som.parquet")
df.head(2).T

Unnamed: 0,0,1
encounter_id,149190,64410
patient_nbr,55629189,86047875
race,Caucasian,AfricanAmerican
gender,Female,Female
age,[10-20),[20-30)
admission_type_id,1,1
discharge_disposition_id,1,1
admission_source_id,7,7
time_in_hospital,3,2
num_lab_procedures,59,11


In [21]:
# import xrange
diagnosis_group = {
    "Circulatory": [range(390, 460), [785]],
    "Respiratory": [range(460, 520), [786]],
    "Digestive":  [range(520, 580), [787]], 
    "Diabetes": [np.arange(250, 251, .01), [250]],
    "Injury": [range(800, 1000)],
    "Musculoskeletal": [range(710, 740)],
    "Genitourinary": [range(580, 630), [788]],
    "Neoplasms": [range(140, 240)],
    "Other": [
        range(140, 240), 
        [780, 781, 784], range(790, 800),
        range(240, 250), range(251, 280),
        range(680, 710), [782],
        range(0, 140), 
        range(290, 320), 
        range(280, 290), 
        range(320, 360), 
        range(630, 680),
        range(360, 390), 
        range(740, 760),
        
    ]
}

reverse_index = {}
for label, cases in diagnosis_group.items():
    for l in cases:
        for n in l:
            # check if l is instance of numpy array
            if isinstance(l, np.ndarray):
                key = str(f"{n:3.2f}")
                key = key.rstrip("0")
                reverse_index[key] = label
            else:
                reverse_index[str(n)] = label

# any code starting with E or V is Other as well
for char in ["E"]:
    for i in range(1000):
        reverse_index[f"{char}{i}"] = "Other"

for char in ["V"]:
    for i in range(100):
        reverse_index[f"{char}{i:02d}"] = "Other"

# corner cases
reverse_index["365.44"] = "Other"
reverse_index["783"] = "Unmapped"
reverse_index["789"] = "Unmapped"

In [22]:
lll = df["diag_1"].unique().tolist() + df["diag_2"].unique().tolist() + df["diag_3"].unique().tolist()
# [k for k in lll if k.startswith("V")]

In [23]:
for col in ["diag_1", "diag_2", "diag_3"]:
    col_name = f"{col}_group"
    df[col_name] = df[col].apply(lambda x: reverse_index[x])
    df[col_name] = df[col_name].astype("category")

In [24]:
# save the data
display(df.shape)
display(df.columns)

filename = "diabetic_data_cleaned_pca_som_diag_grouped"

df.to_csv(f"{filename}.csv", index=False)
df.to_parquet(f"{filename}.parquet")


(100244, 41)

Index(['encounter_id', 'patient_nbr', 'race', 'gender', 'age',
       'admission_type_id', 'discharge_disposition_id', 'admission_source_id',
       'time_in_hospital', 'num_lab_procedures', 'num_procedures',
       'num_medications', 'number_outpatient', 'number_emergency',
       'number_inpatient', 'diag_1', 'diag_2', 'diag_3', 'number_diagnoses',
       'metformin', 'repaglinide', 'glimepiride', 'glipizide', 'glyburide',
       'pioglitazone', 'rosiglitazone', 'insulin', 'glyburide-metformin',
       'change', 'diabetesMed', 'readmitted', 'PC1', 'PC2', 'PC3',
       'som_jaccard_0', 'som_jaccard_1', 'som_euc_0', 'som_euc_1',
       'diag_1_group', 'diag_2_group', 'diag_3_group'],
      dtype='object')