# Imports

In [1]:
import json
import pandas as pd
import matplotlib as mpl
from matplotlib import pyplot as plt
from sklearn import preprocessing as preproc
from sklearn.decomposition import PCA
import numpy as np

from sklearn.metrics import mean_squared_error

# import tqdm to notebook
from tqdm.notebook import tqdm

# https://stackoverflow.com/questions/52285104/3d-scatterplots-with-hue-colormap-and-legend
from mpl_toolkits.mplot3d import Axes3D

import seaborn as sns

try:
  from google.colab import drive
  drive.mount('/content/drive')

  COLAB = True
except ModuleNotFoundError:
  COLAB = False

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

from minisom import MiniSom

import tensorflow as tf
from tensorflow import keras


import matplotlib.pyplot as plt
from matplotlib.patches import RegularPolygon, Ellipse
from mpl_toolkits.axes_grid1 import make_axes_locatable
from matplotlib import cm, colorbar
from matplotlib.lines import Line2D

# forcing tensorflow to use CPU

from datetime import datetime

%matplotlib inline

2024-03-25 23:42:39.531811: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-03-25 23:42:39.556241: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI AVX512_BF16 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


# Load Data

In [2]:
df = pd.read_parquet("diabetic_data_cleaned_pca_som.parquet")

# fix some types that parquet didn't get right
df["admission_type_id"] = df["admission_type_id"].astype("category")
df["discharge_disposition_id"] = df["discharge_disposition_id"].astype("category")
df["admission_source_id"] = df["admission_source_id"].astype("category")

display(df.shape)
display(df.dtypes)
display(df.head(2).T)

(100244, 36)

race                        category
gender                      category
age                         category
admission_type_id           category
discharge_disposition_id    category
admission_source_id         category
time_in_hospital               int64
num_lab_procedures             int64
num_procedures                 int64
num_medications                int64
number_outpatient              int64
number_emergency               int64
number_inpatient               int64
diag_1                      category
diag_2                      category
diag_3                      category
number_diagnoses               int64
metformin                   category
repaglinide                 category
glimepiride                 category
glipizide                   category
glyburide                   category
pioglitazone                category
rosiglitazone               category
insulin                     category
glyburide-metformin         category
change                          bool
d

Unnamed: 0,0,1
race,Caucasian,AfricanAmerican
gender,Female,Female
age,[10-20),[20-30)
admission_type_id,1,1
discharge_disposition_id,1,1
admission_source_id,7,7
time_in_hospital,3,2
num_lab_procedures,59,11
num_procedures,0,5
num_medications,18,13


In [3]:
with open("data_dictionary.json", "r") as f:
  data_info = json.load(f)

# NUM_COLUMNS = data_info["Numeric Columns"]
NOM_COLUMNS = data_info["Nominal Columns"]
ORD_COLUMNS = data_info["Ordinal Columns"]
TGT = data_info["Target"]



# Auto Encoders

In [4]:
df_temp = df.copy()

CAT_COLUMNS = NOM_COLUMNS + ORD_COLUMNS

df_temp = df_temp[CAT_COLUMNS]
for col in CAT_COLUMNS:
  df_temp[col] = df_temp[col].astype("category")

ohe = OneHotEncoder(drop=None, sparse_output=False)
temp = ohe.fit_transform(df_temp)

temp.shape

(100244, 2355)

In [5]:
y = df[TGT]
X_train, X_test, _, _ = train_test_split(temp, y, test_size=0.2, stratify=y)

X_train.shape, X_test.shape

((80195, 2355), (20049, 2355))

In [6]:
def train_autoenc(train_data, df_ref, latent_vars, learning_rate, epochs):
    input_shape = (2355,)

    inp = keras.Input(shape=input_shape)

    enc = keras.layers.Dense(400, activation="relu")(inp)
    enc = keras.layers.Dense(150, activation="relu")(enc)
    enc = keras.layers.Dense(latent_vars, activation="relu", name="encoder_out")(enc)

    dec = keras.layers.Dense(latent_vars, activation="relu")(enc)
    dec = keras.layers.Dense(150, activation="relu")(dec)
    dec = keras.layers.Dense(400, activation="relu")(dec)

    outputs = []
    outs = 0

    # create output layers
    for c in df_ref:
        out_num = df_ref[c].nunique()
        if df_ref[c].isna().sum() > 0:
            out_num += 1

        out_temp = keras.layers.Dense(out_num, activation="softmax", name=f"out_{c}")(dec)
        outs += out_num
        outputs.append(out_temp)

    auto_encoder = keras.Model(inp, outputs)
    auto_encoder.summary()
    
    lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
        learning_rate,
        decay_steps=1500,  # 1 epoch = 1254 steps
        decay_rate=0.98,  # reduce to 10% in each +- 120 epochs
        staircase=True)  # If True, learning rate decays in a discrete staircase, else smoothly

    opt = keras.optimizers.Adam(learning_rate=lr_schedule)

    # opt = keras.optimizers.Adam(learning_rate=learning_rate)

    losses = {f"out_{c}": "categorical_crossentropy" for c in df_ref}
    metrics = {f"out_{c}": ["accuracy"] for c in df_ref}

    auto_encoder.compile(optimizer=opt, loss=losses, metrics=metrics)
    run_id = datetime.now().strftime("%Y%m%d-%H%M%S")
    log_dir = "logs/fit/" + run_id

    print(log_dir)
    tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)

    targets = []

    idx = 0
    for o in outputs:
        tgt = train_data[:, idx:idx+o.shape[1]]
        targets.append(tgt)
        idx += o.shape[1]
        
    hist = auto_encoder.fit(train_data, targets,
                            epochs=epochs, batch_size=64, shuffle=True,
                            callbacks=[tensorboard_callback],
                            verbose=2)

    with open(f"auto_encoder_{run_id}.json", "w") as f:
        d = hist.history
        d["run_id"] = run_id
        d["latent_variables"] = latent_vars
        d["learning_rate"] = learning_rate    
        d["epochs"] = epochs    
        json.dump(d, f, indent=4)

        # save weights
        auto_encoder.save(f"auto_encoder_{run_id}.h5")
    
    return auto_encoder


In [7]:
tf.random.set_seed(42)
np.random.seed(42)

ae = train_autoenc(X_train, df_temp, latent_vars=15, learning_rate=0.001, epochs=150)

2024-03-25 23:42:41.251675: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-03-25 23:42:41.263857: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-03-25 23:42:41.264076: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-

logs/fit/20240325-234241
Epoch 1/150


I0000 00:00:1711410165.579862  491916 service.cc:145] XLA service 0x7d310c0094f0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1711410165.579884  491916 service.cc:153]   StreamExecutor device (0): NVIDIA GeForce RTX 4090, Compute Capability 8.9
2024-03-25 23:42:45.677609: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:268] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2024-03-25 23:42:46.044902: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:465] Loaded cuDNN version 8907















I0000 00:00:1711410170.650188  491916 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.






1254/1254 - 17s - 14ms/step - loss: 16.5669 - out_admission_source_id_accuracy: 0.8481 - out_admission_type_id_accuracy: 0.8252 - out_age_accuracy: 0.7180 - out_change_accuracy: 0.9809 - out_diabetesMed_accuracy: 0.9890 - out_diag_1_accuracy: 0.1889 - out_diag_2_accuracy: 0.1803 - out_diag_3_accuracy: 0.1721 - out_discharge_disposition_id_accuracy: 0.7690 - out_gender_accuracy: 0.9411 - out_glimepiride_accuracy: 0.9466 - out_glipizide_accuracy: 0.9395 - out_glyburide-metformin_accuracy: 0.9931 - out_glyburide_accuracy: 0.9417 - out_insulin_accuracy: 0.8909 - out_metformin_accuracy: 0.9373 - out_pioglitazone_accuracy: 0.9523 - out_race_accuracy: 0.9132 - out_repaglinide_accuracy: 0.9839 - out_rosiglitazone_accuracy: 0.9405
Epoch 2/150
1254/1254 - 4s - 4ms/step - loss: 11.4282 - out_admission_source_id_accuracy: 0.9088 - out_admission_type_id_accuracy: 0.9086 - out_age_accuracy: 0.9372 - out_change_accuracy: 0.9949 - out_diabetesMed_accuracy: 0.9982 - out_diag_1_accuracy: 0.3459 - out_di



In [8]:
preds = ae.predict(X_train)

concat_preds = np.concatenate(preds, axis=1)
mse = mean_squared_error(X_train, concat_preds)
print(f"MSE Train: {mse:2.4f}")

lower = 0
outs = [o.shape[1] for o in ae.outputs]
for i, upper in enumerate(outs):
    pred = preds[i]
    print(f"Prediction {ohe.feature_names_in_[i]}: {pred.shape}", end=" ")

    acc = 0
    for j in range(len(preds[i])):
        acc += np.argmax(X_train[j, lower:lower+upper]) == np.argmax(pred[j, 0:upper])
    acc /= len(pred)
    print(f"Accuracy: {acc:2.2%}")
    lower += upper




[1m2507/2507[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 936us/step
MSE Train: 0.0000
Prediction race: (80195, 5) Accuracy: 99.99%
Prediction gender: (80195, 2) Accuracy: 100.00%
Prediction diag_1: (80195, 715) Accuracy: 99.83%
Prediction diag_2: (80195, 743) Accuracy: 99.58%
Prediction diag_3: (80195, 789) Accuracy: 99.64%
Prediction change: (80195, 2) Accuracy: 99.99%
Prediction diabetesMed: (80195, 2) Accuracy: 100.00%
Prediction admission_type_id: (80195, 8) Accuracy: 99.98%
Prediction discharge_disposition_id: (80195, 26) Accuracy: 99.97%
Prediction admission_source_id: (80195, 17) Accuracy: 99.90%
Prediction metformin: (80195, 4) Accuracy: 99.99%
Prediction repaglinide: (80195, 4) Accuracy: 99.91%
Prediction glimepiride: (80195, 4) Accuracy: 99.95%
Prediction glipizide: (80195, 4) Accuracy: 99.98%
Prediction glyburide: (80195, 4) Accuracy: 99.89%
Prediction pioglitazone: (80195, 4) Accuracy: 99.97%
Prediction rosiglitazone: (80195, 4) Accuracy: 99.99%
Prediction insuli

In [9]:
preds = ae.predict(X_test)

concat_preds = np.concatenate(preds, axis=1)
mse = mean_squared_error(X_test, concat_preds)
print(f"MSE Test: {mse:2.4f}")

lower = 0
outs = [o.shape[1] for o in ae.outputs]
for i, upper in enumerate(outs):
    pred = preds[i]
    print(f"Prediction {ohe.feature_names_in_[i]}: {pred.shape}", end=" ")

    acc = 0
    for j in range(len(preds[i])):
        acc += np.argmax(X_test[j, lower:lower+upper]) == np.argmax(pred[j, 0:upper])
    acc /= len(pred)
    print(f"Accuracy: {acc:2.2%}")
    lower += upper

[1m597/627[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 590us/step




[1m627/627[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step
MSE Test: 0.0008
Prediction race: (20049, 5) Accuracy: 99.23%
Prediction gender: (20049, 2) Accuracy: 99.94%
Prediction diag_1: (20049, 715) Accuracy: 73.55%
Prediction diag_2: (20049, 743) Accuracy: 70.73%
Prediction diag_3: (20049, 789) Accuracy: 71.56%
Prediction change: (20049, 2) Accuracy: 99.63%
Prediction diabetesMed: (20049, 2) Accuracy: 99.91%
Prediction admission_type_id: (20049, 8) Accuracy: 98.24%
Prediction discharge_disposition_id: (20049, 26) Accuracy: 97.18%
Prediction admission_source_id: (20049, 17) Accuracy: 96.87%
Prediction metformin: (20049, 4) Accuracy: 99.19%
Prediction repaglinide: (20049, 4) Accuracy: 99.07%
Prediction glimepiride: (20049, 4) Accuracy: 99.18%
Prediction glipizide: (20049, 4) Accuracy: 99.22%
Prediction glyburide: (20049, 4) Accuracy: 98.81%
Prediction pioglitazone: (20049, 4) Accuracy: 98.95%
Prediction rosiglitazone: (20049, 4) Accuracy: 99.60%
Prediction insulin: (200

In [10]:
encoder = keras.Model(inputs=ae.input, outputs=ae.get_layer("encoder_out").output)

In [11]:
ae_vars = encoder.predict(temp)
df_ae = pd.DataFrame(ae_vars, columns=[f"ae_temp_{i}" for i in range(ae_vars.shape[1])])

final_vars = []
for c in df_ae:
    if df_ae[c].max() - df_ae[c].min() == 0:
        continue

    final_vars.append(c)

df_ae = df_ae[final_vars].rename(columns={c: f"ae_{i}" for i, c in enumerate(final_vars)})

[1m3133/3133[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 573us/step


In [12]:
df_final = df.join(df_ae)
display(df_final.describe().T)

filename = "diabetic_data_cleaned_pca_som_ae"

df_final.to_csv(f"{filename}.csv", index=False)
df_final.to_parquet(f"{filename}.parquet")

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
time_in_hospital,100244.0,4.420275,2.991273,1.0,2.0,4.0,6.0,14.0
num_lab_procedures,100244.0,43.17372,19.691484,1.0,32.0,44.0,57.0,132.0
num_procedures,100244.0,1.350275,1.710913,0.0,0.0,1.0,2.0,6.0
num_medications,100244.0,16.11782,8.116424,1.0,11.0,15.0,20.0,81.0
number_outpatient,100244.0,0.372561,1.274074,0.0,0.0,0.0,0.0,42.0
number_emergency,100244.0,0.1990144,0.933541,0.0,0.0,0.0,0.0,76.0
number_inpatient,100244.0,0.6394697,1.262773,0.0,0.0,0.0,1.0,21.0
number_diagnoses,100244.0,7.497007,1.839537,3.0,6.0,8.0,9.0,16.0
PC1,100244.0,-5.0910510000000005e-17,0.299783,-0.400778,-0.232916,-0.088537,0.155953,1.159058
PC2,100244.0,-6.268567e-17,0.230848,-0.741977,-0.142057,-0.017638,0.132432,0.844482


## Conclusion

This notebook finishes adding 11 new columns. The auto-encoder model could replace the 22 original categorical variables (with more than 2400 combined levels) with 11 numerical variables. It is an unsupervised learning technique since the goal is to recreate the original data (without the target!) with only these 11 numerical variables. It achieved an excellent result. This set of 11 variables can be used in place of the original categorical variables if you want (the same for SOM_JAC and SOM_EUC).