In [1]:
import pandas as pd
pd.set_option('display.max_columns', None)

import numpy as np

from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler

from sklearn.pipeline import Pipeline, FunctionTransformer
from sklearn.compose import ColumnTransformer, make_column_selector

In [2]:
data = pd.read_csv('../raw_data/hospital_readmissions.csv')

# Function for cleaning training data

In [113]:
def clean_training_data(df: pd.DataFrame) -> pd.DataFrame:
    df = df[df['diag_1'] != 'Missing']
    df = df[df['diag_2'] != 'Missing']
    df = df[df['diag_3'] != 'Missing']

    df['age'] = df['age'].map({ '[0-10]': 0.0,
                                '[10-20)': 0.1,
                                '[20-30)': 0.2,
                                '[30-40)': 0.3,
                                '[40-50)': 0.4,
                                '[50-60)': 0.5,
                                '[60-70)': 0.6,
                                '[70-80)': 0.7,
                                '[80-90)': 0.8,
                                '[90-100)': 0.9,
                                '[100-110)': 1.0})
    df['n_outpatient'] = df['n_outpatient'].map({0: 0, 1: 1}).fillna(2).astype(int)
    df['n_inpatient'] = df['n_inpatient'].map({0: 0, 1: 1}).fillna(2).astype(int)
    df['n_emergency'] = df['n_emergency'].map({0: 0, 1: 1}).fillna(2).astype(int)

    df = df.drop(columns=[
                            'medical_specialty',
                            'glucose_test',
                        ])
    return df


# Make training data

In [114]:
train_data = clean_training_data(data)
print(train_data.columns)
print(train_data.shape)
display(train_data.head(2))

Index(['age', 'time_in_hospital', 'n_lab_procedures', 'n_procedures',
       'n_medications', 'n_outpatient', 'n_inpatient', 'n_emergency', 'diag_1',
       'diag_2', 'diag_3', 'A1Ctest', 'change', 'diabetes_med', 'readmitted'],
      dtype='object')
(24779, 15)


Unnamed: 0,age,time_in_hospital,n_lab_procedures,n_procedures,n_medications,n_outpatient,n_inpatient,n_emergency,diag_1,diag_2,diag_3,A1Ctest,change,diabetes_med,readmitted
0,0.7,8,72,1,18,2,0,0,Circulatory,Respiratory,Other,no,no,yes,no
1,0.7,3,34,2,13,0,0,0,Other,Other,Other,no,no,yes,no


# Pre-Processing Blocks

In [115]:
num_preproc = Pipeline([
    ('scaler', MinMaxScaler()),
])

cat_preproc = Pipeline([
    ('ohe', OneHotEncoder(sparse_output=False, drop="if_binary")),
])

preproc = ColumnTransformer([
    ('num_transf', num_preproc, make_column_selector(dtype_include='number')),
    ('cat_transf', cat_preproc, make_column_selector(dtype_include='object')),
], verbose_feature_names_out=False).set_output(transform='pandas')

# Full Pre-Processing Pipeline

In [116]:
pipe_preproc = Pipeline([
    ('preprocessor', preproc),
])

pipe_preproc

# Fitting pipeline

In [117]:
X_train = train_data.drop(columns=['readmitted'])
y_train = train_data[['readmitted']]

X_train.shape, y_train.shape

((24779, 14), (24779, 1))

In [118]:
pipe_preproc.fit(X_train)


In [123]:
X_preproc = pipe_preproc.transform(X_train)
X_preproc

Unnamed: 0,age,time_in_hospital,n_lab_procedures,n_procedures,n_medications,n_outpatient,n_inpatient,n_emergency,diag_1_Circulatory,diag_1_Diabetes,diag_1_Digestive,diag_1_Injury,diag_1_Musculoskeletal,diag_1_Other,diag_1_Respiratory,diag_2_Circulatory,diag_2_Diabetes,diag_2_Digestive,diag_2_Injury,diag_2_Musculoskeletal,diag_2_Other,diag_2_Respiratory,diag_3_Circulatory,diag_3_Diabetes,diag_3_Digestive,diag_3_Injury,diag_3_Musculoskeletal,diag_3_Other,diag_3_Respiratory,A1Ctest_high,A1Ctest_no,A1Ctest_normal,change_yes,diabetes_med_yes
0,0.6,0.538462,0.633929,0.166667,0.217949,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
1,0.6,0.153846,0.294643,0.333333,0.153846,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
2,0.2,0.307692,0.392857,0.000000,0.217949,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0
3,0.6,0.076923,0.312500,0.000000,0.141026,0.5,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0
4,0.4,0.000000,0.366071,0.000000,0.076923,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24995,0.8,1.000000,0.678571,0.166667,0.371795,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
24996,0.8,0.076923,0.580357,0.000000,0.294872,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0
24997,0.6,0.307692,0.098214,0.000000,0.064103,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
24998,0.6,0.076923,0.535714,0.500000,0.179487,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0


# Pickle fitted Preprocessor

In [125]:
import os
import pickle

with open('../preprocessor/preprocessor.pkl', 'wb') as file:
    pickle.dump(pipe_preproc, file)
