# Machine Learning-Based Prediction of 30-Day Hospital Readmission in Diabetic Patients

## Feature Engineering

In [156]:
import pandas as pd
import numpy as np

from scipy.stats import chi2_contingency

In [157]:
df = pd.read_csv("./data/raw/day-26.csv")

In [158]:
df.shape

(101763, 46)

In [159]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 101763 entries, 0 to 101762
Data columns (total 46 columns):
 #   Column                    Non-Null Count   Dtype 
---  ------                    --------------   ----- 
 0   race                      101763 non-null  object
 1   gender                    101763 non-null  object
 2   age                       101763 non-null  object
 3   admission_type            101763 non-null  object
 4   discharge_disposition     101763 non-null  object
 5   admission_source          101763 non-null  object
 6   time_in_hospital          101763 non-null  int64 
 7   payer_code                101763 non-null  object
 8   medical_specialty         101763 non-null  object
 9   num_lab_procedures        101763 non-null  int64 
 10  num_procedures            101763 non-null  int64 
 11  num_medications           101763 non-null  int64 
 12  number_outpatient         101763 non-null  int64 
 13  number_emergency          101763 non-null  int64 
 14  numb

### Convert Target Column

In [160]:
# Check unique values

print(df['readmitted'].value_counts())

readmitted
NO     54861
>30    35545
<30    11357
Name: count, dtype: int64


In [161]:
# Convert to binary
df['readmitted_binary'] = df['readmitted'].apply(
    lambda x: 1 if x == '<30' else 0
)

In [162]:
# Verify conversion
df['readmitted_binary'].value_counts()

readmitted_binary
0    90406
1    11357
Name: count, dtype: int64

In [163]:
df['readmitted_binary'].value_counts(normalize=True)

readmitted_binary
0    0.888398
1    0.111602
Name: proportion, dtype: float64

### Convert Drug Columns to Numeric

In [164]:
# No      → 0
# Steady  → 1
# Up      → 1
# Down    → 1

# Because the important question is was this drug given or not?

In [165]:
drug_cols = df.columns[20:43]

In [166]:
df[drug_cols] = df[drug_cols].replace({
    'No': 0,
    'Steady': 1,
    'Up': 1,
    'Down': 1
})

  df[drug_cols] = df[drug_cols].replace({


### Create a Powerful Feature

#### Feature 1: Total Diabetes Drugs Given

In [167]:
df['total_diabetes_drugs'] = df[drug_cols].sum(axis=1)

# How many different diabetes drugs patient is taking.

#### Feature 2: Insulin Flag

In [168]:
df['insulin_flag'] = df['insulin']

# Insulin is very important indicator of severity.

#### Feature 3: Oral Medication Flag

In [169]:
oral_drugs = drug_cols.drop('insulin')

df['oral_med_flag'] = df[oral_drugs].sum(axis=1)
df['oral_med_flag'] = df['oral_med_flag'].apply(lambda x: 1 if x > 0 else 0)

### Medication Change Behavior

In [170]:
# Converting change column from categorical to numeric

df['change'].value_counts()

change
No    54754
Ch    47009
Name: count, dtype: int64

In [171]:
df['change'] = df['change'].map({'No':0,'Ch':1})

In [172]:
df['change'].value_counts()

change
0    54754
1    47009
Name: count, dtype: int64

In [173]:
# Converting diabetesMed from categorical to numeric column

df['diabetesMed'].value_counts()

diabetesMed
Yes    78361
No     23402
Name: count, dtype: int64

In [174]:
df['diabetesMed'] = df['diabetesMed'].map({"Yes":1 , "No":0})

In [175]:
df['diabetesMed'].value_counts()

diabetesMed
1    78361
0    23402
Name: count, dtype: int64

### Create new column as is_polypharmacy

In [176]:
df['is_polypharmacy'] = df['num_medications'].apply(lambda x: 1 if x >= 10 else 0)

### Drop Original 25 Columns

In [177]:
df.drop(columns=drug_cols, inplace=True)

In [178]:
df.drop(columns=['readmitted'], inplace=True)

In [179]:
df.shape

(101763, 27)

### Handling age column

In [180]:
df['age'].value_counts()

age
[70-80)     26066
[60-70)     22482
[50-60)     17256
[80-90)     17197
[40-50)      9685
[30-40)      3775
[90-100)     2793
[20-30)      1657
[10-20)       691
[0-10)        161
Name: count, dtype: int64

In [181]:
df['age'] = df['age'].str.extract('(\d+)').astype(int) + 5

  df['age'] = df['age'].str.extract('(\d+)').astype(int) + 5


### Checking payer_code column

In [182]:
# Applying 'chi2' test and finding 'Cramér's V' on payer_code to find Strength of the column with target column 

In [183]:
df['payer_code'].nunique()

18

In [184]:
# Create a cross-tabulation table
contingency_table = pd.crosstab(df['payer_code'], df['readmitted_binary'])

# Run the test
chi2, p, dof, ex = chi2_contingency(contingency_table)
print(f"P-value: {p}")

P-value: 2.388095995508881e-13


In [185]:
# 'chi2' is the value you got from your test
n = df.shape[0]  # total observations
min_dim = min(contingency_table.shape) - 1
cramers_v = np.sqrt(chi2 / (n * min_dim))

print(f"Cramér's V (Strength): {cramers_v}")

Cramér's V (Strength): 0.030982086888879927


In [186]:
# Trying to group categories of payer_code column

In [187]:
# Create a mapping dictionary
payer_map = {
    'MC': 'Public', 'MD': 'Public',
    'SP': 'Self-Pay', 'UN': 'Self-Pay',
    'BC': 'Private', 'HM': 'Private', 'CP': 'Private', 'CM': 'Private',
    'OG': 'Private', 'PO': 'Private', 'DM': 'Private', 'CH': 'Private',
    'WC': 'Private', 'OT': 'Private', 'MP': 'Private', 'SI': 'Private',
    'Unknown': 'Unknown', 'FR': 'Unknown'
}

df['payer_grouped'] = df['payer_code'].map(payer_map)

In [188]:
# running 'chi2' and 'Cramér's V' on grouped column

In [189]:
# Create a cross-tabulation table
contingency_table = pd.crosstab(df['payer_grouped'], df['readmitted_binary'])

# Run the test
chi2, p, dof, ex = chi2_contingency(contingency_table)
print(f"P-value: {p}")

n = df.shape[0]  # total observations
min_dim = min(contingency_table.shape) - 1
cramers_v = np.sqrt(chi2 / (n * min_dim))

print(f"Cramér's V (Strength): {cramers_v}")

P-value: 6.775612174625376e-14
Cramér's V (Strength): 0.02515431148217195


In [190]:
# No further improvement , and decided to drop the column

df.drop(columns=['payer_code','payer_grouped'], inplace=True)

### Cleaning A1Cresult

In [191]:
df['A1Cresult'].value_counts()

A1Cresult
NotTested    84745
>8            8216
Norm          4990
>7            3812
Name: count, dtype: int64

In [192]:
# converting it to ordinal

# NotTested → 0
# Norm → 1
# >7 → 2
# >8 → 3

In [193]:
A1Cresult_map = {
  "NotTested" : 0,
  "Norm" : 1,
  ">7" : 2,
  ">8" : 3
}

df['A1Cresult'] = df['A1Cresult'].map(A1Cresult_map)

In [194]:
df['A1Cresult'].value_counts()

A1Cresult
0    84745
3     8216
1     4990
2     3812
Name: count, dtype: int64

### Checking Categorical Columns

In [214]:
df['diag_1'].nunique()

11

In [215]:
df['diag_2'].nunique()

11

In [216]:
df['diag_3'].nunique()

11

In [None]:
df['admission_type'].nunique()

5

In [None]:
df['discharge_disposition'].nunique()

16

In [None]:
df['admission_source'].nunique()

9

In [None]:
df['medical_specialty'].nunique()

16

In [None]:
# None of tham need to be grouped , it can easly encoded using OneHotEncoding now

### Saving clean file before apply encoding

In [218]:
df.to_csv("./data/clean/day-28.csv", index=False)

### Making Preprocessor 

In [228]:
from sklearn.model_selection import train_test_split

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

import joblib

In [222]:
# Train-Test Split

X = df.drop(columns=['readmitted_binary'])
y = df['readmitted_binary']

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

In [225]:
# Build Preprocessor

categorical_cols = [
    'race', 'gender', 'admission_type',
    'discharge_disposition', 'admission_source',
    'medical_specialty', 'diag_1', 'diag_2', 'diag_3'
]

numeric_cols = X_train.select_dtypes(include=['int64']).columns

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_cols),
        ('cat', OneHotEncoder(drop='first', handle_unknown='ignore'), categorical_cols)
    ]
)

In [226]:
# Fit on training data

preprocessor.fit(X_train)

0,1,2
,"transformers  transformers: list of tuples List of (name, transformer, columns) tuples specifying the transformer objects to be applied to subsets of the data. name : str  Like in Pipeline and FeatureUnion, this allows the transformer and  its parameters to be set using ``set_params`` and searched in grid  search. transformer : {'drop', 'passthrough'} or estimator  Estimator must support :term:`fit` and :term:`transform`.  Special-cased strings 'drop' and 'passthrough' are accepted as  well, to indicate to drop the columns or to pass them through  untransformed, respectively. columns : str, array-like of str, int, array-like of int, array-like of bool, slice or callable  Indexes the data on its second axis. Integers are interpreted as  positional columns, while strings can reference DataFrame columns  by name. A scalar string or int should be used where  ``transformer`` expects X to be a 1d array-like (vector),  otherwise a 2d array will be passed to the transformer.  A callable is passed the input data `X` and can return any of the  above. To select multiple columns by name or dtype, you can use  :obj:`make_column_selector`.","[('num', ...), ('cat', ...)]"
,"remainder  remainder: {'drop', 'passthrough'} or estimator, default='drop' By default, only the specified columns in `transformers` are transformed and combined in the output, and the non-specified columns are dropped. (default of ``'drop'``). By specifying ``remainder='passthrough'``, all remaining columns that were not specified in `transformers`, but present in the data passed to `fit` will be automatically passed through. This subset of columns is concatenated with the output of the transformers. For dataframes, extra columns not seen during `fit` will be excluded from the output of `transform`. By setting ``remainder`` to be an estimator, the remaining non-specified columns will use the ``remainder`` estimator. The estimator must support :term:`fit` and :term:`transform`. Note that using this feature requires that the DataFrame columns input at :term:`fit` and :term:`transform` have identical order.",'drop'
,"sparse_threshold  sparse_threshold: float, default=0.3 If the output of the different transformers contains sparse matrices, these will be stacked as a sparse matrix if the overall density is lower than this value. Use ``sparse_threshold=0`` to always return dense. When the transformed output consists of all dense data, the stacked result will be dense, and this keyword will be ignored.",0.3
,"n_jobs  n_jobs: int, default=None Number of jobs to run in parallel. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See :term:`Glossary ` for more details.",
,"transformer_weights  transformer_weights: dict, default=None Multiplicative weights for features per transformer. The output of the transformer is multiplied by these weights. Keys are transformer names, values the weights.",
,"verbose  verbose: bool, default=False If True, the time elapsed while fitting each transformer will be printed as it is completed.",False
,"verbose_feature_names_out  verbose_feature_names_out: bool, str or Callable[[str, str], str], default=True - If True, :meth:`ColumnTransformer.get_feature_names_out` will prefix  all feature names with the name of the transformer that generated that  feature. It is equivalent to setting  `verbose_feature_names_out=""{transformer_name}__{feature_name}""`. - If False, :meth:`ColumnTransformer.get_feature_names_out` will not  prefix any feature names and will error if feature names are not  unique. - If ``Callable[[str, str], str]``,  :meth:`ColumnTransformer.get_feature_names_out` will rename all the features  using the name of the transformer. The first argument of the callable is the  transformer name and the second argument is the feature name. The returned  string will be the new feature name. - If ``str``, it must be a string ready for formatting. The given string will  be formatted using two field names: ``transformer_name`` and ``feature_name``.  e.g. ``""{feature_name}__{transformer_name}""``. See :meth:`str.format` method  from the standard library for more info. .. versionadded:: 1.0 .. versionchanged:: 1.6  `verbose_feature_names_out` can be a callable or a string to be formatted.",True
,"force_int_remainder_cols  force_int_remainder_cols: bool, default=False This parameter has no effect. .. note::  If you do not access the list of columns for the remainder columns  in the `transformers_` fitted attribute, you do not need to set  this parameter. .. versionadded:: 1.5 .. versionchanged:: 1.7  The default value for `force_int_remainder_cols` will change from  `True` to `False` in version 1.7. .. deprecated:: 1.7  `force_int_remainder_cols` is deprecated and will be removed in 1.9.",'deprecated'

0,1,2
,"copy  copy: bool, default=True If False, try to avoid a copy and do inplace scaling instead. This is not guaranteed to always work inplace; e.g. if the data is not a NumPy array or scipy.sparse CSR matrix, a copy may still be returned.",True
,"with_mean  with_mean: bool, default=True If True, center the data before scaling. This does not work (and will raise an exception) when attempted on sparse matrices, because centering them entails building a dense matrix which in common use cases is likely to be too large to fit in memory.",True
,"with_std  with_std: bool, default=True If True, scale the data to unit variance (or equivalently, unit standard deviation).",True

0,1,2
,"categories  categories: 'auto' or a list of array-like, default='auto' Categories (unique values) per feature: - 'auto' : Determine categories automatically from the training data. - list : ``categories[i]`` holds the categories expected in the ith  column. The passed categories should not mix strings and numeric  values within a single feature, and should be sorted in case of  numeric values. The used categories can be found in the ``categories_`` attribute. .. versionadded:: 0.20",'auto'
,"drop  drop: {'first', 'if_binary'} or an array-like of shape (n_features,), default=None Specifies a methodology to use to drop one of the categories per feature. This is useful in situations where perfectly collinear features cause problems, such as when feeding the resulting data into an unregularized linear regression model. However, dropping one category breaks the symmetry of the original representation and can therefore induce a bias in downstream models, for instance for penalized linear classification or regression models. - None : retain all features (the default). - 'first' : drop the first category in each feature. If only one  category is present, the feature will be dropped entirely. - 'if_binary' : drop the first category in each feature with two  categories. Features with 1 or more than 2 categories are  left intact. - array : ``drop[i]`` is the category in feature ``X[:, i]`` that  should be dropped. When `max_categories` or `min_frequency` is configured to group infrequent categories, the dropping behavior is handled after the grouping. .. versionadded:: 0.21  The parameter `drop` was added in 0.21. .. versionchanged:: 0.23  The option `drop='if_binary'` was added in 0.23. .. versionchanged:: 1.1  Support for dropping infrequent categories.",'first'
,"sparse_output  sparse_output: bool, default=True When ``True``, it returns a :class:`scipy.sparse.csr_matrix`, i.e. a sparse matrix in ""Compressed Sparse Row"" (CSR) format. .. versionadded:: 1.2  `sparse` was renamed to `sparse_output`",True
,"dtype  dtype: number type, default=np.float64 Desired dtype of output.",<class 'numpy.float64'>
,"handle_unknown  handle_unknown: {'error', 'ignore', 'infrequent_if_exist', 'warn'}, default='error' Specifies the way unknown categories are handled during :meth:`transform`. - 'error' : Raise an error if an unknown category is present during transform. - 'ignore' : When an unknown category is encountered during  transform, the resulting one-hot encoded columns for this feature  will be all zeros. In the inverse transform, an unknown category  will be denoted as None. - 'infrequent_if_exist' : When an unknown category is encountered  during transform, the resulting one-hot encoded columns for this  feature will map to the infrequent category if it exists. The  infrequent category will be mapped to the last position in the  encoding. During inverse transform, an unknown category will be  mapped to the category denoted `'infrequent'` if it exists. If the  `'infrequent'` category does not exist, then :meth:`transform` and  :meth:`inverse_transform` will handle an unknown category as with  `handle_unknown='ignore'`. Infrequent categories exist based on  `min_frequency` and `max_categories`. Read more in the  :ref:`User Guide `. - 'warn' : When an unknown category is encountered during transform  a warning is issued, and the encoding then proceeds as described for  `handle_unknown=""infrequent_if_exist""`. .. versionchanged:: 1.1  `'infrequent_if_exist'` was added to automatically handle unknown  categories and infrequent categories. .. versionadded:: 1.6  The option `""warn""` was added in 1.6.",'ignore'
,"min_frequency  min_frequency: int or float, default=None Specifies the minimum frequency below which a category will be considered infrequent. - If `int`, categories with a smaller cardinality will be considered  infrequent. - If `float`, categories with a smaller cardinality than  `min_frequency * n_samples` will be considered infrequent. .. versionadded:: 1.1  Read more in the :ref:`User Guide `.",
,"max_categories  max_categories: int, default=None Specifies an upper limit to the number of output features for each input feature when considering infrequent categories. If there are infrequent categories, `max_categories` includes the category representing the infrequent categories along with the frequent categories. If `None`, there is no limit to the number of output features. .. versionadded:: 1.1  Read more in the :ref:`User Guide `.",
,"feature_name_combiner  feature_name_combiner: ""concat"" or callable, default=""concat"" Callable with signature `def callable(input_feature, category)` that returns a string. This is used to create feature names to be returned by :meth:`get_feature_names_out`. `""concat""` concatenates encoded feature name and category with `feature + ""_"" + str(category)`.E.g. feature X with values 1, 6, 7 create feature names `X_1, X_6, X_7`. .. versionadded:: 1.3",'concat'


In [230]:
# Save Preprocessor

joblib.dump(preprocessor, "./data/clean/preprocessing_pipeline.pkl")

['./data/clean/preprocessing_pipeline.pkl']