In [51]:
import warnings
from sklearn.model_selection import train_test_split
import os
import ast
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from ydata_profiling import ProfileReport
from scipy.stats import boxcox

sns.set()
sns.set_palette('cividis')

warnings.filterwarnings('ignore')

In [52]:
train_path = os.path.join("..", "data", "input", "train.csv")
test_path = os.path.join("..", "data", "input", "test.csv")

df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

df.set_index("encounter_id", inplace=True)
test_df.set_index("encounter_id", inplace=True)

In [53]:
df.head().T

encounter_id,533253,426224,634063,890610,654194
country,USA,USA,USA,USA,USA
patient_id,70110,29775006,80729253,2919042,84871971
race,Caucasian,AfricanAmerican,Caucasian,AfricanAmerican,Caucasian
gender,Female,Male,Female,Male,Female
age,[70-80),[50-60),[60-70),[60-70),[70-80)
weight,?,?,?,?,?
payer_code,?,?,?,MC,HM
outpatient_visits_in_previous_year,0,0,0,0,1
emergency_visits_in_previous_year,0,0,0,0,0
inpatient_visits_in_previous_year,2,0,1,1,0


## Data exploration


In [54]:
df.describe(include="all").T

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
country,71236.0,1.0,USA,71236.0,,,,,,,
patient_id,71236.0,,,,54302279.330984,38795850.347332,135.0,23396510.25,45305631.0,87558374.25,189502619.0
race,67682.0,6.0,Caucasian,50693.0,,,,,,,
gender,71236.0,3.0,Female,38228.0,,,,,,,
age,67679.0,10.0,[70-80),17359.0,,,,,,,
weight,71236.0,10.0,?,68990.0,,,,,,,
payer_code,71236.0,18.0,?,28201.0,,,,,,,
outpatient_visits_in_previous_year,71236.0,,,,0.369588,1.287469,0.0,0.0,0.0,0.0,42.0
emergency_visits_in_previous_year,71236.0,,,,0.196249,0.910854,0.0,0.0,0.0,0.0,76.0
inpatient_visits_in_previous_year,71236.0,,,,0.640154,1.267271,0.0,0.0,0.0,1.0,21.0


In [55]:
df.duplicated().sum()

0

### Checking NaNs


since we noticed that there are a lot of `?` we are going to replace all with `np.nan`, to see the quantity of missing values


In [56]:
df.replace("?", np.nan, inplace=True)
test_df.replace("?", np.nan, inplace=True)

df.isna().sum()[df.isnull().sum() > 1] / len(df) * 100

race                      7.117188
age                       4.993262
weight                   96.847100
payer_code               39.588130
admission_type            5.202426
medical_specialty        49.022966
discharge_disposition     3.635802
admission_source          6.623056
primary_diagnosis         0.022461
secondary_diagnosis       0.367792
additional_diagnosis      1.415015
glucose_test_result      94.822842
a1c_test_result          83.272503
dtype: float64

In [57]:
print("a1c", df["a1c_test_result"].unique())
print("glucose", df["glucose_test_result"].unique())

a1c [nan 'Norm' '>8' '>7']
glucose [nan '>300' 'Norm' '>200']


according to the documentation provided, the possible values in a1c and glucose tests, contain a "none".

so, the approach that we are taking is to replace the nans of these columns with a "none", since probably both of this columns could have an impact on the target variable


In [58]:
df.loc[:, ["a1c_test_result", "glucose_test_result"]] = df.loc[:, [
    "a1c_test_result", "glucose_test_result"]].replace(np.nan, "none")
test_df.loc[:, ["a1c_test_result", "glucose_test_result"]] = test_df.loc[:, [
    "a1c_test_result", "glucose_test_result"]].replace(np.nan, "none")

print("a1c", df["a1c_test_result"].unique())
print("glucose", df["glucose_test_result"].unique())

a1c ['none' 'Norm' '>8' '>7']
glucose ['none' '>300' 'Norm' '>200']


values that are not common in the feature, we will replace with np.nan


In [59]:
# here not so agree with how things are handled here, maybe do it down?
# !!! indeed handling missing values is after data exploration

# df["admission_type"].replace("Not Available", np.nan, inplace=True)
# df["admission_type"].replace("Not Mapped", np.nan, inplace=True)
# df["discharge_disposition"].replace("Not Mapped", np.nan, inplace=True)
# df["gender"].replace("Unknown/Invalid", np.nan, inplace=True)
# df["admission_source"].replace("Not Available", np.nan, inplace=True)
# df['medical_specialty'].replace('PhysicianNotFound', np.nan, inplace=True)

# test_df["admission_type"].replace("Not Available", np.nan, inplace=True)
# test_df["admission_type"].replace("Not Mapped", np.nan, inplace=True)
# test_df["discharge_disposition"].replace("Not Mapped", np.nan, inplace=True)
# test_df["gender"].replace("Unknown/Invalid", np.nan, inplace=True)
# test_df["admission_source"].replace("Not Available", np.nan, inplace=True)
# test_df['medical_specialty'].replace('PhysicianNotFound', np.nan, inplace=True)

In [60]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 71236 entries, 533253 to 459757
Data columns (total 30 columns):
 #   Column                                 Non-Null Count  Dtype 
---  ------                                 --------------  ----- 
 0   country                                71236 non-null  object
 1   patient_id                             71236 non-null  int64 
 2   race                                   66166 non-null  object
 3   gender                                 71236 non-null  object
 4   age                                    67679 non-null  object
 5   weight                                 2246 non-null   object
 6   payer_code                             43035 non-null  object
 7   outpatient_visits_in_previous_year     71236 non-null  int64 
 8   emergency_visits_in_previous_year      71236 non-null  int64 
 9   inpatient_visits_in_previous_year      71236 non-null  int64 
 10  admission_type                         67530 non-null  object
 11  medical_specia

### Partitioning metric and categorical features


In [61]:
metric_features = df.select_dtypes(include='number').columns.tolist()
categorical_features = df.select_dtypes(include='object').columns.tolist()

metric_features.remove("patient_id")

print(metric_features)
print(categorical_features)

['outpatient_visits_in_previous_year', 'emergency_visits_in_previous_year', 'inpatient_visits_in_previous_year', 'average_pulse_bpm', 'length_of_stay_in_hospital', 'number_lab_tests', 'non_lab_procedures', 'number_of_medications', 'number_diagnoses']
['country', 'race', 'gender', 'age', 'weight', 'payer_code', 'admission_type', 'medical_specialty', 'discharge_disposition', 'admission_source', 'primary_diagnosis', 'secondary_diagnosis', 'additional_diagnosis', 'glucose_test_result', 'a1c_test_result', 'change_in_meds_during_hospitalization', 'prescribed_diabetes_meds', 'medication', 'readmitted_binary', 'readmitted_multiclass']


In [62]:
df[categorical_features].info()

<class 'pandas.core.frame.DataFrame'>
Index: 71236 entries, 533253 to 459757
Data columns (total 20 columns):
 #   Column                                 Non-Null Count  Dtype 
---  ------                                 --------------  ----- 
 0   country                                71236 non-null  object
 1   race                                   66166 non-null  object
 2   gender                                 71236 non-null  object
 3   age                                    67679 non-null  object
 4   weight                                 2246 non-null   object
 5   payer_code                             43035 non-null  object
 6   admission_type                         67530 non-null  object
 7   medical_specialty                      36314 non-null  object
 8   discharge_disposition                  68646 non-null  object
 9   admission_source                       66518 non-null  object
 10  primary_diagnosis                      71220 non-null  object
 11  secondary_diag

#### Metric Features Analysis


In [63]:
metric_report = df[metric_features].describe().T
metric_report['cardinality'] = df[metric_features].nunique()

metric_report

Unnamed: 0,count,mean,std,min,25%,50%,75%,max,cardinality
outpatient_visits_in_previous_year,71236.0,0.369588,1.287469,0.0,0.0,0.0,0.0,42.0,38
emergency_visits_in_previous_year,71236.0,0.196249,0.910854,0.0,0.0,0.0,0.0,76.0,30
inpatient_visits_in_previous_year,71236.0,0.640154,1.267271,0.0,0.0,0.0,1.0,21.0,21
average_pulse_bpm,71236.0,99.611222,23.040521,60.0,80.0,100.0,119.0,139.0,80
length_of_stay_in_hospital,71236.0,4.391024,2.988739,1.0,2.0,4.0,6.0,14.0,14
number_lab_tests,71236.0,43.095654,19.642919,1.0,31.0,44.0,57.0,121.0,114
non_lab_procedures,71236.0,1.340923,1.706664,0.0,0.0,1.0,2.0,6.0,7
number_of_medications,71236.0,15.995452,8.122347,1.0,10.0,15.0,20.0,75.0,72
number_diagnoses,71236.0,7.421023,1.937809,1.0,6.0,8.0,9.0,16.0,16


some insights from the data:

- since for `outpatient`, `emergency`, and `inpatient` visits a lot of 0 are concentrated up to 75% of the data, probably we should turn them to a dummy
  - ex: `outpatient_visitis` -> `is_outpatient_visited` [0, 1]

-> should create this vars, but do not delete the categorical ones, since them can also provide valuable information


##### Graph Analysis


In [64]:
# plots_per_row = 3

# num_plots = len(metric_features)
# num_rows = num_plots // plots_per_row + (1 if num_plots % plots_per_row else 0)

# # Creating the plot
# plt.figure(figsize=(15, num_rows * 5))

# for i, feature in enumerate(metric_features, 1):
#     plt.subplot(num_rows, plots_per_row, i)
#     sns.histplot(df[feature], kde=True, bins=30)
#     plt.title(f'Distribution of {feature}')
#     plt.ylabel('Frequency')

# plt.tight_layout()
# plt.show()

some insights given the graphs:

- `length_of_stay`, `number_lab_tests`, and `number of medications` look like skewed normal distributions, probably transfrom them using a log scale or with box-cox
- `number_of_diagnosis` and `non_lab_procedures` maybe can be treated as ordinals? since there are very few categories
- `averge_bpm` looks like invariant, please comment on this
- `x_in_previous_year` as metioned above, a possibility could be to create a new feature that is binomial, of whether is 0 or not.


#### Categorical Features Analysis


In [65]:
def describe_categorical(features, dataframe):
    # Initialize lists to store data for each column in the report
    feature_list = []
    mode_list = []
    mode_freq_list = []
    mode_prop_list = []
    second_mode_list = []
    second_mode_freq_list = []
    second_mode_prop_list = []
    missing_val_percent_list = []
    cardinality_list = []

    for feature in features:
        # Calculate mode, 2nd mode and their frequencies
        mode = dataframe[feature].mode()[0]
        mode_freq = dataframe[feature].value_counts().iloc[0]
        mode_prop = mode_freq / len(dataframe)
        second_mode = dataframe[feature].value_counts().index[1] if len(
            dataframe[feature].value_counts()) > 1 else 'N/A'
        second_mode_freq = dataframe[feature].value_counts().iloc[1] if len(
            dataframe[feature].value_counts()) > 1 else 0
        second_mode_prop = second_mode_freq / len(dataframe)

        # Calculate missing values percentage and cardinality
        missing_val_percent = dataframe[feature].isna().mean() * 100
        cardinality = dataframe[feature].nunique()

        # Append to lists
        feature_list.append(feature)
        mode_list.append(mode)
        mode_freq_list.append(mode_freq)
        mode_prop_list.append(mode_prop)
        second_mode_list.append(second_mode)
        second_mode_freq_list.append(second_mode_freq)
        second_mode_prop_list.append(second_mode_prop)
        missing_val_percent_list.append(missing_val_percent)
        cardinality_list.append(cardinality)

    # Create the DataFrame
    categorical_data_quality_report = pd.DataFrame({
        'Feature': feature_list,
        'Mode': mode_list,
        'Mode Frequency': mode_freq_list,
        'Mode Proportion': mode_prop_list,
        '2nd Mode': second_mode_list,
        '2nd Mode Frequency': second_mode_freq_list,
        '2nd Mode Proportion': second_mode_prop_list,
        'Missing Values %': missing_val_percent_list,
        'Cardinality': cardinality_list
    })

    return categorical_data_quality_report.sort_values(by=['Mode Proportion', 'Missing Values %'], ascending=False)

In [66]:
cat_info = describe_categorical(categorical_features, df).set_index("Feature")
cat_info.sort_values('Cardinality')

Unnamed: 0_level_0,Mode,Mode Frequency,Mode Proportion,2nd Mode,2nd Mode Frequency,2nd Mode Proportion,Missing Values %,Cardinality
Feature,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
country,USA,71236,1.0,,0,0.0,0.0,1
prescribed_diabetes_meds,Yes,54890,0.770537,No,16346,0.229463,0.0,2
change_in_meds_during_hospitalization,No,38326,0.538014,Ch,32910,0.461986,0.0,2
readmitted_binary,No,63286,0.888399,Yes,7950,0.111601,0.0,2
gender,Female,38228,0.536639,Male,33005,0.463319,0.0,3
readmitted_multiclass,No,38405,0.539123,>30 days,24881,0.349276,0.0,3
a1c_test_result,none,59320,0.832725,>8,5705,0.080086,0.0,4
glucose_test_result,none,67548,0.948228,Norm,1806,0.025352,0.0,4
race,Caucasian,50693,0.711621,AfricanAmerican,12693,0.178182,7.117188,5
admission_type,Emergency,37742,0.529816,Elective,13211,0.185454,5.202426,7


**insights**:

- `usa` invariant (we should drop)
- `glucose_test_result` and `a1c` (dominated by none)
- top 3 missing values:
  1. `weight` (we should drop)
  2. `medical_specialty`
  3. `payer_code`
- seems that `gender` and `change_in_meds` are the only ones that are boolean


##### Graphical Analysis


In [67]:
# # Filtering out categorical features with low cardinality for visualization
# low_cardinality_features = [
#     feature for feature in categorical_features if cat_info.loc[feature, "Cardinality"] <= 10]

# print(categorical_features[~categorical_features.isin(
#     low_cardinality_features)])

# # Adjusting the layout to display 2 bar plots per row for low cardinality features and increasing plot size
# plots_per_row_low_cardinality = 2
# num_rows_low_cardinality = len(low_cardinality_features) // plots_per_row_low_cardinality + \
#     (1 if len(low_cardinality_features) % plots_per_row_low_cardinality else 0)

# plt.figure(figsize=(20, num_rows_low_cardinality * 8))  # Increased figure size

# for i, feature in enumerate(low_cardinality_features, 1):
#     plt.subplot(num_rows_low_cardinality, plots_per_row_low_cardinality, i)
#     sns.countplot(x=feature, data=df, palette='cividis')
#     plt.title(f' {feature}')
#     plt.ylabel('Frequency')
#     plt.xlabel('')

# plt.tight_layout()
# plt.show()

comments on the graphs:

- `'payer_code', 'medical_specialty', 'discharge_disposition', 'admission_source', 'primary_diagnosis', 'secondary_diagnosis', 'additional_diagnosis', 'medication'` have a lot of categories, re-size them or find a way to avoid having that much categories
- country invariant, so definitely drop it


In [68]:
df["admission_type"].replace("Not Available", np.nan, inplace=True)
df["admission_type"].replace("Not Mapped", np.nan, inplace=True)
df["gender"].replace("Unknown/Invalid", np.nan, inplace=True)
df["admission_source"].replace("Not Available", np.nan, inplace=True)
df["medication"].replace("[]", "none", inplace=True)
df['medical_specialty'].replace('PhysicianNotFound', np.nan, inplace=True)
df['discharge_disposition'].replace('Not Mapped', np.nan, inplace=True)
df['discharge_disposition'].replace('Not Available', np.nan, inplace=True)

test_df["admission_type"].replace("Not Available", np.nan, inplace=True)
test_df["admission_type"].replace("Not Mapped", np.nan, inplace=True)
test_df["gender"].replace("Unknown/Invalid", np.nan, inplace=True)
test_df["admission_source"].replace("Not Available", np.nan, inplace=True)
test_df["medication"].replace("[]", "none", inplace=True)
test_df['medical_specialty'].replace('PhysicianNotFound', np.nan, inplace=True)
test_df['discharge_disposition'].replace('Not Mapped', np.nan, inplace=True)
test_df['discharge_disposition'].replace('Not Available', np.nan, inplace=True)

#### Separation of Train and Validation


In [69]:
y = df["readmitted_binary"]
y_multi = df["readmitted_multiclass"]
df.drop(['readmitted_binary', 'readmitted_multiclass'], axis=1, inplace=True)

In [70]:
X_train, X_val, y_train, y_val = train_test_split(
    df, y, test_size=0.2, stratify=y, random_state=69)

In [71]:
# profile = ProfileReport(
#     df,
#     title='Tugas Customer Data',
#     correlations={
#         "pearson": {"calculate": True},
#         "spearman": {"calculate": False},
#         "kendall": {"calculate": False},
#         "phi_k": {"calculate": False},
#         "cramers": {"calculate": False},
#     },
# )

In [72]:
# profile.to_notebook_iframe()

## Outlier Treatment & Feature Engineering


we will do some feature engineering to get rid of the outliers, it also will help to reduce the number of rows we are removing.


### Metric Features

will try not to delete the features, but just create a new column


In [73]:
def to_mean(x):
    mean_dic = {
        '[70-80)': 75,
        '[50-60)': 55,
        '[60-70)': 65,
        '[80-90)': 85,
        '[40-50)': 45,
        '[20-30)': 25,
        '[30-40)': 35,
        '[90-100)': 95,
        '[10-20)': 15,
        '[0-10)': 5
    }

    return mean_dic.get(x, 0)

In [74]:
X_train['age_mean'] = X_train['age'].apply(to_mean)
X_val['age_mean'] = X_val['age'].apply(to_mean)
test_df['age_mean'] = test_df['age'].apply(to_mean)

In [75]:
metric_features.append("age_mean")
metric_features

['outpatient_visits_in_previous_year',
 'emergency_visits_in_previous_year',
 'inpatient_visits_in_previous_year',
 'average_pulse_bpm',
 'length_of_stay_in_hospital',
 'number_lab_tests',
 'non_lab_procedures',
 'number_of_medications',
 'number_diagnoses',
 'age_mean']

In [76]:
def determine_winsorize_limits(train, limits=0.01):
    """
    Determines the winsorize limits for each numerical column in the training dataset.

    Parameters:
    train (pd.DataFrame): Features of the training dataset.
    limits (float or tuple of floats): The proportion of data to winsorize.

    Returns:
    dict: A dictionary with column names as keys and winsorize limits as values.
    """
    limits_dict = {}
    for column in train.columns:
        data = train[column]
        lower_limit = np.percentile(data, (limits * 100))
        upper_limit = np.percentile(data, (100 - limits * 100))
        limits_dict[column] = (lower_limit, upper_limit)
    return limits_dict


def apply_winsorize(data, limits_dict):
    """
    Applies winsorizing to the given dataset based on the provided limits.

    Parameters:
    data (pd.DataFrame or pd.Series): Dataset to be winsorized.
    limits_dict (dict): A dictionary with winsorize limits.

    Returns:
    pd.DataFrame or pd.Series: The winsorized dataset.
    """
    data_winsorized = pd.DataFrame()
    for column in data.columns:
        if column in limits_dict:
            data_winsorized[column +
                            "_win"] = np.clip(data[column], *limits_dict[column])
    return data_winsorized

In [77]:
train_limits = determine_winsorize_limits(X_train[metric_features])

w_train = apply_winsorize(X_train, train_limits)
X_train_w = pd.concat([X_train, w_train], axis=1)

w_val = apply_winsorize(X_val, train_limits)
X_val_w = pd.concat([X_val, w_val], axis=1)

w_test = apply_winsorize(test_df, train_limits)
test_w = pd.concat([test_df, w_test], axis=1)

In [78]:
# use boxcos transformation rather than the normal logarithmic one
# because it will find a lambda that max the LL function

def boxcox_transform(target: pd.DataFrame, val, test, columns: list, skewness_threshold):
    EPSILON = 1e-5

    train_df = pd.DataFrame()
    val_df = pd.DataFrame()
    test_df = pd.DataFrame()

    for col in columns:
        skew = target[col].skew()
        if abs(skew) > skewness_threshold:
            transformed, lmbda = boxcox(target[col] + EPSILON)
            val_t = boxcox(val[col] + EPSILON, lmbda=lmbda)
            test_t = boxcox(test[col] + EPSILON, lmbda=lmbda)

            train_df[col + "_log"] = transformed
            val_df[col + "_log"] = val_t
            test_df[col + "_log"] = test_t

    train_df.index = target.index
    val_df.index = val.index
    test_df.index = test.index

    return train_df, val_df, test_df

In [79]:
X_train["outpatient_visits_in_previous_year"].describe()

count    56988.000000
mean         0.369411
std          1.286111
min          0.000000
25%          0.000000
50%          0.000000
75%          0.000000
max         42.000000
Name: outpatient_visits_in_previous_year, dtype: float64

In [80]:
metric_features_og = [*metric_features]
metric_features = X_train_w.select_dtypes(include='number').columns

metric_features

Index(['patient_id', 'outpatient_visits_in_previous_year',
       'emergency_visits_in_previous_year',
       'inpatient_visits_in_previous_year', 'average_pulse_bpm',
       'length_of_stay_in_hospital', 'number_lab_tests', 'non_lab_procedures',
       'number_of_medications', 'number_diagnoses', 'age_mean',
       'outpatient_visits_in_previous_year_win',
       'emergency_visits_in_previous_year_win',
       'inpatient_visits_in_previous_year_win', 'average_pulse_bpm_win',
       'length_of_stay_in_hospital_win', 'number_lab_tests_win',
       'non_lab_procedures_win', 'number_of_medications_win',
       'number_diagnoses_win', 'age_mean_win'],
      dtype='object')

In [81]:
for threshold in [0.5, 0.75, 1, 1.25, 2]:
    bct, _, _ = boxcox_transform(
        X_train_w, X_val_w, test_w, metric_features, threshold)
    print(f"there are {len(bct.columns)}: {bct.columns}")

there are 16: Index(['outpatient_visits_in_previous_year_log',
       'emergency_visits_in_previous_year_log',
       'inpatient_visits_in_previous_year_log',
       'length_of_stay_in_hospital_log', 'non_lab_procedures_log',
       'number_of_medications_log', 'number_diagnoses_log', 'age_mean_log',
       'outpatient_visits_in_previous_year_win_log',
       'emergency_visits_in_previous_year_win_log',
       'inpatient_visits_in_previous_year_win_log',
       'length_of_stay_in_hospital_win_log', 'non_lab_procedures_win_log',
       'number_of_medications_win_log', 'number_diagnoses_win_log',
       'age_mean_win_log'],
      dtype='object')
there are 16: Index(['outpatient_visits_in_previous_year_log',
       'emergency_visits_in_previous_year_log',
       'inpatient_visits_in_previous_year_log',
       'length_of_stay_in_hospital_log', 'non_lab_procedures_log',
       'number_of_medications_log', 'number_diagnoses_log', 'age_mean_log',
       'outpatient_visits_in_previous_year_win

In [82]:
# because of the shape, of the distributions, we are going to stay with the 0.75 threshold

log_train, log_val, log_test = boxcox_transform(
    X_train_w, X_val_w, test_w, columns=metric_features, skewness_threshold=0.75)

X_train_bc = pd.concat([X_train, log_train], axis=1)
X_val_bc = pd.concat([X_val, log_val], axis=1)
test_bc = pd.concat([test_df, log_test], axis=1)

X_val_bc.columns

Index(['country', 'patient_id', 'race', 'gender', 'age', 'weight',
       'payer_code', 'outpatient_visits_in_previous_year',
       'emergency_visits_in_previous_year',
       'inpatient_visits_in_previous_year', 'admission_type',
       'medical_specialty', 'average_pulse_bpm', 'discharge_disposition',
       'admission_source', 'length_of_stay_in_hospital', 'number_lab_tests',
       'non_lab_procedures', 'number_of_medications', 'primary_diagnosis',
       'secondary_diagnosis', 'additional_diagnosis', 'number_diagnoses',
       'glucose_test_result', 'a1c_test_result',
       'change_in_meds_during_hospitalization', 'prescribed_diabetes_meds',
       'medication', 'age_mean', 'outpatient_visits_in_previous_year_log',
       'emergency_visits_in_previous_year_log',
       'inpatient_visits_in_previous_year_log',
       'length_of_stay_in_hospital_log', 'non_lab_procedures_log',
       'number_of_medications_log', 'number_diagnoses_log', 'age_mean_log',
       'outpatient_visits_in_

In [83]:
metric_features = X_train_w.select_dtypes(include='number').columns

In [84]:
def metric_to_bool(target, columns, new_names):
    new_df = pd.DataFrame()
    for col, new_name in zip(columns, new_names):
        new_df[new_name] = target[col].astype(bool)
    new_df.index = target.index
    return new_df

In [85]:
cols_to_bool = ["outpatient_visits_in_previous_year",
                "emergency_visits_in_previous_year", "inpatient_visits_in_previous_year"]

new_names = ["is_outpatient_visited",
             "is_emergency_visited", "is_inpatient_visited"]

bool_train = metric_to_bool(X_train_bc, cols_to_bool, new_names)
X_train_bool = pd.concat([X_train_bc, bool_train], axis=1)

bool_val = metric_to_bool(X_val_bc, cols_to_bool, new_names)
X_val_bool = pd.concat([X_val_bc, bool_val], axis=1)

bool_test = metric_to_bool(test_bc, cols_to_bool, new_names)
test_bool = pd.concat([test_bc, bool_test], axis=1)

a normal bpm is between 60-100 bpms [_source:wikipedia/heart_rate_](https://en.wikipedia.org/wiki/Heart_rate)


In [86]:
X_train_bool["is_pulse_normal"] = X_train_bool["average_pulse_bpm"].apply(
    lambda x: 60 <= x <= 100)

X_val_bool["is_pulse_normal"] = X_val_bool["average_pulse_bpm"].apply(
    lambda x: 60 <= x <= 100)

test_bool["is_pulse_normal"] = test_bool["average_pulse_bpm"].apply(
    lambda x: 60 <= x <= 100)

X_train_bool["is_pulse_normal"].describe()

count     56988
unique        2
top        True
freq      29007
Name: is_pulse_normal, dtype: object

### Reducing Dimentionality of Categorical Features


In [87]:
# You can check International Statistical Classification of Diseases and Related Health Problems (https://en.wikipedia.org/wiki/List_of_ICD-9_codes)


def categorize_icd9_code(icd9_code):
    """
    Correctly categorize the given ICD-9 code based on the provided classification,
    including handling for 'E' and 'V' codes and ensuring all codes are treated as strings.
    """
    # Convert to string in case the code is not in string format
    code_str = str(icd9_code).strip()

    try:
        if code_str.startswith('E'):
            return 'External Causes of Injury and Supplemental Classification'
        elif code_str.startswith('V'):
            return 'Supplemental Classification of Factors Influencing Health Status'
        elif '.' in code_str:
            code = int(float(code_str))
        else:
            code = int(code_str)

        # Classify the code
        if 1 <= code <= 139:
            return 'Infectious and Parasitic Diseases'
        elif 140 <= code <= 239:
            return 'Neoplasms'
        elif 240 <= code <= 279:
            return 'Endocrine, Nutritional and Metabolic Diseases, and Immunity Disorders'
        elif 280 <= code <= 289:
            return 'Diseases of the Blood and Blood-forming Organs'
        elif 290 <= code <= 319:
            return 'Mental Disorders'
        elif 320 <= code <= 389:
            return 'Diseases of the Nervous System and Sense Organs'
        elif 390 <= code <= 459:
            return 'Diseases of the Circulatory System'
        elif 460 <= code <= 519:
            return 'Diseases of the Respiratory System'
        elif 520 <= code <= 579:
            return 'Diseases of the Digestive System'
        elif 580 <= code <= 629:
            return 'Diseases of the Genitourinary System'
        elif 630 <= code <= 679:
            return 'Complications of Pregnancy, Childbirth, and the Puerperium'
        elif 680 <= code <= 709:
            return 'Diseases of the Skin and Subcutaneous Tissue'
        elif 710 <= code <= 739:
            return 'Diseases of the Musculoskeletal System and Connective Tissue'
        elif 740 <= code <= 759:
            return 'Congenital Anomalies'
        elif 760 <= code <= 779:
            return 'Certain Conditions Originating in the Perinatal Period'
        elif 780 <= code <= 799:
            return 'Symptoms, Signs, and Ill-defined Conditions'
        elif 800 <= code <= 999:
            return 'Injury and Poisoning'
        else:
            return 'Unknown or Invalid Code'
    except ValueError:
        # In case the code cannot be converted to an integer (e.g., '?')
        return 'Unknown or Invalid Code'

In [88]:
def categorize_discharged(x):
    category_dict = {
        "Discharged to home": "Home",
        "Discharged/transferred to home with home health service": "Home",
        "Hospice / home": "Home",
        "Discharged/transferred to home under care of Home IV provider": "Home",
        "Expired at home. Medicaid only, hospice.": "Home",
        "Discharged/transferred to another short term hospital": "Hospital",
        "Admitted as an inpatient to this hospital": "Hospital",
        "Discharged/transferred/referred to a psychiatric hospital of psychiatric distinct part unit of a hospital": "Hospital",
        "Discharged/transferred within this institution to Medicare approved swing bed": "Hospital",
        "Discharged/transferred/referred to this institution for outpatient services": "Hospital",
        "Neonate discharged to another hospital for neonatal aftercare": "Hospital",
        "Discharged/transferred/referred another institution for outpatient services": "Hospital",
        "Discharged/transferred to SNF": "Facility",
        "Discharged/transferred to ICF": "Facility",
        "Discharged/transferred to another type of inpatient care institution": "Facility",
        "Discharged/transferred to another rehab fac including rehab units of a hospital .": "Facility",
        "Discharged/transferred to a long term care hospital.": "Facility",
        "Discharged/transferred to a nursing facility certified under Medicaid but not certified under Medicare.": "Facility",
        "Discharged/transferred to a federal health care facility.": "Facility",
        "Hospice / medical facility": "Other",
        "Expired": "Other",
        "Left AMA": "Other",
        "Not Mapped": "Other",
        "Expired in a medical facility. Medicaid only, hospice.": "Other",
        "Still patient or expected to return for outpatient services": "Other"
    }

    return category_dict.get(x, x)

In [89]:
def categorize_admission(x):
    trimmed = str(x).strip()
    category_dict = {
        "Emergency Room": "Emergency",
        "Physician Referral": "Referral",
        "Transfer from a hospital": "Transfer",
        "Transfer from another health care facility": "Transfer",
        "Clinic Referral": "Referral",
        "Transfer from a Skilled Nursing Facility (SNF)": "Transfer",
        "HMO Referral": "Referral",
        "Not Mapped": "Not Available",
        "Not Available": "Not Available",
        "Court/Law Enforcement": "Emergency",
        "Transfer from hospital inpt/same fac reslt in a sep claim": "Transfer",
        "Transfer from critial access hospital": "Transfer",
        "Transfer from Ambulatory Surgery Center": "Transfer",
        "Extramural Birth": "Other",
        "Normal Delivery": "Other",
        "Sick Baby": "Other"
    }

    return category_dict.get(trimmed, x)

In [90]:
X_train_reduced_cat = X_train_bool.copy()
X_val_reduced_cat = X_val_bool.copy()
test_reduced_cat = test_bool.copy()

X_train_reduced_cat['primary_diagnosis_cat'] = X_train_reduced_cat['primary_diagnosis'].apply(
    categorize_icd9_code)
X_train_reduced_cat['secondary_diagnosis_cat'] = X_train_reduced_cat['secondary_diagnosis'].apply(
    categorize_icd9_code)
X_train_reduced_cat['additional_diagnosis_cat'] = X_train_reduced_cat['additional_diagnosis'].apply(
    categorize_icd9_code)
X_train_reduced_cat["discharge_disposition_cat"] = X_train_reduced_cat["discharge_disposition"].apply(
    categorize_discharged)
X_train_reduced_cat["admission_source_cat"] = X_train_reduced_cat["admission_source"].apply(
    categorize_admission)

X_val_reduced_cat['primary_diagnosis_cat'] = X_val_reduced_cat['primary_diagnosis'].apply(
    categorize_icd9_code)
X_val_reduced_cat['secondary_diagnosis_cat'] = X_val_reduced_cat['secondary_diagnosis'].apply(
    categorize_icd9_code)
X_val_reduced_cat['additional_diagnosis_cat'] = X_val_reduced_cat['additional_diagnosis'].apply(
    categorize_icd9_code)
X_val_reduced_cat["discharge_disposition_cat"] = X_val_reduced_cat["discharge_disposition"].apply(
    categorize_discharged)
X_val_reduced_cat["admission_source_cat"] = X_val_reduced_cat["admission_source"].apply(
    categorize_admission)

test_reduced_cat['primary_diagnosis_cat'] = test_reduced_cat['primary_diagnosis'].apply(
    categorize_icd9_code)
test_reduced_cat['secondary_diagnosis_cat'] = test_reduced_cat['secondary_diagnosis'].apply(
    categorize_icd9_code)
test_reduced_cat['additional_diagnosis_cat'] = test_reduced_cat['additional_diagnosis'].apply(
    categorize_icd9_code)
test_reduced_cat["discharge_disposition_cat"] = test_reduced_cat["discharge_disposition"].apply(
    categorize_discharged)
test_reduced_cat["admission_source_cat"] = test_reduced_cat["admission_source"].apply(
    categorize_admission)

In [91]:
new_cat_features = X_train_reduced_cat.select_dtypes(exclude='number').columns
cat_info = describe_categorical(
    new_cat_features, X_train_reduced_cat).set_index("Feature")
cat_info.sort_values('Cardinality')

Unnamed: 0_level_0,Mode,Mode Frequency,Mode Proportion,2nd Mode,2nd Mode Frequency,2nd Mode Proportion,Missing Values %,Cardinality
Feature,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
country,USA,56988,1.0,,0,0.0,0.0,1
is_pulse_normal,True,29007,0.509002,False,27981,0.490998,0.0,2
change_in_meds_during_hospitalization,No,30570,0.536429,Ch,26418,0.463571,0.0,2
is_inpatient_visited,False,37733,0.662122,True,19255,0.337878,0.0,2
prescribed_diabetes_meds,Yes,43945,0.771127,No,13043,0.228873,0.0,2
gender,Female,30528,0.535692,Male,26458,0.464273,0.00351,2
is_outpatient_visited,False,47663,0.836369,True,9325,0.163631,0.0,2
is_emergency_visited,False,50513,0.88638,True,6475,0.11362,0.0,2
discharge_disposition_cat,Home,41405,0.726556,Facility,10179,0.178617,4.544816,4
a1c_test_result,none,47476,0.833088,>8,4575,0.08028,0.0,4


In [92]:
def safe_convert_to_list(s):
    if s == "[]":
        return []
    try:
        return ast.literal_eval(s)
    except Exception:
        return []


# Create copies of the original dataframes
X_train_med = X_train_reduced_cat.copy()
X_val_med = X_val_reduced_cat.copy()
test_med = test_reduced_cat.copy()

# Apply the conversion to an additional column in all new datasets
X_train_med['medication_list'] = X_train_med['medication'].apply(
    safe_convert_to_list)
X_val_med['medication_list'] = X_val_med['medication'].apply(
    safe_convert_to_list)
test_med['medication_list'] = test_med['medication'].apply(
    safe_convert_to_list)

# Extract unique medications
unique_medications = set()
for dataset in [X_train_med, X_val_med]:
    for meds in dataset['medication_list']:
        unique_medications.update(meds)

# Function to create binary columns


def create_binary_med_columns(df, unique_meds):
    for med in unique_meds:
        df[f'med_{med}'] = df['medication_list'].apply(
            lambda meds: 1 if med in meds else 0)


# Apply the function to all new datasets
create_binary_med_columns(X_train_med, unique_medications)
create_binary_med_columns(X_val_med, unique_medications)
create_binary_med_columns(test_med, unique_medications)

X_train_med.drop(['medication_list', 'medication'], axis=1, inplace=True)
X_val_med.drop(['medication_list', 'medication'], axis=1, inplace=True)
test_med.drop(['medication_list', 'medication'], axis=1, inplace=True)

X_train_med.head().columns.tolist()

['country',
 'patient_id',
 'race',
 'gender',
 'age',
 'weight',
 'payer_code',
 'outpatient_visits_in_previous_year',
 'emergency_visits_in_previous_year',
 'inpatient_visits_in_previous_year',
 'admission_type',
 'medical_specialty',
 'average_pulse_bpm',
 'discharge_disposition',
 'admission_source',
 'length_of_stay_in_hospital',
 'number_lab_tests',
 'non_lab_procedures',
 'number_of_medications',
 'primary_diagnosis',
 'secondary_diagnosis',
 'additional_diagnosis',
 'number_diagnoses',
 'glucose_test_result',
 'a1c_test_result',
 'change_in_meds_during_hospitalization',
 'prescribed_diabetes_meds',
 'age_mean',
 'outpatient_visits_in_previous_year_log',
 'emergency_visits_in_previous_year_log',
 'inpatient_visits_in_previous_year_log',
 'length_of_stay_in_hospital_log',
 'non_lab_procedures_log',
 'number_of_medications_log',
 'number_diagnoses_log',
 'age_mean_log',
 'outpatient_visits_in_previous_year_win_log',
 'emergency_visits_in_previous_year_win_log',
 'inpatient_visits_

## Handling Missing Values


## Missing Values


In [107]:
X_train_filtered = X_train_med.copy()
X_val_filtered = X_val_med.copy()
test_filtered = test_med.copy()


def missing_percentage(df):
    return df.isnull().mean() * 100


missing_percentage(X_train_filtered)

country                        0.000000
patient_id                     0.000000
race                           7.094476
gender                         0.003510
age                            5.023865
                                 ...   
med_metformin-rosiglitazone    0.000000
med_chlorpropamide             0.000000
med_miglitol                   0.000000
med_repaglinide                0.000000
med_tolbutamide                0.000000
Length: 74, dtype: float64

In [108]:
X_train_filtered.columns

Index(['country', 'patient_id', 'race', 'gender', 'age', 'weight',
       'payer_code', 'outpatient_visits_in_previous_year',
       'emergency_visits_in_previous_year',
       'inpatient_visits_in_previous_year', 'admission_type',
       'medical_specialty', 'average_pulse_bpm', 'discharge_disposition',
       'admission_source', 'length_of_stay_in_hospital', 'number_lab_tests',
       'non_lab_procedures', 'number_of_medications', 'primary_diagnosis',
       'secondary_diagnosis', 'additional_diagnosis', 'number_diagnoses',
       'glucose_test_result', 'a1c_test_result',
       'change_in_meds_during_hospitalization', 'prescribed_diabetes_meds',
       'age_mean', 'outpatient_visits_in_previous_year_log',
       'emergency_visits_in_previous_year_log',
       'inpatient_visits_in_previous_year_log',
       'length_of_stay_in_hospital_log', 'non_lab_procedures_log',
       'number_of_medications_log', 'number_diagnoses_log', 'age_mean_log',
       'outpatient_visits_in_previous_year_

In [102]:
# def replace_categorical_values(train_df, val_df, threshold=0.20):
#     """
#     Replace values in categorical features of training and testing datasets without altering the original dataframes.
#     For features with more than 20% missing values, replace with 'Unknown'.
#     For features with 20% or less missing values, replace with the mode of the training data.

#     Parameters:
#     X_train (DataFrame): The training dataset.
#     X_test (DataFrame): The test dataset.
#     threshold (float): The threshold for determining replacement strategy.

#     Returns:
#     DataFrame, DataFrame: New modified training and testing datasets.
#     """
#     # Creating copies of the dataframes to avoid altering the original data
#     X_train_copy = train_df.copy()
#     X_val_copy = val_df.copy()

#     for col in X_train_copy.columns:
#         if (X_train_copy[col].dtype == 'object' or X_train_copy[col].dtype.name == 'category'):
#             missing_percentage = X_train_copy[col].isna(
#             ).sum() / len(X_train_copy)

#             if missing_percentage > threshold:
#                 # Replace with 'Unknown' for columns with more than 20% missing values
#                 X_train_copy[col].fillna('Unknown', inplace=True)
#                 X_val_copy[col].fillna('Unknown', inplace=True)
#             else:
#                 # Replace with mode for columns with 20% or less missing values
#                 mode_value = X_train_copy[col].mode()[0]
#                 X_train_copy[col].fillna(mode_value, inplace=True)
#                 X_val_copy[col].fillna(mode_value, inplace=True)

#     return X_train_copy, X_val_copy


# X_train_treated, X_val_treated = replace_categorical_values(
#     X_train_filtered, X_val_filtered)

In [109]:
mode_value = X_train_filtered["race"].mode()[0]
X_train_filtered["race"].fillna(mode_value, inplace=True)
X_val_filtered["race"].fillna(mode_value, inplace=True)
test_filtered["race"].fillna(mode_value, inplace=True)

mode_gender = X_train_filtered["gender"].mode()[0]
X_train_filtered["gender"].fillna(mode_gender, inplace=True)
X_val_filtered["gender"].fillna(mode_gender, inplace=True)
test_filtered["gender"].fillna(mode_gender, inplace=True)

mode_age = X_train_filtered["age"].mode()[0]
X_train_filtered["age"].fillna(mode_age, inplace=True)
X_val_filtered["age"].fillna(mode_age, inplace=True)
test_filtered["age"].fillna(mode_age, inplace=True)

X_train_filtered["payer_code"].fillna("Unknown", inplace=True)
X_val_filtered["payer_code"].fillna("Unknown", inplace=True)
test_filtered["payer_code"].fillna("Unknown", inplace=True)

X_train_filtered["admission_type"].fillna("Unknown", inplace=True)
X_val_filtered["admission_type"].fillna("Unknown", inplace=True)
test_filtered["admission_type"].fillna("Unknown", inplace=True)

X_train_filtered["discharge_disposition"].fillna("Unknown", inplace=True)
X_val_filtered["discharge_disposition"].fillna("Unknown", inplace=True)
test_filtered["discharge_disposition"].fillna("Unknown", inplace=True)

X_train_filtered["discharge_disposition_cat"].fillna("Unknown", inplace=True)
X_val_filtered["discharge_disposition_cat"].fillna("Unknown", inplace=True)
test_filtered["discharge_disposition_cat"].fillna("Unknown", inplace=True)

X_train_filtered["admission_source"].fillna("Unknown", inplace=True)
X_val_filtered["admission_source"].fillna("Unknown", inplace=True)
test_filtered["admission_source"].fillna("Unknown", inplace=True)

X_train_filtered["admission_source_cat"].fillna("Unknown", inplace=True)
X_val_filtered["admission_source_cat"].fillna("Unknown", inplace=True)
test_filtered["admission_source_cat"].fillna("Unknown", inplace=True)

X_train_filtered["glucose_test_result"].fillna("None", inplace=True)
X_train_filtered["a1c_test_result"].fillna("None", inplace=True)
test_filtered["glucose_test_result"].fillna("None", inplace=True)

X_val_filtered["glucose_test_result"].fillna("None", inplace=True)
X_val_filtered["a1c_test_result"].fillna("None", inplace=True)
test_filtered["a1c_test_result"].fillna("None", inplace=True)

X_train_filtered["primary_diagnosis"].fillna("Unknown", inplace=True)
X_train_filtered["secondary_diagnosis"].fillna("Unknown", inplace=True)
X_train_filtered["additional_diagnosis"].fillna("Unknown", inplace=True)

X_val_filtered["primary_diagnosis"].fillna("Unknown", inplace=True)
X_val_filtered["secondary_diagnosis"].fillna("Unknown", inplace=True)
X_val_filtered["additional_diagnosis"].fillna("Unknown", inplace=True)

test_filtered["primary_diagnosis"].fillna("Unknown", inplace=True)
test_filtered["secondary_diagnosis"].fillna("Unknown", inplace=True)
test_filtered["additional_diagnosis"].fillna("Unknown", inplace=True)

probably in the future a good idea would be to use nn to fill the data and test differences


In [110]:
features_to_drop = ['weight', 'country', 'medical_specialty', 'patient_id',
                    'primary_diagnosis', 'secondary_diagnosis', 'additional_diagnosis']

X_train_filtered = X_train_filtered.drop(features_to_drop, axis=1)
X_val_filtered = X_val_filtered.drop(features_to_drop, axis=1)
test_filtered = test_filtered.drop(features_to_drop, axis=1)

In [111]:
missing_percentage(X_train_filtered)

race                                  0.0
gender                                0.0
age                                   0.0
payer_code                            0.0
outpatient_visits_in_previous_year    0.0
                                     ... 
med_metformin-rosiglitazone           0.0
med_chlorpropamide                    0.0
med_miglitol                          0.0
med_repaglinide                       0.0
med_tolbutamide                       0.0
Length: 67, dtype: float64

In [112]:
X_train_filtered.to_csv("../data/input/train_treated.csv")
X_val_filtered.to_csv("../data/input/val_treated.csv")
test_filtered.to_csv("../data/input/test_treated.csv")

In [113]:
y.to_csv("../data/input/target.csv")