# Objective

To test if dropping non important columns or the columns with lot of unique values will improve the performance of the Deicison Tree model.


In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.metrics import r2_score
from sklearn.impute import SimpleImputer

In [2]:
import os

base_dir = r"D:\DSpit\AIH"
base_dir

csv_path = os.path.join(
    base_dir,
    "lab",
    "Diabetic-Patients-Readmission-Prediction",
    "CSV Files",
    "Diabetes_cleaned.csv",
)
csv_path

'D:\\DSpit\\AIH/lab/Diabetic-Patients-Readmission-Prediction/CSV Files/Diabetes_cleaned.csv'

'D:\\DSpit\\AIH\\lab\\Diabetic-Patients-Readmission-Prediction\\CSV Files\\Diabetes_cleaned.csv'

In [22]:
df = pd.read_csv(csv_path)
df.head()

Unnamed: 0,race,gender,age,discharge_disposition_id,admission_source_id,time_in_hospital,num_lab_procedures,num_procedures,num_medications,diag_1,...,glipizide,glyburide,pioglitazone,rosiglitazone,insulin,diabetesMed,readmitted,preceding_year_visits,number_changes,insulin_treatment
0,Caucasian,0,5,Not Available,Referral,1,41,0,1,Diabetes,...,-2,-2,-2,-2,-2,0,0,0,0.0,no_med
1,Caucasian,0,15,Discharged to home,Emergency,3,59,0,18,"Endocrine, Nutritional, Metabolic, Immunity",...,-2,-2,-2,-2,1,1,0,0,1.0,insulin_only
2,AfricanAmerican,0,25,Discharged to home,Emergency,2,11,5,13,"Pregnancy, Childbirth",...,0,-2,-2,-2,-2,1,0,3,0.0,other_meds
3,Caucasian,1,35,Discharged to home,Emergency,2,44,1,16,Infectious and Parasitic,...,-2,-2,-2,-2,1,1,0,0,1.0,insulin_only
4,Caucasian,1,45,Discharged to home,Emergency,1,51,0,8,Neoplasms,...,0,-2,-2,-2,0,1,0,0,0.0,insulin_combo


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 97070 entries, 0 to 97069
Data columns (total 28 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   race                      97070 non-null  object 
 1   gender                    97070 non-null  int64  
 2   age                       97070 non-null  int64  
 3   discharge_disposition_id  97070 non-null  object 
 4   admission_source_id       97070 non-null  object 
 5   time_in_hospital          97070 non-null  int64  
 6   num_lab_procedures        97070 non-null  int64  
 7   num_procedures            97070 non-null  int64  
 8   num_medications           97070 non-null  int64  
 9   diag_1                    97070 non-null  object 
 10  diag_2                    97070 non-null  object 
 11  diag_3                    97070 non-null  object 
 12  number_diagnoses          97070 non-null  int64  
 13  max_glu_serum             5092 non-null   object 
 14  A1Cres

In [23]:
# Define column types
categorical_columns = [
    "race",
    "discharge_disposition_id",
    "admission_source_id",
    "diag_1",
    "diag_2",
    "diag_3",
    "max_glu_serum",
    "A1Cresult",
    "insulin_treatment",
]
numerical_columns = [
    "gender",
    "age",
    "num_lab_procedures",
    "num_procedures",
    "num_medications",
    "number_diagnoses",
    "metformin",
    "repaglinide",
    "glimepiride",
    "glipizide",
    "glyburide",
    "pioglitazone",
    "rosiglitazone",
    "insulin",
    "diabetesMed",
    "readmitted",
    "preceding_year_visits",
]

In [1]:
# Top 10 Most Important Features:
#                              feature  importance
# 14                           insulin    0.514447
# 102         insulin_treatment_no_med    0.051890
# 103     insulin_treatment_other_meds    0.045054
# 7                          metformin    0.042641
# 15                       diabetesMed    0.038324
# 100  insulin_treatment_insulin_combo    0.034873
# 101   insulin_treatment_insulin_only    0.034027
# 11                         glyburide    0.031616
# 10                         glipizide    0.028577
# 9                        glimepiride    0.011772

# # Keep only these columns in the dataset
# important_features = ['insulin', 'insulin_treatment_no_med', 'insulin_treatment_other_meds', 'metformin', 'diabetesMed', 'insulin_treatment_insulin_combo', 'insulin_treatment_insulin_only', 'glyburide', 'glipizide', 'glimepiride', "time_in_hospital"]

# categorical_columns = [i for i in categorical_columns if i in important_features]
# numerical_columns = [i for i in numerical_columns if i in important_features]

# all_columns = df.columns.tolist()

# to_drop = [i for i in all_columns if i not in important_features]
# to_drop

In [26]:
len(to_drop)

21

In [27]:
df = df.drop(columns=to_drop)
df.head()

Unnamed: 0,time_in_hospital,metformin,glimepiride,glipizide,glyburide,insulin,diabetesMed
0,1,-2,-2,-2,-2,-2,0
1,3,-2,-2,-2,-2,1,1
2,2,-2,-2,0,-2,-2,1
3,2,-2,-2,-2,-2,1,1
4,1,-2,-2,0,-2,0,1


In [28]:
df.shape

(97070, 7)

In [29]:
target = "time_in_hospital"  # Assuming this is your target variable

In [30]:
# Create preprocessing steps
numeric_transformer = Pipeline(
    steps=[("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler())]
)

categorical_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="constant", fill_value="missing")),
        ("onehot", OneHotEncoder(handle_unknown="ignore")),
    ]
)

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numerical_columns),
        ("cat", categorical_transformer, categorical_columns),
    ]
)

In [31]:
from sklearn.tree import DecisionTreeRegressor

# Create pipeline
pipeline = Pipeline(
    [
        ("preprocessor", preprocessor),
        ("regressor", DecisionTreeRegressor(max_depth=70, random_state=42)),
    ]
)

In [32]:
#
# Top 10 Most Important Features:
#                              feature  importance
# 14                           insulin    0.514447
# 102         insulin_treatment_no_med    0.051890
# 103     insulin_treatment_other_meds    0.045054
# 7                          metformin    0.042641
# 15                       diabetesMed    0.038324
# 100  insulin_treatment_insulin_combo    0.034873
# 101   insulin_treatment_insulin_only    0.034027
# 11                         glyburide    0.031616
# 10                         glipizide    0.028577
# 9                        glimepiride    0.011772

# Keep only these columns and drop others

In [33]:
# Split the data
X = df.drop(columns=[target])
y = df[target]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [34]:
X_train.head()

Unnamed: 0,metformin,glimepiride,glipizide,glyburide,insulin,diabetesMed
51499,-2,-2,-2,-2,-2,0
32493,-2,-2,0,-2,0,1
53949,0,0,-2,-2,0,1
78710,-2,-2,-2,-2,-1,1
78688,-2,-2,-2,-2,-2,0


In [35]:
y_train.head()

51499    4
32493    7
53949    2
78710    3
78688    5
Name: time_in_hospital, dtype: int64

In [36]:
#

In [37]:
pipeline.fit(X_train, y_train)

In [38]:
y_pred = pipeline.predict(X_test)

In [39]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"Root Mean Squared Error: {rmse}")
print(f"Mean Absolute Error: {mae}")
print(f"R-squared Score: {r2}")

Mean Squared Error: 8.749823499872576
Root Mean Squared Error: 2.9580100574326273
Mean Absolute Error: 2.2994767446085795
R-squared Score: 0.026836661483285007


In [40]:
y_preds_train = pipeline.predict(X_train)

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

mse = mean_squared_error(y_train, y_preds_train)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_train, y_preds_train)
r2 = r2_score(y_train, y_preds_train)

print(f"Mean Squared Error: {mse}")
print(f"Root Mean Squared Error: {rmse}")
print(f"Mean Absolute Error: {mae}")
print(f"R-squared Score: {r2}")

Mean Squared Error: 8.4871519647262
Root Mean Squared Error: 2.9132716942856876
Mean Absolute Error: 2.275383594728526
R-squared Score: 0.0333734025918373
