In [1]:
#  import necessary modules & libraries
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import SMOTE

In [2]:
# read data from file
df = pd.read_csv("../cleaned_df.csv", index_col=0)


In [3]:
# Test data sample
df.head(2)

Unnamed: 0,available_extra_rooms_in_hospital,bed_grade,severity_of_illness,visitors_with_patient,admission_deposit,hospital_code_1,hospital_code_2,hospital_code_3,hospital_code_4,hospital_code_5,...,age_11-20,age_21-30,age_31-40,age_41-50,age_51-60,age_61-70,age_71-80,age_81-90,age_91-100,stay
0,3,1,0,2,4911.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0-10
1,2,1,0,2,5954.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,41-50


## Imbalanced data

As mentioned in the dataset notebook, the dataset is imbalanced. I will utilize SMOTE to create synthetic samples for the minor class to address the issue. 

In [4]:
# Define X and y
X = df.drop(columns=["stay"])
y = df["stay"]

# Oversampling the data
smote = SMOTE(sampling_strategy="auto")
X, y = smote.fit_resample(X, y)


In [5]:

# split data with standard 80%/10%/10% split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=42)


### Model Development

In [6]:
# Try both Gini impurity index and Entropy index (experiment with different values of max_depth,
# max_features, min_impurity_decrease, min_samples_leaf, min_samples_split)

dt_gini = DecisionTreeClassifier(criterion='gini', random_state=42)
dt_entropy = DecisionTreeClassifier(criterion='entropy', random_state=42)

In [7]:
# Fit the models
dt_gini.fit(X_train, y_train)
dt_entropy.fit(X_train, y_train)

# Evaluate the models
y_pred_gini = dt_gini.predict(X_val)
y_pred_entropy = dt_entropy.predict(X_val)

# Calculate the accuracy of the models
accuracy_gini = accuracy_score(y_val, y_pred_gini)
accuracy_entropy = accuracy_score(y_val, y_pred_entropy)

print(f"Gini model accuracy: {accuracy_gini}")
print(f"Entropy model accuracy: {accuracy_entropy}")


Gini model accuracy: 0.5915004156275977
Entropy model accuracy: 0.5905756442227764


The 2 primitive models return the average accuracy. Next, I will leverage GridSearchCV to experiment with different hyperparameters to obtain the best possible results

In [8]:
# Use classification_report and confusion_matrix to evaluate the performance of the model

In [9]:
# display_tree in utils.py to display each tree

## Conclusion

Discuss the last hyperparameters based on the training & validation metrics.