In [12]:
#  import necessary modules & libraries
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import GridSearchCV

In [2]:
# read data from file
df = pd.read_csv("../cleaned_df.csv", index_col=0)


In [3]:
# Test data sample
df.head(2)

Unnamed: 0,available_extra_rooms_in_hospital,bed_grade,severity_of_illness,visitors_with_patient,admission_deposit,hospital_code_1,hospital_code_2,hospital_code_3,hospital_code_4,hospital_code_5,...,age_11-20,age_21-30,age_31-40,age_41-50,age_51-60,age_61-70,age_71-80,age_81-90,age_91-100,stay
0,3,1,0,2,4911.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0-10
1,2,1,0,2,5954.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,41-50


## Imbalanced data

As mentioned in the dataset notebook, the dataset is imbalanced. I will utilize SMOTE to create synthetic samples for the minor class to address the issue. 

In [4]:
# Define X and y
X = df.drop(columns=["stay"])
y = df["stay"]

# Oversampling the data
smote = SMOTE(sampling_strategy="auto")
X, y = smote.fit_resample(X, y)


In [5]:

# split data with standard 80%/10%/10% split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=42)


### Model Development

In [6]:
# Try both Gini impurity index and Entropy index (experiment with different values of max_depth,
# max_features, min_impurity_decrease, min_samples_leaf, min_samples_split)

dt_gini = DecisionTreeClassifier(criterion='gini', random_state=42)
dt_entropy = DecisionTreeClassifier(criterion='entropy', random_state=42)

In [7]:
# Fit the models
dt_gini.fit(X_train, y_train)
dt_entropy.fit(X_train, y_train)

# Evaluate the models
y_pred_gini = dt_gini.predict(X_val)
y_pred_entropy = dt_entropy.predict(X_val)

# Calculate the accuracy of the models
accuracy_gini = accuracy_score(y_val, y_pred_gini)
accuracy_entropy = accuracy_score(y_val, y_pred_entropy)

print(f"Gini model accuracy: {accuracy_gini}")
print(f"Entropy model accuracy: {accuracy_entropy}")


Gini model accuracy: 0.5915004156275977
Entropy model accuracy: 0.5905756442227764


The 2 primitive models return the average accuracy. Next, I will leverage GridSearchCV to experiment with different hyperparameters to obtain the best possible results

In [19]:
param_grid = {
    "max_depth": [80, 90, 100],  # Maximum tree depth
    "max_features": [20, 40, None],  # Number of features to consider at each split
    'min_impurity_decrease': [0.0, 0.1, 0.2],  # Minimum decrease in impurity required for a split
    'min_samples_leaf': [100, 500, 1000],  # Minimum samples required at a leaf node
    'min_samples_split': [100, 500, 1000],  # Minimum samples required to split a node
    'criterion': ['gini', 'entropy']  # Splitting criterion (either Gini or entropy)
}

In [20]:
model = DecisionTreeClassifier(random_state=42)

grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5,n_jobs=-1, verbose=2, scoring='accuracy')

In [21]:
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 486 candidates, totalling 2430 fits


In [22]:
best_model = grid_search.best_estimator_
best_accuracy = grid_search.best_score_
print("Best Model:", best_model)
print("Best Accuracy:", best_accuracy)


Best Model: DecisionTreeClassifier(max_depth=15, min_samples_leaf=100,
                       min_samples_split=100, random_state=42)
Best Accuracy: 0.4275587073981712


The best model I can find at this point is dt_gini.

In [30]:
# Use classification_report and confusion_matrix to evaluate the performance of the model
from sklearn.metrics import classification_report, confusion_matrix

print("classification report on training:")
print(classification_report(y_train, dt_gini.predict(X_train)))
print("Classification Report on validation:")
print(classification_report(y_val, dt_gini.predict(X_val)))




classification report on training:
                    precision    recall  f1-score   support

              0-10       1.00      1.00      1.00     70038
             11-20       1.00      1.00      1.00     69968
             21-30       1.00      1.00      1.00     69844
             31-40       1.00      1.00      1.00     69940
             41-50       1.00      1.00      1.00     69977
             51-60       1.00      1.00      1.00     70165
             61-70       1.00      1.00      1.00     69916
             71-80       1.00      1.00      1.00     70114
             81-90       1.00      1.00      1.00     69988
            91-100       1.00      1.00      1.00     70102
More than 100 Days       1.00      1.00      1.00     69868

          accuracy                           1.00    769920
         macro avg       1.00      1.00      1.00    769920
      weighted avg       1.00      1.00      1.00    769920

Classification Report on validation:
                    preci

In [37]:
            # confusion matrix
cm = confusion_matrix(y_val, dt_gini.predict(X_val))
class_names = ["0-10", "11-20", "21-30","31-40", "41-50", "51-60", "61-70", "71-80", "81-90", "91-100", "More than 100 Days"]

# Create a DataFrame with cm and class_names
df_cm = pd.DataFrame(cm, index=class_names, columns=class_names)
print(df_cm)

                    0-10  11-20  21-30  31-40  41-50  51-60  61-70  71-80  \
0-10                4999   1031    618    668    724    261    180    154   
11-20               1133   2957   2358   1226    417    409     75    124   
21-30                662   2453   3374   1334    384    354     77     78   
31-40                645   1198   1307   2483    805   1245    220    478   
41-50                682    354    388    665   5355    346    391    249   
51-60                273    421    342   1272    393   3499    164   1066   
61-70                 98     59     82    123    249    107   7801    126   
71-80                129    128     68    407    234    855    162   5376   
81-90                 38     23     44    101     80    366     74    335   
91-100                25     27      9    124     67    243     70    292   
More than 100 Days    53     34     28    137    110    335    103    434   

                    81-90  91-100  More than 100 Days  
0-10               

In [9]:
# display_tree in utils.py to display each tree

## Conclusion

Discuss the last hyperparameters based on the training & validation metrics.