In [24]:
# Importing necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,PolynomialFeatures
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.decomposition import PCA
from sklearn.metrics import mean_absolute_error, recall_score,precision_score
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import AdaBoostRegressor,AdaBoostClassifier
import xgboost as xgb
from sklearn.model_selection import cross_validate,GridSearchCV,RandomizedSearchCV
import joblib

In [2]:
# loading the dataset.
df = pd.read_csv('../data/processed/processed_data.csv')
df

Unnamed: 0,age_group,length_of_stay,type_of_admission,ccs_diagnosis_description,ccs_procedure_description,apr_drg_description,apr_mdc_description,apr_severity_of_illness_description,apr_risk_of_mortality,gender_M,apr_medical_surgical_description_Surgical,emergency_department_indicator_Y
0,3,5,7.29,11.69,6.36,12.58,11.66,1,1,0,0,1
1,1,8,7.29,11.69,6.36,12.58,11.66,2,1,0,0,1
2,2,8,7.29,11.69,6.36,12.58,11.66,1,1,1,0,1
3,3,4,7.29,5.87,6.36,4.89,7.36,1,1,0,0,1
4,3,3,7.29,6.04,6.36,5.74,7.21,2,2,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...
698106,3,3,7.29,8.01,6.16,5.38,6.91,3,3,0,1,1
698107,1,1,7.29,4.71,7.35,4.78,5.19,3,3,0,0,1
698108,4,3,7.29,6.85,4.51,6.66,6.91,3,3,1,0,1
698109,4,2,7.29,10.68,6.20,12.65,9.68,4,4,0,1,1


In [3]:
df.length_of_stay = df.length_of_stay.astype(float)

In [4]:
df.length_of_stay

0         5.0
1         8.0
2         8.0
3         4.0
4         3.0
         ... 
698106    3.0
698107    1.0
698108    3.0
698109    2.0
698110    2.0
Name: length_of_stay, Length: 698111, dtype: float64

In [5]:
# divede the data fram to x = features & y = target
x = df.drop("length_of_stay", axis = 1)
y = df["length_of_stay"]

In [6]:
# taking sample of the data using stratify to maintain data characteristics
main_x,sample_x,main_y,sample_y = train_test_split(x,y,test_size=0.03,random_state=42,stratify=y)

In [7]:
sample_y.value_counts()

2.0     2550
3.0     2490
1.0     2274
4.0     2141
5.0     1780
6.0     1508
7.0     1300
20.0    1205
8.0     1043
9.0      837
10.0     689
11.0     574
12.0     487
13.0     441
14.0     408
15.0     329
16.0     274
17.0     236
18.0     204
19.0     174
Name: length_of_stay, dtype: int64

In [8]:
smote = SMOTE(random_state=14)
sample_x,sample_y = smote.fit_resample(sample_x,sample_y)

In [9]:
sample_y.value_counts()

4.0     2550
12.0    2550
16.0    2550
15.0    2550
17.0    2550
14.0    2550
18.0    2550
11.0    2550
20.0    2550
3.0     2550
10.0    2550
7.0     2550
2.0     2550
5.0     2550
8.0     2550
9.0     2550
1.0     2550
6.0     2550
13.0    2550
19.0    2550
Name: length_of_stay, dtype: int64

In [10]:
sample_y.count()

51000

In [11]:
sample_y.isnull().sum()

0

In [12]:
sample_x.isnull().sum()

age_group                                    0
type_of_admission                            0
ccs_diagnosis_description                    0
ccs_procedure_description                    0
apr_drg_description                          0
apr_mdc_description                          0
apr_severity_of_illness_description          0
apr_risk_of_mortality                        0
gender_M                                     0
apr_medical_surgical_description_Surgical    0
emergency_department_indicator_Y             0
dtype: int64

In [13]:
x_train,x_test,y_train,y_test = train_test_split(sample_x,sample_y,test_size=0.2,random_state=42,stratify=sample_y)

In [14]:
# Using Stander Scaler for x_train, x_test
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)

### As we are dealing with multi class model we have conseder the folloing:
#### The average parameter in the scikit-learn classification metrics functions controls how the metric is calculated across multiple classes. For multiclass classification tasks, the average parameter can be set to None, micro, macro, or weighted.

1) None: The metric is calculated separately for each class, and the results are not averaged.
2) micro: The metric is calculated over all predictions, regardless of the class.
3) macro: The metric is calculated for each class and then averaged across all classes.
4) weighted: The metric is calculated for each class, weighted by the number of samples in that class.
#### For multiclass classification tasks, it is generally recommended to use the micro or macro averaging setting. The binary averaging setting is only intended for binary classification tasks here we are going to use macro averaging sitting.

In [15]:
# trying three different classification models to compare between them
models = {"knnc": KNeighborsClassifier(),
          "rfc": RandomForestClassifier()}

In [16]:
for name,model in models.items():
        print("----------- ", name, " -------------")
        model.fit(x_train_scaled,y_train)
        y_train_pred = model.predict(x_train_scaled)
        y_test_pred = model.predict(x_test_scaled)
        print("Train Acurracy Score", round(model.score(x_train_scaled,y_train),2), "Test Acurracy Score", round(model.score(x_test_scaled,y_test),2))
        print("Train Precision Score: ", round(precision_score(y_train,y_train_pred,average='macro'),2),"Test Precision Score: ", round(precision_score(y_test,y_test_pred,average='macro'),2))
        print("Train Recall Score: ", round(recall_score(y_train,y_train_pred,average='macro'),2),"Test Recall Score: ", round(recall_score(y_test,y_test_pred,average='macro'),2))
        print("*"*60)

-----------  knnc  -------------
Train Acurracy Score 0.61 Test Acurracy Score 0.4
Train Precision Score:  0.6 Test Precision Score:  0.39
Train Recall Score:  0.61 Test Recall Score:  0.41
************************************************************
-----------  rfc  -------------
Train Acurracy Score 0.98 Test Acurracy Score 0.54
Train Precision Score:  0.98 Test Precision Score:  0.52
Train Recall Score:  0.98 Test Recall Score:  0.54
************************************************************


In [17]:
rfc = RandomForestClassifier()

In [18]:
model.fit(x_train_scaled,y_train)
y_train_pred = model.predict(x_train_scaled)
y_test_pred = model.predict(x_test_scaled)
print("Train Acurracy Score", round(model.score(x_train_scaled,y_train),2), "Test Acurracy Score", round(model.score(x_test_scaled,y_test),2))
print("Train Precision Score: ", round(precision_score(y_train,y_train_pred,average='macro'),2),"Test Precision Score: ", round(precision_score(y_test,y_test_pred,average='macro'),2))
print("Train Recall Score: ", round(recall_score(y_train,y_train_pred,average='macro'),2),"Test Recall Score: ", round(recall_score(y_test,y_test_pred,average='macro'),2))
print("*"*60)

Train Acurracy Score 0.98 Test Acurracy Score 0.54
Train Precision Score:  0.98 Test Precision Score:  0.52
Train Recall Score:  0.98 Test Recall Score:  0.54
************************************************************


## Parameter Tuning

#### Here are some parameters that we can use for a random forest classification model for cross-validation with grid & random search to handle model overfitting:

     * n_estimators: The number of trees in the forest. A higher number of trees will generally improve the accuracy of the model, but it will also increase the training time. we can try values between 100 and 1000.

     * max_depth: The maximum depth of each tree in the forest. A deeper tree will be able to learn more complex relationships between the features and the target variable, but it is also more likely to overfit the training data. we can try values between 3 and 10.

     * min_samples_split: The minimum number of samples required to split a node in the tree. A higher value will make the model more robust to overfitting, but it may also reduce the accuracy of the model. we can try values between 2 and 10.

     * min_samples_leaf: The minimum number of samples required in a leaf node. A higher value will make the model more robust to overfitting, but it may also reduce the accuracy of the model. we can try values between 1 and 5.

     * max_features: The maximum number of features to consider when splitting a node in the tree. A higher value will allow the model to learn more complex relationships between the features and the target variable, but it is also more likely to overfit the training data. we can try values between sqrt(n_features) and n_features.

     * bootstrap: Whether to bootstrap the data when training the trees in the forest. Bootstrapping can help to reduce overfitting, but it may also reduce the accuracy of the model. we can try both True and False.

     * criterion: The criterion used to split nodes in the trees. The two most common criteria are Gini impurity and entropy. we can try both Gini impurity and entropy.

     * oob_score: Whether to calculate the out-of-bag (OOB) score during training. The OOB score is an estimate of the accuracy of the model on unseen data. we can try both True and False.

     * cv: The number of folds to use in cross-validation. we can try values between 3 and 10.


In [19]:
# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [1, 3, 5],
    'min_samples_split': [15,20,25],
    'min_samples_leaf': [15,20,25],
    'max_features': ['sqrt', 'log2',],
    'bootstrap': [True],
    'criterion': ['gini', 'entropy'],
}

# Perform grid search
random_search = RandomizedSearchCV(rfc, param_grid, cv=10,scoring='accuracy')
random_search.fit(x_train_scaled,y_train)

RandomizedSearchCV(cv=10, estimator=RandomForestClassifier(),
                   param_distributions={'bootstrap': [True],
                                        'criterion': ['gini', 'entropy'],
                                        'max_depth': [1, 3, 5],
                                        'max_features': ['sqrt', 'log2'],
                                        'min_samples_leaf': [15, 20, 25],
                                        'min_samples_split': [15, 20, 25],
                                        'n_estimators': [100, 200, 300]},
                   scoring='accuracy')

In [20]:
# Get the best parameters
best_params = random_search.best_params_
best_params

{'n_estimators': 200,
 'min_samples_split': 25,
 'min_samples_leaf': 15,
 'max_features': 'sqrt',
 'max_depth': 5,
 'criterion': 'gini',
 'bootstrap': True}

In [21]:
random_search.best_estimator_

RandomForestClassifier(max_depth=5, max_features='sqrt', min_samples_leaf=15,
                       min_samples_split=25, n_estimators=200)

In [23]:
# Train the model with the best parameters
rfc =RandomForestClassifier(criterion='gini', max_depth=5, max_features='sqrt',
                       min_samples_leaf=15, min_samples_split=25,
                       n_estimators=200)
#model.fit(x_train_scaled,y_train)
rfc.fit(x_train_scaled,y_train)
y_train_pred = model.predict(x_train_scaled)
y_test_pred = model.predict(x_test_scaled)
print("Train Acurracy Score", round(model.score(x_train_scaled,y_train),2), "Test Acurracy Score", round(model.score(x_test_scaled,y_test),2))
print("Train Precision Score: ", round(precision_score(y_train,y_train_pred,average='macro'),2),"Test Precision Score: ", round(precision_score(y_test,y_test_pred,average='macro'),2))
print("Train Recall Score: ", round(recall_score(y_train,y_train_pred,average='macro'),2),"Test Recall Score: ", round(recall_score(y_test,y_test_pred,average='macro'),2))
print("*"*60)

Train Acurracy Score 0.98 Test Acurracy Score 0.54
Train Precision Score:  0.98 Test Precision Score:  0.52
Train Recall Score:  0.98 Test Recall Score:  0.54
************************************************************


In [27]:
# list for features names
features = list(x_train.columns)
features

['age_group',
 'type_of_admission',
 'ccs_diagnosis_description',
 'ccs_procedure_description',
 'apr_drg_description',
 'apr_mdc_description',
 'apr_severity_of_illness_description',
 'apr_risk_of_mortality',
 'gender_M',
 'apr_medical_surgical_description_Surgical',
 'emergency_department_indicator_Y']

In [29]:
# saving random search bet model estimator to variables
rfc_model = random_search.best_estimator_

In [30]:
# creating h5 files for randomforest models for length of stay
joblib.dump(rfc_model,"../models/rfc_model.h5")
joblib.dump(features,"../models/features.h5")
joblib.dump(scaler,"../models/scaler.h5")

['../models/scaler.h5']