# Healthcare Analytics
### This project aims to predict each patient's Length of Stay to help hospitals optimize resources and function better.

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
np.set_printoptions(suppress=True)
import warnings
warnings.filterwarnings('ignore') #no warning massages will be printed

In [4]:
# Importing datasets
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

## Data Exploration

### Overview of Data

In [5]:
train.head()

Unnamed: 0,case_id,Hospital_code,Hospital_type_code,City_Code_Hospital,Hospital_region_code,Available Extra Rooms in Hospital,Department,Ward_Type,Ward_Facility_Code,Bed Grade,patientid,City_Code_Patient,Type of Admission,Severity of Illness,Visitors with Patient,Age,Admission_Deposit,Stay
0,1,8,c,3,Z,3,radiotherapy,R,F,2.0,31397,7.0,Emergency,Extreme,2,51-60,4911.0,0-10
1,2,2,c,5,Z,2,radiotherapy,S,F,2.0,31397,7.0,Trauma,Extreme,2,51-60,5954.0,41-50
2,3,10,e,1,X,2,anesthesia,S,E,2.0,31397,7.0,Trauma,Extreme,2,51-60,4745.0,31-40
3,4,26,b,2,Y,2,radiotherapy,R,D,2.0,31397,7.0,Trauma,Extreme,2,51-60,7272.0,41-50
4,5,26,b,2,Y,2,radiotherapy,S,D,2.0,31397,7.0,Trauma,Extreme,2,51-60,5558.0,41-50


In [8]:
train.info()
train.Stay.unique()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 318438 entries, 0 to 318437
Data columns (total 18 columns):
 #   Column                             Non-Null Count   Dtype  
---  ------                             --------------   -----  
 0   case_id                            318438 non-null  int64  
 1   Hospital_code                      318438 non-null  int64  
 2   Hospital_type_code                 318438 non-null  object 
 3   City_Code_Hospital                 318438 non-null  int64  
 4   Hospital_region_code               318438 non-null  object 
 5   Available Extra Rooms in Hospital  318438 non-null  int64  
 6   Department                         318438 non-null  object 
 7   Ward_Type                          318438 non-null  object 
 8   Ward_Facility_Code                 318438 non-null  object 
 9   Bed Grade                          318325 non-null  float64
 10  patientid                          318438 non-null  int64  
 11  City_Code_Patient                  3139

array(['0-10', '41-50', '31-40', '11-20', '51-60', '21-30', '71-80',
       'More than 100 Days', '81-90', '61-70', '91-100'], dtype=object)

In [12]:
# NA values in train dataset
null_cols = [(col, train[col].isnull().sum()) for col in train.columns if train[col].isnull().any()]
sorted(null_cols, key=lambda x: x[1], reverse=True)

[('City_Code_Patient', 4532), ('Bed Grade', 113)]

In [13]:
# NA values in test dataset
null_cols = [(col, test[col].isnull().sum()) for col in test.columns if test[col].isnull().any()]
sorted(null_cols, key=lambda x: x[1], reverse=True)


[('City_Code_Patient', 2157), ('Bed Grade', 35)]

In [7]:
# Dimension of train data
train.shape

(318438, 18)

In [8]:
# Dimension of test data
test.shape

(137057, 17)

In [14]:
# Counting distinct values for each variable in train data.
train.nunique()

case_id                              318438
Hospital_code                            32
Hospital_type_code                        7
City_Code_Hospital                       11
Hospital_region_code                      3
Available Extra Rooms in Hospital        18
Department                                5
Ward_Type                                 6
Ward_Facility_Code                        6
Bed Grade                                 4
patientid                             92017
City_Code_Patient                        37
Type of Admission                         3
Severity of Illness                       3
Visitors with Patient                    28
Age                                      10
Admission_Deposit                      7300
Stay                                     11
dtype: int64

In [16]:
# Counting distinct values for each variable in test data.
test.nunique()

case_id                              137057
Hospital_code                            32
Hospital_type_code                        7
City_Code_Hospital                       11
Hospital_region_code                      3
Available Extra Rooms in Hospital        15
Department                                5
Ward_Type                                 6
Ward_Facility_Code                        6
Bed Grade                                 4
patientid                             39607
City_Code_Patient                        37
Type of Admission                         3
Severity of Illness                       3
Visitors with Patient                    27
Age                                      10
Admission_Deposit                      6609
dtype: int64

### Data Preparation

In [11]:
# Filling missing values in Bed Grade column for train and test data
for data in [train, test]:
    data['Bed Grade'].fillna(data['Bed Grade'].mode()[0], inplace=True)


In [12]:
# Filling missing values in City_Code_Patient column for train and test data
for data in [train, test]:
    data['City_Code_Patient'].fillna(data['City_Code_Patient'].mode()[0], inplace=True)

In [17]:
# Label encoding Stay column for train data
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
for data in [train]:
    data['Stay'] = le.fit_transform(data['Stay'].astype('str'))

In [18]:
train.head()

Unnamed: 0,case_id,Hospital_code,Hospital_type_code,City_Code_Hospital,Hospital_region_code,Available Extra Rooms in Hospital,Department,Ward_Type,Ward_Facility_Code,Bed Grade,patientid,City_Code_Patient,Type of Admission,Severity of Illness,Visitors with Patient,Age,Admission_Deposit,Stay
0,1,8,c,3,Z,3,radiotherapy,R,F,2.0,31397,7.0,Emergency,Extreme,2,51-60,4911.0,0
1,2,2,c,5,Z,2,radiotherapy,S,F,2.0,31397,7.0,Trauma,Extreme,2,51-60,5954.0,4
2,3,10,e,1,X,2,anesthesia,S,E,2.0,31397,7.0,Trauma,Extreme,2,51-60,4745.0,3
3,4,26,b,2,Y,2,radiotherapy,R,D,2.0,31397,7.0,Trauma,Extreme,2,51-60,7272.0,4
4,5,26,b,2,Y,2,radiotherapy,S,D,2.0,31397,7.0,Trauma,Extreme,2,51-60,5558.0,4


In [19]:
test.head()

Unnamed: 0,case_id,Hospital_code,Hospital_type_code,City_Code_Hospital,Hospital_region_code,Available Extra Rooms in Hospital,Department,Ward_Type,Ward_Facility_Code,Bed Grade,patientid,City_Code_Patient,Type of Admission,Severity of Illness,Visitors with Patient,Age,Admission_Deposit
0,318439,21,c,3,Z,3,gynecology,S,A,2.0,17006,2.0,Emergency,Moderate,2,71-80,3095.0
1,318440,29,a,4,X,2,gynecology,S,F,2.0,17006,2.0,Trauma,Moderate,4,71-80,4018.0
2,318441,26,b,2,Y,3,gynecology,Q,D,4.0,17006,2.0,Emergency,Moderate,3,71-80,4492.0
3,318442,6,a,6,X,3,gynecology,Q,F,2.0,17006,2.0,Trauma,Moderate,3,71-80,4173.0
4,318443,28,b,11,X,2,gynecology,R,F,2.0,17006,2.0,Trauma,Moderate,4,71-80,4161.0


In [22]:
# Adding dummy Stay column to test data and concatenating with train data 
# I assign dummy value of -1 to the Stay column in the test data, because it is needed for the label encoding step
test['Stay'] = -1 
df = pd.concat([train, test])
df.shape

(455495, 18)

In [23]:

# Label encoding categorical columns for df data
le = LabelEncoder()
for col in ['Hospital_type_code', 'Hospital_region_code', 'Department',
          'Ward_Type', 'Ward_Facility_Code', 'Type of Admission', 'Severity of Illness', 'Age']:
    df[col] = le.fit_transform(df[col].astype(str))

In [24]:
# Splitting df data into train and test data
train = df[df['Stay']!=-1]
test = df.drop('Stay', axis=1)

### Feature Engineering

In [59]:
def get_countid_encode(train, test, cols, name):
  # Use the .agg() method to count the number of cases for each group of values
  temp = train.groupby(cols).agg({'case_id': 'count'}).reset_index().rename(columns = {'case_id': name})
  temp2 = test.groupby(cols).agg({'case_id': 'count'}).reset_index().rename(columns = {'case_id': name})
  # Merge the train and test dataframes with the aggregated values
  train = pd.merge(train, temp, how='left', on= cols)
  test = pd.merge(test,temp2, how='left', on= cols)
  # Convert the column to float type
  train[name] = train[name].astype('float')
  test[name] = test[name].astype('float')
  # Use the same median value from the train dataframe to fill in the missing values for both dataframes
  median_value = np.median(temp[name])
  train[name].fillna(median_value, inplace = True)
  test[name].fillna(median_value, inplace = True)
  # Return the train and test dataframes and the median value
  return train, test, median_value


In [60]:
train.info()
test.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 318438 entries, 0 to 318437
Data columns (total 21 columns):
 #   Column                             Non-Null Count   Dtype  
---  ------                             --------------   -----  
 0   case_id                            318438 non-null  int64  
 1   Hospital_code                      318438 non-null  int64  
 2   Hospital_type_code                 318438 non-null  int32  
 3   City_Code_Hospital                 318438 non-null  int64  
 4   Hospital_region_code               318438 non-null  int32  
 5   Available Extra Rooms in Hospital  318438 non-null  int64  
 6   Department                         318438 non-null  int32  
 7   Ward_Type                          318438 non-null  int32  
 8   Ward_Facility_Code                 318438 non-null  int32  
 9   Bed Grade                          318325 non-null  float64
 10  patientid                          318438 non-null  int64  
 11  City_Code_Patient                  3139

In [None]:
train, test = get_countid_encode(train, test, ['patientid'], name = 'patientid')
train, test = get_countid_encode(train, test, 
                                 ['patientid', 'Hospital_region_code'], name = 'count_id_patient_hospitalCode')
train, test = get_countid_encode(train, test, 
                                 ['patientid', 'Ward_Facility_Code'], name = 'count_id_patient_wardfacilityCode')

In [None]:
# Droping duplicate columns
test1 = test.drop(['Stay', 'patientid', 'Hospital_region_code', 'Ward_Facility_Code'], axis =1)
train1 = train.drop(['case_id', 'patientid', 'Hospital_region_code', 'Ward_Facility_Code'], axis =1)

In [None]:

# Splitting train data for Naive Bayes and XGBoost
X1 = train1.drop('Stay', axis =1)
y1 = train1['Stay']
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X1, y1, test_size =0.20, random_state =100)

## Trying the models

### Naive Bayes Model

In [None]:
from sklearn.naive_bayes import GaussianNB
target = y_train.values
features = X_train.values
classifier_nb = GaussianNB()
model_nb = classifier_nb.fit(features, target)

In [23]:
prediction_nb = model_nb.predict(X_test)
from sklearn.metrics import accuracy_score
acc_score_nb = accuracy_score(prediction_nb,y_test)
print("Acurracy:", acc_score_nb*100)

Acurracy: 34.55439015199096


### XGBoost Model

In [24]:
import xgboost
classifier_xgb = xgboost.XGBClassifier(max_depth=4, learning_rate=0.1, n_estimators=800,
                                  objective='multi:softmax', reg_alpha=0.5, reg_lambda=1.5,
                                  booster='gbtree', n_jobs=4, min_child_weight=2, base_score= 0.75)

In [25]:
model_xgb = classifier_xgb.fit(X_train, y_train)

In [26]:
prediction_xgb = model_xgb.predict(X_test)
acc_score_xgb = accuracy_score(prediction_xgb,y_test)
print("Accuracy:", acc_score_xgb*100)

Accuracy: 43.047355859816605


### Neural Network

In [27]:
# Segregation of features and target variable
X = train.drop('Stay', axis =1)
y = train['Stay']
print(X.columns)
z = test.drop('Stay', axis = 1)
print(z.columns)

# Data Scaling
from sklearn import preprocessing
X_scale = preprocessing.scale(X)
X_scale.shape

Index(['case_id', 'Hospital_code', 'Hospital_type_code', 'City_Code_Hospital',
       'Hospital_region_code', 'Available Extra Rooms in Hospital',
       'Department', 'Ward_Type', 'Ward_Facility_Code', 'Bed Grade',
       'patientid', 'City_Code_Patient', 'Type of Admission',
       'Severity of Illness', 'Visitors with Patient', 'Age',
       'Admission_Deposit', 'count_id_patient',
       'count_id_patient_hospitalCode', 'count_id_patient_wardfacilityCode'],
      dtype='object')
Index(['case_id', 'Hospital_code', 'Hospital_type_code', 'City_Code_Hospital',
       'Hospital_region_code', 'Available Extra Rooms in Hospital',
       'Department', 'Ward_Type', 'Ward_Facility_Code', 'Bed Grade',
       'patientid', 'City_Code_Patient', 'Type of Admission',
       'Severity of Illness', 'Visitors with Patient', 'Age',
       'Admission_Deposit', 'count_id_patient',
       'count_id_patient_hospitalCode', 'count_id_patient_wardfacilityCode'],
      dtype='object')


(318438, 20)

In [28]:
X_train, X_test, y_train, y_test = train_test_split(X_scale, y, test_size =0.20, random_state =100)

In [29]:
import keras
from keras.models import Sequential
from keras.layers import Dense
import tensorflow as tf

In [30]:
from keras.utils import to_categorical
#Sparse Matrix
a = to_categorical(y_train)
b = to_categorical(y_test)

In [31]:
model = Sequential()
model.add(Dense(64, activation='relu', input_shape = (254750, 20))) 
model.add(Dense(128, activation='relu'))
model.add(Dense(256, activation='relu'))
model.add(Dense(512, activation='relu'))
model.add(Dense(512, activation='relu'))
model.add(Dense(11, activation='softmax'))

In [32]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 254750, 64)        1344      
_________________________________________________________________
dense_1 (Dense)              (None, 254750, 128)       8320      
_________________________________________________________________
dense_2 (Dense)              (None, 254750, 256)       33024     
_________________________________________________________________
dense_3 (Dense)              (None, 254750, 512)       131584    
_________________________________________________________________
dense_4 (Dense)              (None, 254750, 512)       262656    
_________________________________________________________________
dense_5 (Dense)              (None, 254750, 11)        5643      
Total params: 442,571
Trainable params: 442,571
Non-trainable params: 0
__________________________________________________

In [33]:
model.compile(optimizer= 'SGD', 
              loss='categorical_crossentropy', 
              metrics=['accuracy'])

In [34]:
callbacks = [tf.keras.callbacks.TensorBoard("logs_keras")]
model.fit(X_train, a, epochs=20, callbacks=callbacks, validation_split = 0.2)

Epoch 1/20
Instructions for updating:
use `tf.profiler.experimental.stop` instead.
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x7ff88fd7b9a0>

In [None]:
# Genrating tensorboard
!tensorboard --logdir logs_keras

In [36]:
# Retraining the model with 4 epochs
model.fit(X_train, a, epochs=4, validation_split = 0.2)
print("\n Model Evaluation")
model.evaluate(X_test,b)

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4

 Model Evaluation


[1.5047825574874878, 0.42199471592903137]

# Predictions

In [37]:
# Naive Bayes
pred_nb = classifier_nb.predict(test1.iloc[:,1:])
result_nb = pd.DataFrame(pred_nb, columns=['Stay'])
result_nb['case_id'] = test1['case_id']
result_nb = result_nb[['case_id', 'Stay']]

In [38]:
result_nb['Stay'] = result_nb['Stay'].replace({0:'0-10', 1: '11-20', 2: '21-30', 3:'31-40', 4: '41-50', 5: '51-60', 6: '61-70', 7: '71-80', 8: '81-90', 9: '91-100', 10: 'More than 100 Days'})
result_nb.head()

Unnamed: 0,case_id,Stay
0,318439,21-30
1,318440,51-60
2,318441,21-30
3,318442,21-30
4,318443,31-40


In [39]:
# XGBoost
pred_xgb = classifier_xgb.predict(test1.iloc[:,1:])
result_xgb = pd.DataFrame(pred_xgb, columns=['Stay'])
result_xgb['case_id'] = test1['case_id']
result_xgb = result_xgb[['case_id', 'Stay']]

In [40]:
result_xgb['Stay'] = result_xgb['Stay'].replace({0:'0-10', 1: '11-20', 2: '21-30', 3:'31-40', 4: '41-50', 5: '51-60', 6: '61-70', 7: '71-80', 8: '81-90', 9: '91-100', 10: 'More than 100 Days'})
result_xgb.head()

Unnamed: 0,case_id,Stay
0,318439,0-10
1,318440,51-60
2,318441,21-30
3,318442,21-30
4,318443,51-60


In [41]:
# Neural Network
test_scale = preprocessing.scale(z)
test_scale.shape

(137057, 20)

In [42]:
pred = model.predict_classes(test_scale)
pred

Instructions for updating:
Please use instead:* `np.argmax(model.predict(x), axis=-1)`,   if your model does multi-class classification   (e.g. if it uses a `softmax` last-layer activation).* `(model.predict(x) > 0.5).astype("int32")`,   if your model does binary classification   (e.g. if it uses a `sigmoid` last-layer activation).


array([0, 5, 2, ..., 1, 1, 5])

In [43]:
result_nn = pd.DataFrame(pred, columns=['Stay'])
result_nn['case_id'] = test['case_id']
result_nn = result_nn[['case_id', 'Stay']]

In [44]:
result_nn['Stay'] = result_nn['Stay'].replace({0:'0-10', 1: '11-20', 2: '21-30', 3:'31-40', 4: '41-50', 5: '51-60', 6: '61-70', 7: '71-80', 8: '81-90', 9: '91-100', 10: 'More than 100 Days'})
result_nn.head()

Unnamed: 0,case_id,Stay
0,318439,0-10
1,318440,51-60
2,318441,21-30
3,318442,21-30
4,318443,51-60


## Results

In [45]:
# Naive Bayes
print(result_nb.groupby('Stay')['case_id'].nunique())

Stay
0-10                   2598
11-20                 26827
21-30                 72206
31-40                 15639
41-50                   469
51-60                 13651
61-70                    92
71-80                   955
81-90                   296
91-100                    2
More than 100 Days     4322
Name: case_id, dtype: int64


In [46]:
# XGBoost
print(result_xgb.groupby('Stay')['case_id'].nunique())

Stay
0-10                   4373
11-20                 39337
21-30                 58261
31-40                 12100
41-50                    61
51-60                 19217
61-70                    16
71-80                   302
81-90                  1099
91-100                   78
More than 100 Days     2213
Name: case_id, dtype: int64


In [47]:
# Neural Networks
print(result_nn.groupby('Stay')['case_id'].nunique())

Stay
0-10                   5379
11-20                 41215
21-30                 55240
31-40                 10926
41-50                     9
51-60                 20016
71-80                    29
81-90                  1126
More than 100 Days     3117
Name: case_id, dtype: int64
