In [5]:
import os
import random
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder




In [6]:
random.seed(0)

In [7]:
## loading the file
health_care = pd.read_csv('./healthcare/train_data.csv', na_values=['NA', 'NaN', '?'])

In [8]:
# Checking for NA values
health_care.isnull().values.any()

True

In [9]:
# Counting  NA values
health_care.isnull().sum().sum()

4645

In [10]:
for column in health_care.columns :
    print(f"Number of NA values in {column} is : { health_care[column].isnull().sum()} ")


Number of NA values in case_id is : 0 
Number of NA values in Hospital_code is : 0 
Number of NA values in Hospital_type_code is : 0 
Number of NA values in City_Code_Hospital is : 0 
Number of NA values in Hospital_region_code is : 0 
Number of NA values in Available Extra Rooms in Hospital is : 0 
Number of NA values in Department is : 0 
Number of NA values in Ward_Type is : 0 
Number of NA values in Ward_Facility_Code is : 0 
Number of NA values in Bed Grade is : 113 
Number of NA values in patientid is : 0 
Number of NA values in City_Code_Patient is : 4532 
Number of NA values in Type of Admission is : 0 
Number of NA values in Severity of Illness is : 0 
Number of NA values in Visitors with Patient is : 0 
Number of NA values in Age is : 0 
Number of NA values in Admission_Deposit is : 0 
Number of NA values in Stay is : 0 


In [11]:
# Dropping these columns for now
health_care = health_care.dropna()

In [12]:
for column in health_care.columns :
    print(f"Number of NA values in {column} is : { health_care[column].isnull().sum()} ")


Number of NA values in case_id is : 0 
Number of NA values in Hospital_code is : 0 
Number of NA values in Hospital_type_code is : 0 
Number of NA values in City_Code_Hospital is : 0 
Number of NA values in Hospital_region_code is : 0 
Number of NA values in Available Extra Rooms in Hospital is : 0 
Number of NA values in Department is : 0 
Number of NA values in Ward_Type is : 0 
Number of NA values in Ward_Facility_Code is : 0 
Number of NA values in Bed Grade is : 0 
Number of NA values in patientid is : 0 
Number of NA values in City_Code_Patient is : 0 
Number of NA values in Type of Admission is : 0 
Number of NA values in Severity of Illness is : 0 
Number of NA values in Visitors with Patient is : 0 
Number of NA values in Age is : 0 
Number of NA values in Admission_Deposit is : 0 
Number of NA values in Stay is : 0 


In [13]:
health_care.tail()

Unnamed: 0,case_id,Hospital_code,Hospital_type_code,City_Code_Hospital,Hospital_region_code,Available Extra Rooms in Hospital,Department,Ward_Type,Ward_Facility_Code,Bed Grade,patientid,City_Code_Patient,Type of Admission,Severity of Illness,Visitors with Patient,Age,Admission_Deposit,Stay
318433,318434,6,a,6,X,3,radiotherapy,Q,F,4.0,86499,23.0,Emergency,Moderate,3,41-50,4144.0,11-20
318434,318435,24,a,1,X,2,anesthesia,Q,E,4.0,325,8.0,Urgent,Moderate,4,81-90,6699.0,31-40
318435,318436,7,a,4,X,3,gynecology,R,F,4.0,125235,10.0,Emergency,Minor,3,71-80,4235.0,11-20
318436,318437,11,b,2,Y,3,anesthesia,Q,D,3.0,91081,8.0,Trauma,Minor,5,11-20,3761.0,11-20
318437,318438,19,a,7,Y,5,gynecology,Q,C,2.0,21641,8.0,Emergency,Minor,2,11-20,4752.0,0-10


The columns to be pre-processed are :
1. Hospital_type_code
2. Hospital_region_code
3. Department
4. Ward_type
5. Ward_Facility_Code
6. City_Code_Patient
7. Type of Admission
8. Severity of Illness
9. Age
11. Stay



In [14]:
health_care.columns

Index(['case_id', 'Hospital_code', 'Hospital_type_code', 'City_Code_Hospital',
       'Hospital_region_code', 'Available Extra Rooms in Hospital',
       'Department', 'Ward_Type', 'Ward_Facility_Code', 'Bed Grade',
       'patientid', 'City_Code_Patient', 'Type of Admission',
       'Severity of Illness', 'Visitors with Patient', 'Age',
       'Admission_Deposit', 'Stay'],
      dtype='object')

In [15]:
## Pre processing these columns

string_columns = ['Hospital_type_code', 'Hospital_region_code', 'Department', 'Ward_Type', 'Ward_Facility_Code',  'Type of Admission', 'Severity of Illness', 'Age' ]
encoder = LabelEncoder()
ohEncoder = OneHotEncoder()

## label_encoded data
X_le = health_care.drop(['Stay', 'case_id'], axis=1)
y_le = health_care['Stay']

y_le = encoder.fit_transform(y_le)
for column in string_columns :
    X_le[column] = encoder.fit(X_le[column]).transform(X_le[column])
    

In [16]:
X_le.head()

Unnamed: 0,Hospital_code,Hospital_type_code,City_Code_Hospital,Hospital_region_code,Available Extra Rooms in Hospital,Department,Ward_Type,Ward_Facility_Code,Bed Grade,patientid,City_Code_Patient,Type of Admission,Severity of Illness,Visitors with Patient,Age,Admission_Deposit
0,8,2,3,2,3,3,2,5,2.0,31397,7.0,0,0,2,5,4911.0
1,2,2,5,2,2,3,3,5,2.0,31397,7.0,1,0,2,5,5954.0
2,10,4,1,0,2,1,3,4,2.0,31397,7.0,1,0,2,5,4745.0
3,26,1,2,1,2,3,2,3,2.0,31397,7.0,1,0,2,5,7272.0
4,26,1,2,1,2,3,3,3,2.0,31397,7.0,1,0,2,5,5558.0


In [17]:
y_le = 10*y_le + 5
y_le

array([ 5, 45, 35, ..., 15, 15,  5])

In [18]:
## getting the inputs and labels

X = X_le.values
y = y_le

In [19]:
print('X.shape', X.shape)
print('y.shape', y.shape)


X.shape (313793, 16)
y.shape (313793,)


In [20]:
for column in string_columns :
    print(f'max {column} is : {X_le[column].max()}')

max Hospital_type_code is : 6
max Hospital_region_code is : 2
max Department is : 4
max Ward_Type is : 5
max Ward_Facility_Code is : 5
max Type of Admission is : 2
max Severity of Illness is : 2
max Age is : 9


In [21]:
for column in health_care.columns :
    x = pd.unique(health_care[column])
    print(f'number of unique elements of  {column} is : {len(x)}')

number of unique elements of  case_id is : 313793
number of unique elements of  Hospital_code is : 32
number of unique elements of  Hospital_type_code is : 7
number of unique elements of  City_Code_Hospital is : 11
number of unique elements of  Hospital_region_code is : 3
number of unique elements of  Available Extra Rooms in Hospital is : 18
number of unique elements of  Department is : 5
number of unique elements of  Ward_Type is : 6
number of unique elements of  Ward_Facility_Code is : 6
number of unique elements of  Bed Grade is : 4
number of unique elements of  patientid is : 90344
number of unique elements of  City_Code_Patient is : 37
number of unique elements of  Type of Admission is : 3
number of unique elements of  Severity of Illness is : 3
number of unique elements of  Visitors with Patient is : 28
number of unique elements of  Age is : 10
number of unique elements of  Admission_Deposit is : 7283
number of unique elements of  Stay is : 11


## Mean of stay

In [22]:
print(f'The mean of stay is {y_le.mean()} days')

The mean of stay is 31.960034162648626 days


# Linear Regression

In [23]:
from sklearn.linear_model import LinearRegression

In [24]:
model = LinearRegression(fit_intercept=True)


In [25]:
model.fit(X, y)

LinearRegression()

In [26]:
a, b, score = model.coef_, model.intercept_, model.score(X, y)

In [27]:
print('score : %.2f' % score)

score : 0.37


In [28]:
from sklearn.metrics import mean_squared_error
y_pred = model.predict(X)
print('Mean of stay %.2f' % y.mean())
print('Predicted mean of stay: %.2f' % y_pred.mean())
print('RMSE : %.2f' % np.sqrt(mean_squared_error(y, y_pred)))
print('Max predicted length of stay %.2f' % y_pred.max())

Mean of stay 31.96
Predicted mean of stay: 31.96
RMSE : 17.11
Max predicted length of stay 245.65


In [29]:
print('Mean model RMSE : %.2f' % np.sqrt(mean_squared_error(y, y.mean()*np.ones(len(y))  )) )


Mean model RMSE : 21.48


In [30]:
l = ['case_id', 'Stay', 'patientid']
columns = [ c for c in health_care.columns if c not in l ]
print(columns)

['Hospital_code', 'Hospital_type_code', 'City_Code_Hospital', 'Hospital_region_code', 'Available Extra Rooms in Hospital', 'Department', 'Ward_Type', 'Ward_Facility_Code', 'Bed Grade', 'City_Code_Patient', 'Type of Admission', 'Severity of Illness', 'Visitors with Patient', 'Age', 'Admission_Deposit']


In [112]:
for c in columns :
    Xc = X_le[c].values
    model = LinearRegression(fit_intercept=True)
    reg = model.fit(Xc.reshape(-1,1), y)
    sc = model.score(Xc.reshape(-1,1),y)
    yp = model.predict(Xc.reshape(-1,1))
    print(f'Feature used is : {c}')
    print('score : %.4f' % sc)
    print('Mean of stay %.2f' % y.mean())
    print('Predicted mean of stay: %.2f' % yp.mean())
    print('RMSE : %.2f' % np.sqrt(mean_squared_error(y, yp)))
    print('Max predicted length of stay %.2f' % yp.max())
    print('-------------------------------------------------')
    

Feature used is : Hospital_code
score : 0.0018
Mean of stay 31.96
Predicted mean of stay: 31.96
RMSE : 21.46
Max predicted length of stay 33.42
-------------------------------------------------
Feature used is : Hospital_type_code
score : 0.0035
Mean of stay 31.96
Predicted mean of stay: 31.96
RMSE : 21.44
Max predicted length of stay 35.87
-------------------------------------------------
Feature used is : City_Code_Hospital
score : 0.0000
Mean of stay 31.96
Predicted mean of stay: 31.96
RMSE : 21.48
Max predicted length of stay 32.34
-------------------------------------------------
Feature used is : Hospital_region_code
score : 0.0001
Mean of stay 31.96
Predicted mean of stay: 31.96
RMSE : 21.48
Max predicted length of stay 32.38
-------------------------------------------------
Feature used is : Available Extra Rooms in Hospital
score : 0.0148
Mean of stay 31.96
Predicted mean of stay: 31.96
RMSE : 21.32
Max predicted length of stay 39.11
-------------------------------------------

In [56]:
import sklearn
from sklearn.linear_model import Lasso
from sklearn import metrics

# Create linear regression
regressor = Lasso(alpha=0)

# Fit/train LASSO
regressor.fit(X,y)
# Predict
pred = regressor.predict(X)

a= regressor.coef_
b = regressor.intercept_
c = np.sqrt(metrics.mean_squared_error(pred,y))

print('RMSE : %.2f' % c)
print('score : %.2f' % regressor.score(X,y))



RMSE : 18.99
score : 0.22


In [51]:
print('score : %.2f' % regressor.score(X,y))

score : 0.37


In [60]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation
from tensorflow.keras.callbacks import EarlyStopping


In [61]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)
model = Sequential()
model.add(Dense(64, input_dim=X.shape[1], activation='relu'))
model.add(Dense(1)) # Output
model.compile(loss='mean_squared_error', optimizer='adam')
monitor = EarlyStopping(monitor='loss', min_delta=1e-3, patience=5, verbose=1, mode='auto')
model.fit(X_train,y_train,verbose=0,epochs=100)
pred = model.predict(X_test)
# Measure RMSE error.  
score = np.sqrt(metrics.mean_squared_error(pred,y_test))
print("Final score (RMSE): {}".format(score))

Final score (RMSE): 16.987498268760206


In [62]:
model = Sequential()
model.add(Dense(64, input_dim=X.shape[1], activation='relu'))
model.add(Dense(64)) 
model.add(Dense(1)) # Output
model.compile(loss='mean_squared_error', optimizer='adam')
monitor = EarlyStopping(monitor='loss', min_delta=1e-3, patience=5, verbose=1, mode='auto')
model.fit(X_train,y_train,verbose=0,epochs=100)
pred = model.predict(X_test)
# Measure RMSE error.  
score = np.sqrt(metrics.mean_squared_error(pred,y_test))
print("Final score (RMSE): {}".format(score))

Final score (RMSE): 17.101368794413656


In [77]:
from sklearn.preprocessing import StandardScaler


In [84]:
scaler = StandardScaler()
for i in range(X_train.shape[1]) :
    X_train[:,i]= scaler.fit_transform(X_train[:,i].reshape(-1, 1))[:,0]
    X_test[:,i] = scaler.transform(X_test[:,i].reshape(-1, 1))[:,0]

    

In [85]:
print(X_test.shape)
print(X_train.shape)

(78449, 16)
(235344, 16)


In [86]:
for i in range(X_train.shape[1]) :
    print(X_train[:10,i])

[ 0.54260906  0.54260906  0.07916298  0.7743321  -1.42703678  0.54260906
  0.31088602 -0.50014462  0.54260906  1.3536397 ]
[-0.81954518 -0.81954518 -0.81954518  1.78713849 -0.81954518 -0.81954518
  0.48379665 -0.81954518 -0.81954518  0.48379665]
[ 0.39510011  0.39510011  0.71738868 -1.21634271  0.39510011  0.39510011
 -0.57176558 -1.21634271  0.39510011 -0.57176558]
[-1.03711316 -1.03711316  0.29207248 -1.03711316 -1.03711316 -1.03711316
  1.62125812 -1.03711316 -1.03711316  1.62125812]
[ 1.53969622  1.53969622 -0.16903714  0.68532954 -0.16903714 -0.16903714
 -0.16903714 -0.16903714 -1.02340381 -0.16903714]
[ 1.87889614  1.87889614  0.10001591  0.10001591 -1.67886433  0.10001591
  0.10001591 -1.67886433  1.87889614  0.10001591]
[ 0.13849744 -1.10438305 -1.10438305  0.13849744  0.13849744  0.13849744
  0.13849744  0.13849744  1.38137792  0.13849744]
[ 1.01632609  1.01632609 -0.76082115  0.42394368  1.01632609  1.01632609
 -1.94558599  0.42394368  1.01632609 -1.94558599]
[-1.86151664 -0.

In [87]:
model = Sequential()
model.add(Dense(64, input_dim=X.shape[1], activation='relu'))
model.add(Dense(1)) # Output
model.compile(loss='mean_squared_error', optimizer='adam')
monitor = EarlyStopping(monitor='loss', min_delta=1e-3, patience=5, verbose=1, mode='auto')
model.fit(X_train,y_train,verbose=0,epochs=100)
pred = model.predict(X_test)
# Measure RMSE error.  
score = np.sqrt(metrics.mean_squared_error(pred,y_test))
print("Final score (RMSE): {}".format(score))

Final score (RMSE): 15.575867820524966


In [90]:
model = Sequential()
model.add(Dense(64, input_dim=X.shape[1], activation='relu'))
model.add(Dense(1)) # Output
model.compile(loss='mean_squared_error', optimizer='adam')
monitor = EarlyStopping(monitor='loss', min_delta=1e-3, patience=5, verbose=1, mode='auto')
model.fit(X_train,y_train,verbose=1,epochs=200)
pred = model.predict(X_test)
# Measure RMSE error.  
score = np.sqrt(metrics.mean_squared_error(pred,y_test))
print("Final score (RMSE): {}".format(score))

Train on 235344 samples
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/2

Epoch 88/200
Epoch 89/200
Epoch 90/200
Epoch 91/200
Epoch 92/200
Epoch 93/200
Epoch 94/200
Epoch 95/200
Epoch 96/200
Epoch 97/200
Epoch 98/200
Epoch 99/200
Epoch 100/200
Epoch 101/200
Epoch 102/200
Epoch 103/200
Epoch 104/200
Epoch 105/200
Epoch 106/200
Epoch 107/200
Epoch 108/200
Epoch 109/200
Epoch 110/200
Epoch 111/200
Epoch 112/200
Epoch 113/200
Epoch 114/200
Epoch 115/200
Epoch 116/200
Epoch 117/200
Epoch 118/200
Epoch 119/200
Epoch 120/200
Epoch 121/200
Epoch 122/200
Epoch 123/200
Epoch 124/200
Epoch 125/200
Epoch 126/200
Epoch 127/200
Epoch 128/200
Epoch 129/200
Epoch 130/200
Epoch 131/200
Epoch 132/200
Epoch 133/200
Epoch 134/200
Epoch 135/200
Epoch 136/200
Epoch 137/200
Epoch 138/200
Epoch 139/200
Epoch 140/200
Epoch 141/200
Epoch 142/200
Epoch 143/200
Epoch 144/200
Epoch 145/200
Epoch 146/200
Epoch 147/200
Epoch 148/200
Epoch 149/200
Epoch 150/200
Epoch 151/200
Epoch 152/200
Epoch 153/200
Epoch 154/200
Epoch 155/200
Epoch 156/200
Epoch 157/200
Epoch 158/200
Epoch 159/200
Epoc

Epoch 172/200
Epoch 173/200
Epoch 174/200
Epoch 175/200
Epoch 176/200
Epoch 177/200
Epoch 178/200
Epoch 179/200
Epoch 180/200
Epoch 181/200
Epoch 182/200
Epoch 183/200
Epoch 184/200
Epoch 185/200
Epoch 186/200
Epoch 187/200
Epoch 188/200
Epoch 189/200
Epoch 190/200
Epoch 191/200
Epoch 192/200
Epoch 193/200
Epoch 194/200
Epoch 195/200
Epoch 196/200
Epoch 197/200
Epoch 198/200
Epoch 199/200
Epoch 200/200
Final score (RMSE): 15.466665252288074


In [92]:
model = Sequential()
model.add(Dense(32, input_dim=X.shape[1], activation='relu'))
model.add(Dense(32)) 
model.add(Dense(1)) # Output
model.compile(loss='mean_squared_error', optimizer='adam')
monitor = EarlyStopping(monitor='loss', min_delta=1e-3, patience=5, verbose=1, mode='auto')
model.fit(X_train,y_train,verbose=1,epochs=50)
pred = model.predict(X_test)
# Measure RMSE error.  
score = np.sqrt(metrics.mean_squared_error(pred,y_test))
print("Final score (RMSE): {}".format(score))

Train on 235344 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Final score (RMSE): 15.569742276039877
