In [1]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder




In [2]:
## loading the file
health_care = pd.read_csv('./healthcare/train_data.csv')


In [3]:
# Checking for NA values
print(health_care.isnull().values.any())

# Counting  NA values
print(health_care.isnull().sum().sum())

# Columns with NA values
for column in health_care.columns :
    print(f"Number of NA values in {column} is : { health_care[column].isnull().sum()} ")


True
4645
Number of NA values in case_id is : 0 
Number of NA values in Hospital_code is : 0 
Number of NA values in Hospital_type_code is : 0 
Number of NA values in City_Code_Hospital is : 0 
Number of NA values in Hospital_region_code is : 0 
Number of NA values in Available Extra Rooms in Hospital is : 0 
Number of NA values in Department is : 0 
Number of NA values in Ward_Type is : 0 
Number of NA values in Ward_Facility_Code is : 0 
Number of NA values in Bed Grade is : 113 
Number of NA values in patientid is : 0 
Number of NA values in City_Code_Patient is : 4532 
Number of NA values in Type of Admission is : 0 
Number of NA values in Severity of Illness is : 0 
Number of NA values in Visitors with Patient is : 0 
Number of NA values in Age is : 0 
Number of NA values in Admission_Deposit is : 0 
Number of NA values in Stay is : 0 


In [4]:
# Dropping these columns for now
health_care = health_care.dropna()

In [5]:
# Data preview
health_care.head()

Unnamed: 0,case_id,Hospital_code,Hospital_type_code,City_Code_Hospital,Hospital_region_code,Available Extra Rooms in Hospital,Department,Ward_Type,Ward_Facility_Code,Bed Grade,patientid,City_Code_Patient,Type of Admission,Severity of Illness,Visitors with Patient,Age,Admission_Deposit,Stay
0,1,8,c,3,Z,3,radiotherapy,R,F,2.0,31397,7.0,Emergency,Extreme,2,51-60,4911.0,0-10
1,2,2,c,5,Z,2,radiotherapy,S,F,2.0,31397,7.0,Trauma,Extreme,2,51-60,5954.0,41-50
2,3,10,e,1,X,2,anesthesia,S,E,2.0,31397,7.0,Trauma,Extreme,2,51-60,4745.0,31-40
3,4,26,b,2,Y,2,radiotherapy,R,D,2.0,31397,7.0,Trauma,Extreme,2,51-60,7272.0,41-50
4,5,26,b,2,Y,2,radiotherapy,S,D,2.0,31397,7.0,Trauma,Extreme,2,51-60,5558.0,41-50


The columns to be pre-processed are :
1. Hospital_type_code
2. Hospital_region_code
3. Department
4. Ward_type
5. Ward_Facility_Code
6. City_Code_Patient
7. Type of Admission
8. Severity of Illness
9. Age
11. Stay



In [6]:
# Function to hot encode the column with name : name for dataframe df
def encode_text_dummy(df, name):
    dummies = pd.get_dummies(df[name])
    for x in dummies.columns:
        dummy_name = f"{name}-{x}"
        df[dummy_name] = dummies[x]
    df.drop(name, axis=1, inplace=True) ## inplace to make changed on the original df

# Label Encoding Data

In [7]:
## Pre processing these columns

string_columns = ['Hospital_type_code', 'Hospital_region_code', 'Department', 'Ward_Type', 'Ward_Facility_Code',  'Type of Admission', 'Severity of Illness', 'Age' ]
encoder = LabelEncoder()

## label_encoded data initialisation
X_le = health_care.drop(['Stay', 'case_id'], axis=1) # dropping case id
y_le = health_care['Stay']

## label encoding
y_le = encoder.fit_transform(y_le)
for column in string_columns :
    X_le[column] = encoder.fit(X_le[column]).transform(X_le[column])

# transforming target
y_le = 10*y_le + 5

In [8]:
# quick check
X_le.head()

Unnamed: 0,Hospital_code,Hospital_type_code,City_Code_Hospital,Hospital_region_code,Available Extra Rooms in Hospital,Department,Ward_Type,Ward_Facility_Code,Bed Grade,patientid,City_Code_Patient,Type of Admission,Severity of Illness,Visitors with Patient,Age,Admission_Deposit
0,8,2,3,2,3,3,2,5,2.0,31397,7.0,0,0,2,5,4911.0
1,2,2,5,2,2,3,3,5,2.0,31397,7.0,1,0,2,5,5954.0
2,10,4,1,0,2,1,3,4,2.0,31397,7.0,1,0,2,5,4745.0
3,26,1,2,1,2,3,2,3,2.0,31397,7.0,1,0,2,5,7272.0
4,26,1,2,1,2,3,3,3,2.0,31397,7.0,1,0,2,5,5558.0


In [9]:
# target check
y_le[:5]

array([ 5, 45, 35, 45, 45])

# Hot Encoding Data