In [99]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import tensorflow as tf
import numpy as np

In [100]:
url = 'https://raw.githubusercontent.com/AdriannaDJ/healthcare_attrition/main/Resources/watson_healthcare_modified.csv'
df=pd.read_csv(url)
# df=pd.read_csv('../Resources/watson_healthcare_modified.csv')

In [101]:
df = df.drop(columns = ['EmployeeID', 'EmployeeCount', 'StandardHours', 'TrainingTimesLastYear', 'MonthlyRate', 'DailyRate', 'HourlyRate', 'Over18'])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1676 entries, 0 to 1675
Data columns (total 27 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Age                       1676 non-null   int64 
 1   Attrition                 1676 non-null   object
 2   BusinessTravel            1676 non-null   object
 3   Department                1676 non-null   object
 4   DistanceFromHome          1676 non-null   int64 
 5   Education                 1676 non-null   int64 
 6   EducationField            1676 non-null   object
 7   EnvironmentSatisfaction   1676 non-null   int64 
 8   Gender                    1676 non-null   object
 9   JobInvolvement            1676 non-null   int64 
 10  JobLevel                  1676 non-null   int64 
 11  JobRole                   1676 non-null   object
 12  JobSatisfaction           1676 non-null   int64 
 13  MaritalStatus             1676 non-null   object
 14  MonthlyIncome           

In [102]:
target='Attrition'
# remove target
X=df.drop(columns=[target]).copy()
# get list of columns
cols=list(X.columns)
# start choices dictionary to capture categorical options for HTML
choices={col: None for col in cols}

# get all categorical columns
cat_cols=X.dtypes[X.dtypes=='object'].index
# get all continuous columns
cont_cols=X.dtypes[X.dtypes!='object'].index
# get all categorical features
unique_vals_count=df[cat_cols].nunique().sum()
print(f'There should be {len(cont_cols)-len(cat_cols)+unique_vals_count} columns')

There should be 36 columns


In [103]:
# set X as only continuous features
X=X.drop(columns=cat_cols).copy()

# OHE transform categorical features
ohe=OneHotEncoder(sparse_output=False)
cat_cols_transformed=ohe.fit_transform(df[cat_cols])
categories=ohe.categories_
cat_feature_names=ohe.get_feature_names_out()

# map options to each categorical variable for HTML
for k, v in zip(cat_cols, categories):
    choices[k]=list(v)

In [114]:
X[cat_feature_names]=cat_cols_transformed

y=df[target].copy()
if y.dtype == object:
    unique_values, indices = np.unique(y, return_inverse=True)
    y = indices  # Replace strings with numerical indices

y

array([0, 0, 1, ..., 0, 0, 0])

In [105]:
X_train, X_test, y_train, y_test=train_test_split(X, y, random_state=42)

In [106]:
scaler=StandardScaler()
scaler.fit(X_train)
X_train_scaled=scaler.transform(X_train)
X_test_scaled=scaler.transform(X_test)

In [107]:
# Preview X_train_scaled[0]
len(X_train_scaled[0])

43

In [108]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
num_features = len(X_train_scaled[0])
num_nodes_1 = 80
num_nodes_2 = 20
num_nodes_4 = 1

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(tf.keras.layers.Dense(units=num_nodes_1, activation="relu", input_dim=num_features))

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=num_nodes_2, activation="relu"))

# Output layer
nn.add(tf.keras.layers.Dense(units=num_nodes_4, activation="sigmoid"))

# Check the structure of the model
nn.summary()

Model: "sequential_13"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_40 (Dense)            (None, 80)                3520      
                                                                 
 dense_41 (Dense)            (None, 20)                1620      
                                                                 
 dense_42 (Dense)            (None, 1)                 21        
                                                                 
Total params: 5161 (20.16 KB)
Trainable params: 5161 (20.16 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [109]:
# Compile the model
nn.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [110]:
# Train the model
fit_model = nn.fit(X_train_scaled,y_train,epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [111]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

14/14 - 0s - loss: 0.3559 - accuracy: 0.9069 - 163ms/epoch - 12ms/step
Loss: 0.35586756467819214, Accuracy: 0.9069212675094604


In [None]:
# import pickle

# with open('choices.pkl', 'wb') as f:
#     pickle.dump(choices, f)

# with open('scaler.pkl', 'wb') as f:
#     pickle.dump(scaler, f)

# with open('ohe.pkl', 'wb') as f:
#     pickle.dump(ohe, f)

# with open('model.pkl', 'wb') as f:
#     pickle.dump(fit_model, f)