In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [3]:
df=pd.read_csv('../data/watson_healthcare_modified.csv')

In [5]:
df = df.drop(columns = ['EmployeeID', 'EmployeeCount', 'StandardHours', 'TrainingTimesLastYear'])

df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1676 entries, 0 to 1675
Data columns (total 31 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Age                       1676 non-null   int64 
 1   Attrition                 1676 non-null   object
 2   BusinessTravel            1676 non-null   object
 3   DailyRate                 1676 non-null   int64 
 4   Department                1676 non-null   object
 5   DistanceFromHome          1676 non-null   int64 
 6   Education                 1676 non-null   int64 
 7   EducationField            1676 non-null   object
 8   EnvironmentSatisfaction   1676 non-null   int64 
 9   Gender                    1676 non-null   object
 10  HourlyRate                1676 non-null   int64 
 11  JobInvolvement            1676 non-null   int64 
 12  JobLevel                  1676 non-null   int64 
 13  JobRole                   1676 non-null   object
 14  JobSatisfaction         

In [6]:
target='Attrition'
# remove target
X=df.drop(columns=['Attrition']).copy()
# get list of columns
cols=list(X.columns)
# start choices dictionary to capture categorical options for HTML
choices={col: None for col in cols}

# get all categorical columns
cat_cols=X.dtypes[X.dtypes=='object'].index
# get all continuous columns
cont_cols=X.dtypes[X.dtypes!='object'].index
# get all categorical features
unique_vals_count=df[cat_cols].nunique().sum()
print(f'There should be {len(cont_cols)-len(cat_cols)+unique_vals_count} columns')

There should be 39 columns


In [7]:
# set X as only continuous features
X=X.drop(columns=cat_cols).copy()

# OHE transform categorical features
ohe=OneHotEncoder(sparse_output=False)
cat_cols_transformed=ohe.fit_transform(df[cat_cols])
categories=ohe.categories_
cat_feature_names=ohe.get_feature_names_out()

# map options to each categorical variable for HTML
for k, v in zip(cat_cols, categories): 
    choices[k]=list(v)

In [8]:
X[cat_feature_names]=cat_cols_transformed
y=df[target].copy()

In [9]:
X_train, X_test, y_train, y_test=train_test_split(X, y)

In [14]:
scaler=StandardScaler()
scaler.fit(X_train)
X_train_scaled=scaler.transform(X_train)
X_test_scaled=scaler.transform(X_test)

In [15]:
rfc=RandomForestClassifier()
rfc.fit(X_train_scaled, y_train)
print(f'Train score: {rfc.score(X_train_scaled, y_train)}')
print(f'Test score: {rfc.score(X_test_scaled, y_test)}')

Train score: 0.9968178202068417
Test score: 0.8997613365155132


In [16]:
print(classification_report(y_test, rfc.predict(X_test_scaled)))

              precision    recall  f1-score   support

          No       0.90      0.99      0.94       362
         Yes       0.89      0.30      0.45        57

    accuracy                           0.90       419
   macro avg       0.90      0.65      0.70       419
weighted avg       0.90      0.90      0.88       419



In [17]:
# sample 'yes' prediction
sample_yes=pd.concat([X_test.iloc[idx] for idx, result in enumerate(results) if result=='yes'], axis=1).T
sample_yes.iloc[0]

NameError: name 'results' is not defined

In [18]:
import pickle

with open('choices.pkl', 'wb') as f: 
    pickle.dump(choices, f)
        
with open('scaler.pkl', 'wb') as f: 
    pickle.dump(scaler, f)
    
with open('ohe.pkl', 'wb') as f: 
    pickle.dump(ohe, f)    
    
with open('model.pkl', 'wb') as f: 
    pickle.dump(rfc, f)