In [1]:
import pandas as pd
from sklearn.ensemble import AdaBoostClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn import metrics

In [2]:
df=pd.read_csv('../Resources/watson_healthcare_modified.csv')

In [3]:
##Version 2: Remove additional columns
df = df.drop(columns = ['EmployeeID', 'EmployeeCount', 'StandardHours', 'TrainingTimesLastYear', 'MonthlyRate', 'DailyRate', 'HourlyRate', 'Over18'])

df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1676 entries, 0 to 1675
Data columns (total 27 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Age                       1676 non-null   int64 
 1   Attrition                 1676 non-null   object
 2   BusinessTravel            1676 non-null   object
 3   Department                1676 non-null   object
 4   DistanceFromHome          1676 non-null   int64 
 5   Education                 1676 non-null   int64 
 6   EducationField            1676 non-null   object
 7   EnvironmentSatisfaction   1676 non-null   int64 
 8   Gender                    1676 non-null   object
 9   JobInvolvement            1676 non-null   int64 
 10  JobLevel                  1676 non-null   int64 
 11  JobRole                   1676 non-null   object
 12  JobSatisfaction           1676 non-null   int64 
 13  MaritalStatus             1676 non-null   object
 14  MonthlyIncome           

In [4]:
# Rename columns name to add number cap suggestion for future user input
df.rename(columns={'Age':'Age (18-60)', 'DistanceFromHome':'Distance From Home (miles)', 'Education':'Education (1-5)', 'EnvironmentSatisfaction':'Env. Satisfaction (1-4)',
                   'JobInvolvement':'Job Involvement (1-4)', 'JobLevel':'Job Level (1-5)', 'JobSatisfaction':'Job Satisfaction (1-4)', 'PerformanceRating':'Performance Rating (1-4)',
                   'RelationshipSatisfaction':'Relationship Satisfaction (1-4)', 'Shift':'Shift (1-3)', 'WorkLifeBalance':'WorkLifeBalance (1-4)'}, inplace=True)
df.head()

Unnamed: 0,Age (18-60),Attrition,BusinessTravel,Department,Distance From Home (miles),Education (1-5),EducationField,Env. Satisfaction (1-4),Gender,Job Involvement (1-4),...,PercentSalaryHike,Performance Rating (1-4),Relationship Satisfaction (1-4),Shift (1-3),TotalWorkingYears,WorkLifeBalance (1-4),YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,No,Travel_Rarely,Cardiology,1,2,Life Sciences,2,Female,3,...,11,3,1,0,8,1,6,4,0,5
1,49,No,Travel_Frequently,Maternity,8,1,Life Sciences,3,Male,2,...,23,4,4,1,10,3,10,7,1,7
2,37,Yes,Travel_Rarely,Maternity,2,2,Other,4,Male,2,...,15,3,2,0,7,3,0,0,0,0
3,33,No,Travel_Frequently,Maternity,3,4,Life Sciences,4,Female,3,...,11,3,3,0,8,3,8,7,3,0
4,27,No,Travel_Rarely,Maternity,2,1,Medical,1,Male,3,...,12,3,4,1,6,3,2,2,2,2


In [5]:
target='Attrition'
# remove target
X=df.drop(columns=['Attrition']).copy()
# get list of columns
cols=list(X.columns)
# start choices dictionary to capture categorical options for HTML
choices={col: None for col in cols}

# get all categorical columns
cat_cols=X.dtypes[X.dtypes=='object'].index
# get all continuous columns
cont_cols=X.dtypes[X.dtypes!='object'].index
# get all categorical features
unique_vals_count=df[cat_cols].nunique().sum()
print(f'There should be {len(cont_cols)-len(cat_cols)+unique_vals_count} columns')

There should be 36 columns


In [6]:
# set X as only continuous features
X=X.drop(columns=cat_cols).copy()

# OHE transform categorical features
ohe=OneHotEncoder(sparse_output=False)
cat_cols_transformed=ohe.fit_transform(df[cat_cols])
categories=ohe.categories_
cat_feature_names=ohe.get_feature_names_out()

# map options to each categorical variable for HTML
for k, v in zip(cat_cols, categories): 
    choices[k]=list(v)

In [7]:
X[cat_feature_names]=cat_cols_transformed
y=df[target].copy()

In [8]:
X_train, X_test, y_train, y_test=train_test_split(X, y)

In [9]:
scaler=StandardScaler()
scaler.fit(X_train)
X_train_scaled=scaler.transform(X_train)
X_test_scaled=scaler.transform(X_test)

In [10]:
# Create adaboost classifer object
abc = AdaBoostClassifier(n_estimators=50,learning_rate=1)
# Train Adaboost Classifer
model = abc.fit(X_train, y_train)
#Predict the response for test dataset
y_pred = model.predict(X_test)

print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.9355608591885441


In [11]:
import pickle

with open('choices.pkl', 'wb') as f: 
    pickle.dump(choices, f)
        
with open('scaler.pkl', 'wb') as f: 
    pickle.dump(scaler, f)
    
with open('ohe.pkl', 'wb') as f: 
    pickle.dump(ohe, f)    
    
with open('model.pkl', 'wb') as f: 
    pickle.dump(abc, f)