## Part 1: Preprocessing

In [1]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.pipeline import Pipeline
import pandas as pd
import numpy as np
from tensorflow.keras.models import Model
from tensorflow.keras import layers



In [2]:
#import data
#  Import and read the attrition data
attrition_df = pd.read_csv('https://static.bc-edx.com/ai/ail-v-1-0/m19/lms/datasets/attrition.csv')
attrition_df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,Department,DistanceFromHome,Education,EducationField,EnvironmentSatisfaction,HourlyRate,JobInvolvement,...,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,Sales,1,2,Life Sciences,2,94,3,...,3,1,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,Research & Development,8,1,Life Sciences,3,61,2,...,4,4,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,Research & Development,2,2,Other,4,92,2,...,3,2,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,Research & Development,3,4,Life Sciences,4,56,3,...,3,3,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,Research & Development,2,1,Medical,1,40,3,...,3,4,1,6,3,3,2,2,2,2


In [3]:
# Determine the number of unique values in each column
attrition_df.nunique()


Age                         43
Attrition                    2
BusinessTravel               3
Department                   3
DistanceFromHome            29
Education                    5
EducationField               6
EnvironmentSatisfaction      4
HourlyRate                  71
JobInvolvement               4
JobLevel                     5
JobRole                      9
JobSatisfaction              4
MaritalStatus                3
NumCompaniesWorked          10
OverTime                     2
PercentSalaryHike           15
PerformanceRating            2
RelationshipSatisfaction     4
StockOptionLevel             4
TotalWorkingYears           40
TrainingTimesLastYear        7
WorkLifeBalance              4
YearsAtCompany              37
YearsInCurrentRole          19
YearsSinceLastPromotion     16
YearsWithCurrManager        18
dtype: int64

In [4]:
#find nulls and data types
attrition_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 27 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Age                       1470 non-null   int64 
 1   Attrition                 1470 non-null   object
 2   BusinessTravel            1470 non-null   object
 3   Department                1470 non-null   object
 4   DistanceFromHome          1470 non-null   int64 
 5   Education                 1470 non-null   int64 
 6   EducationField            1470 non-null   object
 7   EnvironmentSatisfaction   1470 non-null   int64 
 8   HourlyRate                1470 non-null   int64 
 9   JobInvolvement            1470 non-null   int64 
 10  JobLevel                  1470 non-null   int64 
 11  JobRole                   1470 non-null   object
 12  JobSatisfaction           1470 non-null   int64 
 13  MaritalStatus             1470 non-null   object
 14  NumCompaniesWorked      

In [5]:
# Create y_df with the Attrition and Department columns
y_df=attrition_df[["Attrition","Department"]]
display(y_df.head())
print(y_df["Department"].unique())

Unnamed: 0,Attrition,Department
0,Yes,Sales
1,No,Research & Development
2,Yes,Research & Development
3,No,Research & Development
4,No,Research & Development


['Sales' 'Research & Development' 'Human Resources']


In [6]:
# Create a list of at least 10 column names to use as X data
#going to use all the columns and do feature selection later.


# Create X_df using your selected columns
X_df=attrition_df.drop(columns=["Attrition","Department"])

# Show the data types for X_df
X_df.info()
print(X_df["BusinessTravel"].unique())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 25 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Age                       1470 non-null   int64 
 1   BusinessTravel            1470 non-null   object
 2   DistanceFromHome          1470 non-null   int64 
 3   Education                 1470 non-null   int64 
 4   EducationField            1470 non-null   object
 5   EnvironmentSatisfaction   1470 non-null   int64 
 6   HourlyRate                1470 non-null   int64 
 7   JobInvolvement            1470 non-null   int64 
 8   JobLevel                  1470 non-null   int64 
 9   JobRole                   1470 non-null   object
 10  JobSatisfaction           1470 non-null   int64 
 11  MaritalStatus             1470 non-null   object
 12  NumCompaniesWorked        1470 non-null   int64 
 13  OverTime                  1470 non-null   object
 14  PercentSalaryHike       

In [7]:
print(y_df.value_counts())

Attrition  Department            
No         Research & Development    828
           Sales                     354
Yes        Research & Development    133
           Sales                      92
No         Human Resources            51
Yes        Human Resources            12
Name: count, dtype: int64


In [8]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test=train_test_split(X_df, y_df, test_size=0.2,random_state=42)



In [9]:
# Convert your X data to numeric data types however you see fit
# Add new code cells as necessary

ordinal_cols=["BusinessTravel"]
onehot_cols=["EducationField","JobRole","MaritalStatus","OverTime"]

drop_cols=ordinal_cols+onehot_cols
numeric_cols=X_df.drop(columns=drop_cols).columns.tolist()

#check to make sure did correctly
print(ordinal_cols)
print(onehot_cols)
print(numeric_cols)
print(len(numeric_cols))

#create pipelines
ordinal_pipline=Pipeline(steps=[
    ("ordinal",OrdinalEncoder(categories=[['Non-Travel', 'Travel_Rarely', 'Travel_Frequently']]))
])

onehot_pipeline=Pipeline(steps=[
    ("onehot",OneHotEncoder(handle_unknown="ignore"))
])

numeric_pipeline=Pipeline(steps=[
    ("scaler",StandardScaler())
])

#combine for X_train
processor=ColumnTransformer(
    transformers=[
        ("ord",ordinal_pipline,ordinal_cols),
        ("ohe",onehot_pipeline,onehot_cols),
        ("num",numeric_pipeline,numeric_cols)
    ]
)

#encode for X_train
X_train_encoded=processor.fit_transform(X_train)

#encode for X_test
X_test_encoded=processor.fit_transform(X_test)

['BusinessTravel']
['EducationField', 'JobRole', 'MaritalStatus', 'OverTime']
['Age', 'DistanceFromHome', 'Education', 'EnvironmentSatisfaction', 'HourlyRate', 'JobInvolvement', 'JobLevel', 'JobSatisfaction', 'NumCompaniesWorked', 'PercentSalaryHike', 'PerformanceRating', 'RelationshipSatisfaction', 'StockOptionLevel', 'TotalWorkingYears', 'TrainingTimesLastYear', 'WorkLifeBalance', 'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion', 'YearsWithCurrManager']
20


In [None]:
#check X_train encoded values
X_train_encoded_df=pd.DataFrame(X_train_encoded)
X_train_encoded_df.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,31,32,33,34,35,36,37,38,39,40
count,1176.0,1176.0,1176.0,1176.0,1176.0,1176.0,1176.0,1176.0,1176.0,1176.0,...,1176.0,1176.0,1176.0,1176.0,1176.0,1176.0,1176.0,1176.0,1176.0,1176.0
mean,1.097789,0.018707,0.417517,0.105442,0.313776,0.058673,0.085884,0.089286,0.035714,0.173469,...,-4.153896e-16,-1.5105080000000002e-17,1.661558e-17,6.344132000000001e-17,-1.895687e-16,-6.04203e-18,1.5105080000000001e-18,7.854639000000001e-17,-3.9273200000000005e-17,-1.9636600000000002e-17
std,0.529756,0.135548,0.493359,0.307253,0.464224,0.235113,0.280312,0.285277,0.185656,0.378814,...,1.000425,1.000425,1.000425,1.000425,1.000425,1.000425,1.000425,1.000425,1.000425,1.000425
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.4292901,-1.565321,-0.9455246,-1.423397,-2.155277,-2.521633,-1.138573,-1.165051,-0.6761095,-1.142448
25%,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.4292901,-0.6398222,-0.9455246,-0.6553112,-0.6135459,-1.082099,-0.6456433,-0.6113636,-0.6761095,-0.5750836
50%,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.4292901,0.2856771,0.2188073,-0.1432541,0.1573195,0.3574354,-0.3170233,-0.3345197,-0.355244,-0.2914014
75%,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,-0.4292901,1.211176,0.2188073,0.4968173,0.1573195,0.3574354,0.5045267,0.7728559,-0.03437845,0.8433273
max,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,2.329427,1.211176,2.547471,3.697174,2.469916,1.79697,5.433827,3.818139,4.136874,3.680149


In [10]:
# Create a OneHotEncoder for the Department column


# Fit the encoder to the training data


# Create two new variables by applying the encoder
# to the training and testing data

y_train_dept_encode=pd.get_dummies(y_train,columns=["Department"],drop_first=False).drop(columns="Attrition")

y_test_dept_encode=pd.get_dummies(y_test,columns=["Department"],drop_first=False).drop(columns="Attrition")

display(y_train_dept_encode)
display(y_test_dept_encode)


Unnamed: 0,Department_Human Resources,Department_Research & Development,Department_Sales
1097,False,True,False
727,False,True,False
254,False,False,True
1175,False,True,False
1341,False,True,False
...,...,...,...
1130,False,True,False
1294,False,True,False
860,False,True,False
1459,False,True,False


Unnamed: 0,Department_Human Resources,Department_Research & Development,Department_Sales
1041,False,False,True
184,False,True,False
1222,True,False,False
67,False,True,False
220,False,True,False
...,...,...,...
567,False,False,True
560,False,True,False
945,False,True,False
522,False,True,False


In [51]:
# Create a OneHotEncoder for the Attrition column


# Fit the encoder to the training data


# Create two new variables by applying the encoder
# to the training and testing data

y_train_attr_encode=pd.get_dummies(y_train,columns=["Attrition"],drop_first=False).drop(columns=["Department"]) 

y_test_attr_encode=pd.get_dummies(y_test,columns=["Attrition"],drop_first=False).drop(columns=["Department"])



display(y_train_attr_encode)
display(y_test_attr_encode)
y_train_attr_encode.value_counts()

Unnamed: 0,Attrition_No,Attrition_Yes
1097,True,False
727,True,False
254,True,False
1175,True,False
1341,True,False
...,...,...
1130,True,False
1294,True,False
860,False,True
1459,True,False


Unnamed: 0,Attrition_No,Attrition_Yes
1041,True,False
184,True,False
1222,False,True
67,True,False
220,True,False
...,...,...
567,True,False
560,True,False
945,True,False
522,True,False


Attrition_No  Attrition_Yes
True          False            978
False         True             198
Name: count, dtype: int64

## Part 2: Create, Compile, and Train the Model

In [52]:
# Find the number of columns in the X training data.
print(X_train_encoded.shape)

# Create the input layer
input_layer=layers.Input(shape=(41,), name="input_layer")

# Create at least two shared layers
shared_layer1=layers.Dense(128, activation="relu", name="shared_layer1")(input_layer)
shared_layer2=layers.Dense(64, activation="relu", name="shared_layer2")(shared_layer1)

(1176, 41)


In [53]:
# Create a branch for Department
# with a hidden layer and an output layer

# Create the hidden layer
dept_dense=layers.Dense(32, activation="relu", name="dept_dense")(shared_layer2)

# Create the output layer
dept_output=layers.Dense(len(y_train_dept_encode.columns),activation="softmax",name="dept_output")(dept_dense)

In [54]:
# Create a branch for Attrition
# with a hidden layer and an output layer

# Create the hidden layer
attr_dense=layers.Dense(32, activation="relu", name="attr_dense")(shared_layer2)

# Create the output layer
attr_output=layers.Dense(len(y_train_attr_encode.columns),activation="softmax",name="attr_output")(attr_dense)

In [55]:
# Create the model
model = Model(inputs=input_layer, outputs={
    "dept_output": dept_output,
    "attr_output": attr_output
})

# Compile the model
model.compile(
    optimizer="adam",
    loss={
        "dept_output": "categorical_crossentropy",
        "attr_output": "binary_crossentropy"  
    },
    metrics={
        "dept_output": "accuracy",
        "attr_output": "accuracy"
    }
)

# Summarize the model
model.summary()

In [None]:
# Train the model
from keras.callbacks import EarlyStopping
early_stop = EarlyStopping(monitor="val_loss", patience=5, 
restore_best_weights=True)

model.fit(
    X_train_encoded,
    {"dept_output": y_train_dept_encode,
     "attr_output" : y_train_attr_encode},
     epochs=50,
     batch_size=32,
     validation_data=(X_test_encoded, {
         "dept_output" : y_test_dept_encode,
         "attr_output" : y_test_attr_encode}),
     callbacks=[early_stop]                    
        
     )

Epoch 1/50
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - attr_output_accuracy: 0.7063 - attr_output_loss: 0.5959 - dept_output_accuracy: 0.5888 - dept_output_loss: 0.8787 - loss: 1.4746 - val_attr_output_accuracy: 0.8673 - val_attr_output_loss: 0.3668 - val_dept_output_accuracy: 0.6905 - val_dept_output_loss: 0.6785 - val_loss: 1.0769
Epoch 2/50
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - attr_output_accuracy: 0.8292 - attr_output_loss: 0.4310 - dept_output_accuracy: 0.7130 - dept_output_loss: 0.6412 - loss: 1.0723 - val_attr_output_accuracy: 0.8673 - val_attr_output_loss: 0.3398 - val_dept_output_accuracy: 0.8844 - val_dept_output_loss: 0.4521 - val_loss: 0.8255
Epoch 3/50
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - attr_output_accuracy: 0.8324 - attr_output_loss: 0.3881 - dept_output_accuracy: 0.9061 - dept_output_loss: 0.3704 - loss: 0.7585 - val_attr_output_accuracy: 0.8741 - val_attr_output_

<keras.src.callbacks.history.History at 0x22b3ebc3ef0>

In [None]:
# Evaluate the model with the testing data
results=model.evaluate(X_test_encoded, {"dept_output" : y_test_dept_encode,
         "attr_output" : y_test_attr_encode})

print(model.metrics_names)
print(results)


[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - attr_output_accuracy: 0.8863 - attr_output_loss: 0.3319 - dept_output_accuracy: 0.9684 - dept_output_loss: 0.0878 - loss: 0.4247 
['loss', 'compile_metrics', 'dept_output_loss', 'attr_output_loss']
[0.4149811565876007, 0.0863058865070343, 0.3007259964942932, 0.9013605713844299, 0.9727891087532043]


In [58]:
# Print the accuracy for both department and attrition
print(f"Attrition Accuracy: {results[3]:.3f}") 
print(f"Department Accuracy: {results[4]:.3f}") 

Attrition Accuracy: 0.901
Department Accuracy: 0.973


In [70]:
#confusion matrix for attr

#predictions
predictions_attr=model.predict(X_test_encoded)["attr_output"]

prediction_attr_classes=(predictions_attr >0.5).astype(int)
#need prediction_attr_classes as 1D array
prediction_attr_classes = np.argmax(predictions_attr, axis=1)

#need a binary array for y_test_attr_encode instead if the 2 columns onehotencoder values
y_test_attr_classes = np.argmax(y_test_attr_encode, axis=1)

from sklearn.metrics import confusion_matrix
cm_attr=confusion_matrix(y_test_attr_classes,prediction_attr_classes)

print("Confusion Matrix for Attrition:")
print(cm_attr)

[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 
Confusion Matrix for Attrition:
[[253   2]
 [ 27  12]]


In [72]:
#classification report
from sklearn.metrics import classification_report

# Generate classification report for the attrition output
report_attr = classification_report(y_test_attr_classes, prediction_attr_classes)

print("Classification Report for Attrition Output:")
print(report_attr)

Classification Report for Attrition Output:
              precision    recall  f1-score   support

           0       0.90      0.99      0.95       255
           1       0.86      0.31      0.45        39

    accuracy                           0.90       294
   macro avg       0.88      0.65      0.70       294
weighted avg       0.90      0.90      0.88       294



# Summary

In the provided space below, briefly answer the following questions.

1. Is accuracy the best metric to use on this data? Why or why not?

2. What activation functions did you choose for your output layers, and why?

3. Can you name a few ways that this model might be improved?

YOUR ANSWERS HERE

1. Accuracy is not always the best metric to access performance on imbalanced datasets (which I have).  Looking at precision, recall, F1-score and AUC-ROC may be better indicators of the model performance.  In this case there are 27 cases where the employee left and the model said they stayed...so there is still an issue.  Perhaps actions in #3 would help.
2. For the two shared layers and 2 hidden layers. I used relu.  Relu is simple, helps with vanishing gradient issues, and is computationally efficient.  Attrition is binary classification but department is not so I would not use sigmoid. Tanh works best with outputs between -1 and 1.  By using standardscaler, I scaled to +/- 1 standard deviation and the values do not fall in this range on all features.  For the output layers I used softmax since this is a mutli-class classification problem.
3. The department part of the model looks to be good with accuracy of 97% and loss of 0.1.  The attrition part, however needs some more work with an accuracy of 0.9 and a loss of 0.3.  This may be due to the imbalance in the data.  There are 978 cases where the employee stayed but only 198 that left.  This may be biassing the model toward employees staying.  To correct this, you can add class weights into the model.fit statement.  A higher weight on attrition_yes would penalize the loss more for a misclassification.  Another approach may be to use SMOTE to increase the attrition_yes class sample or randomly remove data from the the attrition_no class.  Getting more attrition_yes data would also help.  I wanted to do some feature removal but the tools I thought I could use were not not working.  One way to make a better model would be to preform a feature importance study and remove redundant or inconsequential features.  As an alternative you can use industry knowledge and pick the top ten influential features.  There is always hyperparameter tuning of learning rates, optimizer parameters, and the number of neurons in the layers that may help get a few more percent points in accuracy.

## Feature Reduction

In [None]:
import shap

background_data = X_train_encoded[:100]

def model_predict(input_data):
    # Get model predictions for department output (or attrition, depending on what you want)
    return model(input_data)["dept_output"].numpy()

explainer = shap.KernelExplainer(model_predict, background_data)

shap_values = explainer.shap_values(X_test_encoded[:1])

# Visualize the SHAP values for a single instance
shap.initjs()
shap_values_dept = shap_values[0, :, 0]
shap_values_attr = shap_values[0, :, 1]  
shap.force_plot(explainer.expected_value[0], shap_values_dept, X_test_encoded[:1])


In [None]:
X_train_encoded.corr()

In [None]:
from sklearn.inspection import permutation_importance
import numpy as np

# Define a custom scoring function
def model_score(X, y, sample_weight=None):
    # Get the prediction using model.predict() (not directly using model object)
    y_pred = model.predict(X)
    accuracy = np.mean(np.argmax(y_pred["dept_output"], axis=-1) == np.argmax(y, axis=-1))
    return accuracy

# Assuming model is trained, X_test_encoded and y_test_dept_encode are available
result = permutation_importance(
    model, X_test_encoded, y_test_dept_encode, n_repeats=10, random_state=42, scoring=model_score
)

# Get feature importance scores
importance = result.importances_mean

# Print the importance scores
for i, feature in enumerate(X_train_encoded.columns):
    print(f"{feature}: {importance[i]}")
