## Part 1: Preprocessing

In [1]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np
from tensorflow.keras.models import Model
from tensorflow.keras import layers

#  Import and read the attrition data
attrition_df = pd.read_csv('https://static.bc-edx.com/ai/ail-v-1-0/m19/lms/datasets/attrition.csv')
attrition_df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,Department,DistanceFromHome,Education,EducationField,EnvironmentSatisfaction,HourlyRate,JobInvolvement,...,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,Sales,1,2,Life Sciences,2,94,3,...,3,1,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,Research & Development,8,1,Life Sciences,3,61,2,...,4,4,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,Research & Development,2,2,Other,4,92,2,...,3,2,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,Research & Development,3,4,Life Sciences,4,56,3,...,3,3,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,Research & Development,2,1,Medical,1,40,3,...,3,4,1,6,3,3,2,2,2,2


In [None]:
# Determine the number of unique values in each column.
attrition_df.nunique()

Age                         43
Attrition                    2
BusinessTravel               3
Department                   3
DistanceFromHome            29
Education                    5
EducationField               6
EnvironmentSatisfaction      4
HourlyRate                  71
JobInvolvement               4
JobLevel                     5
JobRole                      9
JobSatisfaction              4
MaritalStatus                3
NumCompaniesWorked          10
OverTime                     2
PercentSalaryHike           15
PerformanceRating            2
RelationshipSatisfaction     4
StockOptionLevel             4
TotalWorkingYears           40
TrainingTimesLastYear        7
WorkLifeBalance              4
YearsAtCompany              37
YearsInCurrentRole          19
YearsSinceLastPromotion     16
YearsWithCurrManager        18
dtype: int64

In [5]:
# Create y_df with the Attrition and Department columns
y_df = attrition_df[['Attrition', 'Department']]
y_df.head()

Unnamed: 0,Attrition,Department
0,Yes,Sales
1,No,Research & Development
2,Yes,Research & Development
3,No,Research & Development
4,No,Research & Development


In [9]:
# Create a list of at least 10 column names to use as X data
X = ['Age', 'BusinessTravel', 'DistanceFromHome', 'Education', 'HourlyRate',
     'EnvironmentSatisfaction', 'PercentSalaryHike', 'YearsAtCompany', 'YearsSinceLastPromotion', 'OverTime']


# Create X_df using your selected columns
X_df = attrition_df[['Age', 'BusinessTravel', 'DistanceFromHome', 'Education', 'HourlyRate',
     'EnvironmentSatisfaction', 'PercentSalaryHike', 'YearsAtCompany', 'YearsSinceLastPromotion', 'OverTime']]

# Show the data types for X_df
X_df.dtypes


Unnamed: 0,0
Age,int64
BusinessTravel,object
DistanceFromHome,int64
Education,int64
HourlyRate,int64
EnvironmentSatisfaction,int64
PercentSalaryHike,int64
YearsAtCompany,int64
YearsSinceLastPromotion,int64
OverTime,object


In [10]:
# Split the data into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_df, y_df, test_size=0.2, random_state=42)


In [12]:
# Convert your X data to numeric data types however you see fit
# Add new code cells as necessary
X_numeric = pd.get_dummies(X_df, columns=['BusinessTravel', 'OverTime'], drop_first=True)
X_numeric.dtypes

Unnamed: 0,0
Age,int64
DistanceFromHome,int64
Education,int64
HourlyRate,int64
EnvironmentSatisfaction,int64
PercentSalaryHike,int64
YearsAtCompany,int64
YearsSinceLastPromotion,int64
BusinessTravel_Travel_Frequently,bool
BusinessTravel_Travel_Rarely,bool


In [13]:
X_train, X_test, y_train, y_test = train_test_split(X_numeric, y_df, test_size=0.2, random_state=42)

In [15]:
# Create a StandardScaler
scaler = StandardScaler()

# Fit the StandardScaler to the training data
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Scale the training and testing data

X_train_scaled

array([[-1.38855944,  1.44039645, -0.86335572, ..., -0.49041445,
         0.63904869, -0.63641018],
       [-2.04073779, -0.52269928, -0.86335572, ..., -0.49041445,
        -1.56482598, -0.63641018],
       [-0.84507748,  1.31770296, -0.86335572, ..., -0.49041445,
         0.63904869, -0.63641018],
       ...,
       [-1.60595222, -0.76808624,  1.06322176, ...,  2.03909165,
        -1.56482598,  1.57131364],
       [-0.84507748,  0.45884859, -0.86335572, ..., -0.49041445,
         0.63904869,  1.57131364],
       [ 1.43754676, -0.03192534,  0.09993302, ..., -0.49041445,
         0.63904869,  1.57131364]])

In [20]:
# Create a OneHotEncoder for the Department column
from sklearn.preprocessing import OneHotEncoder
department_encoder = OneHotEncoder(sparse_output=False, drop='first') # Set sparse_output=False

# Fit the encoder to the training data
department_encoder.fit(y_train[['Department']])

# Create two new variables by applying the encoder
# to the training and testing data
y_train_department_encoded = department_encoder.transform(y_train[['Department']])
y_test_department_encoded = department_encoder.transform(y_test[['Department']])

y_train_department_encoded_df = pd.DataFrame(
    y_train_department_encoded,
    columns=department_encoder.get_feature_names_out(['Department']),
    index=y_train.index
)

y_test_department_encoded_df = pd.DataFrame(
    y_test_department_encoded,
    columns=department_encoder.get_feature_names_out(['Department']),
    index=y_test.index
)




In [21]:
# Create a OneHotEncoder for the Attrition column
from sklearn.preprocessing import OneHotEncoder
attrition_encoder = OneHotEncoder(sparse_output=False, drop='first') # Set sparse_output=False

# Fit the encoder to the training data
attrition_encoder.fit(y_train[['Attrition']])

# Create two new variables by applying the encoder
# to the training and testing data
y_train_attrition_encoded = attrition_encoder.transform(y_train[['Attrition']])
y_test_attrition_encoded = attrition_encoder.transform(y_test[['Attrition']])

y_train_attrition_encoded_df = pd.DataFrame(
    y_train_attrition_encoded,
    columns=attrition_encoder.get_feature_names_out(['Attrition']),
    index=y_train.index
)

y_test_attrition_encoded_df = pd.DataFrame(
    y_test_attrition_encoded,
    columns=attrition_encoder.get_feature_names_out(['Attrition']),
    index=y_test.index
)



## Create, Compile, and Train the Model

In [22]:
import tensorflow as tf
print(tf.__version__)


2.17.1


In [24]:
# Find the number of columns in the X training data
number_input_features = X_train_scaled.shape[1]
# Create the input layer
input_layer = tf.keras.layers.Input(shape=(number_input_features,))

# Create at least two shared layers
hidden_layer_1 = tf.keras.layers.Dense(units=10, activation='relu')(input_layer)
hidden_layer_2 = tf.keras.layers.Dense(units=6, activation='relu')(hidden_layer_1)

In [25]:
# Create a branch for Department
# with a hidden layer and an output layer

# Create the hidden layer
department_hidden_layer = tf.keras.layers.Dense(units=4, activation='relu')(hidden_layer_2)

# Create the output layer
department_output_layer = tf.keras.layers.Dense(
    units=y_train_department_encoded_df.shape[1],
    activation='softmax',
    name='department_output'
)(department_hidden_layer)


In [26]:
# Create a branch for Attrition
# with a hidden layer and an output layer

# Create the hidden layer
attrition_hidden_layer = tf.keras.layers.Dense(units=4, activation='relu')(hidden_layer_2)

# Create the output layer
attrition_output_layer = tf.keras.layers.Dense(
    units=1,
    activation='sigmoid',
    name='attrition_output'
)(attrition_hidden_layer)


In [27]:
# Create the model
model = tf.keras.models.Model(
    inputs=input_layer,
    outputs=[attrition_output_layer, department_output_layer]
)
# Compile the model
model.compile(
    optimizer='adam',
    loss={
        'attrition_output': 'binary_crossentropy',
        'department_output': 'categorical_crossentropy'
    },
    metrics={
        'attrition_output': 'accuracy',
        'department_output': 'accuracy'
    }
)

# Summarize the model
model.summary()


In [29]:
# Train the model
y_train_targets = {
    'attrition_output': y_train['Attrition'].apply(lambda x: 1 if x == 'Yes' else 0),  # Convert Yes/No to 1/0
    'department_output': y_train_department_encoded_df
}
history = model.fit(
    X_train_scaled,
    y_train_targets,
    validation_split=0.2,
    epochs=100,
    batch_size=32,
    verbose=1
)


Epoch 1/100
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 25ms/step - attrition_output_accuracy: 0.6500 - attrition_output_loss: 0.6631 - department_output_accuracy: 0.4087 - department_output_loss: 0.6874 - loss: 1.3504 - val_attrition_output_accuracy: 0.7966 - val_attrition_output_loss: 0.6120 - val_department_output_accuracy: 0.6144 - val_department_output_loss: 0.6617 - val_loss: 1.2760
Epoch 2/100
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - attrition_output_accuracy: 0.8347 - attrition_output_loss: 0.6010 - department_output_accuracy: 0.6302 - department_output_loss: 0.6593 - loss: 1.2603 - val_attrition_output_accuracy: 0.7966 - val_attrition_output_loss: 0.5699 - val_department_output_accuracy: 0.6610 - val_department_output_loss: 0.6520 - val_loss: 1.2270
Epoch 3/100
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - attrition_output_accuracy: 0.8289 - attrition_output_loss: 0.5616 - department_output_accu

In [31]:
# Evaluate the model with the testing data
y_test_targets = {
    'attrition_output': y_test['Attrition'].apply(lambda x: 1 if x == 'Yes' else 0),  # Convert Yes/No to 1/0
    'department_output': y_test_department_encoded_df
}
evaluation_results = model.evaluate(
    X_test_scaled,
    y_test_targets,
    verbose=1
)
evaluation_results

[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - attrition_output_accuracy: 0.8502 - attrition_output_loss: 0.3721 - department_output_accuracy: 0.6917 - department_output_loss: 0.6444 - loss: 1.0169


[0.9936051964759827,
 0.3589197099208832,
 0.6320303678512573,
 0.8639456033706665,
 0.7108843326568604]

In [32]:
# Print the accuracy for both department and attrition
attrition_accuracy = evaluation_results[3]
department_accuracy = evaluation_results[4]
print(f"Attrition Accuracy: {attrition_accuracy}")
print(f"Department Accuracy: {department_accuracy}")

Attrition Accuracy: 0.8639456033706665
Department Accuracy: 0.7108843326568604


# Summary

In the provided space below, briefly answer the following questions.

1. Is accuracy the best metric to use on this data? Why or why not?

2. What activation functions did you choose for your output layers, and why?

3. Can you name a few ways that this model might be improved?

YOUR ANSWERS HERE

1. Accuracy can be a good baseline metric to measure data results but should be evaluated with caution. If the data is imbalanced than accuracy can be a misleading metric.
2. For the Attrition output I used a sigmoid activation because it was a binary classification problem. For the Department output I used softmax because it is multiclass.
3. Two things that come to mind first are experiment with hyperparameter tuning changing number of layers and neurons in the model. Secondly, you could preprocess the data a little more with some feature engineering. Doing these things could have a positive impact on the models results.