## Part 1: Preprocessing

In [29]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
import pandas as pd
import numpy as np
from tensorflow.keras.models import Model
from tensorflow.keras import layers

#  Import and read the attrition data
attrition_df = pd.read_csv('https://static.bc-edx.com/ai/ail-v-1-0/m19/lms/datasets/attrition.csv')
attrition_df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,Department,DistanceFromHome,Education,EducationField,EnvironmentSatisfaction,HourlyRate,JobInvolvement,...,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,Sales,1,2,Life Sciences,2,94,3,...,3,1,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,Research & Development,8,1,Life Sciences,3,61,2,...,4,4,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,Research & Development,2,2,Other,4,92,2,...,3,2,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,Research & Development,3,4,Life Sciences,4,56,3,...,3,3,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,Research & Development,2,1,Medical,1,40,3,...,3,4,1,6,3,3,2,2,2,2


In [30]:
try:
    # Attempt to import the dependencies
    from sklearn.model_selection import train_test_split
    from sklearn.preprocessing import StandardScaler
    from sklearn.preprocessing import OneHotEncoder
    import pandas as pd
    import numpy as np
    from tensorflow.keras.models import Model
    from tensorflow.keras import layers

    # If the imports are successful, print a success message
    print("All dependencies are installed correctly.")
except ImportError as e:
    # If an ImportError occurs, print an error message with details
    print(f"An error occurred: {e}. Please ensure all dependencies are installed correctly.")

All dependencies are installed correctly.


In [31]:
# Determine the number of unique values in each column and sort them in descending order.
attrition_df.nunique().sort_values(ascending=False)

HourlyRate                  71
Age                         43
TotalWorkingYears           40
YearsAtCompany              37
DistanceFromHome            29
YearsInCurrentRole          19
YearsWithCurrManager        18
YearsSinceLastPromotion     16
PercentSalaryHike           15
NumCompaniesWorked          10
JobRole                      9
TrainingTimesLastYear        7
EducationField               6
JobLevel                     5
Education                    5
EnvironmentSatisfaction      4
JobInvolvement               4
JobSatisfaction              4
RelationshipSatisfaction     4
StockOptionLevel             4
WorkLifeBalance              4
Department                   3
BusinessTravel               3
MaritalStatus                3
OverTime                     2
Attrition                    2
PerformanceRating            2
dtype: int64

In [32]:
# Create y_df with the Attrition and Department columns
y_df = attrition_df[['Attrition', 'Department']]
y_df.head()

Unnamed: 0,Attrition,Department
0,Yes,Sales
1,No,Research & Development
2,Yes,Research & Development
3,No,Research & Development
4,No,Research & Development


In [33]:
# Create a list of at least 10 column names to use as X data
columns_list_for_X_data = ['HourlyRate',
                           'Age',
                           'TotalWorkingYears',
                           'YearsAtCompany',
                           'YearsInCurrentRole',
                           'YearsSinceLastPromotion',
                           'YearsWithCurrManager',
                           'DistanceFromHome',
                           'Education',
                           'JobLevel']
# # Create X_df using your selected columns
X_df = attrition_df[columns_list_for_X_data]

# Show the data types for X_df
print(X_df.dtypes)

HourlyRate                 int64
Age                        int64
TotalWorkingYears          int64
YearsAtCompany             int64
YearsInCurrentRole         int64
YearsSinceLastPromotion    int64
YearsWithCurrManager       int64
DistanceFromHome           int64
Education                  int64
JobLevel                   int64
dtype: object


In [34]:
# Check the new DataFrames
display('Input Features are labled as X_df:', X_df.head())
display('Target Variables are labled as y_df:', y_df.head())

'Input Features are labled as X_df:'

Unnamed: 0,HourlyRate,Age,TotalWorkingYears,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager,DistanceFromHome,Education,JobLevel
0,94,41,8,6,4,0,5,1,2,2
1,61,49,10,10,7,1,7,8,1,2
2,92,37,7,0,0,0,0,2,2,1
3,56,33,8,8,7,3,0,3,4,1
4,40,27,6,2,2,2,2,2,1,1


'Target Variables are labled as y_df:'

Unnamed: 0,Attrition,Department
0,Yes,Sales
1,No,Research & Development
2,Yes,Research & Development
3,No,Research & Development
4,No,Research & Development


In [35]:
# Split the data into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_df, y_df)

display('X_train:', X_train.head())
display('X_test:', X_test.head())
display('y_train:', y_train.head())
display('y_test:', y_test.head())

'X_train:'

Unnamed: 0,HourlyRate,Age,TotalWorkingYears,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager,DistanceFromHome,Education,JobLevel
1350,89,27,9,8,7,0,7,2,2,2
993,93,25,6,3,2,1,2,18,1,2
858,50,53,26,7,7,4,7,7,2,5
607,43,49,9,9,8,7,7,11,3,3
1139,35,32,6,3,2,0,2,5,4,1


'X_test:'

Unnamed: 0,HourlyRate,Age,TotalWorkingYears,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager,DistanceFromHome,Education,JobLevel
487,83,20,1,1,0,0,0,1,3,1
1443,56,42,24,22,6,4,14,2,3,5
973,60,35,10,9,7,0,0,1,3,2
260,100,32,5,5,1,0,3,7,3,1
97,43,28,5,5,4,0,4,4,3,2


'y_train:'

Unnamed: 0,Attrition,Department
1350,No,Sales
993,No,Sales
858,No,Research & Development
607,Yes,Sales
1139,No,Research & Development


'y_test:'

Unnamed: 0,Attrition,Department
487,No,Research & Development
1443,No,Research & Development
973,No,Research & Development
260,No,Research & Development
97,No,Sales


In [36]:
# Convert the X data to numeric data types for machine learning model with float32 for memory efficiency.
X_train_numeric_dtypes = X_train.astype('float32')
X_test_numeric_dtypes = X_test.astype('float32')

display('X_train_numeric_dtypes:', X_train_numeric_dtypes.head())
display('X_test_numeric_dtypes:', X_test_numeric_dtypes.head())

'X_train_numeric_dtypes:'

Unnamed: 0,HourlyRate,Age,TotalWorkingYears,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager,DistanceFromHome,Education,JobLevel
1350,89.0,27.0,9.0,8.0,7.0,0.0,7.0,2.0,2.0,2.0
993,93.0,25.0,6.0,3.0,2.0,1.0,2.0,18.0,1.0,2.0
858,50.0,53.0,26.0,7.0,7.0,4.0,7.0,7.0,2.0,5.0
607,43.0,49.0,9.0,9.0,8.0,7.0,7.0,11.0,3.0,3.0
1139,35.0,32.0,6.0,3.0,2.0,0.0,2.0,5.0,4.0,1.0


'X_test_numeric_dtypes:'

Unnamed: 0,HourlyRate,Age,TotalWorkingYears,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager,DistanceFromHome,Education,JobLevel
487,83.0,20.0,1.0,1.0,0.0,0.0,0.0,1.0,3.0,1.0
1443,56.0,42.0,24.0,22.0,6.0,4.0,14.0,2.0,3.0,5.0
973,60.0,35.0,10.0,9.0,7.0,0.0,0.0,1.0,3.0,2.0
260,100.0,32.0,5.0,5.0,1.0,0.0,3.0,7.0,3.0,1.0
97,43.0,28.0,5.0,5.0,4.0,0.0,4.0,4.0,3.0,2.0


In [37]:
# Create a StandardScaler
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

# Fit the StandardScaler to the training data
X_scaler = scaler.fit(X_train)

# Scale the training and testing data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [38]:
from sklearn.preprocessing import OneHotEncoder

In [39]:
# Create a OneHotEncoder for the Department column
department_encoder = OneHotEncoder(handle_unknown='ignore')

department_encoder.fit(y_train[['Department']])

# Create two new variables by applying the encoder
# to the training and testing data
encoded_train = department_encoder.transform(y_train[['Department']])
encoded_test = department_encoder.transform(y_test[['Department']])

encoded_train_array = encoded_train.toarray()
encoded_train_array

array([[0., 0., 1.],
       [0., 0., 1.],
       [0., 1., 0.],
       ...,
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 1., 0.]])

In [40]:
# Create a OneHotEncoder for the Attrition column
attrition_encoder = OneHotEncoder()

# Fit the encoder to the training data
attrition_encoder = attrition_encoder.fit(y_train[['Attrition']])

# Create two new variables by applying the encoder
# to the training and testing data
encoded_attrition_train = attrition_encoder.transform(y_train[['Attrition']])
encoded_attrition_test = attrition_encoder.transform(y_test[['Attrition']])


## Create, Compile, and Train the Model

In [41]:
# Find the number of columns in the X training data
input_features = len(X_train_scaled[0])

# Create the input layer
inputs = layers.Input(shape=(input_features,))
inputs

# Create at least two shared layers
shared_layer1 = layers.Dense(units=8, activation='relu')
shared_layer2 = layers.Dense(units=8, activation='relu')

In [42]:
# Create a branch for Department with a hidden layer and an output layer
department_branch = shared_layer1(inputs)
department_branch = shared_layer2(department_branch)

# Create the hidden layer
department_hidden = layers.Dense(units=8, activation='relu')(department_branch)

# Create the output layer
department_output = layers.Dense(units=3, activation='softmax', name='department_output')(department_hidden)


In [43]:
# Create a branch for Attrition with a hidden layer and an output layer
attrition_branch = shared_layer1(inputs)
attrition_branch = shared_layer2(attrition_branch)

# Create the hidden layer
attrition_hidden = layers.Dense(units=8, activation='relu')(attrition_branch)

# Create the output layer
attrition_output = layers.Dense(units=2, activation='softmax', name='attrition_output')(attrition_hidden)

In [44]:
# Create the model
model = Model(inputs=inputs, outputs=[department_output, attrition_output])

# Compile the model
model.compile(optimizer='adam',
              loss={'department_output': 'categorical_crossentropy',
                    'attrition_output': 'categorical_crossentropy'},
              metrics=['accuracy'])

# Summarize the model
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_2 (InputLayer)        [(None, 10)]                 0         []                            
                                                                                                  
 dense_4 (Dense)             (None, 8)                    88        ['input_2[0][0]',             
                                                                     'input_2[0][0]']             
                                                                                                  
 dense_5 (Dense)             (None, 8)                    72        ['dense_4[0][0]',             
                                                                     'dense_4[1][0]']             
                                                                                            

In [53]:
from tensorflow.keras import layers, Model

# Assuming 'input_features' is already defined
inputs = layers.Input(shape=(input_features,))

# Shared layers
shared_layer1 = layers.Dense(units=8, activation='relu')
shared_layer2 = layers.Dense(units=8, activation='relu')

# Department branch
department_branch = shared_layer1(inputs)
department_branch = shared_layer2(department_branch)
department_hidden = layers.Dense(units=8, activation='relu')(department_branch)
department_output = layers.Dense(units=3, activation='softmax', name='department_output')(department_hidden)

# Attrition branch
attrition_branch = shared_layer1(inputs)
attrition_branch = shared_layer2(attrition_branch)
attrition_hidden = layers.Dense(units=8, activation='relu')(attrition_branch)
attrition_output = layers.Dense(units=2, activation='softmax', name='attrition_output')(attrition_hidden)

# Recreate the model
model = Model(inputs=inputs, outputs=[department_output, attrition_output])

# Compile the model (you may need to adjust the loss and metrics)
model.compile(optimizer='adam',
              loss={'department_output': 'categorical_crossentropy',
                    'attrition_output': 'categorical_crossentropy'},
              metrics=['accuracy'])

In [54]:
# Check the shapes and types of your input and target data
print("X_train_scaled shape:", X_train_scaled.shape)
print("X_train_scaled type:", type(X_train_scaled))
print("encoded_train_array shape:", encoded_train_array.shape)
print("encoded_train_array type:", type(encoded_train_array))
print("encoded_attrition_train shape:", encoded_attrition_train.shape)
print("encoded_attrition_train type:", type(encoded_attrition_train))

# If the data is not in NumPy array format, convert it
import numpy as np

if not isinstance(X_train_scaled, np.ndarray):
    X_train_scaled = np.array(X_train_scaled)

if not isinstance(encoded_train_array, np.ndarray):
    encoded_train_array = encoded_train_array.toarray()  # Assuming it's a sparse matrix

if not isinstance(encoded_attrition_train, np.ndarray):
    encoded_attrition_train = encoded_attrition_train.toarray()  # Assuming it's a sparse matrix

# Verify that the data is not empty
print("X_train_scaled is empty:", X_train_scaled.size == 0)
print("encoded_train_array is empty:", encoded_train_array.size == 0)
print("encoded_attrition_train is empty:", encoded_attrition_train.size == 0)

# Train the model, assign the History object to a new variable
history = model.fit(X_train_scaled,
                    {'department_output': encoded_train_array, 'attrition_output': encoded_attrition_train},
                    epochs=100,
                    shuffle=True,
                    verbose=2)





X_train_scaled shape: (1102, 10)
X_train_scaled type: <class 'numpy.ndarray'>
encoded_train_array shape: (1102, 3)
encoded_train_array type: <class 'numpy.ndarray'>
encoded_attrition_train shape: (1102, 2)
encoded_attrition_train type: <class 'numpy.ndarray'>
X_train_scaled is empty: False
encoded_train_array is empty: False
encoded_attrition_train is empty: False
Epoch 1/100
35/35 - 2s - loss: 1.6382 - department_output_loss: 1.1051 - attrition_output_loss: 0.5331 - department_output_accuracy: 0.3893 - attrition_output_accuracy: 0.8131 - 2s/epoch - 44ms/step
Epoch 2/100
35/35 - 0s - loss: 1.5086 - department_output_loss: 1.0326 - attrition_output_loss: 0.4760 - department_output_accuracy: 0.6515 - attrition_output_accuracy: 0.8394 - 85ms/epoch - 2ms/step
Epoch 3/100
35/35 - 0s - loss: 1.4286 - department_output_loss: 0.9793 - attrition_output_loss: 0.4493 - department_output_accuracy: 0.6615 - attrition_output_accuracy: 0.8394 - 72ms/epoch - 2ms/step
Epoch 4/100
35/35 - 0s - loss: 1.3

In [57]:
# Evaluate the model with the testing data
results = model.evaluate(X_test_scaled,
                         {'department_output': encoded_test.toarray(),
                          'attrition_output': encoded_attrition_test.toarray()}, # Convert sparse matrices to dense arrays
                         verbose=2)

# Access the metrics as needed
model_loss = results[0]  # Overall loss
department_loss = results[1]  # Loss for department output
department_accuracy = results[2]  # Accuracy for department output
attrition_loss = results[3]  # Loss for attrition output
attrition_accuracy = results[4]  # Accuracy for attrition output

print("Overall Loss:", model_loss)
print("Department Loss:", department_loss)
print("Department Accuracy:", department_accuracy)
print("Attrition Loss:", attrition_loss)
print("Attrition Accuracy:", attrition_accuracy)


12/12 - 0s - loss: 1.2167 - department_output_loss: 0.7490 - attrition_output_loss: 0.4677 - department_output_accuracy: 0.6332 - attrition_output_accuracy: 0.8261 - 136ms/epoch - 11ms/step
Overall Loss: 1.2167096138000488
Department Loss: 0.749003529548645
Department Accuracy: 0.4677060544490814
Attrition Loss: 0.633152186870575
Attrition Accuracy: 0.8260869383811951


In [59]:
# Evaluate the model with the testing data
results = model.evaluate(X_test_scaled,
                         {'department_output': encoded_test.toarray(),
                          'attrition_output': encoded_attrition_test.toarray()}, # Convert sparse matrices to dense arrays
                         verbose=2)

# Access the metrics as needed
model_loss = results[0]  # Overall loss
department_loss = results[1]  # Loss for department output
department_accuracy = results[2]  # Accuracy for department output
attrition_loss = results[3]  # Loss for attrition output
attrition_accuracy = results[4]  # Accuracy for attrition output

# Print the accuracy for both department and attrition
print(f"Department Accuracy: {department_accuracy}, Attrition Accuracy: {attrition_accuracy}")

12/12 - 0s - loss: 1.2167 - department_output_loss: 0.7490 - attrition_output_loss: 0.4677 - department_output_accuracy: 0.6332 - attrition_output_accuracy: 0.8261 - 81ms/epoch - 7ms/step
Department Accuracy: 0.4677060544490814, Attrition Accuracy: 0.8260869383811951


# Summary

In the provided space below, briefly answer the following questions.

1. Is accuracy the best metric to use on this data? Why or why not?

2. What activation functions did you choose for your output layers, and why?

3. Can you name a few ways that this model might be improved?

YOUR ANSWERS HERE

1. It would seem that accuracy is not the best metric to use on this data. This is because the data is imbalanced, with the majority of the data being of the negative class. This means that a model that predicts all data points as the negative class would have a high accuracy, but would not be useful. Instead, we could use precision and recall to evaluate the model.
2. The activation function I chose for the output layer is the sigmoid function. This is because the output layer is a binary classification problem, and the sigmoid function is well-suited for binary classification problems.
3. A few ways that this model might be improved include:
    - Using a more complex model, such as a deep neural network with more layers and neurons.
    - Tuning the hyperparameters of the model, such as the learning rate, batch size, and number of epochs.
    - Using techniques such as data augmentation to increase the size of the training data.
    - Using techniques such as dropout to prevent overfitting.
    - Using techniques such as batch normalization to improve the training of the model.