## Part 1: Preprocessing

In [58]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
import pandas as pd
import numpy as np
from tensorflow.keras.models import Model
from tensorflow.keras import layers

#  Import and read the attrition data
attrition_df = pd.read_csv('https://static.bc-edx.com/ai/ail-v-1-0/m19/lms/datasets/attrition.csv')
attrition_df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,Department,DistanceFromHome,Education,EducationField,EnvironmentSatisfaction,HourlyRate,JobInvolvement,...,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,Sales,1,2,Life Sciences,2,94,3,...,3,1,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,Research & Development,8,1,Life Sciences,3,61,2,...,4,4,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,Research & Development,2,2,Other,4,92,2,...,3,2,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,Research & Development,3,4,Life Sciences,4,56,3,...,3,3,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,Research & Development,2,1,Medical,1,40,3,...,3,4,1,6,3,3,2,2,2,2


In [59]:
try:
    # Attempt to import the dependencies
    from sklearn.model_selection import train_test_split
    from sklearn.preprocessing import StandardScaler
    from sklearn.preprocessing import OneHotEncoder
    import pandas as pd
    import numpy as np
    from tensorflow.keras.models import Model
    from tensorflow.keras import layers
    
    # If the imports are successful, print a success message
    print("All dependencies are installed correctly.")
except ImportError as e:
    # If an ImportError occurs, print an error message with details
    print(f"An error occurred: {e}. Please ensure all dependencies are installed correctly.")

All dependencies are installed correctly.


In [43]:
# Determine the number of unique values in each column and sort them in descending order.
attrition_df.nunique().sort_values(ascending=False)

HourlyRate                  71
Age                         43
TotalWorkingYears           40
YearsAtCompany              37
DistanceFromHome            29
YearsInCurrentRole          19
YearsWithCurrManager        18
YearsSinceLastPromotion     16
PercentSalaryHike           15
NumCompaniesWorked          10
JobRole                      9
TrainingTimesLastYear        7
EducationField               6
JobLevel                     5
Education                    5
EnvironmentSatisfaction      4
JobInvolvement               4
JobSatisfaction              4
RelationshipSatisfaction     4
StockOptionLevel             4
WorkLifeBalance              4
Department                   3
BusinessTravel               3
MaritalStatus                3
OverTime                     2
Attrition                    2
PerformanceRating            2
dtype: int64

In [44]:
# Create y_df with the Attrition and Department columns
y_df = attrition_df[['Attrition', 'Department']]
y_df.head()

Unnamed: 0,Attrition,Department
0,Yes,Sales
1,No,Research & Development
2,Yes,Research & Development
3,No,Research & Development
4,No,Research & Development


In [61]:
# Create a list of at least 10 column names to use as X data
columns_list_for_X_data = ['HourlyRate',
                           'Age',
                           'TotalWorkingYears',
                           'YearsAtCompany',
                           'YearsInCurrentRole',
                           'YearsSinceLastPromotion',
                           'YearsWithCurrManager',
                           'DistanceFromHome',
                           'Education',
                           'JobLevel']
# # Create X_df using your selected columns
X_df = attrition_df[columns_list_for_X_data]

# Show the data types for X_df
print(X_df.dtypes)

HourlyRate                 int64
Age                        int64
TotalWorkingYears          int64
YearsAtCompany             int64
YearsInCurrentRole         int64
YearsSinceLastPromotion    int64
YearsWithCurrManager       int64
DistanceFromHome           int64
Education                  int64
JobLevel                   int64
dtype: object


In [46]:
# Check the new DataFrames
display('Input Features are labled as X_df:', X_df.head())
display('Target Variables are labled as y_df:', y_df.head())

'Input Features are labled as X_df:'

Unnamed: 0,HourlyRate,Age,TotalWorkingYears,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager,DistanceFromHome,Education,JobLevel
0,94,41,8,6,4,0,5,1,2,2
1,61,49,10,10,7,1,7,8,1,2
2,92,37,7,0,0,0,0,2,2,1
3,56,33,8,8,7,3,0,3,4,1
4,40,27,6,2,2,2,2,2,1,1


'Target Variables are labled as y_df:'

Unnamed: 0,Attrition,Department
0,Yes,Sales
1,No,Research & Development
2,Yes,Research & Development
3,No,Research & Development
4,No,Research & Development


In [62]:
# Split the data into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_df, y_df)

display('X_train:', X_train.head())
display('X_test:', X_test.head())
display('y_train:', y_train.head())
display('y_test:', y_test.head())

'X_train:'

Unnamed: 0,HourlyRate,Age,TotalWorkingYears,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager,DistanceFromHome,Education,JobLevel
537,32,27,9,9,7,1,7,10,2,3
716,64,41,21,18,16,0,11,9,3,5
1364,42,28,7,7,7,0,7,1,2,2
1244,78,30,10,9,7,0,7,2,4,1
456,44,31,10,5,4,0,1,7,3,3


'X_test:'

Unnamed: 0,HourlyRate,Age,TotalWorkingYears,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager,DistanceFromHome,Education,JobLevel
1238,51,23,3,3,2,1,2,4,1,1
87,96,51,10,4,2,0,3,9,4,1
1290,93,34,11,7,1,0,7,9,4,2
794,45,34,7,6,2,0,4,3,1,2
1070,55,28,5,4,2,1,3,7,3,2


'y_train:'

Unnamed: 0,Attrition,Department
537,No,Research & Development
716,No,Research & Development
1364,No,Sales
1244,No,Research & Development
456,No,Sales


'y_test:'

Unnamed: 0,Attrition,Department
1238,No,Research & Development
87,No,Research & Development
1290,Yes,Research & Development
794,No,Research & Development
1070,No,Sales


In [63]:
# Convert the X data to numeric data types for machine learning model with float32 for memory efficiency.
X_train_numeric_dtypes = X_train.astype('float32')
X_test_numeric_dtypes = X_test.astype('float32')

display('X_train_numeric_dtypes:', X_train_numeric_dtypes.head())
display('X_test_numeric_dtypes:', X_test_numeric_dtypes.head())

'X_train_numeric_dtypes:'

Unnamed: 0,HourlyRate,Age,TotalWorkingYears,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager,DistanceFromHome,Education,JobLevel
537,32.0,27.0,9.0,9.0,7.0,1.0,7.0,10.0,2.0,3.0
716,64.0,41.0,21.0,18.0,16.0,0.0,11.0,9.0,3.0,5.0
1364,42.0,28.0,7.0,7.0,7.0,0.0,7.0,1.0,2.0,2.0
1244,78.0,30.0,10.0,9.0,7.0,0.0,7.0,2.0,4.0,1.0
456,44.0,31.0,10.0,5.0,4.0,0.0,1.0,7.0,3.0,3.0


'X_test_numeric_dtypes:'

Unnamed: 0,HourlyRate,Age,TotalWorkingYears,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager,DistanceFromHome,Education,JobLevel
1238,51.0,23.0,3.0,3.0,2.0,1.0,2.0,4.0,1.0,1.0
87,96.0,51.0,10.0,4.0,2.0,0.0,3.0,9.0,4.0,1.0
1290,93.0,34.0,11.0,7.0,1.0,0.0,7.0,9.0,4.0,2.0
794,45.0,34.0,7.0,6.0,2.0,0.0,4.0,3.0,1.0,2.0
1070,55.0,28.0,5.0,4.0,2.0,1.0,3.0,7.0,3.0,2.0


In [64]:
# Create a StandardScaler
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

# Fit the StandardScaler to the training data
X_scaler = scaler.fit(X_train)

# Scale the training and testing data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [65]:
from sklearn.preprocessing import OneHotEncoder

In [66]:
# Create a OneHotEncoder for the Department column
department_encoder = OneHotEncoder(handle_unknown='ignore')

department_encoder.fit(y_train[['Department']])

# Create two new variables by applying the encoder
# to the training and testing data
encoded_train = department_encoder.transform(y_train[['Department']])
encoded_test = department_encoder.transform(y_test[['Department']])

encoded_train_array = encoded_train.toarray()
encoded_train_array

array([[0., 1., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       ...,
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 1., 0.]])

In [70]:
# Create a OneHotEncoder for the Attrition column
attrition_encoder = OneHotEncoder()

# Fit the encoder to the training data
attrition_encoder = attrition_encoder.fit(y_train[['Attrition']])

# Create two new variables by applying the encoder
# to the training and testing data
encoded_attrition_train = attrition_encoder.transform(y_train[['Attrition']])
encoded_attrition_test = attrition_encoder.transform(y_test[['Attrition']])


## Create, Compile, and Train the Model

In [53]:
# Find the number of columns in the X training data
input_features = len(X_train_scaled[0])

# Create the input layer
inputs = layers.Input(shape=(input_features,))
inputs

# Create at least two shared layers
shared_layer1 = layers.Dense(units=8, activation='relu')
shared_layer2 = layers.Dense(units=8, activation='relu')

In [54]:
# Create a branch for Department with a hidden layer and an output layer
department_branch = shared_layer1(inputs)
department_branch = shared_layer2(department_branch)

# Create the hidden layer
department_hidden = layers.Dense(units=8, activation='relu')(department_branch)

# Create the output layer
department_output = layers.Dense(units=3, activation='softmax', name='department_output')(department_hidden)


In [71]:
# Create a branch for Attrition with a hidden layer and an output layer
attrition_branch = shared_layer1(inputs)
attrition_branch = shared_layer2(attrition_branch)

# Create the hidden layer
attrition_hidden = layers.Dense(units=8, activation='relu')(attrition_branch)

# Create the output layer
attrition_output = layers.Dense(units=2, activation='softmax', name='attrition_output')(attrition_hidden)

In [72]:
# Create the model
model = Model(inputs=inputs, outputs=[department_output, attrition_output])

# Compile the model
model.compile(optimizer='adam',
              loss={'department_output': 'categorical_crossentropy',
                    'attrition_output': 'categorical_crossentropy'},
              metrics=['accuracy'])

# Summarize the model
model.summary()

In [73]:
# Train the model
model = model.fit(X_train_scaled,
                    {'department_output': encoded_train_array, 'attrition_output': encoded_attrition_train},
                    epochs=100,
                    shuffle=True,
                    verbose=2)

# Evaluate the model
model_loss, model_accuracy = model.evaluate(X_test_scaled,
                                             {'department_output': encoded_test.toarray(),
                                              'attrition_output': encoded_attrition_test.toarray()},
                                             verbose=2)

# Print the model loss and accuracy results
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

Epoch 1/100


TypeError: Failed to convert elements of SparseTensor(indices=Tensor("data_2:0", shape=(None, 2), dtype=int64), values=Tensor("data_3:0", shape=(None,), dtype=float32), dense_shape=Tensor("data_4:0", shape=(2,), dtype=int64)) to Tensor. Consider casting elements to a supported type. See https://www.tensorflow.org/api_docs/python/tf/dtypes for supported TF dtypes.

In [77]:
# Evaluate the model with the testing data
model_loss, model_accuracy = model.evaluate(X_test_scaled,
                                             {'department_output': encoded_test.toarray(),
                                              'attrition_output': encoded_attrition_test.toarray()},
                                             verbose=2)


ValueError: For a model with multiple outputs, when providing the `metrics` argument as a list, it should have as many entries as the model has outputs. Received:
metrics=['accuracy']
of length 1 whereas the model has 2 outputs.

In [78]:
# Print the accuracy for both department and attrition
print(f"Department Accuracy: {model_accuracy[0]}, Attrition Accuracy: {model_accuracy[1]}")

NameError: name 'model_accuracy' is not defined

# Summary

In the provided space below, briefly answer the following questions.

1. Is accuracy the best metric to use on this data? Why or why not?

2. What activation functions did you choose for your output layers, and why?

3. Can you name a few ways that this model might be improved?

YOUR ANSWERS HERE

1. It would seem that accuracy is not the best metric to use on this data. This is because the data is imbalanced, with the majority of the data being of the negative class. This means that a model that predicts all data points as the negative class would have a high accuracy, but would not be useful. Instead, we could use precision and recall to evaluate the model.
2. The activation function I chose for the output layer is the sigmoid function. This is because the output layer is a binary classification problem, and the sigmoid function is well-suited for binary classification problems.
3. A few ways that this model might be improved include:
    - Using a more complex model, such as a deep neural network with more layers and neurons.
    - Tuning the hyperparameters of the model, such as the learning rate, batch size, and number of epochs.
    - Using techniques such as data augmentation to increase the size of the training data.
    - Using techniques such as dropout to prevent overfitting.
    - Using techniques such as batch normalization to improve the training of the model.