## Part 1: Preprocessing

In [19]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras import layers

#  Import and read the attrition data
attrition_df = pd.read_csv('https://static.bc-edx.com/ai/ail-v-1-0/m19/lms/datasets/attrition.csv')
attrition_df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,Department,DistanceFromHome,Education,EducationField,EnvironmentSatisfaction,HourlyRate,JobInvolvement,...,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,Sales,1,2,Life Sciences,2,94,3,...,3,1,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,Research & Development,8,1,Life Sciences,3,61,2,...,4,4,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,Research & Development,2,2,Other,4,92,2,...,3,2,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,Research & Development,3,4,Life Sciences,4,56,3,...,3,3,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,Research & Development,2,1,Medical,1,40,3,...,3,4,1,6,3,3,2,2,2,2


In [20]:
# Determine the number of unique values in each column.
attrition_df.nunique()

Age                         43
Attrition                    2
BusinessTravel               3
Department                   3
DistanceFromHome            29
Education                    5
EducationField               6
EnvironmentSatisfaction      4
HourlyRate                  71
JobInvolvement               4
JobLevel                     5
JobRole                      9
JobSatisfaction              4
MaritalStatus                3
NumCompaniesWorked          10
OverTime                     2
PercentSalaryHike           15
PerformanceRating            2
RelationshipSatisfaction     4
StockOptionLevel             4
TotalWorkingYears           40
TrainingTimesLastYear        7
WorkLifeBalance              4
YearsAtCompany              37
YearsInCurrentRole          19
YearsSinceLastPromotion     16
YearsWithCurrManager        18
dtype: int64

In [21]:
# Create y_df with the Attrition and Department columns
y_df = attrition_df[['Attrition', 'Department']]


In [22]:
# Create a list of at least 10 column names to use as X data
selected_columns = ['Age', 'DistanceFromHome', 'Education', 'EnvironmentSatisfaction',
                    'HourlyRate', 'JobInvolvement', 'JobLevel', 'JobRole', 'MaritalStatus', 'WorkLifeBalance']


# Create X_df using your selected columns
X_df = attrition_df[selected_columns]

# Show the data types for X_df
print(X_df.dtypes)


Age                         int64
DistanceFromHome            int64
Education                   int64
EnvironmentSatisfaction     int64
HourlyRate                  int64
JobInvolvement              int64
JobLevel                    int64
JobRole                    object
MaritalStatus              object
WorkLifeBalance             int64
dtype: object


In [23]:
# Split the data into training and testing sets
from sklearn.model_selection import train_test_split

X = X_df.values
y = y_df.values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

X_train shape: (1176, 10)
X_test shape: (294, 10)
y_train shape: (1176, 2)
y_test shape: (294, 2)


In [24]:
# Convert your X data to numeric data types however you see fit
# Add new code cells as necessary
from sklearn.preprocessing import OneHotEncoder
categorical_columns = ['JobRole', 'MaritalStatus', 'WorkLifeBalance']

# Initialize OneHotEncoder
encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')

In [25]:
# Fit and transform the categorical columns for training data
X_train_categorical_encoded = encoder.fit_transform(X_train[:, [X_df.columns.get_loc(col) for col in categorical_columns]])

# Transform the categorical columns for testing data
X_test_categorical_encoded = encoder.transform(X_test[:, [X_df.columns.get_loc(col) for col in categorical_columns]])



In [26]:
# Print the data types for X_df after encoding categorical variables
X_train_encoded = pd.concat([X_df.drop(columns=categorical_columns), pd.DataFrame(X_train_categorical_encoded, columns=encoder.get_feature_names_out(categorical_columns))], axis=1)
print(X_train_encoded.dtypes)

Age                                    int64
DistanceFromHome                       int64
Education                              int64
EnvironmentSatisfaction                int64
HourlyRate                             int64
JobInvolvement                         int64
JobLevel                               int64
JobRole_Healthcare Representative    float64
JobRole_Human Resources              float64
JobRole_Laboratory Technician        float64
JobRole_Manager                      float64
JobRole_Manufacturing Director       float64
JobRole_Research Director            float64
JobRole_Research Scientist           float64
JobRole_Sales Executive              float64
JobRole_Sales Representative         float64
MaritalStatus_Divorced               float64
MaritalStatus_Married                float64
MaritalStatus_Single                 float64
WorkLifeBalance_1                    float64
WorkLifeBalance_2                    float64
WorkLifeBalance_3                    float64
WorkLifeBa

In [27]:
# Transform the categorical columns for testing data
X_test_categorical_encoded = encoder.transform(X_test[:, [X_df.columns.get_loc(col) for col in categorical_columns]])

# Get the indices of the categorical columns in X_df
categorical_indices = [X_df.columns.get_loc(col) for col in categorical_columns]

# Concatenate the encoded categorical variables with the scaled numerical variables
X_test_encoded = np.concatenate([X_test[:, [i for i in range(X_test.shape[1]) if i not in categorical_indices]], X_test_categorical_encoded], axis=1)


In [28]:
# Create a StandardScaler
scaler = StandardScaler()
# Fit the StandardScaler to the training data
scaler.fit(X_train_encoded)
# # Scale the training and testing data
X_train_scaled = scaler.transform(X_train_encoded)
X_test_scaled = scaler.transform(X_test_encoded)



In [29]:
# Create a OneHotEncoder for the Department column
department_encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')

# Resize the index of the y_train array
y_train = y_train[:, [y_df.columns.get_loc('Attrition'), y_df.columns.get_loc('Department')]]

# Fit the encoder to the training data
department_encoder.fit(y_train[:, [y_df.columns.get_loc('Department')]])

# Create two new variables by applying the encoder
# to the training and testing data
y_train_department_encoded = department_encoder.transform(y_train[:, [y_df.columns.get_loc('Department')]])
y_test_department_encoded = department_encoder.transform(y_test[:, [y_df.columns.get_loc('Department')]])





In [30]:
# Create a OneHotEncoder for the Attrition column
attrition_encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')

# Fit the encoder to the training data
attrition_encoder.fit(y_train[:, [y_df.columns.get_loc('Attrition')]])

# Create two new variables by applying the encoder
# to the training and testing data
y_train_encoded = attrition_encoder.transform(y_train[:, [0]])
y_test_encoded = attrition_encoder.transform(y_test[:, [0]])




## Create, Compile, and Train the Model

In [31]:
from tensorflow.keras.layers import Input, Dense

# Find the number of columns in the X training data
num_input_features = X_train_categorical_encoded.shape[1]

# Create the input layer
input_layer = Input(shape=(num_input_features,))

# Create at least two shared layers
shared_layer_1 = Dense(64, activation='relu')(input_layer)
shared_layer_2 = Dense(64, activation='relu')(shared_layer_1)

In [32]:
# Create a branch for Department
# with a hidden layer and an output layer
num_departments = len(y_df['Department'].unique())
# Create the hidden layer
department_hidden_layer = Dense(64, activation='relu')(shared_layer_2)

# Create the output layer
department_output_layer = Dense(num_departments, activation='softmax', name='department_output')(department_hidden_layer)


In [33]:
# Create a branch for Attrition
# with a hidden layer and an output layer
num_classes = len(y_df['Attrition'].unique())
# Create the hidden layer
attrition_hidden_layer = Dense(64, activation='relu')(shared_layer_2)

# Create the output layer
attrition_output_layer = Dense(num_classes, activation='softmax', name='attrition_output')(attrition_hidden_layer)


In [34]:
from tensorflow.keras.optimizers import Adam

# Create the model
model = Model(inputs=input_layer, outputs=[department_output_layer, attrition_output_layer])

# Compile the model
model.compile(optimizer=Adam(learning_rate=0.001),
              loss={'department_output': 'categorical_crossentropy', 'attrition_output': 'categorical_crossentropy'},
              metrics={'department_output': 'accuracy', 'attrition_output': 'accuracy'})

# Summarize the model
model.summary()


Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_2 (InputLayer)        [(None, 16)]                 0         []                            
                                                                                                  
 dense_4 (Dense)             (None, 64)                   1088      ['input_2[0][0]']             
                                                                                                  
 dense_5 (Dense)             (None, 64)                   4160      ['dense_4[0][0]']             
                                                                                                  
 dense_6 (Dense)             (None, 64)                   4160      ['dense_5[0][0]']             
                                                                                            

In [42]:
model_history = model.fit(X_train_categorical_encoded, {'department_output': y_train_department_encoded, 'attrition_output': y_train_encoded},
                    validation_data=(X_test_categorical_encoded, {'department_output': y_test_department_encoded, 'attrition_output': y_test_encoded}),
                    epochs=50, batch_size=64)


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [40]:
print(y_train_department_encoded.dtype)
print(y_train_encoded.dtype)
print(y_test_department_encoded.dtype)
print(X_train_categorical_encoded.dtype)
print(X_test_encoded.dtype)

float64
float64
float64
float64
object


In [46]:
# Evaluate the model with the testing data
evaluation_results = model.evaluate(X_test_categorical_encoded, {'department_output': y_test_department_encoded, 'attrition_output': y_test_encoded})

# Print the evaluation results
print("Evaluation results:", evaluation_results)

Evaluation results: [0.5653046369552612, 0.09516385942697525, 0.4701407849788666, 0.9523809552192688, 0.8673469424247742]


In [44]:
# Print the accuracy for both department and attrition
print("Department Output Accuracy:", evaluation_results[3])
print("Attrition Output Accuracy:", evaluation_results[4])


Department Output Accuracy: 0.9523809552192688
Attrition Output Accuracy: 0.8673469424247742


# Summary

In the provided space below, briefly answer the following questions.

1. Is accuracy the best metric to use on this data? Why or why not?

2. What activation functions did you choose for your output layers, and why?

3. Can you name a few ways that this model might be improved?

YOUR ANSWERS HERE

1. Metrics like precision, recall, or F1-score might provide a more nuanced evaluation of the model's performance, especially for the minority class.
2. ReLU (Rectified Linear Unit) for Hidden Layers:
ReLU is a simple and computationally efficient activation function that has been shown to work well in many deep learning architectures. Softmax for Output Layers:
Softmax is commonly used for multi-class classification problems where the output is categorical and mutually exclusive.
3. Hyperparameter Tuning, Trying different architectures, such as adding more layers or adjusting the number of neurons in each layer, to improve performance,
Feature Engineering,
and Ensemble Methods