## Part 1: Preprocessing

In [1]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np
from tensorflow.keras.models import Model
from tensorflow.keras import layers
from sklearn.preprocessing import OneHotEncoder

#  Import and read the attrition data
attrition_df = pd.read_csv('https://static.bc-edx.com/ai/ail-v-1-0/m19/lms/datasets/attrition.csv')
attrition_df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,Department,DistanceFromHome,Education,EducationField,EnvironmentSatisfaction,HourlyRate,JobInvolvement,...,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,Sales,1,2,Life Sciences,2,94,3,...,3,1,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,Research & Development,8,1,Life Sciences,3,61,2,...,4,4,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,Research & Development,2,2,Other,4,92,2,...,3,2,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,Research & Development,3,4,Life Sciences,4,56,3,...,3,3,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,Research & Development,2,1,Medical,1,40,3,...,3,4,1,6,3,3,2,2,2,2


In [2]:
# Determine the number of unique values in each column.
attrition_df.nunique()

Age                         43
Attrition                    2
BusinessTravel               3
Department                   3
DistanceFromHome            29
Education                    5
EducationField               6
EnvironmentSatisfaction      4
HourlyRate                  71
JobInvolvement               4
JobLevel                     5
JobRole                      9
JobSatisfaction              4
MaritalStatus                3
NumCompaniesWorked          10
OverTime                     2
PercentSalaryHike           15
PerformanceRating            2
RelationshipSatisfaction     4
StockOptionLevel             4
TotalWorkingYears           40
TrainingTimesLastYear        7
WorkLifeBalance              4
YearsAtCompany              37
YearsInCurrentRole          19
YearsSinceLastPromotion     16
YearsWithCurrManager        18
dtype: int64

In [3]:
# Create y_df with the Attrition and Department columns
y_df = attrition_df[["Attrition","Department"]]
display(y_df.head())



Unnamed: 0,Attrition,Department
0,Yes,Sales
1,No,Research & Development
2,Yes,Research & Development
3,No,Research & Development
4,No,Research & Development


In [4]:
display(y_df["Attrition"].value_counts())
display(y_df["Department"].value_counts())

Attrition
No     1233
Yes     237
Name: count, dtype: int64

Department
Research & Development    961
Sales                     446
Human Resources            63
Name: count, dtype: int64

In [5]:
# Create a list of at least 10 column names to use as X data
x_column_list = ["Education",
                 "Age",
                 "DistanceFromHome",
                 "JobSatisfaction",
                 "OverTime",
                 "StockOptionLevel",
                 "WorkLifeBalance",
                 "YearsAtCompany",
                 "YearsSinceLastPromotion",
                 "NumCompaniesWorked"]


# Create X_df using your selected columns
X_df = attrition_df[x_column_list]


# Show the data types for X_df
display(X_df.dtypes)


Education                   int64
Age                         int64
DistanceFromHome            int64
JobSatisfaction             int64
OverTime                   object
StockOptionLevel            int64
WorkLifeBalance             int64
YearsAtCompany              int64
YearsSinceLastPromotion     int64
NumCompaniesWorked          int64
dtype: object

In [6]:
X_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 10 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   Education                1470 non-null   int64 
 1   Age                      1470 non-null   int64 
 2   DistanceFromHome         1470 non-null   int64 
 3   JobSatisfaction          1470 non-null   int64 
 4   OverTime                 1470 non-null   object
 5   StockOptionLevel         1470 non-null   int64 
 6   WorkLifeBalance          1470 non-null   int64 
 7   YearsAtCompany           1470 non-null   int64 
 8   YearsSinceLastPromotion  1470 non-null   int64 
 9   NumCompaniesWorked       1470 non-null   int64 
dtypes: int64(9), object(1)
memory usage: 115.0+ KB


In [7]:
sum(X_df["OverTime"].isnull())

0

In [8]:
# Split the data into training and testing sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_df, y_df, random_state=42)

In [9]:
#reset my indices so future concat steps work properly and do not add nan values
X_train = X_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)

y_train = y_train.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

In [10]:
print(len(X_train["OverTime"]))
print(sum(X_train["OverTime"].isnull()))

1102
0


In [11]:
print(len(X_test["OverTime"]))
print(sum(X_test["OverTime"].isnull()))

368
0


In [12]:
# Convert your X data to numeric data types however you see fit
# Add new code cells as necessary
X_train["OverTime"].value_counts()


OverTime
No     780
Yes    322
Name: count, dtype: int64

In [13]:
X_test["OverTime"].value_counts()

OverTime
No     274
Yes     94
Name: count, dtype: int64

In [14]:
#Make the encoder
X_overtime_encoder = OneHotEncoder(sparse_output=False)

X_overtime_encoder.fit(X_train[["OverTime"]])

#Encode X_train
train_overtime_encoded = X_overtime_encoder.transform(X_train[["OverTime"]])

#Encode X_test
test_overtime_encoded = X_overtime_encoder.transform(X_test[["OverTime"]])

#Make a dataframe with the overtime endcoded data for train and test
overtime_columns = X_overtime_encoder.get_feature_names_out(["OverTime"])

train_overtime_encoded_df = pd.DataFrame(train_overtime_encoded, columns=overtime_columns)
print("Train_OverTime_Encoded_Df")
display(train_overtime_encoded_df.head())
print(f"No: {sum(train_overtime_encoded_df['OverTime_No'])}")
print(f"Yes: {sum(train_overtime_encoded_df['OverTime_Yes'])}")
print(len(train_overtime_encoded_df))

print()

test_overtime_encoded_df = pd.DataFrame(test_overtime_encoded, columns=overtime_columns)
print("Test_OverTime_Encoded_Df")
display(test_overtime_encoded_df.head())
print(f"No: {sum(test_overtime_encoded_df['OverTime_No'])}")
print(f"Yes: {sum(test_overtime_encoded_df['OverTime_Yes'])}")
print(len(test_overtime_encoded_df))

Train_OverTime_Encoded_Df


Unnamed: 0,OverTime_No,OverTime_Yes
0,1.0,0.0
1,1.0,0.0
2,1.0,0.0
3,1.0,0.0
4,1.0,0.0


No: 780.0
Yes: 322.0
1102

Test_OverTime_Encoded_Df


Unnamed: 0,OverTime_No,OverTime_Yes
0,1.0,0.0
1,1.0,0.0
2,1.0,0.0
3,1.0,0.0
4,1.0,0.0


No: 274.0
Yes: 94.0
368


In [15]:
print(f"Number of columns in X_train: {len(X_train.columns)}")
print(len(X_train.index))

print(f"Number of columns in X_test: {len(X_test.columns)}")
print(len(X_test.index))

Number of columns in X_train: 10
1102
Number of columns in X_test: 10
368


In [16]:
X_train.head()

Unnamed: 0,Education,Age,DistanceFromHome,JobSatisfaction,OverTime,StockOptionLevel,WorkLifeBalance,YearsAtCompany,YearsSinceLastPromotion,NumCompaniesWorked
0,3,29,7,1,No,0,3,3,1,3
1,4,36,1,3,No,0,3,1,0,6
2,3,34,3,1,No,0,3,13,3,3
3,3,27,9,4,No,0,3,7,0,1
4,3,32,10,4,No,0,2,10,0,1


In [17]:
#drop the original columns
X_train_no_overtime = X_train.drop(columns=["OverTime"])
X_test_no_overtime = X_test.drop(columns=["OverTime"])

print(f"Number of columns in X_train_no_overtime: {len(X_train_no_overtime.columns)}")
print(len(X_train_no_overtime.index))

print(f"Number of columns in X_test_no_overtime: {len(X_test_no_overtime.columns)}")
print(len(X_test_no_overtime.index))



Number of columns in X_train_no_overtime: 9
1102
Number of columns in X_test_no_overtime: 9
368


In [18]:
X_train_no_overtime.head()

Unnamed: 0,Education,Age,DistanceFromHome,JobSatisfaction,StockOptionLevel,WorkLifeBalance,YearsAtCompany,YearsSinceLastPromotion,NumCompaniesWorked
0,3,29,7,1,0,3,3,1,3
1,4,36,1,3,0,3,1,0,6
2,3,34,3,1,0,3,13,3,3
3,3,27,9,4,0,3,7,0,1
4,3,32,10,4,0,2,10,0,1


In [19]:
#combine the encoded columns with the original data, replace the correct columns

X_train_recombined = pd.concat([
    X_train_no_overtime,
    train_overtime_encoded_df],
    axis='columns'
)

X_test_recombined = pd.concat([
    X_test_no_overtime,
    test_overtime_encoded_df],
    axis='columns'
)

print(f"Number of columns in X_train_recombined: {len(X_train_recombined.columns)}")
print(len(X_train_recombined.index))

print(f"Number of columns in X_test_recombined: {len(X_test_recombined.columns)}")
print(len(X_test_recombined.index))

Number of columns in X_train_recombined: 11
1102
Number of columns in X_test_recombined: 11
368


In [20]:
print("X_train_recombined")
display(X_train_recombined.head())
print(f"No: {sum(X_train_recombined['OverTime_No'])}")
print(f"Yes: {sum(X_train_recombined['OverTime_Yes'])}")

X_train_recombined


Unnamed: 0,Education,Age,DistanceFromHome,JobSatisfaction,StockOptionLevel,WorkLifeBalance,YearsAtCompany,YearsSinceLastPromotion,NumCompaniesWorked,OverTime_No,OverTime_Yes
0,3,29,7,1,0,3,3,1,3,1.0,0.0
1,4,36,1,3,0,3,1,0,6,1.0,0.0
2,3,34,3,1,0,3,13,3,3,1.0,0.0
3,3,27,9,4,0,3,7,0,1,1.0,0.0
4,3,32,10,4,0,2,10,0,1,1.0,0.0


No: 780.0
Yes: 322.0


In [21]:
print("X_test_recombined")
display(X_test_recombined.head())
print(f"No: {sum(X_test_recombined['OverTime_No'])}")
print(f"Yes: {sum(X_test_recombined['OverTime_Yes'])}")

X_test_recombined


Unnamed: 0,Education,Age,DistanceFromHome,JobSatisfaction,StockOptionLevel,WorkLifeBalance,YearsAtCompany,YearsSinceLastPromotion,NumCompaniesWorked,OverTime_No,OverTime_Yes
0,3,28,5,1,0,3,5,1,0,1.0,0.0
1,2,53,13,1,2,3,4,1,1,1.0,0.0
2,1,24,22,3,1,3,1,0,1,1.0,0.0
3,3,45,7,1,1,3,1,0,2,1.0,0.0
4,2,36,5,2,0,4,13,3,8,1.0,0.0


No: 274.0
Yes: 94.0


In [22]:
# Create a StandardScaler
X_scaler = StandardScaler()


# Fit the StandardScaler to the training data
X_scaler.fit(X_train_recombined)

# Scale the training and testing data
X_train_scaled = X_scaler.transform(X_train_recombined)
X_test_scaled = X_scaler.transform(X_test_recombined)

X_scaled_columns = X_scaler.get_feature_names_out()
X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=X_scaled_columns)
X_test_scaled_df = pd.DataFrame(X_test_scaled, columns=X_scaled_columns)



In [23]:
# Create a OneHotEncoder for the Department column
department_encoder = OneHotEncoder(sparse_output=False)

# Fit the encoder to the training data
department_encoder.fit(y_train[["Department"]])

# Create two new variables by applying the encoder
# to the training and testing data
train_department_encoded = department_encoder.transform(y_train[["Department"]])
test_department_encoded = department_encoder.transform(y_test[["Department"]])

department_encoded_columns = department_encoder.get_feature_names_out()

print(department_encoded_columns)
display(train_department_encoded)
display(test_department_encoded)


['Department_Human Resources' 'Department_Research & Development'
 'Department_Sales']


array([[0., 1., 0.],
       [0., 0., 1.],
       [0., 0., 1.],
       ...,
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 0., 1.]])

array([[0., 0., 1.],
       [0., 1., 0.],
       [1., 0., 0.],
       ...,
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 1., 0.]])

In [24]:
# Create a OneHotEncoder for the Attrition column
attrition_encoder = OneHotEncoder(sparse_output=True)

# Fit the encoder to the training data
attrition_encoder.fit(y_train[["Attrition"]])

# Create two new variables by applying the encoder
# to the training and testing data
train_attrition_encoded = attrition_encoder.transform(y_train[["Attrition"]]).todense()
test_attrition_encoded = attrition_encoder.transform(y_test[["Attrition"]]).todense()

attrition_encoded_columns = attrition_encoder.get_feature_names_out()

print(attrition_encoded_columns)
display(train_attrition_encoded)
display(test_attrition_encoded)


['Attrition_No' 'Attrition_Yes']


matrix([[1., 0.],
        [1., 0.],
        [1., 0.],
        ...,
        [0., 1.],
        [1., 0.],
        [1., 0.]])

matrix([[1., 0.],
        [1., 0.],
        [0., 1.],
        [1., 0.],
        [1., 0.],
        [1., 0.],
        [1., 0.],
        [1., 0.],
        [1., 0.],
        [1., 0.],
        [1., 0.],
        [1., 0.],
        [1., 0.],
        [1., 0.],
        [1., 0.],
        [1., 0.],
        [1., 0.],
        [0., 1.],
        [0., 1.],
        [1., 0.],
        [0., 1.],
        [1., 0.],
        [1., 0.],
        [1., 0.],
        [1., 0.],
        [1., 0.],
        [1., 0.],
        [1., 0.],
        [1., 0.],
        [1., 0.],
        [1., 0.],
        [1., 0.],
        [1., 0.],
        [1., 0.],
        [0., 1.],
        [1., 0.],
        [1., 0.],
        [0., 1.],
        [0., 1.],
        [1., 0.],
        [1., 0.],
        [1., 0.],
        [1., 0.],
        [1., 0.],
        [0., 1.],
        [1., 0.],
        [0., 1.],
        [0., 1.],
        [0., 1.],
        [0., 1.],
        [1., 0.],
        [1., 0.],
        [1., 0.],
        [1., 0.],
        [1., 0.],
        [1

## Create, Compile, and Train the Model

In [25]:
# Find the number of columns in the X training data
input_nodes = len(X_train_scaled_df.columns)

# Create the input layer
input_layer = layers.Input(shape=(input_nodes,), name="input_layer")

# Create at least two shared layers
shared_dense_1 = layers.Dense(11,activation='relu')(input_layer)
shared_dense_2 = layers.Dense(11, activation='relu')(shared_dense_1)


In [26]:
# Create a branch for Department
# with a hidden layer and an output layer

# Create the hidden layer
department_dense = layers.Dense(11, activation='relu')(shared_dense_2)

# Create the output layer
# since we might see other departments that are not currently in our data in the future we use sigmoid since we have to consider each department independently of each other
department_output = layers.Dense(len(department_encoded_columns),
                                 activation='sigmoid',
                                 name='department_output')(department_dense)


In [27]:
# Create a branch for Attrition
# with a hidden layer and an output layer

# Create the hidden layer
attrition_dense = layers.Dense(10, activation='relu')(shared_dense_2)

# Create the output layer
# since we expect binary output and have 2 nodes we use softmax
attrition_output = layers.Dense(len(attrition_encoded_columns),
                                activation='softmax',
                                name='attrition_output')(attrition_dense)


In [28]:
# Create the model
model = Model(inputs=input_layer,
              outputs = [
                  department_output,
                  attrition_output
              ])

# Compile the model
model.compile(optimizer='adam',
              loss={
                  'department_output': 'categorical_crossentropy',
                  'attrition_output': 'binary_crossentropy'
              },
              metrics={
                  'department_output': 'accuracy',
                  'attrition_output': 'accuracy'
              })

# Summarize the model
display(model.summary())

None

In [29]:
# Train the model
model.fit(X_train_scaled,
          {
              'department_output': train_department_encoded,
              'attrition_output': train_attrition_encoded
          },
          epochs=100,
          batch_size=32)


Epoch 1/100
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - attrition_output_accuracy: 0.7419 - department_output_accuracy: 0.1448 - loss: 1.8867
Epoch 2/100
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - attrition_output_accuracy: 0.8174 - department_output_accuracy: 0.5145 - loss: 1.6432 
Epoch 3/100
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - attrition_output_accuracy: 0.8385 - department_output_accuracy: 0.6383 - loss: 1.4135 
Epoch 4/100
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - attrition_output_accuracy: 0.8212 - department_output_accuracy: 0.6643 - loss: 1.3051 
Epoch 5/100
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - attrition_output_accuracy: 0.8359 - department_output_accuracy: 0.6422 - loss: 1.2540
Epoch 6/100
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - attrition_output_accuracy: 0.8328 - departmen

<keras.src.callbacks.history.History at 0x1d9cad17b20>

In [30]:
# Evaluate the model with the testing data
results = model.evaluate(np.array(X_test_scaled),
                         {
                             'department_output': test_department_encoded,
                             'attrition_output': test_attrition_encoded
                         })

[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 971us/step - attrition_output_accuracy: 0.8786 - department_output_accuracy: 0.6303 - loss: 1.2301


In [31]:
categories = enumerate(["Department", "Attrition"])

print(results)

for i, cat in enumerate(categories):
    print(f"{cat} accuracy: {results[i+1]}")

[1.205636739730835, 0.8722826242446899, 0.6385869383811951]
(0, 'Department') accuracy: 0.8722826242446899
(1, 'Attrition') accuracy: 0.6385869383811951


In [18]:
# Print the accuracy for both department and attrition
categories = enumerate(["Department", "Attrition"])

print(results)

for i, cat in enumerate(categories):
    print(f"{cat} accuracy: {results[i+1]}")

Department predictions accuracy: 0.5271739363670349
Attrition predictions accuracy: 0.8260869383811951


# Summary

In the provided space below, briefly answer the following questions.

1. Is accuracy the best metric to use on this data? Why or why not?

2. What activation functions did you choose for your output layers, and why?

3. Can you name a few ways that this model might be improved?

YOUR ANSWERS HERE

1. I would say that accuracy is the most important measure becuase we need to know which department the people who are attriting are from, and its imparitive that we correctly guess who is going to attrit so that we do not push out anybody accidentally by treating them as if they were going to leave anyway, while likewise not wasting resources on people who are about to leave.
2. I chose sigmoid for "Department" and softmax for "Attrition". I chose sigmoid for "Department" becuase a person can only belong to one department, therefore they should be considered independently of each other, and if we add a department in the future that our system was not trained on it will show that the person does not belong to any of the existing departments by giving low values to each instead of forcing all the values to add to 1. I chose softmax for "Attrition" because it was a binary output where I had 2 output nodes. Doing it like this we can choose the higher scoring of the two outputs to say if it was a yes or a no.
3. Having more data would probably help, there was not very many people who worked in HR in this dataset and it probably threw off results. Adding more layers might also help, I only added 2 layers shared layers but having more might help the algorithm learn better.