<a href="https://colab.research.google.com/github/GabeJJ388/neural-network-challenge-2/blob/main/attrition.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Part 1: Preprocessing

In [38]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np
from tensorflow.keras.models import Model
from tensorflow.keras import layers

#  Import and read the attrition data
attrition_df = pd.read_csv('https://static.bc-edx.com/ai/ail-v-1-0/m19/lms/datasets/attrition.csv')
attrition_df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,Department,DistanceFromHome,Education,EducationField,EnvironmentSatisfaction,HourlyRate,JobInvolvement,...,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,Sales,1,2,Life Sciences,2,94,3,...,3,1,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,Research & Development,8,1,Life Sciences,3,61,2,...,4,4,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,Research & Development,2,2,Other,4,92,2,...,3,2,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,Research & Development,3,4,Life Sciences,4,56,3,...,3,3,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,Research & Development,2,1,Medical,1,40,3,...,3,4,1,6,3,3,2,2,2,2


In [39]:
# Determine the number of unique values in each column
attrition_df.nunique()

Unnamed: 0,0
Age,43
Attrition,2
BusinessTravel,3
Department,3
DistanceFromHome,29
Education,5
EducationField,6
EnvironmentSatisfaction,4
HourlyRate,71
JobInvolvement,4


In [40]:
# Create y_df with the Attrition and Department columns
y_df = attrition_df[['Attrition', 'Department']]

In [41]:
# Create a list of at least 10 column names to use as X data
X_data = ['Age', 'BusinessTravel', 'HourlyRate', 'YearsAtCompany', 'DistanceFromHome', 'Education', 'EducationField', 'NumCompaniesWorked', 'EnvironmentSatisfaction', 'HourlyRate']

# Create X_df using your selected columns
X_df = attrition_df[X_data]

# Show the data types for X_df
X_df.dtypes

Unnamed: 0,0
Age,int64
BusinessTravel,object
HourlyRate,int64
YearsAtCompany,int64
DistanceFromHome,int64
Education,int64
EducationField,object
NumCompaniesWorked,int64
EnvironmentSatisfaction,int64
HourlyRate,int64


In [42]:
# Split the data into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_df, y_df, random_state=42)


In [43]:
X_train['BusinessTravel'].value_counts()

Unnamed: 0_level_0,count
BusinessTravel,Unnamed: 1_level_1
Travel_Rarely,783
Travel_Frequently,213
Non-Travel,106


In [44]:
import numpy as np
# Ensure BusinessTravel is of type object (though OneHotEncoder works on object type too)
X_train['BusinessTravel'] = X_train['BusinessTravel'].astype("object")
X_test['BusinessTravel'] = X_test['BusinessTravel'].astype("object")
X_train.info()

# Initialize OneHotEncoder
from sklearn.preprocessing import OneHotEncoder
business_travel_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore', drop=None)
# Fit encoder on training data
business_travel_encoder.fit(X_train[['BusinessTravel']])

# Get feature names for new columns
encoded_columns = business_travel_encoder.get_feature_names_out(['BusinessTravel'])

# Transform and convert to DataFrame
X_train_encoded = pd.DataFrame(business_travel_encoder.transform(X_train[['BusinessTravel']]),
                               columns=encoded_columns, index=X_train.index)

X_test_encoded = pd.DataFrame(business_travel_encoder.transform(X_test[['BusinessTravel']]),
                              columns=encoded_columns, index=X_test.index)

# Drop the original column and merge the encoded features
X_train = pd.concat([X_train.drop(columns=['BusinessTravel']), X_train_encoded], axis=1)
X_test = pd.concat([X_test.drop(columns=['BusinessTravel']), X_test_encoded], axis=1)

# Check the transformed X_train
X_train.head()



<class 'pandas.core.frame.DataFrame'>
Index: 1102 entries, 1343 to 1126
Data columns (total 10 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   Age                      1102 non-null   int64 
 1   BusinessTravel           1102 non-null   object
 2   HourlyRate               1102 non-null   int64 
 3   YearsAtCompany           1102 non-null   int64 
 4   DistanceFromHome         1102 non-null   int64 
 5   Education                1102 non-null   int64 
 6   EducationField           1102 non-null   object
 7   NumCompaniesWorked       1102 non-null   int64 
 8   EnvironmentSatisfaction  1102 non-null   int64 
 9   HourlyRate               1102 non-null   int64 
dtypes: int64(8), object(2)
memory usage: 94.7+ KB


Unnamed: 0,Age,HourlyRate,YearsAtCompany,DistanceFromHome,Education,EducationField,NumCompaniesWorked,EnvironmentSatisfaction,HourlyRate.1,BusinessTravel_Non-Travel,BusinessTravel_Travel_Frequently,BusinessTravel_Travel_Rarely
1343,29,59,3,7,3,Life Sciences,3,4,59,0.0,0.0,1.0
1121,36,73,1,1,4,Life Sciences,6,2,73,0.0,0.0,1.0
1048,34,81,13,3,3,Other,3,4,81,0.0,0.0,1.0
1393,27,44,7,9,3,Marketing,1,4,44,0.0,0.0,1.0
527,32,55,10,10,3,Marketing,1,4,55,0.0,0.0,1.0


In [45]:
X_train['EducationField'].value_counts()

Unnamed: 0_level_0,count
EducationField,Unnamed: 1_level_1
Life Sciences,456
Medical,349
Marketing,117
Technical Degree,93
Other,68
Human Resources,19


In [46]:
# Ensure BusinessTravel is of type object (though OneHotEncoder works on object type too)
X_train['EducationField'] = X_train['EducationField'].astype("object")
X_test['EducationField'] = X_test['EducationField'].astype("object")
X_train.info()

# Initialize OneHotEncoder
from sklearn.preprocessing import OneHotEncoder
business_travel_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore', drop=None)
# Fit encoder on training data
business_travel_encoder.fit(X_train[['EducationField']])

# Get feature names for new columns
encoded_columns = business_travel_encoder.get_feature_names_out(['EducationField'])

# Transform and convert to DataFrame
X_train_encoded = pd.DataFrame(business_travel_encoder.transform(X_train[['EducationField']]),
                               columns=encoded_columns, index=X_train.index)

X_test_encoded = pd.DataFrame(business_travel_encoder.transform(X_test[['EducationField']]),
                              columns=encoded_columns, index=X_test.index)

# Drop the original column and merge the encoded features
X_train = pd.concat([X_train.drop(columns=['EducationField']), X_train_encoded], axis=1)
X_test = pd.concat([X_test.drop(columns=['EducationField']), X_test_encoded], axis=1)

# Check the transformed X_train
X_train.head()

<class 'pandas.core.frame.DataFrame'>
Index: 1102 entries, 1343 to 1126
Data columns (total 12 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   Age                               1102 non-null   int64  
 1   HourlyRate                        1102 non-null   int64  
 2   YearsAtCompany                    1102 non-null   int64  
 3   DistanceFromHome                  1102 non-null   int64  
 4   Education                         1102 non-null   int64  
 5   EducationField                    1102 non-null   object 
 6   NumCompaniesWorked                1102 non-null   int64  
 7   EnvironmentSatisfaction           1102 non-null   int64  
 8   HourlyRate                        1102 non-null   int64  
 9   BusinessTravel_Non-Travel         1102 non-null   float64
 10  BusinessTravel_Travel_Frequently  1102 non-null   float64
 11  BusinessTravel_Travel_Rarely      1102 non-null   float64
dtypes: float

Unnamed: 0,Age,HourlyRate,YearsAtCompany,DistanceFromHome,Education,NumCompaniesWorked,EnvironmentSatisfaction,HourlyRate.1,BusinessTravel_Non-Travel,BusinessTravel_Travel_Frequently,BusinessTravel_Travel_Rarely,EducationField_Human Resources,EducationField_Life Sciences,EducationField_Marketing,EducationField_Medical,EducationField_Other,EducationField_Technical Degree
1343,29,59,3,7,3,3,4,59,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
1121,36,73,1,1,4,6,2,73,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
1048,34,81,13,3,3,3,4,81,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1393,27,44,7,9,3,1,4,44,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
527,32,55,10,10,3,1,4,55,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0


In [47]:
# Create a StandardScaler
standard_scaler = StandardScaler()

# Fit the StandardScaler to the training data
X_Scaler = standard_scaler.fit(X_train)

# Scale the training and testing data
X_train_scale = X_Scaler.transform(X_train)
X_test_scale = X_Scaler.transform(X_test)

In [48]:
# Create a OneHotEncoder for the Department column
department_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore', drop=None)

# Fit the encoder to the training data
department_encoder.fit(np.array(y_train['Department']).reshape(-1, 1))

# Create two new variables by applying the encoder
# to the training and testing data
y_department_train = department_encoder.transform(np.array(y_train['Department']).reshape(-1, 1))
y_department_test = department_encoder.transform(np.array(y_test['Department']).reshape(-1, 1))

y_department_train

array([[0., 1., 0.],
       [0., 0., 1.],
       [0., 0., 1.],
       ...,
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 0., 1.]])

In [49]:
## Create a OneHotEncoder for the Attrition column
attrition = OneHotEncoder(sparse_output=False, handle_unknown='ignore', drop=None)

# Fit the encoder to the training data
attrition.fit(np.array(y_train['Attrition']).reshape(-1, 1))

# Create two new variables by applying the encoder
# to the training and testing data
y_attrition_train = attrition.transform(np.array(y_train['Attrition']).reshape(-1, 1))
y_attrition_test = attrition.transform(np.array(y_test['Attrition']).reshape(-1, 1))

y_attrition_train

array([[1., 0.],
       [1., 0.],
       [1., 0.],
       ...,
       [0., 1.],
       [1., 0.],
       [1., 0.]])

## Part 2: Create, Compile, and Train the Model

In [52]:
# Find the number of columns in the X training data.

print (X_train_scale.shape[1])

# Create the input layer
input_layer = layers.Input(shape=(X_train_scale.shape[1],), name = 'input_layer')

# Create at least two shared layers
shared_layer_1 = layers.Dense(units=64, activation='relu', name='shared_layer_1') ( input_layer)
shared_layer_2 = layers.Dense(units=128, activation='relu', name='shared_layer_2') (shared_layer_1)

17


In [53]:
# Create a branch for Department
# with a hidden layer and an output layer
# Create the hidden layer
department_hidden = layers.Dense(units=34, activation='relu', name='department_hidden') (shared_layer_2)
# Create the output layer
department_output = layers.Dense(units= y_department_train.shape[1], activation='softmax', name='department_output') (department_hidden)



In [54]:
# Create a branch for Attrition
# with a hidden layer and an output layer
# Create the hidden layer
# Create the output layer
attrition_hidden = layers.Dense(units=34, activation='relu', name='attrition_hidden') (shared_layer_2)
# Create the output layer
attrition_output = layers.Dense(units= y_attrition_train.shape[1], activation='softmax', name='attrition_output') (attrition_hidden)




In [56]:
# Create the model
model = Model(inputs=input_layer, outputs={"department_output": department_output, "attrition_output": attrition_output})

# Compile the model
# Compile the model
model.compile(optimizer='adam',
              loss={
                  'department_output': 'categorical_crossentropy',
                  'attrition_output': 'binary_crossentropy'
              },
              metrics={
                  'department_output': 'accuracy',
                  'attrition_output': 'accuracy'
              })
# Summarize the model
model.summary()

In [57]:
# Train the model
fit_model = model.fit(X_train_scale, {"department_output": y_department_train, "attrition_output": y_attrition_train}, epochs=100)

Epoch 1/100
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 6ms/step - attrition_output_accuracy: 0.7754 - attrition_output_loss: 0.5863 - department_output_accuracy: 0.5700 - department_output_loss: 0.9180 - loss: 1.5047
Epoch 2/100
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - attrition_output_accuracy: 0.8270 - attrition_output_loss: 0.4665 - department_output_accuracy: 0.7649 - department_output_loss: 0.6482 - loss: 1.1146
Epoch 3/100
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - attrition_output_accuracy: 0.8173 - attrition_output_loss: 0.4535 - department_output_accuracy: 0.7805 - department_output_loss: 0.5671 - loss: 1.0204
Epoch 4/100
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - attrition_output_accuracy: 0.8324 - attrition_output_loss: 0.4135 - department_output_accuracy: 0.7649 - department_output_loss: 0.5759 - loss: 0.9897
Epoch 5/100
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━

In [59]:
# Evaluate the model with the testing data
evaluation = model.evaluate(X_test_scale, {"department_output": y_department_test, "attrition_output": y_attrition_test})
print(evaluation)

[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - attrition_output_accuracy: 0.7946 - attrition_output_loss: 1.0953 - department_output_accuracy: 0.6426 - department_output_loss: 2.2273 - loss: 3.3082
[3.531686305999756, 2.4061663150787354, 1.2198630571365356, 0.7853260636329651, 0.616847813129425]


In [60]:
# Print the accuracy for both department and attrition
print(f"Department Accuracy: {evaluation[3]}")
print(f"Attrition Accuracy: {evaluation[4]}")

Department Accuracy: 0.7853260636329651
Attrition Accuracy: 0.616847813129425


# Summary

In the provided space below, briefly answer the following questions.

1. Is accuracy the best metric to use on this data? Why or why not?

2. What activation functions did you choose for your output layers, and why?

3. Can you name a few ways that this model might be improved?

YOUR ANSWERS HERE

1.
2.
3.

In [61]:
#1) The accuracy is the best metric to use and the Department Accuracy is better than the Attrition Accuracy.
    #Using the accuracy shows how efficient the model would be at determining attricion.
#2) For the activation functions I used Relu, This was the most fiting activation for binary values.
#3) The accuracy needs to be improved. My model showed high accuracy during it's activation and training,
    #I believe that feeding the model much more data regarding attrition would result in better accuracy