## Part 1: Preprocessing

In [99]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
import pandas as pd
import numpy as np
from tensorflow.keras.models import Model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, InputLayer
from tensorflow.keras import layers

#  Import and read the attrition data
attrition_df = pd.read_csv('https://static.bc-edx.com/ai/ail-v-1-0/m19/lms/datasets/attrition.csv')
attrition_df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,Department,DistanceFromHome,Education,EducationField,EnvironmentSatisfaction,HourlyRate,JobInvolvement,...,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,Sales,1,2,Life Sciences,2,94,3,...,3,1,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,Research & Development,8,1,Life Sciences,3,61,2,...,4,4,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,Research & Development,2,2,Other,4,92,2,...,3,2,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,Research & Development,3,4,Life Sciences,4,56,3,...,3,3,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,Research & Development,2,1,Medical,1,40,3,...,3,4,1,6,3,3,2,2,2,2


In [36]:
# Determine the number of unique values in each column.
unique_values = attrition_df.nunique()
print(unique_values)

Age                         43
Attrition                    2
BusinessTravel               3
Department                   3
DistanceFromHome            29
Education                    5
EducationField               6
EnvironmentSatisfaction      4
HourlyRate                  71
JobInvolvement               4
JobLevel                     5
JobRole                      9
JobSatisfaction              4
MaritalStatus                3
NumCompaniesWorked          10
OverTime                     2
PercentSalaryHike           15
PerformanceRating            2
RelationshipSatisfaction     4
StockOptionLevel             4
TotalWorkingYears           40
TrainingTimesLastYear        7
WorkLifeBalance              4
YearsAtCompany              37
YearsInCurrentRole          19
YearsSinceLastPromotion     16
YearsWithCurrManager        18
dtype: int64


In [37]:
# Create y_df with the Attrition and Department columns
y_df = attrition_df[['Attrition', 'Department']]
y_df.head()


Unnamed: 0,Attrition,Department
0,Yes,Sales
1,No,Research & Development
2,Yes,Research & Development
3,No,Research & Development
4,No,Research & Development


In [38]:
# Create a list of at least 10 column names to use as X data
selected_columns = ['Age', 'Attrition', 'BusinessTravel', 'Department', 'DistanceFromHome',
       'Education', 'EducationField', 'EnvironmentSatisfaction', 'HourlyRate',
       'JobInvolvement', 'JobLevel', 'JobRole', 'JobSatisfaction',
       'MaritalStatus', 'NumCompaniesWorked', 'OverTime', 'PercentSalaryHike',
       'PerformanceRating', 'RelationshipSatisfaction', 'StockOptionLevel',
       'TotalWorkingYears', 'TrainingTimesLastYear', 'WorkLifeBalance',
       'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion',
       'YearsWithCurrManager']


# Create X_df using your selected columns
X_df = attrition_df[selected_columns]
X_df.head()
# List all columns to confirm names
attrition_df.columns

# Show the data types for X_df
attrition_df.dtypes

Unnamed: 0,0
Age,int64
Attrition,object
BusinessTravel,object
Department,object
DistanceFromHome,int64
Education,int64
EducationField,object
EnvironmentSatisfaction,int64
HourlyRate,int64
JobInvolvement,int64


In [125]:
# Checking class balance in the target variable 'Attrition'
class_counts = y_df['Attrition'].value_counts(normalize=True)
print("Class Distribution of Attrition:\n", class_counts)

# If the classes are imbalanced, suggest using metrics like F1-score in addition to accuracy.
if class_counts.min() < 0.4:
    print("Warning: Detected imbalanced classes. Consider using F1-score alongside accuracy for better evaluation of model performance.")
else:
    print("Classes are reasonably balanced. Accuracy can be an appropriate metric.")

Class Distribution of Attrition:
 Attrition
No     0.838776
Yes    0.161224
Name: proportion, dtype: float64


In [39]:
# Split the data into training and testing sets
from sklearn.model_selection import train_test_split

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_df, y_df, test_size=0.2, random_state=42)


In [40]:
# Convert your X data to numeric data types however you see fit
X_train = pd.get_dummies(X_train)
X_test = pd.get_dummies(X_test)
# Add new code cells as necessary
X_train, X_test = X_train.align(X_test, join='left', axis=1, fill_value=0)

In [41]:
# Create a StandardScaler
scaler = StandardScaler()

# Fit the StandardScaler to the training data
X_train_scaled = scaler.fit_transform(X_train)

# Scale the training and testing data
X_test_scaled = scaler.transform(X_test)


In [67]:
# Create a OneHotEncoder for the Department column
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

# Fit the encoder to the training data
X_train_encoded = encoder.fit_transform(X_train)

# Encode the training and testing data
X_test_encoded = encoder.transform(X_test)
# Create two new variables by applying the encoder to the training and testing data
X_train_encoded_df = pd.DataFrame(X_train_encoded, columns=encoder.get_feature_names_out(), index=X_train.index)
X_test_encoded_df = pd.DataFrame(X_test_encoded, columns=encoder.get_feature_names_out(), index=X_test.index)




In [78]:
# Create a OneHotEncoder for the Attrition column
encoder = OneHotEncoder(drop='first', sparse_output=False)
attrition_encoded = encoder.fit_transform(attrition_df[['Attrition']])
attrition_df['Attrition_Encoded'] = attrition_encoded
attrition_df.head() # Display to verify encoding
# Fit the encoder to the training data
X = attrition_df.drop(columns=['Attrition', 'Attrition_Encoded'])
y = attrition_df['Attrition_Encoded']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
categorical_cols = ['BusinessTravel', 'Department', 'EducationField']
encoder = OneHotEncoder(drop='first', sparse_output=False)
X_train_encoded = encoder.fit_transform(X_train[categorical_cols])
X_test_encoded = encoder.transform(X_test[categorical_cols])
X_train_encoded_df = pd.DataFrame(X_train_encoded, columns=encoder.get_feature_names_out(), index=X_train.index)

# Create two new variables by applying the encoder to the training and testing
# data
X_train_final = np.concatenate([X_train.drop(columns=categorical_cols), X_train_encoded], axis=1)
X_test_final = np.concatenate([X_test.drop(columns=categorical_cols), X_test_encoded], axis=1)
print("Training data shape:", X_train_final.shape)
print("Testing data shape:", X_test_final.shape)

Training data shape: (1176, 32)
Testing data shape: (294, 32)


## Create, Compile, and Train the Model

In [90]:
# Step 1: Find the number of columns in the X training data
num_columns_X_train = X_train_final.shape[1]

# Step 2: Initialize the Sequential model
model = Sequential()

# Step 3: Create the input layer
model.add(InputLayer(input_shape=(num_columns_X_train,)))

# Step 4: Create at least two shared layers
model.add(Dense(64, activation='relu'))  # First shared layer
model.add(Dense(32, activation='relu'))  # Second shared layer

# Display the model summary to verify the architecture
model.summary()

In [102]:
# Create a branch for Department
department_input = Input(shape=(1,), name='Department_Input')
# with a hidden layer and an output layer
hidden_layer = Dense(10, activation='relu', name='Department_Hidden_Layer')(department_input)
# Create the output layer
output_layer = Dense(1, activation='sigmoid', name='Department_Output_Layer')(hidden_layer)


In [103]:
# Create a branch for Attrition
attrition_input = Input(shape=(1,), name='Attrition_Input')
# with a hidden layer and an output layer

# Create the hidden layer
attrition_hidden_layer = Dense(10, activation='relu', name='Attrition_Hidden_Layer')(attrition_input)

# Create the output layer
attrition_output_layer = Dense(1, activation='sigmoid', name='Attrition_Output_Layer')(attrition_hidden_layer)


In [105]:
# Input layers for Department and Attrition branches
department_input = Input(shape=(1,), name='Department_Input')
attrition_input = Input(shape=(1,), name='Attrition_Input')

# Hidden and output layers for Department branch
department_hidden_layer = Dense(10, activation='relu', name='Department_Hidden_Layer')(department_input)
department_output_layer = Dense(1, activation='sigmoid', name='Department_Output_Layer')(department_hidden_layer)

# Hidden and output layers for Attrition branch
attrition_hidden_layer = Dense(10, activation='relu', name='Attrition_Hidden_Layer')(attrition_input)
attrition_output_layer = Dense(1, activation='sigmoid', name='Attrition_Output_Layer')(attrition_hidden_layer)




In [135]:
# Train the model



In [128]:
# Evaluate the model with the testing data


In [131]:
# Print the accuracy for both department and attrition


# Summary

In the provided space below, briefly answer the following questions.

1. Is accuracy the best metric to use on this data? Why or why not?

2. What activation functions did you choose for your output layers, and why?

3. Can you name a few ways that this model might be improved?

YOUR ANSWERS HERE

1. Accuracy may not be the best metric for this data, especially for the attrition prediction task. This is because attrition data is often imbalanced—usually, a small percentage of employees leave a company, so a model could have high accuracy by predicting that most employees stay. In this case, metrics like precision, recall, or F1 score might be better for the attrition prediction, as they would offer insights into how well the model identifies actual leavers.

For the department prediction, accuracy might be a reasonable metric if the data is more balanced across departments. However, if certain departments are overrepresented, it could still lead to skewed results.
2. For this branched neural network, two different activation functions are suitable based on the target outputs:

Attrition Prediction (Binary Classification): A sigmoid activation function is ideal because it outputs probabilities between 0 and 1, which is useful for binary classification. The sigmoid function helps in making a clear prediction about whether an employee will leave or not.

Department Prediction (Multi-Class Classification): A softmax activation function is appropriate for the department prediction branch since it outputs a probability distribution across multiple classes. This allows the model to predict the department most likely suited for the employee.
3. Hyperparameter Tuning: Adjusting parameters like learning rate, batch size, and number of epochs could improve model performance.

Additional Layers or Units: Adding more layers or units to the shared and branch-specific layers could enhance the model’s ability to capture complex patterns, particularly if it’s underfitting.

Regularization: Techniques like dropout or L2 regularization can prevent overfitting and improve generalization.

Data Augmentation and Feature Engineering: Adding more meaningful features, engineering new ones, or balancing the dataset could help the model learn more effectively, especially if there is a class imbalance.

Advanced Metrics for Evaluation: Adding metrics like precision, recall, and F1 score for binary classification and possibly AUC-ROC for attrition prediction can give more insights and aid in tuning the model.