## Part 1: Preprocessing

In [50]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np
from tensorflow.keras.models import Model
from tensorflow.keras import layers
from sklearn.preprocessing import OneHotEncoder


#  Import and read the attrition data
attrition_df = pd.read_csv('attrition.csv')
attrition_df.head()


Unnamed: 0,Age,Attrition,BusinessTravel,Department,DistanceFromHome,Education,EducationField,EnvironmentSatisfaction,HourlyRate,JobInvolvement,...,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,Sales,1,2,Life Sciences,2,94,3,...,3,1,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,Research & Development,8,1,Life Sciences,3,61,2,...,4,4,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,Research & Development,2,2,Other,4,92,2,...,3,2,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,Research & Development,3,4,Life Sciences,4,56,3,...,3,3,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,Research & Development,2,1,Medical,1,40,3,...,3,4,1,6,3,3,2,2,2,2


In [6]:
# Determine the number of unique values in each column.
attrition_df.nunique()



Age                         43
Attrition                    2
BusinessTravel               3
Department                   3
DistanceFromHome            29
Education                    5
EducationField               6
EnvironmentSatisfaction      4
HourlyRate                  71
JobInvolvement               4
JobLevel                     5
JobRole                      9
JobSatisfaction              4
MaritalStatus                3
NumCompaniesWorked          10
OverTime                     2
PercentSalaryHike           15
PerformanceRating            2
RelationshipSatisfaction     4
StockOptionLevel             4
TotalWorkingYears           40
TrainingTimesLastYear        7
WorkLifeBalance              4
YearsAtCompany              37
YearsInCurrentRole          19
YearsSinceLastPromotion     16
YearsWithCurrManager        18
dtype: int64

In [13]:
# Load your dataset
df = pd.read_csv('attrition.csv')
# Create y_df with the Attrition and Department columns
y_df = df[['Attrition', 'Department']]

In [26]:

# Display all columns in the dataset to confirm their names
print("Available columns in the dataset:")
print(df.columns.tolist())  # Shows all column names in a clear list format

# Define the list of at least 10 column names to use as X data
# Update these based on the exact names displayed from the print statement above
selected_columns = ['Age', 'YearsAtCompany', 'EducationField', 'JobRole', 
                    'MaritalStatus', 'PerformanceRating', 'WorkLifeBalance', 
                    'DistanceFromHome', 'EnvironmentSatisfaction', 'JobSatisfaction']  # Replace with valid column names

# Create X_df using the selected columns
X_df = df[selected_columns]

# Show the data types for X_df
print("\nData types for each column in X_df:")
print(X_df.dtypes)


Available columns in the dataset:
['Age', 'Attrition', 'BusinessTravel', 'Department', 'DistanceFromHome', 'Education', 'EducationField', 'EnvironmentSatisfaction', 'HourlyRate', 'JobInvolvement', 'JobLevel', 'JobRole', 'JobSatisfaction', 'MaritalStatus', 'NumCompaniesWorked', 'OverTime', 'PercentSalaryHike', 'PerformanceRating', 'RelationshipSatisfaction', 'StockOptionLevel', 'TotalWorkingYears', 'TrainingTimesLastYear', 'WorkLifeBalance', 'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion', 'YearsWithCurrManager']

Data types for each column in X_df:
Age                         int64
YearsAtCompany              int64
EducationField             object
JobRole                    object
MaritalStatus              object
PerformanceRating           int64
WorkLifeBalance             int64
DistanceFromHome            int64
EnvironmentSatisfaction     int64
JobSatisfaction             int64
dtype: object


In [27]:
# Split the data into training and testing sets
from sklearn.model_selection import train_test_split



In [29]:
# Identify non-numeric columns in X_df
non_numeric_columns = X_df.select_dtypes(include=['object']).columns
print("Non-numeric columns:", non_numeric_columns)

# Convert categorical columns to numeric
# Option A: Use one-hot encoding for columns with a manageable number of unique values
X_df = pd.get_dummies(X_df, columns=non_numeric_columns, drop_first=True)

# Display the transformed X_df to ensure all columns are now numeric
print("Data types after encoding:")
print(X_df.dtypes)



Non-numeric columns: Index([], dtype='object')
Data types after encoding:
Age                                int64
YearsAtCompany                     int64
PerformanceRating                  int64
WorkLifeBalance                    int64
DistanceFromHome                   int64
EnvironmentSatisfaction            int64
JobSatisfaction                    int64
EducationField_Life Sciences        bool
EducationField_Marketing            bool
EducationField_Medical              bool
EducationField_Other                bool
EducationField_Technical Degree     bool
JobRole_Human Resources             bool
JobRole_Laboratory Technician       bool
JobRole_Manager                     bool
JobRole_Manufacturing Director      bool
JobRole_Research Director           bool
JobRole_Research Scientist          bool
JobRole_Sales Executive             bool
JobRole_Sales Representative        bool
MaritalStatus_Married               bool
MaritalStatus_Single                bool
dtype: object


In [31]:
# Split the data into training and testing sets
X_train, X_test = train_test_split(X_df_numeric, test_size=0.2, random_state=42)

# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the StandardScaler to the training data
scaler.fit(X_train)

# Scale the training and testing data
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Optional: Convert the scaled arrays back to DataFrames for easier handling and interpretation
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns)

# Display the scaled data to confirm
print("Scaled training data (first 5 rows):")
print(X_train_scaled.head())
print("\nScaled testing data (first 5 rows):")
print(X_test_scaled.head())

Scaled training data (first 5 rows):
        Age  YearsAtCompany  PerformanceRating  WorkLifeBalance  \
0 -1.388559       -0.974263          -0.429290         0.357435   
1 -2.040738       -1.138573          -0.429290         0.357435   
2 -0.845077       -0.645643          -0.429290         0.357435   
3  0.241886       -0.317023           2.329427         0.357435   
4 -0.627685        0.504527          -0.429290         0.357435   

   DistanceFromHome  EnvironmentSatisfaction  JobSatisfaction  \
0          1.440396                 0.279706        -1.582336   
1         -0.522699                -0.639104         1.152834   
2          1.317703                 1.198515         1.152834   
3          0.336155                 1.198515        -0.670613   
4          1.317703                -0.639104         0.241111   

   EducationField_Life Sciences  EducationField_Marketing  \
0                     -0.846634                 -0.343323   
1                      1.181149                

In [42]:

# Separate features and targets
X_df = df.drop(columns=['Attrition', 'Department'])  # Features excluding target columns
y_df = df[['Attrition', 'Department']]               # Target columns

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_df, y_df, test_size=0.2, random_state=42)

# Initialize the OneHotEncoder for the Department column
department_encoder = OneHotEncoder(sparse_output=False, drop='first')  # `sparse_output` used instead of `sparse`

# Fit the encoder to the Department column in the training data
department_encoder.fit(y_train[['Department']])

# Transform the Department column in both the training and testing sets
department_train_encoded = department_encoder.transform(y_train[['Department']])
department_test_encoded = department_encoder.transform(y_test[['Department']])

# Convert the encoded arrays to DataFrames for easier handling and interpretation
department_train_encoded = pd.DataFrame(department_train_encoded, columns=department_encoder.get_feature_names_out(['Department']))
department_test_encoded = pd.DataFrame(department_test_encoded, columns=department_encoder.get_feature_names_out(['Department']))

# Optional: Display the encoded Department columns to confirm
print("Encoded Department column for training data (first 5 rows):")
print(department_train_encoded.head())
print("\nEncoded Department column for testing data (first 5 rows):")
print(department_test_encoded.head())


Encoded Department column for training data (first 5 rows):
   Department_Research & Development  Department_Sales
0                                1.0               0.0
1                                1.0               0.0
2                                0.0               1.0
3                                1.0               0.0
4                                1.0               0.0

Encoded Department column for testing data (first 5 rows):
   Department_Research & Development  Department_Sales
0                                0.0               1.0
1                                1.0               0.0
2                                0.0               0.0
3                                1.0               0.0
4                                1.0               0.0


In [43]:
# Initialize the OneHotEncoder for the Attrition column
attrition_encoder = OneHotEncoder(sparse_output=False, drop='first')  # drop='first' to create a binary variable for yes/no

# Fit the encoder to the Attrition column in the training data
attrition_encoder.fit(y_train[['Attrition']])

# Transform the Attrition column in both the training and testing sets
attrition_train_encoded = attrition_encoder.transform(y_train[['Attrition']])
attrition_test_encoded = attrition_encoder.transform(y_test[['Attrition']])

# Convert the encoded arrays to DataFrames for easier handling and interpretation
attrition_train_encoded = pd.DataFrame(attrition_train_encoded, columns=attrition_encoder.get_feature_names_out(['Attrition']))
attrition_test_encoded = pd.DataFrame(attrition_test_encoded, columns=attrition_encoder.get_feature_names_out(['Attrition']))

# Display the encoded Attrition columns to confirm
print("Encoded Attrition column for training data (first 5 rows):")
print(attrition_train_encoded.head())
print("\nEncoded Attrition column for testing data (first 5 rows):")
print(attrition_test_encoded.head())


Encoded Attrition column for training data (first 5 rows):
   Attrition_Yes
0            0.0
1            0.0
2            0.0
3            0.0
4            0.0

Encoded Attrition column for testing data (first 5 rows):
   Attrition_Yes
0            0.0
1            0.0
2            1.0
3            0.0
4            0.0


## Create, Compile, and Train the Model

In [51]:

#  Find the number of columns in X_train
input_shape = X_train.shape[1]  # Number of features in the training set

# Create the input layer
input_layer = Input(shape=(input_shape,))

# Create shared layers
# Add a few dense layers (adjust the number of neurons as needed)
shared_layer1 = Dense(64, activation='relu')(input_layer)
shared_layer2 = Dense(32, activation='relu')(shared_layer1)



In [54]:
# Create a branch for the Department prediction
# Hidden layer for the Department branch
department_hidden = Dense(16, activation='relu')(shared_layer2)  # Adjust the number of neurons as needed

# Output layer for the Department branch
# Assuming `department_train_encoded.shape[1]` is the number of unique Department categories
department_output = Dense(department_train_encoded.shape[1], activation='softmax', name='department')(department_hidden)


In [55]:
# Create a branch for the Attrition prediction
# Hidden layer for the Attrition branch
attrition_hidden = Dense(16, activation='relu')(shared_layer2)  # Adjust the number of neurons as needed

# Output layer for the Attrition branch
# Since Attrition is a binary classification, use 1 neuron with sigmoid activation
attrition_output = Dense(1, activation='sigmoid', name='attrition')(attrition_hidden)


In [57]:
# Create the model
# Define the model with inputs and outputs for both branches
model = Model(inputs=input_layer, outputs=[department_output, attrition_output])

# Compile the model
# Use different losses for each output: categorical_crossentropy for multi-class and binary_crossentropy for binary
model.compile(optimizer='adam',
              loss={'department': 'categorical_crossentropy', 'attrition': 'binary_crossentropy'},
              metrics={'department': 'accuracy', 'attrition': 'accuracy'})

# Summarize the model
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_4 (InputLayer)           [(None, 25)]         0           []                               
                                                                                                  
 dense_6 (Dense)                (None, 64)           1664        ['input_4[0][0]']                


                                                                                                  
 dense_7 (Dense)                (None, 32)           2080        ['dense_6[0][0]']                
                                                                                                  
 dense_10 (Dense)               (None, 16)           528         ['dense_7[0][0]']                
                                                                                                  
 dense_11 (Dense)               (None, 16)           528         ['dense_7[0][0]']                
                                                                                                  
 department (Dense)             (None, 2)            34          ['dense_10[0][0]']               
                                                                                                  
 attrition (Dense)              (None, 1)            17          ['dense_11[0][0]']               
          

In [None]:

#Train the model
history = model.fit(
    X_train_scaled,
    {'department': department_train_encoded, 'attrition': attrition_train_encoded},
    epochs=30,
    batch_size=32,
    validation_split=0.2,
    verbose=1
)


Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [63]:
# Evaluate the model using the testing data
evaluation = model.evaluate(
    X_test_scaled,
    {'department': department_test_encoded, 'attrition': attrition_test_encoded},
    verbose=1
)

# Print the evaluation results
print("\nEvaluation Results:")
print(f"Department Loss: {evaluation[1]}")
print(f"Department Accuracy: {evaluation[3]}")
print(f"Attrition Loss: {evaluation[2]}")
print(f"Attrition Accuracy: {evaluation[4]}")



Evaluation Results:
Department Loss: 8.879435539245605
Department Accuracy: 0.9149659872055054
Attrition Loss: 8.829614639282227
Attrition Accuracy: 0.8707482814788818


In [64]:
# Evaluate the model using the testing data
evaluation = model.evaluate(
    X_test_scaled,
    {'department': department_test_encoded, 'attrition': attrition_test_encoded},
    verbose=1
)

# Print the accuracy for both Department and Attrition
department_accuracy = evaluation[3]  # Department accuracy
attrition_accuracy = evaluation[4]   # Attrition accuracy

print("\nModel Accuracy:")
print(f"Department Accuracy: {department_accuracy * 100:.2f}%")
print(f"Attrition Accuracy: {attrition_accuracy * 100:.2f}%")



Model Accuracy:
Department Accuracy: 91.50%
Attrition Accuracy: 87.07%


# Summary

In the provided space below, briefly answer the following questions.

1. Is accuracy the best metric to use on this data? Why or why not?

2. What activation functions did you choose for your output layers, and why?

3. Can you name a few ways that this model might be improved?

YOUR ANSWERS HERE

1. Accuracy is good, but it may not be enough if we have more of one type than another (like if most people stay instead of leave). In that case, precision (how correct we are) and recall (how many we catch) can help us understand better.
2. For Department, I used softmax because it helps choose one answer out of many (like picking the right department).
For Attrition, I used sigmoid because it’s best for yes-or-no questions (like “Will they leave?”).
3. Stop Overfitting: Add dropout layers that ignore some parts of the model during training to make it better at generalizing.
Tune Settings: Try different numbers for layers, learning rate, etc., to see what works best.
Balance Data: If one category has more data than another, we can adjust the model to pay more attention to the smaller category.
Add Good Features: Adding useful information can help the model learn better.





