## Part 1: Preprocessing

In [2]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np
from tensorflow.keras.models import Model
from tensorflow.keras import layers

#  Import and read the attrition data
attrition_df = pd.read_csv('https://static.bc-edx.com/ai/ail-v-1-0/m19/lms/datasets/attrition.csv')
attrition_df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,Department,DistanceFromHome,Education,EducationField,EnvironmentSatisfaction,HourlyRate,JobInvolvement,...,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,Sales,1,2,Life Sciences,2,94,3,...,3,1,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,Research & Development,8,1,Life Sciences,3,61,2,...,4,4,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,Research & Development,2,2,Other,4,92,2,...,3,2,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,Research & Development,3,4,Life Sciences,4,56,3,...,3,3,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,Research & Development,2,1,Medical,1,40,3,...,3,4,1,6,3,3,2,2,2,2


In [3]:
# Determine the number of unique values in each column.
attrition_df.nunique()

Age                         43
Attrition                    2
BusinessTravel               3
Department                   3
DistanceFromHome            29
Education                    5
EducationField               6
EnvironmentSatisfaction      4
HourlyRate                  71
JobInvolvement               4
JobLevel                     5
JobRole                      9
JobSatisfaction              4
MaritalStatus                3
NumCompaniesWorked          10
OverTime                     2
PercentSalaryHike           15
PerformanceRating            2
RelationshipSatisfaction     4
StockOptionLevel             4
TotalWorkingYears           40
TrainingTimesLastYear        7
WorkLifeBalance              4
YearsAtCompany              37
YearsInCurrentRole          19
YearsSinceLastPromotion     16
YearsWithCurrManager        18
dtype: int64

In [4]:
# Check to see if dataframe has any null values
display(attrition_df.isnull().sum())

Age                         0
Attrition                   0
BusinessTravel              0
Department                  0
DistanceFromHome            0
Education                   0
EducationField              0
EnvironmentSatisfaction     0
HourlyRate                  0
JobInvolvement              0
JobLevel                    0
JobRole                     0
JobSatisfaction             0
MaritalStatus               0
NumCompaniesWorked          0
OverTime                    0
PercentSalaryHike           0
PerformanceRating           0
RelationshipSatisfaction    0
StockOptionLevel            0
TotalWorkingYears           0
TrainingTimesLastYear       0
WorkLifeBalance             0
YearsAtCompany              0
YearsInCurrentRole          0
YearsSinceLastPromotion     0
YearsWithCurrManager        0
dtype: int64

In [5]:
# Create y_df with the Attrition and Department columns
y_df = attrition_df[['Attrition', 'Department']]
y_df.head()


Unnamed: 0,Attrition,Department
0,Yes,Sales
1,No,Research & Development
2,Yes,Research & Development
3,No,Research & Development
4,No,Research & Development


In [6]:
#Generate list of column names
attrition_df.columns


Index(['Age', 'Attrition', 'BusinessTravel', 'Department', 'DistanceFromHome',
       'Education', 'EducationField', 'EnvironmentSatisfaction', 'HourlyRate',
       'JobInvolvement', 'JobLevel', 'JobRole', 'JobSatisfaction',
       'MaritalStatus', 'NumCompaniesWorked', 'OverTime', 'PercentSalaryHike',
       'PerformanceRating', 'RelationshipSatisfaction', 'StockOptionLevel',
       'TotalWorkingYears', 'TrainingTimesLastYear', 'WorkLifeBalance',
       'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion',
       'YearsWithCurrManager'],
      dtype='object')

In [8]:
# Create a list of at least 10 column names to use as X data
selected_cols = ['Age', 'Education', 'EducationField',
                 'JobRole', 'JobSatisfaction', 'PerformanceRating',
                 'TotalWorkingYears', 'TrainingTimesLastYear',
                 'YearsAtCompany', 'YearsInCurrentRole',
                 'YearsSinceLastPromotion']

# Create X_df using your selected columns
X_df = attrition_df[selected_cols]
display(X_df.head())

# Show the data types for X_df
display(X_df.dtypes)


Unnamed: 0,Age,Education,EducationField,JobRole,JobSatisfaction,PerformanceRating,TotalWorkingYears,TrainingTimesLastYear,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion
0,41,2,Life Sciences,Sales Executive,4,3,8,0,6,4,0
1,49,1,Life Sciences,Research Scientist,2,4,10,3,10,7,1
2,37,2,Other,Laboratory Technician,3,3,7,3,0,0,0
3,33,4,Life Sciences,Research Scientist,3,3,8,3,8,7,3
4,27,1,Medical,Laboratory Technician,2,3,6,3,2,2,2


Age                         int64
Education                   int64
EducationField             object
JobRole                    object
JobSatisfaction             int64
PerformanceRating           int64
TotalWorkingYears           int64
TrainingTimesLastYear       int64
YearsAtCompany              int64
YearsInCurrentRole          int64
YearsSinceLastPromotion     int64
dtype: object

In [9]:
# Display X_df column values
for cols in X_df.columns:
  print(f"-------------- {cols} --------------")
  print(X_df[cols].unique())
  print("--------------------------------------------")
  print("\n")

-------------- Age --------------
[41 49 37 33 27 32 59 30 38 36 35 29 31 34 28 22 53 24 21 42 44 46 39 43
 50 26 48 55 45 56 23 51 40 54 58 20 25 19 57 52 47 18 60]
--------------------------------------------


-------------- Education --------------
[2 1 4 3 5]
--------------------------------------------


-------------- EducationField --------------
['Life Sciences' 'Other' 'Medical' 'Marketing' 'Technical Degree'
 'Human Resources']
--------------------------------------------


-------------- JobRole --------------
['Sales Executive' 'Research Scientist' 'Laboratory Technician'
 'Manufacturing Director' 'Healthcare Representative' 'Manager'
 'Sales Representative' 'Research Director' 'Human Resources']
--------------------------------------------


-------------- JobSatisfaction --------------
[4 2 3 1]
--------------------------------------------


-------------- PerformanceRating --------------
[3 4]
--------------------------------------------


-------------- TotalWorkingYea

In [10]:
# Split the data into training and testing sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_df, y_df, random_state=42)

# Display X_train, X_test, y_train, y_test after splitting
display(X_train.head())
display(X_train.shape)

display(y_train.head())
display(y_train.shape)

display(X_test.head())
display(X_test.shape)

display(y_test.head())
display(y_test.shape)


Unnamed: 0,Age,Education,EducationField,JobRole,JobSatisfaction,PerformanceRating,TotalWorkingYears,TrainingTimesLastYear,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion
1343,29,3,Life Sciences,Laboratory Technician,1,3,11,2,3,2,1
1121,36,4,Life Sciences,Sales Executive,3,3,15,5,1,0,0
1048,34,3,Other,Sales Executive,1,3,15,2,13,9,3
1393,27,3,Marketing,Sales Executive,4,3,7,5,7,7,0
527,32,3,Marketing,Sales Executive,4,3,10,2,10,7,0


(1102, 11)

Unnamed: 0,Attrition,Department
1343,No,Research & Development
1121,No,Sales
1048,No,Sales
1393,No,Sales
527,No,Sales


(1102, 2)

Unnamed: 0,Age,Education,EducationField,JobRole,JobSatisfaction,PerformanceRating,TotalWorkingYears,TrainingTimesLastYear,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion
1041,28,3,Medical,Sales Executive,1,3,6,4,5,4,1
184,53,2,Medical,Manufacturing Director,1,3,5,3,4,2,1
1222,24,1,Human Resources,Human Resources,3,3,1,2,1,0,0
67,45,3,Life Sciences,Research Scientist,1,3,25,2,1,0,0
220,36,2,Life Sciences,Laboratory Technician,2,3,16,3,13,11,3


(368, 11)

Unnamed: 0,Attrition,Department
1041,No,Sales
184,No,Research & Development
1222,Yes,Human Resources
67,No,Research & Development
220,No,Research & Development


(368, 2)

In [11]:
# Convert your X data to numeric data types however you see fit
# Add new code cells as necessary

from sklearn.preprocessing import LabelEncoder, OneHotEncoder

# Preprocess X_train "EducationField" and "JobRole" columns (one-hot encoding)
categorical_cols = ['EducationField', 'JobRole']

# Create Encoder
categorical_encoder = OneHotEncoder(drop='first', sparse_output=False)

# Fit and transform X_train categorical columns
categorical_encoded = categorical_encoder.fit_transform(X_train[categorical_cols])
#display(categorical_encoded)

# Get Encoded Column names
enc_categorical_cols = categorical_encoder.get_feature_names_out(categorical_cols)
#display(enc_categorical_cols)

# Create Encoded dataframe
df_categorical_encoded = pd.DataFrame(categorical_encoded, columns=enc_categorical_cols)
#display(df_categorical_encoded.head())

# Reindex X_train before concatenate to align with df_categorical_encoded DataFrame
X_train_reindexed = X_train.reset_index(drop=True)

# Concatenate the encoded columns to the original DataFrame
X_train_encoded = pd.concat([X_train_reindexed, df_categorical_encoded], axis=1)

# Drop the original "EducationField" and "JobRole" columns
X_train_encoded = X_train_encoded.drop(['EducationField', 'JobRole'], axis=1)

# Display Dataframe and shape
display(X_train_encoded.head())
display(X_train_encoded.shape)

# Check to see if dataframes contain any nulls
#display(X_train_encoded.isnull().sum())




Unnamed: 0,Age,Education,JobSatisfaction,PerformanceRating,TotalWorkingYears,TrainingTimesLastYear,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,EducationField_Life Sciences,...,EducationField_Other,EducationField_Technical Degree,JobRole_Human Resources,JobRole_Laboratory Technician,JobRole_Manager,JobRole_Manufacturing Director,JobRole_Research Director,JobRole_Research Scientist,JobRole_Sales Executive,JobRole_Sales Representative
0,29,3,1,3,11,2,3,2,1,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,36,4,3,3,15,5,1,0,0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,34,3,1,3,15,2,13,9,3,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,27,3,4,3,7,5,7,7,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,32,3,4,3,10,2,10,7,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


(1102, 22)

In [12]:
# Convert your X data to numeric data types however you see fit
# Add new code cells as necessary

# Preprocess X_test "EducationField" and "JobRole" columns (one-hot encoding)
# Transform X_test categorical columns
test_categorical_encoded = categorical_encoder.transform(X_test[categorical_cols])
#display(test_categorical_encoded)

# Create Encoded dataframe
df_test_categorical_encoded = pd.DataFrame(test_categorical_encoded, columns=enc_categorical_cols)
#display(df_test_categorical_encoded.head())

# Reindex X_test before concatenate to align with df_categorical_encoded DataFrame
X_test_reindexed = X_test.reset_index(drop=True)

# Concatenate the encoded columns to the original DataFrame
X_test_encoded = pd.concat([X_test_reindexed, df_test_categorical_encoded], axis=1)

# Drop the original "EducationField" and "JobRole" columns
X_test_encoded = X_test_encoded.drop(['EducationField', 'JobRole'], axis=1)

# Display Dataframe and shape
display(X_test_encoded.head())
display(X_test_encoded.shape)

# Check to see if dataframes contain any nulls
#display(X_test_encoded.isnull().sum())

Unnamed: 0,Age,Education,JobSatisfaction,PerformanceRating,TotalWorkingYears,TrainingTimesLastYear,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,EducationField_Life Sciences,...,EducationField_Other,EducationField_Technical Degree,JobRole_Human Resources,JobRole_Laboratory Technician,JobRole_Manager,JobRole_Manufacturing Director,JobRole_Research Director,JobRole_Research Scientist,JobRole_Sales Executive,JobRole_Sales Representative
0,28,3,1,3,6,4,5,4,1,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,53,2,1,3,5,3,4,2,1,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,24,1,3,3,1,2,1,0,0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,45,3,1,3,25,2,1,0,0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,36,2,2,3,16,3,13,11,3,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


(368, 22)

In [13]:
# Display X_train and X_test datatypes
display(X_train_encoded.dtypes)
display(X_test_encoded.dtypes)


Age                                  int64
Education                            int64
JobSatisfaction                      int64
PerformanceRating                    int64
TotalWorkingYears                    int64
TrainingTimesLastYear                int64
YearsAtCompany                       int64
YearsInCurrentRole                   int64
YearsSinceLastPromotion              int64
EducationField_Life Sciences       float64
EducationField_Marketing           float64
EducationField_Medical             float64
EducationField_Other               float64
EducationField_Technical Degree    float64
JobRole_Human Resources            float64
JobRole_Laboratory Technician      float64
JobRole_Manager                    float64
JobRole_Manufacturing Director     float64
JobRole_Research Director          float64
JobRole_Research Scientist         float64
JobRole_Sales Executive            float64
JobRole_Sales Representative       float64
dtype: object

Age                                  int64
Education                            int64
JobSatisfaction                      int64
PerformanceRating                    int64
TotalWorkingYears                    int64
TrainingTimesLastYear                int64
YearsAtCompany                       int64
YearsInCurrentRole                   int64
YearsSinceLastPromotion              int64
EducationField_Life Sciences       float64
EducationField_Marketing           float64
EducationField_Medical             float64
EducationField_Other               float64
EducationField_Technical Degree    float64
JobRole_Human Resources            float64
JobRole_Laboratory Technician      float64
JobRole_Manager                    float64
JobRole_Manufacturing Director     float64
JobRole_Research Director          float64
JobRole_Research Scientist         float64
JobRole_Sales Executive            float64
JobRole_Sales Representative       float64
dtype: object

In [14]:
# Display X_train and X_test columns
display(X_train_encoded.columns)
display(X_test_encoded.columns)

Index(['Age', 'Education', 'JobSatisfaction', 'PerformanceRating',
       'TotalWorkingYears', 'TrainingTimesLastYear', 'YearsAtCompany',
       'YearsInCurrentRole', 'YearsSinceLastPromotion',
       'EducationField_Life Sciences', 'EducationField_Marketing',
       'EducationField_Medical', 'EducationField_Other',
       'EducationField_Technical Degree', 'JobRole_Human Resources',
       'JobRole_Laboratory Technician', 'JobRole_Manager',
       'JobRole_Manufacturing Director', 'JobRole_Research Director',
       'JobRole_Research Scientist', 'JobRole_Sales Executive',
       'JobRole_Sales Representative'],
      dtype='object')

Index(['Age', 'Education', 'JobSatisfaction', 'PerformanceRating',
       'TotalWorkingYears', 'TrainingTimesLastYear', 'YearsAtCompany',
       'YearsInCurrentRole', 'YearsSinceLastPromotion',
       'EducationField_Life Sciences', 'EducationField_Marketing',
       'EducationField_Medical', 'EducationField_Other',
       'EducationField_Technical Degree', 'JobRole_Human Resources',
       'JobRole_Laboratory Technician', 'JobRole_Manager',
       'JobRole_Manufacturing Director', 'JobRole_Research Director',
       'JobRole_Research Scientist', 'JobRole_Sales Executive',
       'JobRole_Sales Representative'],
      dtype='object')

In [15]:
# Create a StandardScaler
std_scaler = StandardScaler()
cols_to_scale = ['Age', 'Education', 'JobSatisfaction',
                 'TotalWorkingYears', 'TrainingTimesLastYear', 'YearsAtCompany',
                 'YearsInCurrentRole', 'YearsSinceLastPromotion']

# Fit the StandardScaler to the training data
X_train_encoded_scaled = X_train_encoded[cols_to_scale]
X_train_scaled = std_scaler.fit_transform(X_train_encoded_scaled)
#display(X_train_scaled)
df_X_train_scaled = pd.DataFrame(X_train_scaled, columns=cols_to_scale)
display(df_X_train_scaled)

# Scale the training and testing data
X_test_encoded_scaled = X_test_encoded[cols_to_scale]
X_test_scaled = std_scaler.transform(X_test_encoded_scaled)
#display(X_test_scaled)
df_X_test_scaled = pd.DataFrame(X_test_scaled, columns=cols_to_scale)
display(df_X_test_scaled)


Unnamed: 0,Age,Education,JobSatisfaction,TotalWorkingYears,TrainingTimesLastYear,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion
0,-0.852159,0.094018,-1.569331,-0.014296,-0.615111,-0.642460,-0.603137,-0.360548
1,-0.093088,1.053349,0.251093,0.498037,1.695749,-0.970918,-1.155867,-0.682269
2,-0.309965,0.094018,-1.569331,0.498037,-0.615111,0.999830,1.331416,0.282892
3,-1.069036,0.094018,1.161305,-0.526629,1.695749,0.014456,0.778687,-0.682269
4,-0.526843,0.094018,1.161305,-0.142379,-0.615111,0.507143,0.778687,-0.682269
...,...,...,...,...,...,...,...,...
1097,-0.201527,0.094018,0.251093,-0.142379,0.155176,0.507143,1.331416,1.248052
1098,0.449105,0.094018,-0.659119,-0.014296,0.155176,-0.642460,-0.603137,-0.360548
1099,-1.611229,1.053349,1.161305,-1.295129,1.695749,-1.135147,-1.155867,-0.682269
1100,-0.852159,-0.865313,-0.659119,-0.142379,-0.615111,-0.478231,-0.326772,-0.682269


Unnamed: 0,Age,Education,JobSatisfaction,TotalWorkingYears,TrainingTimesLastYear,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion
0,-0.960597,0.094018,-1.569331,-0.654713,0.925462,-0.314002,-0.050408,-0.360548
1,1.750369,-0.865313,-1.569331,-0.782796,0.155176,-0.478231,-0.603137,-0.360548
2,-1.394352,-1.824643,0.251093,-1.295129,-0.615111,-0.970918,-1.155867,-0.682269
3,0.882860,0.094018,-1.569331,1.778870,-0.615111,-0.970918,-1.155867,-0.682269
4,-0.093088,-0.865313,-0.659119,0.626121,0.155176,0.999830,1.884145,0.282892
...,...,...,...,...,...,...,...,...
363,-1.177475,-0.865313,1.161305,-0.782796,-0.615111,-0.806689,-0.603137,-0.038828
364,0.015351,0.094018,-0.659119,-0.398546,-0.615111,0.178685,-0.050408,1.569772
365,0.340667,-0.865313,1.161305,-0.270463,0.155176,0.342914,1.055051,0.604612
366,-1.285913,0.094018,-0.659119,-0.654713,0.155176,-0.806689,-1.155867,-0.360548


In [16]:
# Create a LabelEncoder
# Label encoding for binary; one-hot encoding for multiple categories

pr_encoder = LabelEncoder()
# Column to Label Encode 'PerformanceRating'

# Fit_transform the LabelEncoder to the training data
df_X_train_scaled['PerformanceRating'] = pr_encoder.fit_transform(X_train_encoded[['PerformanceRating']])
display(df_X_train_scaled)
display(df_X_train_scaled.shape)
#display(df_X_train_scaled['PerformanceRating'].value_counts())

# LabelEncode to the test data
df_X_test_scaled['PerformanceRating'] = pr_encoder.transform(X_test_encoded[['PerformanceRating']])
display(df_X_test_scaled)
display(df_X_test_scaled.shape)
#display(df_X_test_scaled['PerformanceRating'].value_counts())

  y = column_or_1d(y, warn=True)


Unnamed: 0,Age,Education,JobSatisfaction,TotalWorkingYears,TrainingTimesLastYear,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,PerformanceRating
0,-0.852159,0.094018,-1.569331,-0.014296,-0.615111,-0.642460,-0.603137,-0.360548,0
1,-0.093088,1.053349,0.251093,0.498037,1.695749,-0.970918,-1.155867,-0.682269,0
2,-0.309965,0.094018,-1.569331,0.498037,-0.615111,0.999830,1.331416,0.282892,0
3,-1.069036,0.094018,1.161305,-0.526629,1.695749,0.014456,0.778687,-0.682269,0
4,-0.526843,0.094018,1.161305,-0.142379,-0.615111,0.507143,0.778687,-0.682269,0
...,...,...,...,...,...,...,...,...,...
1097,-0.201527,0.094018,0.251093,-0.142379,0.155176,0.507143,1.331416,1.248052,0
1098,0.449105,0.094018,-0.659119,-0.014296,0.155176,-0.642460,-0.603137,-0.360548,0
1099,-1.611229,1.053349,1.161305,-1.295129,1.695749,-1.135147,-1.155867,-0.682269,0
1100,-0.852159,-0.865313,-0.659119,-0.142379,-0.615111,-0.478231,-0.326772,-0.682269,0


(1102, 9)

  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


Unnamed: 0,Age,Education,JobSatisfaction,TotalWorkingYears,TrainingTimesLastYear,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,PerformanceRating
0,-0.960597,0.094018,-1.569331,-0.654713,0.925462,-0.314002,-0.050408,-0.360548,0
1,1.750369,-0.865313,-1.569331,-0.782796,0.155176,-0.478231,-0.603137,-0.360548,0
2,-1.394352,-1.824643,0.251093,-1.295129,-0.615111,-0.970918,-1.155867,-0.682269,0
3,0.882860,0.094018,-1.569331,1.778870,-0.615111,-0.970918,-1.155867,-0.682269,0
4,-0.093088,-0.865313,-0.659119,0.626121,0.155176,0.999830,1.884145,0.282892,0
...,...,...,...,...,...,...,...,...,...
363,-1.177475,-0.865313,1.161305,-0.782796,-0.615111,-0.806689,-0.603137,-0.038828,0
364,0.015351,0.094018,-0.659119,-0.398546,-0.615111,0.178685,-0.050408,1.569772,0
365,0.340667,-0.865313,1.161305,-0.270463,0.155176,0.342914,1.055051,0.604612,0
366,-1.285913,0.094018,-0.659119,-0.654713,0.155176,-0.806689,-1.155867,-0.360548,0


(368, 9)

In [17]:
# Combine all columns back together to create X_train and X_test

# Create list of the OneHotEncoder columns
ohe_df_cols = ['EducationField_Life Sciences', 'EducationField_Marketing',
               'EducationField_Medical', 'EducationField_Other',
               'EducationField_Technical Degree', 'JobRole_Human Resources',
               'JobRole_Laboratory Technician', 'JobRole_Manager',
               'JobRole_Manufacturing Director', 'JobRole_Research Director',
               'JobRole_Research Scientist', 'JobRole_Sales Executive',
               'JobRole_Sales Representative']

# Create X_train and X_test dataframes with the OneHotEncoder columns
df_X_train_ohe = X_train_encoded[ohe_df_cols]
df_X_test_ohe = X_test_encoded[ohe_df_cols]

# Concatenate the encoded columns to the scaled X_train and X_test DataFrames
X_train_clean = pd.concat([df_X_train_scaled, df_X_train_ohe], axis=1)
display(X_train_clean)
display(X_train_clean.shape)

X_test_clean = pd.concat([df_X_test_scaled, df_X_test_ohe], axis=1)
display(X_test_clean)
display(X_train_clean.shape)

Unnamed: 0,Age,Education,JobSatisfaction,TotalWorkingYears,TrainingTimesLastYear,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,PerformanceRating,EducationField_Life Sciences,...,EducationField_Other,EducationField_Technical Degree,JobRole_Human Resources,JobRole_Laboratory Technician,JobRole_Manager,JobRole_Manufacturing Director,JobRole_Research Director,JobRole_Research Scientist,JobRole_Sales Executive,JobRole_Sales Representative
0,-0.852159,0.094018,-1.569331,-0.014296,-0.615111,-0.642460,-0.603137,-0.360548,0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,-0.093088,1.053349,0.251093,0.498037,1.695749,-0.970918,-1.155867,-0.682269,0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,-0.309965,0.094018,-1.569331,0.498037,-0.615111,0.999830,1.331416,0.282892,0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,-1.069036,0.094018,1.161305,-0.526629,1.695749,0.014456,0.778687,-0.682269,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,-0.526843,0.094018,1.161305,-0.142379,-0.615111,0.507143,0.778687,-0.682269,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1097,-0.201527,0.094018,0.251093,-0.142379,0.155176,0.507143,1.331416,1.248052,0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1098,0.449105,0.094018,-0.659119,-0.014296,0.155176,-0.642460,-0.603137,-0.360548,0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1099,-1.611229,1.053349,1.161305,-1.295129,1.695749,-1.135147,-1.155867,-0.682269,0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1100,-0.852159,-0.865313,-0.659119,-0.142379,-0.615111,-0.478231,-0.326772,-0.682269,0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


(1102, 22)

Unnamed: 0,Age,Education,JobSatisfaction,TotalWorkingYears,TrainingTimesLastYear,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,PerformanceRating,EducationField_Life Sciences,...,EducationField_Other,EducationField_Technical Degree,JobRole_Human Resources,JobRole_Laboratory Technician,JobRole_Manager,JobRole_Manufacturing Director,JobRole_Research Director,JobRole_Research Scientist,JobRole_Sales Executive,JobRole_Sales Representative
0,-0.960597,0.094018,-1.569331,-0.654713,0.925462,-0.314002,-0.050408,-0.360548,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,1.750369,-0.865313,-1.569331,-0.782796,0.155176,-0.478231,-0.603137,-0.360548,0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,-1.394352,-1.824643,0.251093,-1.295129,-0.615111,-0.970918,-1.155867,-0.682269,0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.882860,0.094018,-1.569331,1.778870,-0.615111,-0.970918,-1.155867,-0.682269,0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,-0.093088,-0.865313,-0.659119,0.626121,0.155176,0.999830,1.884145,0.282892,0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
363,-1.177475,-0.865313,1.161305,-0.782796,-0.615111,-0.806689,-0.603137,-0.038828,0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
364,0.015351,0.094018,-0.659119,-0.398546,-0.615111,0.178685,-0.050408,1.569772,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
365,0.340667,-0.865313,1.161305,-0.270463,0.155176,0.342914,1.055051,0.604612,0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
366,-1.285913,0.094018,-0.659119,-0.654713,0.155176,-0.806689,-1.155867,-0.360548,0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


(1102, 22)

In [18]:
# Display y_train column values
for cols in y_train.columns:
  print(f"-------------- y_train {cols} --------------")
  print(y_train[cols].unique())
  print("--------------------------------------------")
  print("\n")
# Display y_test column values
for cols in y_test.columns:
  print(f"-------------- y_test {cols} --------------")
  print(y_test[cols].unique())
  print("--------------------------------------------")
  print("\n")

-------------- y_train Attrition --------------
['No' 'Yes']
--------------------------------------------


-------------- y_train Department --------------
['Research & Development' 'Sales' 'Human Resources']
--------------------------------------------


-------------- y_test Attrition --------------
['No' 'Yes']
--------------------------------------------


-------------- y_test Department --------------
['Sales' 'Research & Development' 'Human Resources']
--------------------------------------------




In [19]:
# Create a OneHotEncoder for the Department column

# Get value counts for Department values:  'Sales', 'Research & Development', 'Human Resources'
display(y_train['Department'].value_counts())

# Create a OneHotEncoder for the Department column
department_encoder = OneHotEncoder(drop='first', sparse_output=False)

# Fit_transform the encoder to the training data
#y_train_reindex = y_train.reset_index(drop=True)
y_train_department_encoded = department_encoder.fit_transform(y_train[['Department']])
department_columns = department_encoder.get_feature_names_out(['Department'])

# Create new variable by applying the encoder
# to the training data
y_train_department = pd.DataFrame(y_train_department_encoded, columns=department_columns)
display(y_train_department.head())

# Transform the encoder to the testing data
#y_test_reindex = y_test.reset_index(drop=True)
y_test_department_encoded = department_encoder.transform(y_test[['Department']])

# Create new variable by applying the encoder
# to the testing data
y_test_department = pd.DataFrame(y_test_department_encoded, columns=department_columns)
display(y_test_department.head())


Research & Development    721
Sales                     336
Human Resources            45
Name: Department, dtype: int64

Unnamed: 0,Department_Research & Development,Department_Sales
0,1.0,0.0
1,0.0,1.0
2,0.0,1.0
3,0.0,1.0
4,0.0,1.0


Unnamed: 0,Department_Research & Development,Department_Sales
0,0.0,1.0
1,1.0,0.0
2,0.0,0.0
3,1.0,0.0
4,1.0,0.0


In [20]:
# Create a OneHotEncoder for the Attrition column

# Get value counts for Attrition values:  'No' 'Yes'
display(y_train['Attrition'].value_counts())

# Create a OneHotEncoder for the Attrition column
attrition_encoder = OneHotEncoder(drop='if_binary', sparse_output=False)

# Fit_transform the encoder to the training data
y_train_attrition_encoded = attrition_encoder.fit_transform(y_train[['Attrition']])
attrition_columns = attrition_encoder.get_feature_names_out(['Attrition'])

# Create new variable by applying the encoder
# to the training data
y_train_attrition = pd.DataFrame(y_train_attrition_encoded, columns=attrition_columns)
display(y_train_attrition.head())

# Transform the encoder to the testing data
y_test_attrition_encoded = attrition_encoder.transform(y_test[['Attrition']])

# Create new variable by applying the encoder
# to the testing data
y_test_attrition = pd.DataFrame(y_test_attrition_encoded, columns=attrition_columns)
display(y_test_attrition.head())


No     913
Yes    189
Name: Attrition, dtype: int64

Unnamed: 0,Attrition_Yes
0,0.0
1,0.0
2,0.0
3,0.0
4,0.0


Unnamed: 0,Attrition_Yes
0,0.0
1,0.0
2,1.0
3,0.0
4,0.0


## Create, Compile, and Train the Model

In [24]:
# Find the number of columns in the X training data
X_cols = X_train_clean.shape[1]

# Create the input layer
input_layer = layers.Input(shape=(X_cols,), name='input_features')

# Create at least two shared layers
shared_layer1 = layers.Dense(64, activation='relu')(input_layer)
shared_layer2 = layers.Dense(128, activation='relu')(shared_layer1)

In [25]:
# Create a branch for Department
# with a hidden layer and an output layer

# Create the hidden layer
department_dense = layers.Dense(32, activation='relu')(shared_layer2)

# Create the output layer
department_output = layers.Dense(len(y_train_department.columns),
                                 activation='softmax',
                                 name='department_output')(department_dense)


In [26]:
# Create a branch for Attrition
# with a hidden layer and an output layer

# Create the hidden layer
attrition_dense = layers.Dense(32, activation='relu')(shared_layer2)

# Create the output layer
attrition_output = layers.Dense(len(y_train_attrition.columns),
                                 activation='sigmoid',
                                 name='attrition_output')(attrition_dense)


In [28]:
# Create the model
model = Model(inputs=input_layer, outputs=[department_output, attrition_output])

# Compile the model
model.compile(optimizer='adam',
              loss={'department_output': 'categorical_crossentropy', 'attrition_output': 'binary_crossentropy'},
              metrics={'department_output': 'accuracy', 'attrition_output': 'accuracy'})


# Summarize the model
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_features (InputLayer  [(None, 22)]                 0         []                            
 )                                                                                                
                                                                                                  
 dense (Dense)               (None, 64)                   1472      ['input_features[0][0]']      
                                                                                                  
 dense_1 (Dense)             (None, 128)                  8320      ['dense[0][0]']               
                                                                                                  
 dense_2 (Dense)             (None, 32)                   4128      ['dense_1[0][0]']         

In [29]:
# Train the model
model.fit(
    X_train_clean,
    {
        'department_output': y_train_department,
        'attrition_output': y_train_attrition
    },
    epochs=100,
    batch_size=32,
    validation_split=0.2
)


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.src.callbacks.History at 0x7cb1f7129150>

In [30]:
# Evaluate the model with the testing data
test_results = model.evaluate(X_test_clean, {'department_output': y_test_department, 'attrition_output': y_test_attrition})
display(model.metrics_names)
display(test_results)



['loss',
 'department_output_loss',
 'attrition_output_loss',
 'department_output_accuracy',
 'attrition_output_accuracy']

[389.507080078125,
 389.0963439941406,
 0.41075506806373596,
 0.426630437374115,
 0.8695651888847351]

In [31]:
# Print the accuracy for both department and attrition
print(f"Department Accuracy: {test_results[3]}")
print(f"Attrition Accuracy: {test_results[4]}")

Department Accuracy: 0.426630437374115
Attrition Accuracy: 0.8695651888847351


# Summary

In the provided space below, briefly answer the following questions.

1. Is accuracy the best metric to use on this data? Why or why not?

2. What activation functions did you choose for your output layers, and why?

3. Can you name a few ways that this model might be improved?

YOUR ANSWERS HERE

1.

Accuracy is the correct metric to use for this because there is balance in the data.  R&D is 65%, Sales is 30% and HR is 4%.  The accuracy score being ~80% works for predicting a department that would be a good fit for an employee.  If this is done incorrectly, there is a relatively low cost due to the employee being part of the decision process.


2.

Department Output - This predicts three departments.  For multi-class classifications, softmax is recommended, so that was my choice.

Attribution Output - This gave a yes/no output.  for binary classification, sigmoid is recommended, so that was my choice for this output layer.


3.
*   Reduce the number of inputs, or add more for a better classification
*   Increase the neurons quantity
*   Increase the epochs quantity
*   Split the models to better focus on the departments recommended for employees and the employee attrition.





