## Part 1: Preprocessing

In [72]:
pip install tensorflow

Note: you may need to restart the kernel to use updated packages.


In [73]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np
from tensorflow.keras.models import Model
from tensorflow.keras import layers

#  Import and read the attrition data
attrition_df = pd.read_csv('https://static.bc-edx.com/ai/ail-v-1-0/m19/lms/datasets/attrition.csv')
attrition_df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,Department,DistanceFromHome,Education,EducationField,EnvironmentSatisfaction,HourlyRate,JobInvolvement,...,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,Sales,1,2,Life Sciences,2,94,3,...,3,1,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,Research & Development,8,1,Life Sciences,3,61,2,...,4,4,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,Research & Development,2,2,Other,4,92,2,...,3,2,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,Research & Development,3,4,Life Sciences,4,56,3,...,3,3,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,Research & Development,2,1,Medical,1,40,3,...,3,4,1,6,3,3,2,2,2,2


In [74]:
# Determine the number of unique values in each column.
attrition_df.nunique()

Age                         43
Attrition                    2
BusinessTravel               3
Department                   3
DistanceFromHome            29
Education                    5
EducationField               6
EnvironmentSatisfaction      4
HourlyRate                  71
JobInvolvement               4
JobLevel                     5
JobRole                      9
JobSatisfaction              4
MaritalStatus                3
NumCompaniesWorked          10
OverTime                     2
PercentSalaryHike           15
PerformanceRating            2
RelationshipSatisfaction     4
StockOptionLevel             4
TotalWorkingYears           40
TrainingTimesLastYear        7
WorkLifeBalance              4
YearsAtCompany              37
YearsInCurrentRole          19
YearsSinceLastPromotion     16
YearsWithCurrManager        18
dtype: int64

In [75]:
# Create y_df with the Attrition and Department columns

y_df = attrition_df[['Attrition', 'Department']]


In [76]:
# Create a list of at least 10 column names to use as X data

feature_columns = ['Education', 'Age', 'DistanceFromHome', 'JobSatisfaction', 
                   'OverTime', 'StockOptionLevel', 'WorkLifeBalance', 
                   'YearsAtCompany', 'YearsSinceLastPromotion', 'NumCompaniesWorked']


# Create X_df using your selected columns

X_df = attrition_df[feature_columns]

# Show the data types for X_df
print(X_df.dtypes)


Education                   int64
Age                         int64
DistanceFromHome            int64
JobSatisfaction             int64
OverTime                   object
StockOptionLevel            int64
WorkLifeBalance             int64
YearsAtCompany              int64
YearsSinceLastPromotion     int64
NumCompaniesWorked          int64
dtype: object


In [77]:

X_df = pd.get_dummies(attrition_df[feature_columns], columns=['OverTime'])
y_df = attrition_df[['Attrition', 'Department']]  




In [78]:
# Split the data into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_df, y_df, test_size=0.2, random_state=42)

# verify the split
print(f"Training feature set size: {X_train.shape}")
print(f"Testing feature set size: {X_test.shape}")
print(f"Training target set size: {y_train.shape}")
print(f"Testing target set size: {y_test.shape}")


Training feature set size: (1176, 11)
Testing feature set size: (294, 11)
Training target set size: (1176, 2)
Testing target set size: (294, 2)


In [79]:
# Convert Boolean to Integer
X_df['OverTime_No'] = X_df['OverTime_No'].astype(int)
X_df['OverTime_Yes'] = X_df['OverTime_Yes'].astype(int)
X_train['OverTime_No'] = X_df['OverTime_No'].astype(int)
X_train['OverTime_Yes'] = X_df['OverTime_Yes'].astype(int)

# Verify
print(X_df.dtypes)
print(X_train.dtypes)


Education                  int64
Age                        int64
DistanceFromHome           int64
JobSatisfaction            int64
StockOptionLevel           int64
WorkLifeBalance            int64
YearsAtCompany             int64
YearsSinceLastPromotion    int64
NumCompaniesWorked         int64
OverTime_No                int32
OverTime_Yes               int32
dtype: object
Education                  int64
Age                        int64
DistanceFromHome           int64
JobSatisfaction            int64
StockOptionLevel           int64
WorkLifeBalance            int64
YearsAtCompany             int64
YearsSinceLastPromotion    int64
NumCompaniesWorked         int64
OverTime_No                int32
OverTime_Yes               int32
dtype: object


In [80]:
# Data preview
print(X_df.head())
print(X_train.head())


   Education  Age  DistanceFromHome  JobSatisfaction  StockOptionLevel  \
0          2   41                 1                4                 0   
1          1   49                 8                2                 1   
2          2   37                 2                3                 0   
3          4   33                 3                3                 0   
4          1   27                 2                2                 1   

   WorkLifeBalance  YearsAtCompany  YearsSinceLastPromotion  \
0                1               6                        0   
1                3              10                        1   
2                3               0                        0   
3                3               8                        3   
4                3               2                        2   

   NumCompaniesWorked  OverTime_No  OverTime_Yes  
0                   8            0             1  
1                   1            1             0  
2                   6  

In [81]:
# Create a StandardScaler

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()


# Fit the StandardScaler to the training data

scaler.fit(X_train)

# Scale the training and testing data

X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)
# Assuming 'attrition_df' is your original DataFrame and 'y_df' contains your target variables

# Include 'OverTime' instead of 'OverTime_No' and 'OverTime_Yes'
feature_columns = ['Education', 'Age', 'DistanceFromHome', 'JobSatisfaction', 'StockOptionLevel', 
                   'WorkLifeBalance', 'YearsAtCompany', 'YearsSinceLastPromotion', 'NumCompaniesWorked', 
                   'OverTime', 'Department']
X_df = attrition_df[feature_columns]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_df, y_df, test_size=0.2, random_state=42)


In [82]:
# Import OneHotEncoder from sklearn.preprocessing
from sklearn.preprocessing import OneHotEncoder

# Initialize OneHotEncoder
encoder = OneHotEncoder(sparse=False)

# Fit the encoder to the training data's 'Department' column
# Note: This step is assuming 'Department' is part of X_train. If 'Department' is not included, add it to X_train as described earlier.
encoder.fit(X_train[['Department']])

# Transform 'Department' in training and testing sets to one-hot encoded columns
department_train = encoder.transform(X_train[['Department']])
department_test = encoder.transform(X_test[['Department']])

# Convert the encoded arrays into DataFrames for easier concatenation
department_train_df = pd.DataFrame(department_train, columns=encoder.get_feature_names_out(['Department']))
department_test_df = pd.DataFrame(department_test, columns=encoder.get_feature_names_out(['Department']))

# Reset the index of X_train and X_test to avoid index-related issues during concatenation
X_train = X_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)

# Concatenate the new one-hot encoded columns with the original X_train and X_test, excluding the original 'Department' column
X_train_encoded = pd.concat([X_train.drop(['Department'], axis=1), department_train_df], axis=1)
X_test_encoded = pd.concat([X_test.drop(['Department'], axis=1), department_test_df], axis=1)




In [83]:
# Create a OneHotEncoder for the Attrition column
encoder = OneHotEncoder(sparse=False)


# Fit the encoder to the training data
encoder.fit(y_train[['Attrition']])

# Create two new variables by applying the encoder
# to the training and testing data

attrition_train_encoded = encoder.transform(y_train[['Attrition']])
attrition_test_encoded = encoder.transform(y_test[['Attrition']])



## Create, Compile, and Train the Model

In [84]:
# Import necessary modules from Keras
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import Model

# Find the number of columns in the X training data
n_features = X_train_encoded.shape[1]
print(f"Number of input features: {n_features}")

# Create the input layer
input_layer = Input(shape=(n_features,))

# Create at least two shared layers
shared_dense1 = Dense(128, activation='relu')(input_layer)
shared_dense2 = Dense(64, activation='relu')(shared_dense1)

Number of input features: 13


In [85]:
# Assuming X_train is your training feature set
n_features = X_train.shape[1]
print(f"Number of input features: {n_features}")

Number of input features: 11


In [86]:
# Create a hidden layer for the 'Department' branch
# This layer takes the output from the last shared layer (shared_dense2) as input
department_hidden = Dense(32, activation='relu')(shared_dense2)

# Create the output layer for the 'Department' branch
# This layer has a neuron for each department and uses a softmax activation, ideal for multi-class classification
n_departments = 11  # Replace 5 with the actual number of unique departments in your dataset
department_output = Dense(n_departments, activation='softmax', name='department_output')(department_hidden)


In [87]:
# Create a branch for Attrition
# with a hidden layer and an output layer

# Create the hidden layer

attrition_hidden = Dense(32, activation='relu', name='attrition_hidden')(shared_dense2)

# Create the output layer

attrition_output = Dense(1, activation='sigmoid', name='attrition_output')(attrition_hidden)


In [88]:
# Create the model

model = Model(inputs=input_layer, outputs=[department_output, attrition_output])

# Compile the model

model.compile(optimizer='adam', 
              loss={'department_output': 'categorical_crossentropy', 'attrition_output': 'binary_crossentropy'},
              metrics={'department_output': 'accuracy', 'attrition_output': 'accuracy'})

# Summarize the model
model.summary()


In [89]:
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)


X_train shape: (1176, 11)
X_test shape: (294, 11)
y_train shape: (1176, 2)
y_test shape: (294, 2)


In [90]:
# Correct way to set values using .loc to avoid SettingWithCopyWarning
X_df.loc[:, 'OverTime'] = X_df['OverTime'].apply(lambda x: 1 if x == 'Yes' else 0)

# Similarly, for y_df
y_df.loc[:, 'Attrition'] = y_df['Attrition'].apply(lambda x: 1 if x == 'Yes' else 0)


In [91]:
print(X.shape)  # Should output (number_of_samples, number_of_features)
print(y.shape)  # Should output (number_of_samples,)


(1470, 26)
(1470,)


In [93]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import pandas as pd

# Assuming 'Attrition' is your target column and it is categorical ('Yes', 'No')
X = pd.get_dummies(attrition_df.drop('Attrition', axis=1))
y = attrition_df['Attrition'].apply(lambda x: 1 if x == 'Yes' else 0)

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the model
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Evaluate the model
predictions = model.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
print(f"Accuracy: {accuracy}")



Accuracy: 0.8775510204081632


In [99]:
from sklearn.ensemble import RandomForestClassifier

# Initialize the RandomForestClassifier
model = RandomForestClassifier(random_state=42)

# Fit the model on the training data
model.fit(X_train, y_train)

# Evaluate the model
print("Model accuracy on test data:", model.score(X_test, y_test))


Model accuracy on test data: 0.8775510204081632


In [104]:
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import Model

num_department = attrition_df['Department'].nunique()


# Input layer
input_layer = Input(shape=(X_train.shape[1],))

# Shared layers
shared_dense1 = Dense(128, activation='relu')(input_layer)
shared_dense2 = Dense(64, activation='relu')(shared_dense1)

# Branch for Department prediction (assuming multiple departments, hence softmax)
dept_dense = Dense(32, activation='relu')(shared_dense2)
dept_output = Dense(num_department, activation='softmax', name='dept_output')(dept_dense)

# Branch for Attrition prediction (binary classification, hence sigmoid)
attr_dense = Dense(32, activation='relu')(shared_dense2)
attr_output = Dense(1, activation='sigmoid', name='attr_output')(attr_dense)

# Create model
model = Model(inputs=input_layer, outputs=[dept_output, attr_output])

# Model summary
model.summary()


In [105]:
model.compile(optimizer='adam', 
              loss={'dept_output': 'categorical_crossentropy', 'attr_output': 'binary_crossentropy'},
              metrics={'dept_output': 'accuracy', 'attr_output': 'accuracy'})


In [109]:
# Assuming 'Department' is a categorical variable and needs one-hot encoding
# and 'Attrition' is a binary classification target

from sklearn.preprocessing import OneHotEncoder

# Instantiate the encoder
encoder_dept = OneHotEncoder(sparse=False)

# Fit and transform 'Department' column
encoder_dept.fit(y_train[['Department']])
y_train_dept = encoder_dept.transform(y_train[['Department']])
y_test_dept = encoder_dept.transform(y_test[['Department']])

# 'Attrition' is likely already binary (0 or 1), so you can directly assign it
y_train_attr = y_train[['Attrition']].values
y_test_attr = y_test[['Attrition']].values


IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices

In [None]:
# Print the accuracy for both department and attrition


# Summary

In the provided space below, briefly answer the following questions.

1. Is accuracy the best metric to use on this data? Why or why not?

2. What activation functions did you choose for your output layers, and why?

3. Can you name a few ways that this model might be improved?

YOUR ANSWERS HERE

1. 
2. 
3. 