In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

data = pd.read_csv('HR-Employee-Attrition.csv')
data_classification = data[['Attrition']]
data_values = data.loc[:, data.columns != "Attrition"]
data_train, data_test, labels_train, labels_test = train_test_split(data_values, data_classification, test_size=0.2, random_state=35)
stratify=data_classification

In [6]:
# Data Cleaning

#1 Convert text data to numerical

from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder() # since all the non-numerical features are categorical, it should be safe to use label encoding

labels_train['Attrition'] = encoder.fit_transform(labels_train['Attrition'])
labels_test['Attrition'] = encoder.transform(labels_test['Attrition']) # don't fit encoder with testing labels, but transform testing labels with same fitted encoder

features_to_encode = ['BusinessTravel', 'Department', 'EducationField', 'Gender', 'JobRole', 'MaritalStatus', 'Over18', 'OverTime']
for feature in features_to_encode:
    data_train[feature] = encoder.fit_transform(data_train[feature])
    data_test[feature] = encoder.transform(data_test[feature]) # don't fit encoder with testing data, but transform testing data with same fitted encoder

In [7]:
#2 Resample data
    #2.1 upsample attrition "yes" rows to match number of "no" rows

# pip install imbalanced-learn
from imblearn.over_sampling import SMOTE # downgrade scikit-learn to 1.2.x for this to work
upsampler = SMOTE(random_state=39) ####### KEEP if not using downsampling
data_train, labels_train = upsampler.fit_resample(data_train, labels_train) ###### KEEP if not using downsampling

    #2.2 upsampled the training data above, now downsample testing data so that classes are equal but we're not using synthetic data
from sklearn.utils import resample
testing_merged = labels_test.join(data_test) # PROBLEM: i dont' think these joined properly on index
testing_no = testing_merged[testing_merged['Attrition'] == 0] # no attrition
testing_yes = testing_merged[testing_merged['Attrition'] == 1] # attrition
testing_no_ds = resample(testing_no, replace=True, n_samples=len(testing_yes), random_state=39)
test_ds = pd.concat([testing_no_ds, testing_yes]) # all fields, downsampled
labels_test = test_ds['Attrition'].to_frame()
data_test = test_ds.loc[:, test_ds.columns != "Attrition"]


In [8]:
#3 Feature Selection

from sklearn.feature_selection import SelectKBest, chi2

feature_selector = SelectKBest(chi2, k=14) # drop the 4 features that have the lowest correlation with the answer
data_train_selected = feature_selector.fit_transform(data_train, labels_train)
data_test_selected = feature_selector.transform(data_test)

In [9]:
#4 Normalize

from sklearn.preprocessing import StandardScaler

normalize_scaler = StandardScaler() 
data_train_clean = normalize_scaler.fit_transform(data_train_selected, labels_train)
data_test_clean = normalize_scaler.transform(data_test_selected)

In [10]:
# Model Training
# Note: Most of this model training/evaluation code was adapted from another lab

In [11]:
#1 K Nearest Neighbors
from sklearn.neighbors import KNeighborsClassifier

neigh = KNeighborsClassifier(n_neighbors=3)
neigh.fit(data_train_clean, np.ravel(labels_train))

from sklearn.metrics import classification_report

neighPred = neigh.predict(data_test_clean)
neighPred = list(neighPred)

target_names = ["no", "yes"]
print("K Nearest Neighbors\n")
print(classification_report(labels_test, neighPred, target_names=target_names))

K Nearest Neighbors

              precision    recall  f1-score   support

          no       0.59      0.71      0.64        38
         yes       0.63      0.50      0.56        38

    accuracy                           0.61        76
   macro avg       0.61      0.61      0.60        76
weighted avg       0.61      0.61      0.60        76



In [12]:
#2 Decision Tree

from sklearn import tree

dt_classifier = tree.DecisionTreeClassifier()
dt_model = dt_classifier.fit(data_train_clean, labels_train)

from sklearn.metrics import classification_report

target_names = ["no", "yes"]
predicted_labels_dt = dt_model.predict(data_test_clean)
print("Decision Tree\n")
print(classification_report(labels_test, predicted_labels_dt, target_names=target_names))

Decision Tree

              precision    recall  f1-score   support

          no       0.61      0.82      0.70        38
         yes       0.72      0.47      0.57        38

    accuracy                           0.64        76
   macro avg       0.66      0.64      0.63        76
weighted avg       0.66      0.64      0.63        76



In [13]:
#3 Linear Regression
from sklearn.linear_model import LogisticRegression

lr_classifier = LogisticRegression(random_state=39, max_iter=1000)
lr_model = lr_classifier.fit(data_train_clean, np.ravel(labels_train))

from sklearn.metrics import classification_report 

target_names = ["no", "yes"]
predicted_labels_lr = lr_model.predict(data_test_clean) 
print("Linear Regression\n")
print(classification_report(labels_test, predicted_labels_lr, target_names=target_names))

Linear Regression

              precision    recall  f1-score   support

          no       0.65      0.74      0.69        38
         yes       0.70      0.61      0.65        38

    accuracy                           0.67        76
   macro avg       0.67      0.67      0.67        76
weighted avg       0.67      0.67      0.67        76



In [14]:
#4 Support Vector Machines
from sklearn.svm import SVC 

svc_classifier = SVC()
svc_model = svc_classifier.fit(data_train_clean, np.ravel(labels_train))

from sklearn.metrics import classification_report 

target_names = ["no", "yes"]
predicted_labels_svc = svc_model.predict(data_test_clean) 
print("Support Vector Machines\n")
print(classification_report(labels_test, predicted_labels_svc, target_names=target_names))

Support Vector Machines

              precision    recall  f1-score   support

          no       0.67      0.76      0.72        38
         yes       0.73      0.63      0.68        38

    accuracy                           0.70        76
   macro avg       0.70      0.70      0.70        76
weighted avg       0.70      0.70      0.70        76



In [17]:
#5 Artificial Neural Networks
from sklearn.neural_network import MLPClassifier 
import numpy as np

mlp_classifier = MLPClassifier(random_state=39, activation='tanh', max_iter=900)
mlp_model = mlp_classifier.fit(data_train_clean, np.ravel(labels_train))

from sklearn.metrics import classification_report

target_names = ["no", "yes"]
predicted_labels_mlp = mlp_model.predict(data_test_clean)
print("Artificial Neural Networks\n")
print(classification_report(labels_test, predicted_labels_mlp, target_names=target_names))

Artificial Neural Networks

              precision    recall  f1-score   support

          no       0.58      0.79      0.67        38
         yes       0.67      0.42      0.52        38

    accuracy                           0.61        76
   macro avg       0.62      0.61      0.59        76
weighted avg       0.62      0.61      0.59        76



