In [79]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
data = pd.read_csv('HR-Employee-Attrition.csv')

data_classification = data[['Attrition']]
data_values = data.loc[:, data.columns != "Attrition"]
data_train, data_test, labels_train, labels_test = train_test_split(data_values, data_classification, test_size=0.2, random_state=35)
stratify=data_classification
# print(f"{data_train.shape} {data_test.shape}, {labels_train.shape}, {labels_test.shape}")

In [80]:
# Data Cleaning

#1 Convert text data to numerical

from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder() # since all the non-numerical features are categorical, it should be safe to use label encoding

labels_train['Attrition'] = encoder.fit_transform(labels_train['Attrition'])
labels_test['Attrition'] = encoder.transform(labels_test['Attrition']) # don't fit encoder with testing labels, but transform testing labels with same fitted encoder

features_to_encode = ['BusinessTravel', 'Department', 'EducationField', 'Gender', 'JobRole', 'MaritalStatus', 'Over18', 'OverTime']
for feature in features_to_encode:
#     data[feature] = encoder.fit_transform(data[feature])
    data_train[feature] = encoder.fit_transform(data_train[feature])
    data_test[feature] = encoder.transform(data_test[feature]) # don't fit encoder with testing data, but transform testing data with same fitted encoder

In [81]:
#2 Feature Selection

from sklearn.feature_selection import SelectKBest, chi2

feature_selector = SelectKBest(chi2, k=30) # drop the 4 features that have the lowest correlation with the answer
data_train_selected = feature_selector.fit_transform(data_train, labels_train)
data_test_selected = feature_selector.transform(data_test)


In [82]:
#3 Normalize

from sklearn.preprocessing import StandardScaler

normalize_scaler = StandardScaler() 
# data_normalized = normalize_scaler.fit_transform(data_values_selected, np.ravel(data_classification))
data_train_clean = normalize_scaler.fit_transform(data_train_selected, labels_train)
data_test_clean = normalize_scaler.transform(data_test_selected)

# use in model fitting: data_train_clean, data_test_clean, labels_train, labels_test

In [83]:
#4 Extra Credit?: graphing

In [84]:
# Model Training
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.neighbors import KNeighborsClassifier

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [85]:
#2 K Nearnest Neighbors
neigh = KNeighborsClassifier(n_neighbors=3)
neigh.fit(data_train_clean, np.ravel(labels_train))

neighPred = neigh.predict(data_test_clean)
neighPred = list(neighPred)

target_names = ["no", "yes"]
confusion_matrix(labels_test, neighPred)

print(classification_report(labels_test, neighPred, target_names=target_names))

              precision    recall  f1-score   support

          no       0.89      0.95      0.92       256
         yes       0.35      0.18      0.24        38

    accuracy                           0.85       294
   macro avg       0.62      0.57      0.58       294
weighted avg       0.82      0.85      0.83       294



In [86]:
#3 Decision Tree

from sklearn import tree

dt_classifier = tree.DecisionTreeClassifier()
dt_model = dt_classifier.fit(data_train_clean, labels_train)

from sklearn.metrics import classification_report

target_names = ["no", "yes"]
predicted_labels_dt = dt_model.predict(data_test_clean)
print(classification_report(labels_test, predicted_labels_dt, target_names=target_names))

              precision    recall  f1-score   support

          no       0.91      0.82      0.86       256
         yes       0.26      0.42      0.32        38

    accuracy                           0.77       294
   macro avg       0.58      0.62      0.59       294
weighted avg       0.82      0.77      0.79       294



In [87]:
#4 Linear Regression
from sklearn.linear_model import LogisticRegression

lr_classifier = LogisticRegression(random_state=39, max_iter=1000)
lr_model = lr_classifier.fit(data_train_clean, np.ravel(labels_train))

from sklearn.metrics import classification_report 

target_names = ["no", "yes"]
predicted_labels_lr = lr_model.predict(data_test_clean) 
print(classification_report(labels_test, predicted_labels_lr, target_names=target_names))

              precision    recall  f1-score   support

          no       0.91      0.96      0.94       256
         yes       0.58      0.39      0.47        38

    accuracy                           0.88       294
   macro avg       0.75      0.68      0.70       294
weighted avg       0.87      0.88      0.87       294



In [88]:
#5 Support Vector Machines
from sklearn.svm import SVC 

svc_classifier = SVC()
svc_model = svc_classifier.fit(data_train_clean, np.ravel(labels_train))

from sklearn.metrics import classification_report 

target_names = ["no", "yes"]
predicted_labels_svc = svc_model.predict(data_test_clean) 
print(classification_report(labels_test, predicted_labels_svc, target_names=target_names))

              precision    recall  f1-score   support

          no       0.90      0.98      0.94       256
         yes       0.65      0.29      0.40        38

    accuracy                           0.89       294
   macro avg       0.77      0.63      0.67       294
weighted avg       0.87      0.89      0.87       294



In [89]:
#6 Artificial Neural Networks
from sklearn.neural_network import MLPClassifier 
import numpy as np

mlp_classifier = MLPClassifier(random_state=39, activation='tanh', max_iter=600)
mlp_model = mlp_classifier.fit(data_train_clean, np.ravel(labels_train))

from sklearn.metrics import classification_report

target_names = ["no", "yes"]
predicted_labels_mlp = mlp_model.predict(data_test_clean)
print(classification_report(labels_test, predicted_labels_mlp, target_names=target_names))

              precision    recall  f1-score   support

          no       0.92      0.93      0.92       256
         yes       0.46      0.42      0.44        38

    accuracy                           0.86       294
   macro avg       0.69      0.67      0.68       294
weighted avg       0.86      0.86      0.86       294

