In [1]:
import sys
sys.path.append('./utils/')

import pandas as pd
import numpy as np

from basic_utils import print_full
from ml_utils import get_metrics
from pandoras_box import create_predictor, get_prediction



# Dataset selection

In [2]:
# Datset 1
a = 'MFG10YearTerminationData_train.csv'

# Datset 2
b1 = 'WA_Fn-UseC_-HR-Employee-Attrition_train.csv'
b2 = 'WA_Fn-UseC_-HR-Employee-Attrition_test.csv'

# (VERY) simple dataset
c = 'simple_dataset.csv'

file_name = b1

Let us have a first look at the dataset...

In [3]:
print_full(pd.read_csv('./data_sets/' + file_name), n=6)

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,EnvironmentSatisfaction,Gender,HourlyRate,JobInvolvement,JobLevel,JobRole,JobSatisfaction,MaritalStatus,MonthlyIncome,MonthlyRate,NumCompaniesWorked,Over18,OverTime,PercentSalaryHike,PerformanceRating,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41.0,No,Travel_Rarely,548,Research & Development,9,4,Life Sciences,1,1772,3,Male,94,3,1,Laboratory Technician,1,Divorced,2289,20520,1,Y,No,20,4,2,80,2,5,2,3,5.0,3,0,4
1,45.0,No,Travel_Rarely,1234,Sales,11,2,Life Sciences,1,1045,4,Female,90,3,4,Manager,4,Married,17650,5404,3,Y,No,13,3,2,80,1,26,4,4,9.0,3,1,1
2,36.0,No,Travel_Rarely,711,Research & Development,5,4,Life Sciences,1,1651,2,Female,42,3,3,Healthcare Representative,1,Married,8008,22792,4,Y,No,12,3,3,80,2,9,6,3,3.0,2,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1320,50.0,No,Travel_Rarely,328,Research & Development,1,3,Medical,1,249,3,Male,86,2,1,Laboratory Technician,3,Married,3690,3425,2,Y,No,15,3,4,80,1,5,2,2,3.0,2,0,2
1321,39.0,No,Travel_Rarely,722,Sales,24,1,Marketing,1,2056,2,Female,60,2,4,Sales Executive,4,Married,12031,8828,0,Y,No,11,3,1,80,1,21,2,2,20.0,9,9,6
1322,26.0,No,Travel_Frequently,496,Research & Development,11,2,Medical,1,390,1,Male,60,3,2,Healthcare Representative,1,Married,4741,22722,1,Y,Yes,13,3,3,80,1,5,3,3,5.0,3,3,3


# Feature mapping
## b) WA_Fn-UseC_-HR-Employee-Attrition mapping

In [4]:
# IDs
employee_id = 'EmployeeNumber'
record_id = None

# Dates
hire_date = None
record_date = None
termination_date = None

length_of_service = 'YearsAtCompany'

# Age / birthday
age = 'Age'
birth_date = None
birth_year = None

# Target
target = 'Attrition'
other_target_fields = []

# Others
job_title = 'JobRole'

# Special field types
special_field_types = {}

# Pandora's Box

## Create predictor

In [5]:
selected_feats, woe_dicts, clf, scaler, valid_metrics = create_predictor(
    './data_sets/' + file_name, target,
    employee_id=employee_id,
    record_id=record_id,
    hire_date=hire_date,
    record_date=record_date,
    termination_date=termination_date,
    length_of_service=length_of_service,
    age=age,
    birth_date=birth_date,
    birth_year=birth_year,
    other_target_fields=other_target_fields,
    job_title=job_title,
    special_field_types=special_field_types)

valid_metrics

(!) Job title enrichment has been performed.

(!) Generation enrichment has been performed.

Feature selection algorithm
--------------------------------------------------------------------------------
+ JobRole (ROC AUC score inc = 34.19%)
+ OverTime (ROC AUC score inc = 10.26%)
+ StockOptionLevel (ROC AUC score inc = 3.81%)
+ JobInvolvement (ROC AUC score inc = 1.51%)
+ EnvironmentSatisfaction (ROC AUC score inc = 1.07%)
+ EducationField (ROC AUC score inc = 1.57%)
+ BusinessTravel (ROC AUC score inc = 0.50%)
+ JobSatisfaction (ROC AUC score inc = 1.30%)
+ RelationshipSatisfaction (ROC AUC score inc = 0.41%)
+ YearsInCurrentRole (ROC AUC score inc = 0.71%)
+ YearsSinceLastPromotion (ROC AUC score inc = 0.91%)
+ generation (ROC AUC score inc = 0.38%)
+ TrainingTimesLastYear (ROC AUC score inc = 0.22%)
+ MaritalStatus (ROC AUC score inc = 0.16%)
+ WorkLifeBalance (ROC AUC score inc = 0.18%)
+ NumCompaniesWorked (ROC AUC score inc = 0.10%)
+ Gender (ROC AUC score inc = 0.23%)
+ MonthlyI

{'accuracy': 0.8716981132075472,
 'log_loss': 0.31267071838509825,
 'roc_auc': 0.85596061177456528}

## Get prediction

In [6]:
score, y_hat, df = get_prediction(
    './data_sets/' + b2, selected_feats, woe_dicts, clf, scaler,
    employee_id=employee_id,
    record_id=record_id,
    hire_date=hire_date,
    record_date=record_date,
    termination_date=termination_date,
    length_of_service=length_of_service,
    age=age,
    birth_date=birth_date,
    birth_year=birth_year,
    other_target_fields=other_target_fields,
    job_title=job_title,
    special_field_types=special_field_types)

(!) Job title enrichment has been performed.

(!) Generation enrichment has been performed.



In [7]:
# Create result
assert target, 'You must specify a target field.'
result = 'result'
target_values = np.sort(df[target].unique())
assert len(target_values) == 2, 'There must be 2 unique values in the target field.'
if all(target_values == np.array([0, 1])):
    df[result] = df[target]
else:
    v = df[target].value_counts().index[1]
    df[result] = (df[target] == v).astype(int)

if termination_date:
    df.loc[df[result] == 0, termination_date] = np.datetime64('NaT')

# Prediction real results
yt = df[result].values
get_metrics(yt, score, y_hat)

{'accuracy': 0.88435374149659862,
 'log_loss': 0.37636227514055814,
 'roc_auc': 0.78523035230352312}