In [6]:
import sys
sys.path.append('./utils/')

import pandas as pd
import numpy as np

from basic_utils import print_full
from ml_utils import get_metrics
from pandoras_box import create_predictor, get_prediction

# Dataset selection

In [7]:
# Datset 1
a1 = 'MFG10YearTerminationData_train.csv'
a2 = 'MFG10YearTerminationData_test.csv'

# Datset 2
b1 = 'WA_Fn-UseC_-HR-Employee-Attrition_train.csv'
b2 = 'WA_Fn-UseC_-HR-Employee-Attrition_test.csv'

# (VERY) simple dataset
c = 'simple_dataset.csv'

file_name = a1

Let us have a first look at the dataset...

In [8]:
print_full(pd.read_csv('./data_sets/' + file_name), n=6)

Unnamed: 0,EmployeeID,recorddate_key,birthdate_key,orighiredate_key,terminationdate_key,age,length_of_service,city_name,department_name,job_title,store_name,gender_short,gender_full,termreason_desc,termtype_desc,STATUS_YEAR,STATUS,BUSINESS_UNIT,record_id
0,2304,2006-01-01,1941-01-15 00:00:00,1992-07-22,2006-01-15,65.0,13,Vernon,Meats,Meat Cutter,36,F,Female,Retirement,Voluntary,2006,TERMINATED,STORES,0
1,4118,2006-01-01,1946-01-23 00:00:00,1997-07-31,2006-01-23,60.0,8,Vancouver,Meats,Meat Cutter,35,M,Male,Retirement,Voluntary,2006,TERMINATED,STORES,1
2,2303,2006-01-01,1941-01-15 00:00:00,1992-07-22,2006-01-15,65.0,13,New Westminster,Meats,Meat Cutter,21,F,Female,Retirement,Voluntary,2006,TERMINATED,STORES,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44689,3967,2014-12-31,1963-03-06 00:00:00,1997-02-20,1900-01-01,51.0,17,Cranbrook,Produce,Produce Clerk,8,F,Female,Not Applicable,Not Applicable,2014,ACTIVE,STORES,41379
44690,3950,2014-12-31,1963-01-22 00:00:00,1997-01-29,1900-01-01,51.0,17,Nelson,Meats,Meat Cutter,19,F,Female,Not Applicable,Not Applicable,2014,ACTIVE,STORES,41378
44691,7587,2014-12-31,1989-03-19 00:00:00,2010-11-21,1900-01-01,25.0,4,Trail,Customer Service,Cashier,33,M,Male,Not Applicable,Not Applicable,2014,ACTIVE,STORES,41385


# Feature mapping
## a) MFG10YearTerminationData mapping

In [9]:
# IDs
employee_id = 'EmployeeID'
record_id = 'record_id'

# Dates
hire_date = 'orighiredate_key'
record_date = 'recorddate_key'
termination_date = 'terminationdate_key'

length_of_service = 'length_of_service'

# Age / birthday
age = 'age'
birth_date = 'birthdate_key'
birth_year = None

# Target
target = 'STATUS'

other_target_fields = ['termreason_desc', 'termtype_desc'] + [target]

# Others
job_title = 'job_title'

# Special field types
special_field_types = {
    'store_name': str
}

# Pandora's Box

## Create predictor

In [10]:
selected_feats, woe_dicts, clf, scaler, valid_metrics = create_predictor(
    './data_sets/' + file_name, target,
    employee_id=employee_id,
    record_id=record_id,
    hire_date=hire_date,
    record_date=record_date,
    termination_date=termination_date,
    length_of_service=length_of_service,
    age=age,
    birth_date=birth_date,
    birth_year=birth_year,
    other_target_fields=other_target_fields,
    job_title=job_title,
    special_field_types=special_field_types)

valid_metrics

(!) Job title enrichment has been performed.

(!) Length of service enrichment has been performed.

(!) Birth year enrichment has been performed.

(!) Generation enrichment has been performed.

Feature selection algorithm
--------------------------------------------------------------------------------
+ age (ROC AUC score inc = 54.50%)
+ generation (ROC AUC score inc = 6.35%)
+ length_of_service (ROC AUC score inc = 0.19%)
+ STATUS_YEAR (ROC AUC score inc = 5.73%)
+ gender_short (ROC AUC score inc = 1.07%)
+ birth_year (ROC AUC score inc = 0.77%)
+ BUSINESS_UNIT (ROC AUC score inc = 0.40%)
+ city_name (ROC AUC score inc = 0.16%)
store_name has been excluded (ROC AUC score inc = 0.03%)
Feature selection is over.
--------------------------------------------------------------------------------


{'accuracy': 0.971697057836447,
 'log_loss': 0.12708278988400162,
 'roc_auc': 0.63999493983776534}

## Get prediction

In [13]:
score, y_hat, df = get_prediction(
    './data_sets/' + a2, selected_feats, woe_dicts, clf, scaler,
    employee_id=employee_id,
    record_id=record_id,
    hire_date=hire_date,
    record_date=record_date,
    termination_date=termination_date,
    length_of_service=length_of_service,
    age=age,
    birth_date=birth_date,
    birth_year=birth_year,
    other_target_fields=other_target_fields,
    job_title=job_title,
    special_field_types=special_field_types)

(!) Job title enrichment has been performed.

(!) Length of service enrichment has been performed.

(!) Birth year enrichment has been performed.

(!) Generation enrichment has been performed.



In [14]:
# Create result
assert target, 'You must specify a target field.'
result = 'result'
target_values = np.sort(df[target].unique())
assert len(target_values) == 2, 'There must be 2 unique values in the target field.'
if all(target_values == np.array([0, 1])):
    df[result] = df[target]
else:
    v = df[target].value_counts().index[1]
    df[result] = (df[target] == v).astype(int)

if termination_date:
    df.loc[df[result] == 0, termination_date] = np.datetime64('NaT')

# Prediction real results
yt = df[result].values
get_metrics(yt, score, y_hat)

{'accuracy': 0.96956258818786534,
 'log_loss': 0.15444179470079925,
 'roc_auc': 0.56620463625395212}