In [1]:
import pandas as pd
import numpy as np
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier
from keras.utils import np_utils
from sklearn.model_selection import cross_validate, cross_val_predict
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix


Code cleaning

In [2]:
pd.set_option('display.max_columns', None)
df = pd.read_csv("../data/2021_12_07.csv", parse_dates = ["start_date", "end_date"], dtype={'work_postal':'str'})
df2 = pd.read_csv("comprate_2021-12-07-10-26-31.csv")

#Replace under_29 by .under_29 to make it easier when sorting
df['age_group'] = df['age_group'].replace(to_replace='under_29', value='.under_29')

#Fill in na values for age_group
df['age_group'] = df['age_group'].fillna(df['age_group'].value_counts().index[0])

#Replace missing values in event column with unknown
df['event'] = df['event'].fillna('unknown')

FileNotFoundError: [Errno 2] No such file or directory: '2021_12_07.csv'

Gather relevant features from dataset

In [None]:
#Temporary end_date: this end date is used if the employee is still working
temp_end_date = pd.to_datetime('2021-12-07')

#Get the list of all employees by their unique IDs
employee_ids = df.emplid_sec.unique()

duration = []                #total duration of employment in days
division = []                #last division employed at
department = []              #last department employed at
comprate = []                #highest comprate during employment
last_pay_raise = []          #days since highest comprate
highest_educ_lvl = []        #highest education level
age_group = []               #age group
pay_increase_ot = []         #(max comprate - min comprate) / duration
last_jobtitle_duration = []  #duration of last held jobtitle in days
highest_jt = []              #
event = []                   #Unknown, Retirement, Termination
#Loop through each employee records
for ID in employee_ids:
    #Get all records of the employee
    employee = df[df['emplid_sec'] == ID].copy()
    
    ##### DURATION #####
    #number of days worked as of 2021-12-07 (Includes end date)
    duration.append(int(sum(employee['duration'].tolist(), employee.shape[0])))
    
    ##### DIVISION #####
    #Get the last division they were in
    employee.sort_values(by=['end_date'], inplace=True)
    division.append(employee.iloc[-1]['division'])
    
    ##### DEPARTMENT #####
    #Get the last department they were in
    employee.sort_values(by=['end_date'], inplace=True)
    department.append(employee.iloc[-1]['department'])
    
    ##### COMP RATE #####
    #Get their highest comprate
    comprate.append(max(employee['comprate'].tolist()))
    
    ##### LAST PAY RAISE #####
    #Get last date of work or temporary last date
    if(employee['end_date'].isna().sum()):
        end = temp_end_date
    else:
        end = employee['end_date'].sort_values().tolist()[-1]
    #Get date of last pay raise
    employee.sort_values(by=['comprate'], inplace=True)
    last_raise = employee.iloc[-1]['start_date']
    #Calculate the difference
    last_pay_raise.append((end - last_raise).days)
    
    ##### EDUCATION LEVEL #####
    #Get the highest education level
    highest_educ_lvl.append(sorted(employee['highest_educ_lvl'].tolist())[-1])
    
    ##### AGE GROUP #####
    #Get the age group they were before they left
    age_group.append(sorted(employee['age_group'].tolist())[-1])
    
    ##### COMPRATE INCREASE OVER TIME #####
    #(max - min) / duration
    max_rate = max(employee['comprate'].tolist())
    min_rate = min(employee['comprate'].tolist())
    pay_increase_ot.append((max_rate - min_rate) / duration[-1])
    
    ##### DURATION OF CURRENT POSITION #####
    #Get the duration in days of the last jobtitle they held or currently holding
    employee.sort_values(by=['end_date'], inplace=True)
    last_jobtitle = employee.iloc[-1]['jobtitle']
    last_jobtitle_duration.append(employee.iloc[-1]['duration'] + 1)
    for i in reversed(range(len(employee) - 1)):
        if(employee.iloc[i]['jobtitle'] == last_jobtitle):
            last_jobtitle_duration[-1] += employee.iloc[i]['duration'] + 1
        else:
            break
    ##### EVENT #####
    #Get the employee's latest event
    #Unknown, Retirement, Termination
    employee.sort_values(by=['end_date'], inplace=True)
    event.append(employee.iloc[-1]['event'])

piot_compared_avg = []  #pay increase over time compared with average

#Get the average comprate increase over time
avg_pay_increase_ot = sum(pay_increase_ot) / len(pay_increase_ot)

##### COMPRATE INCREASE OVER TIME COMPARED TO AVERAGE #####
#calculate the % below or above average
for val in pay_increase_ot:
    piot_compared_avg.append((val - avg_pay_increase_ot) / avg_pay_increase_ot)
#Current technique: calculate the average comprate for each jobtitle and sort by the average for its rank

#Get all current jobtitles
jobtitles = df['jobtitle'].unique()

#Dictionary to store average
pay_avg_by_jt = {}

#Loop through each jobtitles
for jt in jobtitles:
    #Get all records with the current jobtitle
    records = df[df['jobtitle'] == jt].copy()
    #Get the maximum comprate for each employee
    max_comprates_by_jt = records.groupby(['emplid_sec'])['comprate'].max()
    #Calculate the average
    pay_avg_by_jt[jt] = sum(max_comprates_by_jt) / len(max_comprates_by_jt)
    

#Loop through each employees
for ID in employee_ids:
    #Get all records of the employee
    employee = df[df['emplid_sec'] == ID].copy()
    
    ##### HIGHEST JOB TITLE #####
    #Get all jobtitles
    jobs = employee['jobtitle'].unique().tolist()
    max_rate = 0; #Variable to store the maximum comparate
    highest_jt.append("")
    #Loop through each jobtitles
    for j in jobs:
        #Compare the comprate and keep the max
        if(pay_avg_by_jt[j] > max_rate):
            max_rate = pay_avg_by_jt[j]
            highest_jt[-1] = j

assert(len(duration) ==             
len(division) ==               
len(department) ==               
len(comprate) ==                
len(last_pay_raise) ==          
len(highest_educ_lvl) ==        
len(age_group) ==               
len(pay_increase_ot) ==         
len(last_jobtitle_duration) ==  
len(piot_compared_avg) ==       
len(event))

Encode/convert data to integer values or one hot encoding

In [None]:

#encode class values as integers
from sklearn.preprocessing import LabelEncoder, LabelBinarizer
lb = LabelBinarizer()


age_group = LabelEncoder().fit_transform(age_group)
division = LabelEncoder().fit_transform(division)
event = LabelEncoder().fit_transform(event)
highest_educ_lvl = LabelEncoder().fit_transform(highest_educ_lvl)

Y = event
vals = pd.DataFrame()
vals['duraton'] = duration
vals['comprate'] = comprate
vals['age_group'] = age_group
vals['division'] = division
vals['last_pay_raise'] = last_pay_raise
dataset = vals.values

X = dataset[:,0:5].astype(float)

# convert integers to dummy variables (i.e. one hot encoded)
dummy_y = np_utils.to_categorical(Y)


ML Model

In [None]:
#define baseline model
def baseline_model():
    #create model
    model = Sequential()
    model.add(Dense(8, input_dim = 5, activation='relu'))
    model.add(Dense(3, activation='softmax'))
    #compile model
    model.compile(loss='categorical_crossentropy',optimizer='adam', metrics=['accuracy'])
    return model

estimator = KerasClassifier(build_fn=baseline_model, epochs=200, batch_size=5, verbose =0)
kfold = KFold(n_splits=3, shuffle=True)

y_pred = cross_val_predict(estimator, X, Y)

matrix = confusion_matrix(Y,y_pred)
print(matrix.diagonal()/matrix.sum(axis=1))

result = cross_validate(estimator, X, Y, scoring = 'f1_weighted')
print(result['test_score'])

  estimator = KerasClassifier(build_fn=baseline_model, epochs=200, batch_size=5, verbose =0)
2022-03-01 13:22:15.850609: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


[0.43974763 0.77322253 0.55175962]
