In [1]:
import pandas as pd
pd.set_option('display.max_columns', None)
df = pd.read_csv("source_2021-12-07-09-51-43.csv", parse_dates = ["start_date", "end_date"], dtype={'work_postal':'str'})

In [2]:
#Replace under_29 by .under_29 to make it easier when sorting
df['age_group'] = df['age_group'].replace(to_replace='under_29', value='.under_29')

#Fill in na values for age_group
df['age_group'] = df['age_group'].fillna(df['age_group'].value_counts().index[0])

#Replace missing values in event column with unknown
df['event'] = df['event'].fillna('unknown')

#Drop all temportary positions
df = df[~df['jobtitle'].str.contains("TEMP")]

In [3]:
temp_end_date = pd.to_datetime('2021-12-07')
employee_ids = df.emplid_sec.unique()

#Define list of features we want in our model
duration = []
comprate = []
last_pay_raise = []
age_group = []
pay_increase_ot = []
piot_compared_avg = []
event = []

for ID in employee_ids:
    #Get all records of the employee
    employee = df[df['emplid_sec'] == ID].copy()
    
    ##### DURATION #####
    #Add up all durations (there are some inaccuracies doing this)
    duration.append(sum(employee['duration'].tolist(), employee.shape[0]))
    
    ##### COMP RATE #####
    #Get the highest comprate
    comprate.append(max(employee['comprate'].tolist()))
    
    ##### LAST PAY RAISE #####
    #Get last date of work or temporary last date
    if(employee['end_date'].isna().sum()):
        end = temp_end_date
    else:
        end = employee['end_date'].sort_values().tolist()[-1]
    #Get date of last pay raise
    employee.sort_values(by=['comprate'], inplace=True)
    last_raise = employee.iloc[-1]['start_date']
    #Calculate the difference
    last_pay_raise.append((end - last_raise).days)
    
    ##### AGE GROUP #####
    #Get the age group they were before they left
    age_group.append(sorted(employee['age_group'].tolist())[-1])
    
    ##### PAY INCREASE OVER TIME #####
    #(max - min) / duration
    max_rate = max(employee['comprate'].tolist())
    min_rate = min(employee['comprate'].tolist())
    pay_increase_ot.append((max_rate - min_rate) / duration[-1])
    
    ##### EVENT #####
    #Get the employee's latest event
    employee.sort_values(by=['end_date'], inplace=True)
    #Currently working
    if(pd.isnull(employee.iloc[-1]['end_date'])):
        event.append('Working')
    else:
        event.append(employee.iloc[-1]['event'])
        
avg_pay_increase_ot = sum(pay_increase_ot) / len(pay_increase_ot)
##### COMPRATE INCREASE OVER TIME COMPARED TO AVERAGE #####
#calculate the % below or above average
for val in pay_increase_ot:
    piot_compared_avg.append((val - avg_pay_increase_ot) / avg_pay_increase_ot)

In [4]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_predict, cross_validate
from sklearn.preprocessing import LabelEncoder
from matplotlib import pyplot
from imblearn.over_sampling import SMOTE
from sklearn.metrics import confusion_matrix

#from sklearn import metrics
#logreg = LogisticRegression()
#logreg.fit(X_train, y_train)

In [5]:
data = {'duration': duration,
        'comprate': comprate,
        'last_pay_raise': last_pay_raise,
        'age_group': age_group,
        'pay_increase_ot': pay_increase_ot,
        'piot_compared_avg': piot_compared_avg,
        'event': event}
model_df = pd.DataFrame(data)

#Drop data points with 'unknown' event
model_df = model_df[model_df['event'] != 'unknown']

#Define input and output parameters for model
X = model_df.iloc[:, :-1]
y = model_df.iloc[:, -1]

#Encode labels for age_group
le = LabelEncoder()
le.fit(X['age_group'])
X['age_group'] = le.transform(X['age_group'])

#Encode labels for y and save the mapping
le.fit(y)
event_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
print(event_mapping)
y = le.transform(y)

{'Retirement': 0, 'Termination': 1, 'Working': 2}


In [6]:
model = RandomForestClassifier()
model.fit(X, y)
#plot_importance(model)
pyplot.show()
y_pred = cross_val_predict(model, X, y)
matrix = confusion_matrix(y, y_pred)

In [7]:
print(matrix.diagonal()/matrix.sum(axis=1))
result = cross_validate(model, X, y, scoring='f1_weighted')
print(result['test_score'])

[0.69891173 0.82154191 0.2548816 ]
[0.5028179  0.54655555 0.59057661 0.6283987  0.43755138]


In [9]:
import numpy as np
feature_labels = np.array(['duration',
'comprate',
'last_pay_raise', 
'age_group', 
'pay_increase_ot',
'piot_compared_avg',
'event'])
importance = rf.feature_importances_
feature_indexes_by_importance = importance.argsort()
for index in feature_indexes_by_importance:
    print('{}-{:.2f}%'.format(feature_labels[index], (importance[index] *100.0)))

NameError: name 'rf' is not defined