In [1]:
import pandas as pd
pd.set_option('display.max_columns', None)
df = pd.read_csv("source_2021-12-07-09-51-43.csv", parse_dates = ["start_date", "end_date"], dtype={'work_postal':'str'})

In [2]:
#Replace under_29 by .under_29 to make it easier when sorting
df['age_group'] = df['age_group'].replace(to_replace='under_29', value='.under_29')

#Fill in na values for age_group
df['age_group'] = df['age_group'].fillna(df['age_group'].value_counts().index[0])

#Replace missing values in event column with unknown
df['event'] = df['event'].fillna('unknown')

#Drop all temportary positions
df = df[~df['jobtitle'].str.contains("TEMP")]

In [3]:
temp_end_date = pd.to_datetime('2021-12-07')
employee_ids = df.emplid_sec.unique()

#Define list of features we want in our model
duration = []
comprate = []
age_group = []
event = []

for ID in employee_ids:
    #Get all records of the employee
    employee = df[df['emplid_sec'] == ID].copy()
    
    ##### DURATION #####
    #Add up all durations (there are some inaccuracies doing this)
    duration.append(sum(employee['duration'].tolist(), employee.shape[0]))
    
    ##### COMP RATE #####
    #Get the highest comprate
    comprate.append(max(employee['comprate'].tolist()))
    
    ##### AGE GROUP #####
    #Get the age group they were before they left
    age_group.append(sorted(employee['age_group'].tolist())[-1])
    
    ##### EVENT #####
    #Get the employee's latest event
    employee.sort_values(by=['end_date'], inplace=True)
    #Currently working
    if(pd.isnull(employee.iloc[-1]['end_date'])):
        event.append('Working')
    else:
        event.append(employee.iloc[-1]['event'])

In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_predict, cross_validate
from sklearn.preprocessing import LabelEncoder
from matplotlib import pyplot
from imblearn.over_sampling import SMOTE
from sklearn.metrics import confusion_matrix

In [5]:
data = {'duration': duration,
        'comprate': comprate,
        'age_group': age_group,
         'event'   : event}
model_df = pd.DataFrame(data)

#Drop data points with 'unknown' event
model_df = model_df[model_df['event'] != 'unknown']

#Define input and output parameters for model
X = model_df.iloc[:, :-1]
y = model_df.iloc[:, -1]

#Encode labels for age_group
le = LabelEncoder()
le.fit(X['age_group'])
X['age_group'] = le.transform(X['age_group'])

#Encode labels for y and save the mapping
le.fit(y)
event_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
print(event_mapping)
y = le.transform(y)

{'Retirement': 0, 'Termination': 1, 'Working': 2}


In [26]:
model = LogisticRegression()
model.fit(X, y)
#plot_importance(model)
pyplot.show()
y_pred = cross_val_predict(model, X, y)
matrix = confusion_matrix(y, y_pred)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

In [7]:
print(matrix.diagonal()/matrix.sum(axis=1))
result = cross_validate(model, X, y, scoring='f1_weighted')
print(result['test_score'])

[0.49516324 0.69871197 0.5166182 ]


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

[0.38442051 0.53757625 0.71077558 0.70863031 0.51991794]


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [8]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
logreg = LogisticRegression()
logreg.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression()

In [9]:
from sklearn.metrics import accuracy_score
print('Logistic regression accuracy: {:.3f}'.format(accuracy_score(y_test, logreg.predict(X_test))))

Logistic regression accuracy: 0.628


In [10]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
rf.fit(X_train, y_train)

RandomForestClassifier()

In [11]:
print('Random Forest Accuracy: {:.3f}'.format(accuracy_score(y_test, rf.predict(X_test))))

Random Forest Accuracy: 0.746


In [12]:
from sklearn.svm import SVC
svc = SVC()
svc.fit(X_train, y_train)

SVC()

In [13]:
print('Support vector machine accuracy: {:.3f}'.format(accuracy_score(y_test, svc.predict(X_test))))

Support vector machine accuracy: 0.641


In [14]:
from sklearn import model_selection
from sklearn.model_selection import cross_val_score
kfold = model_selection.KFold(n_splits=10, shuffle=True, random_state=7)
modelCV = RandomForestClassifier()
scoring = 'accuracy'
results = model_selection.cross_val_score(modelCV, X_train, y_train, cv=kfold, scoring=scoring)
print("10-fold cross validation average accuracy: %.3f" % (results.mean()))

10-fold cross validation average accuracy: 0.751


In [15]:
#The average accuracy remains very close to the Random Forest model accuracy; 
#hence, we can conclude that the model generalizes well.

In [16]:
from sklearn.metrics import classification_report
print(classification_report(y_test, rf.predict(X_test)))

              precision    recall  f1-score   support

           0       0.68      0.67      0.68       488
           1       0.79      0.77      0.78      1648
           2       0.72      0.75      0.73      1412

    accuracy                           0.75      3548
   macro avg       0.73      0.73      0.73      3548
weighted avg       0.75      0.75      0.75      3548



In [17]:
print(classification_report(y_test, logreg.predict(X_test)))

              precision    recall  f1-score   support

           0       0.52      0.50      0.51       488
           1       0.68      0.69      0.69      1648
           2       0.60      0.60      0.60      1412

    accuracy                           0.63      3548
   macro avg       0.60      0.60      0.60      3548
weighted avg       0.63      0.63      0.63      3548



In [18]:
print(classification_report(y_test, svc.predict(X_test)))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00       488
           1       0.70      0.82      0.76      1648
           2       0.57      0.65      0.61      1412

    accuracy                           0.64      3548
   macro avg       0.42      0.49      0.46      3548
weighted avg       0.55      0.64      0.59      3548



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [19]:
df['division_leavers'].value_counts()

0    70144
1    13202
Name: division_leavers, dtype: int64