In [1]:
import pandas as pd
pd.set_option('display.max_columns', None)
df = pd.read_csv("source_2021-12-07-09-51-43.csv", parse_dates = ["start_date", "end_date"], dtype={'work_postal':'str'})

In [2]:
#Replace under_29 by .under_29 to make it easier when sorting
df['age_group'] = df['age_group'].replace(to_replace='under_29', value='.under_29')

#Fill in na values for age_group
df['age_group'] = df['age_group'].fillna(df['age_group'].value_counts().index[0])

#Replace missing values in event column with unknown
df['event'] = df['event'].fillna('unknown')

#Drop all temportary positions
df = df[~df['jobtitle'].str.contains("TEMP")]

In [3]:
temp_end_date = pd.to_datetime('2021-12-07')
employee_ids = df.emplid_sec.unique()

#Define list of features we want in our model
duration = []
comprate = []
last_pay_raise = []
age_group = []
pay_increase_ot = []
piot_compared_avg = []
event = []

for ID in employee_ids:
    #Get all records of the employee
    employee = df[df['emplid_sec'] == ID].copy()
    
    ##### DURATION #####
    #Add up all durations (there are some inaccuracies doing this)
    duration.append(sum(employee['duration'].tolist(), employee.shape[0]))
    
    ##### COMP RATE #####
    #Get the highest comprate
    comprate.append(max(employee['comprate'].tolist()))
    
    ##### LAST PAY RAISE #####
    #Get last date of work or temporary last date
    if(employee['end_date'].isna().sum()):
        end = temp_end_date
    else:
        end = employee['end_date'].sort_values().tolist()[-1]
    #Get date of last pay raise
    employee.sort_values(by=['comprate'], inplace=True)
    last_raise = employee.iloc[-1]['start_date']
    #Calculate the difference
    last_pay_raise.append((end - last_raise).days)
    
    ##### AGE GROUP #####
    #Get the age group they were before they left
    age_group.append(sorted(employee['age_group'].tolist())[-1])
    
    ##### PAY INCREASE OVER TIME #####
    #(max - min) / duration
    max_rate = max(employee['comprate'].tolist())
    min_rate = min(employee['comprate'].tolist())
    pay_increase_ot.append((max_rate - min_rate) / duration[-1])
    
    ##### EVENT #####
    #Get the employee's latest event
    employee.sort_values(by=['end_date'], inplace=True)
    #Currently working
    if(pd.isnull(employee.iloc[-1]['end_date'])):
        event.append('Working')
    else:
        event.append(employee.iloc[-1]['event'])
        
avg_pay_increase_ot = sum(pay_increase_ot) / len(pay_increase_ot)
##### COMPRATE INCREASE OVER TIME COMPARED TO AVERAGE #####
#calculate the % below or above average
for val in pay_increase_ot:
    piot_compared_avg.append((val - avg_pay_increase_ot) / avg_pay_increase_ot)

In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_predict, cross_validate
from sklearn.preprocessing import LabelEncoder
from matplotlib import pyplot
from imblearn.over_sampling import SMOTE
from sklearn.metrics import confusion_matrix

#from sklearn import metrics
#logreg = LogisticRegression()
#logreg.fit(X_train, y_train)

In [5]:
data = {'duration': duration,
        'comprate': comprate,
        'last_pay_raise': last_pay_raise,
        'age_group': age_group,
        'pay_increase_ot': pay_increase_ot,
        'piot_compared_avg': piot_compared_avg,
        'event': event}
model_df = pd.DataFrame(data)

#Drop data points with 'unknown' event
model_df = model_df[model_df['event'] != 'unknown']

#Define input and output parameters for model
X = model_df.iloc[:, :-1]
y = model_df.iloc[:, -1]

#Encode labels for age_group
le = LabelEncoder()
le.fit(X['age_group'])
X['age_group'] = le.transform(X['age_group'])

#Encode labels for y and save the mapping
le.fit(y)
event_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
print(event_mapping)
y = le.transform(y)

{'Retirement': 0, 'Termination': 1, 'Working': 2}


In [6]:
model = LogisticRegression()
model.fit(X, y)
#plot_importance(model)
pyplot.show()
y_pred = cross_val_predict(model, X, y)
matrix = confusion_matrix(y, y_pred)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

In [7]:
print(matrix.diagonal()/matrix.sum(axis=1))
result = cross_validate(model, X, y, scoring='f1_weighted')
print(result['test_score'])

[0.47339782 0.71663244 0.5317823 ]


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

[0.38828341 0.55188726 0.70834432 0.70089197 0.57550342]


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [8]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
logreg = LogisticRegression()
logreg.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression()

In [19]:
from sklearn.metrics import accuracy_score
print('Logistic regression accuracy: {:.3f}'.format(accuracy_score(y_test, logreg.predict(X_test))))

Logistic regression accuracy: 0.645


In [10]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
rf.fit(X_train, y_train)

RandomForestClassifier()

In [11]:
print('Random Forest Accuracy: {:.3f}'.format(accuracy_score(y_test, rf.predict(X_test))))

Random Forest Accuracy: 0.795


In [23]:
#y_pred = rf.predict(X_test)
#print(classification_report(y_test, y_pred))

In [12]:
from sklearn.svm import SVC
svc = SVC()
svc.fit(X_train, y_train)

SVC()

In [13]:
print('Support vector machine accuracy: {:.3f}'.format(accuracy_score(y_test, svc.predict(X_test))))

Support vector machine accuracy: 0.651


In [14]:
from sklearn import model_selection
from sklearn.model_selection import cross_val_score
kfold = model_selection.KFold(n_splits=10, shuffle=True, random_state=7)
modelCV = RandomForestClassifier()
scoring = 'accuracy'
results = model_selection.cross_val_score(modelCV, X_train, y_train, cv=kfold, scoring=scoring)
print("10-fold cross validation average accuracy: %.3f" % (results.mean()))

10-fold cross validation average accuracy: 0.789


In [None]:
The average accuracy remains very close to the Random Forest model accuracy; 
hence, we can conclude that the model generalizes well.

In [15]:
from sklearn.metrics import classification_report
print(classification_report(y_test, rf.predict(X_test)))

              precision    recall  f1-score   support

           0       0.76      0.73      0.74       488
           1       0.83      0.80      0.82      1648
           2       0.77      0.81      0.79      1412

    accuracy                           0.79      3548
   macro avg       0.79      0.78      0.78      3548
weighted avg       0.80      0.79      0.79      3548



In [21]:
print(classification_report(y_test, logreg.predict(X_test)))

              precision    recall  f1-score   support

           0       0.54      0.48      0.51       488
           1       0.69      0.72      0.70      1648
           2       0.62      0.61      0.62      1412

    accuracy                           0.64      3548
   macro avg       0.62      0.60      0.61      3548
weighted avg       0.64      0.64      0.64      3548



In [17]:
print(classification_report(y_test, svc.predict(X_test)))

              precision    recall  f1-score   support

           0       0.69      0.07      0.13       488
           1       0.71      0.80      0.75      1648
           2       0.58      0.68      0.63      1412

    accuracy                           0.65      3548
   macro avg       0.66      0.52      0.50      3548
weighted avg       0.66      0.65      0.62      3548

