# Pay Type Predictor - Random Forest Model

The goal of this project is trying to build an algorithm to automatically test if a given payment information description belongs to the following three categories respectively: `is_productive`, `use_salaries`, and `use_hours`. These categories are independent, which means a specific payment can belong to more than one categories. 

So far, Random Forest seems to work the best. So in this file, we aim to tune the parameters for better accuracy.

In [1]:
ls data

README.md                       paycode_data.xlsx
paycode_clean_string_only.xlsx  paycode_first_clean.xlsx


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Step 1 - Data Preprocessing

In [3]:
data = pd.read_excel('data/paycode_data.xlsx')
data.head()

Unnamed: 0,report_id,pay_type_id,pay_type_description,Total_Dollars,Total_Hours,Hourly Rate,cr255210db1_pay_pay_types.is_productive,cr255210db1_pay_pay_types.use_salaries,cr255210db1_pay_pay_types.use_hours
0,87,101,MISC-REGULAR,93339554.0,2679091.0,34.840009,1,1,1
1,87,102,MISC-SICK PAY,5258853.0,158087.0,33.265563,0,1,1
2,87,104,MISC-VACATION,6710429.0,196136.0,34.213143,0,1,1
3,87,105,MISC-OTHER TIMEOFF,10103587.0,289865.0,34.856181,0,1,1
4,87,201,UNIFORM-REGULAR,5080.0,180.0,28.222222,1,1,1


In [4]:
data = data.drop(columns = ["report_id","pay_type_id" ])

In [5]:
data.columns = ['pay_type_description', 'total_dollars', 'total_hours', 'hourly_rate', 'is_productive', 'use_salaries', 'use_hours']
data.head()

Unnamed: 0,pay_type_description,total_dollars,total_hours,hourly_rate,is_productive,use_salaries,use_hours
0,MISC-REGULAR,93339554.0,2679091.0,34.840009,1,1,1
1,MISC-SICK PAY,5258853.0,158087.0,33.265563,0,1,1
2,MISC-VACATION,6710429.0,196136.0,34.213143,0,1,1
3,MISC-OTHER TIMEOFF,10103587.0,289865.0,34.856181,0,1,1
4,UNIFORM-REGULAR,5080.0,180.0,28.222222,1,1,1


# Step 2 - Split Data into Training and Testing

To avoid positional bias, we can shuffle the data frame rows.

In [6]:
data = data.sample(frac=1).reset_index(drop=True)
data.head()

Unnamed: 0,pay_type_description,total_dollars,total_hours,hourly_rate,is_productive,use_salaries,use_hours
0,Retroactive Calculation,25933.28,0.0,0.0,0,1,0
1,PTO Leave,1021.88,14.25,71.710877,0,1,1
2,Retro Cashout Hol - NOT EC,51.2,0.0,0.0,0,1,0
3,Double-time Evening Shift,139169.31,1165.06,119.452483,1,1,1
4,,36287.12,2080.1,17.444892,1,1,1


In [7]:
total = len(data)
total

62391

In [8]:
train = data.iloc[ : int(total*0.8)]
train.head()

Unnamed: 0,pay_type_description,total_dollars,total_hours,hourly_rate,is_productive,use_salaries,use_hours
0,Retroactive Calculation,25933.28,0.0,0.0,0,1,0
1,PTO Leave,1021.88,14.25,71.710877,0,1,1
2,Retro Cashout Hol - NOT EC,51.2,0.0,0.0,0,1,0
3,Double-time Evening Shift,139169.31,1165.06,119.452483,1,1,1
4,,36287.12,2080.1,17.444892,1,1,1


In [9]:
train.tail()

Unnamed: 0,pay_type_description,total_dollars,total_hours,hourly_rate,is_productive,use_salaries,use_hours
49907,Employee Exp Reimburse-Non tax,0.0,0.0,0.0,1,0,0
49908,Holiday - Worked OT Base,20414.23,611.5,33.383859,1,1,1
49909,WEEKEND DIFFERENTIAL,26.71,0.0,0.0,1,0,0
49910,Education-Mandatory,4518.34,66.25,68.201358,0,1,1
49911,B12 BNS RET 12,5000.0,0.0,0.0,0,1,0


In [10]:
test = data.iloc[int(total*0.8):]
test.head()

Unnamed: 0,pay_type_description,total_dollars,total_hours,hourly_rate,is_productive,use_salaries,use_hours
49912,Intermittent PTO,10299.72,186.0,55.374839,0,1,1
49913,Holiday - Worked DT Base,7508.21,282.5,26.577735,1,1,1
49914,FAMILY/MEDICAL LEAVE,0.0,0.0,0.0,0,1,0
49915,,6219.4,0.0,0.0,0,0,0
49916,Auto Allowance,14874.09,52.14,285.272152,0,0,0


What if we don't scale the values like we did in `Analysis.ipynb`? Would that give us the same accuracy?  Let's see. 

In [11]:
train_x = train[["total_dollars", "total_hours", "hourly_rate"]]
valid_x = test[["total_dollars", "total_hours", "hourly_rate"]]

# train_x = train[["total_dollars_scaled"]]
# valid_x = test[["total_dollars_scaled"]]

train_y_is_productive = train[["is_productive"]]
valid_y_is_productive = test.is_productive

train_y_use_salaries = train[["use_salaries"]]
valid_y_use_salaries = test.use_salaries

train_y_use_hours = train[["use_hours"]]
valid_y_use_hours = test.use_hours

valid_pay_type = test.pay_type_description
valid_pay_type = [str(name) for name in valid_pay_type]

In [12]:
train_x.head()

Unnamed: 0,total_dollars,total_hours,hourly_rate
0,25933.28,0.0,0.0
1,1021.88,14.25,71.710877
2,51.2,0.0,0.0
3,139169.31,1165.06,119.452483
4,36287.12,2080.1,17.444892


In [13]:
valid_x.head()

Unnamed: 0,total_dollars,total_hours,hourly_rate
49912,10299.72,186.0,55.374839
49913,7508.21,282.5,26.577735
49914,0.0,0.0,0.0
49915,6219.4,0.0,0.0
49916,14874.09,52.14,285.272152


# Random Forest

In [14]:
from sklearn.ensemble import RandomForestRegressor
import pickle

In [15]:
def rf_train(category, rf):
    
    # We assume hyperparameters are already initialized in rf
    # Training 
    print("Category: ", str(category))
    print("Training...")
    
    if category == "is_productive":
        rf.fit(train_x, train_y_is_productive)
    elif category == "use_salaries":
        rf.fit(train_x, train_y_use_salaries)
    elif category == "use_hours":
        rf.fit(train_x, train_y_use_hours)
    else:
        print("ERROR: Input Category Not Recognized!")
        return 
        
    print("Training completed!")

    # Predicting 
    print("Predicting...")
    pred = rf.predict(valid_x)
    train_pred = rf.predict(train_x)

    pred = pred.reshape((len(pred), ))
    train_pred = train_pred.reshape((len(train_pred), ))

    pred = [round(x) for x in pred]
    train_pred = [round(x) for x in train_pred]
    print("Prediction Completed! ")
    
    # Printing Results
    if category == "is_productive":
        count = 0
        for i in range(len(train_pred)):
            if train_pred[i] == train_y_is_productive.is_productive[i]:
                count += 1
        print("Training Accuracy = ", str(count / len(train_pred)))

        count = 0
        vy = list(valid_y_is_productive)
        for i in range(len(valid_x)):
            if pred[i] == vy[i]:
                count += 1
        print("Testing Accuracy = ", str(count / len(valid_x)))
    
    elif category == "use_salaries":
        count = 0
        for i in range(len(train_pred)):
            if train_pred[i] == train_y_use_salaries.use_salaries[i]:
                count += 1
        print("Training Accuracy = ", str(count / len(train_pred)))

        count = 0
        vy = list(valid_y_use_salaries)
        for i in range(len(valid_x)):
            if pred[i] == vy[i]:
                count += 1
        print("Testing Accuracy = ", str(count / len(valid_x)))
    
    elif category == "use_hours": 
        count = 0
        for i in range(len(train_pred)):
            if train_pred[i] == train_y_use_hours.use_hours[i]:
                count += 1
        print("Training Accuracy = ", str(count / len(train_pred)))

        count = 0
        vy = list(valid_y_use_hours)
        for i in range(len(valid_x)):
            if pred[i] == vy[i]:
                count += 1
        print("Testing Accuracy = ", str(count / len(valid_x)))
    
    # Saving Models to /Flask/models
    pickle.dump(rf, open('../Flask/models/RandomForest_' + category + '.pkl', 'wb'))
    print("Model saved at ../Flask/models/RandomForest_"+ category + ".pkl", )

In [16]:
rf_use_hours = RandomForestRegressor(max_depth = 15, n_estimators = 25)
rf_train("use_hours", rf_use_hours)

Category:  use_hours
Training...


  del sys.path[0]


Training completed!
Predicting...
Prediction Completed! 
Training Accuracy =  0.8773841961852861
Testing Accuracy =  0.8533536341052969
Model saved at ../Flask/models/RandomForest_use_hours.pkl


In [17]:
rf_use_salaries = RandomForestRegressor(max_depth = 20, n_estimators = 50)
rf_train("use_salaries", rf_use_salaries)

Category:  use_salaries
Training...


  # This is added back by InteractiveShellApp.init_path()


Training completed!
Predicting...
Prediction Completed! 
Training Accuracy =  0.9262301650905593
Testing Accuracy =  0.8982290247615995
Model saved at ../Flask/models/RandomForest_use_salaries.pkl


In [18]:
rf_is_produtive = RandomForestRegressor(max_depth = 10, n_estimators = 20)
rf_train("is_productive", rf_is_produtive)

Category:  is_productive
Training...


  if __name__ == '__main__':


Training completed!
Predicting...
Prediction Completed! 
Training Accuracy =  0.6920379868568681
Testing Accuracy =  0.6692843977882843
Model saved at ../Flask/models/RandomForest_is_productive.pkl


In [19]:
ls ../Flask/models

README.md                       RandomForest_use_salaries.pkl
RandomForest_is_productive.pkl  generate_models.py
RandomForest_use_hours.pkl


-----

# Enhancement - Adding String Classifier 

In [20]:
ls data

README.md                       paycode_data.xlsx
paycode_clean_string_only.xlsx  paycode_first_clean.xlsx


In [21]:
pay_type_des_df = pd.read_excel('data/paycode_clean_string_only.xlsx')
pay_type_des_df.head()

Unnamed: 0,pay_type_description,cr255210db1_pay_pay_types.is_productive,cr255210db1_pay_pay_types.use_salaries,cr255210db1_pay_pay_types.use_hours
0,0,0,1.0,0
1,TrainingEduOTPremDly,1,1.0,0
2,$ ADJ,0,1.0,0
3,$ BONUS,0,1.0,0
4,$ BONUS CERTIFIED,0,1.0,0


In [22]:
pay_type_des_df.columns = ['pay_type_description', 'is_productive', 'use_salaries', 'use_hours']

In [23]:
pay_type_des_df["pay_type_description"] = pay_type_des_df["pay_type_description"].str.upper()

In [24]:
pay_type_des_df.head()

Unnamed: 0,pay_type_description,is_productive,use_salaries,use_hours
0,,0,1.0,0
1,TRAININGEDUOTPREMDLY,1,1.0,0
2,$ ADJ,0,1.0,0
3,$ BONUS,0,1.0,0
4,$ BONUS CERTIFIED,0,1.0,0


In [25]:
import math

In [26]:
for index, row in pay_type_des_df.iterrows():
    if type(row.pay_type_description) == float and math.isnan(row.pay_type_description):
        print("Got one")

Got one
Got one
Got one
Got one
Got one
Got one
Got one
Got one


In [27]:
string_classifier = {}
for index, row in pay_type_des_df.iterrows():
    if type(row.pay_type_description) == float and math.isnan(row.pay_type_description):
        continue
    string_classifier[str(row.pay_type_description)] = {'is_productive': row.is_productive, 
                                                   'use_salaries': row.use_salaries, 
                                                   'use_hours': row.use_hours}

In [28]:
test.head()

Unnamed: 0,pay_type_description,total_dollars,total_hours,hourly_rate,is_productive,use_salaries,use_hours
49912,Intermittent PTO,10299.72,186.0,55.374839,0,1,1
49913,Holiday - Worked DT Base,7508.21,282.5,26.577735,1,1,1
49914,FAMILY/MEDICAL LEAVE,0.0,0.0,0.0,0,1,0
49915,,6219.4,0.0,0.0,0,0,0
49916,Auto Allowance,14874.09,52.14,285.272152,0,0,0


In [29]:
def rf_train_with_string_classifier(category, rf):
    # We assume hyperparameters are already initialized in rf
    # Training 
    print("Category: ", str(category))
    print("Training...")
    
    if category == "is_productive":
        rf.fit(train_x, train_y_is_productive)
    elif category == "use_salaries":
        rf.fit(train_x, train_y_use_salaries)
    elif category == "use_hours":
        rf.fit(train_x, train_y_use_hours)
    else:
        print("ERROR: Input Category Not Recognized!")
        return 
        
    print("Training completed!")

    # Predicting 
    print("Predicting...")
    pred = rf.predict(valid_x)
    train_pred = rf.predict(train_x)

    pred = pred.reshape((len(pred), ))
    train_pred = train_pred.reshape((len(train_pred), ))
    
    pred = [round(x) for x in pred]
    pred_len = len(pred)
    for i in range(pred_len):
        name = str.upper(valid_pay_type[i])
        if name in string_classifier:
            if category == "is_productive":
                pred[i] = string_classifier[name]['is_productive']
            elif category == "use_salaries":
                pred[i] = string_classifier[name]['use_salaries']
            elif category == "use_hours":
                pred[i] == string_classifier[name]['use_hours']
    
    train_pred = [round(x) for x in train_pred]
    print("Prediction Completed! ")
    
    # Printing Results
    if category == "is_productive":
        count = 0
        for i in range(len(train_pred)):
            if train_pred[i] == train_y_is_productive.is_productive[i]:
                count += 1
        print("Training Accuracy = ", str(count / len(train_pred)))

        count = 0
        vy = list(valid_y_is_productive)
        for i in range(len(valid_x)):
            if pred[i] == vy[i]:
                count += 1
        print("Testing Accuracy = ", str(count / len(valid_x)))
    
    elif category == "use_salaries":
        count = 0
        for i in range(len(train_pred)):
            if train_pred[i] == train_y_use_salaries.use_salaries[i]:
                count += 1
        print("Training Accuracy = ", str(count / len(train_pred)))

        count = 0
        vy = list(valid_y_use_salaries)
        for i in range(len(valid_x)):
            if pred[i] == vy[i]:
                count += 1
        print("Testing Accuracy = ", str(count / len(valid_x)))
    
    elif category == "use_hours": 
        count = 0
        for i in range(len(train_pred)):
            if train_pred[i] == train_y_use_hours.use_hours[i]:
                count += 1
        print("Training Accuracy = ", str(count / len(train_pred)))

        count = 0
        vy = list(valid_y_use_hours)
        for i in range(len(valid_x)):
            if pred[i] == vy[i]:
                count += 1
        print("Testing Accuracy = ", str(count / len(valid_x)))
    
    # Saving Models to /Flask/models
    pickle.dump(rf, open('../Flask/models/RandomForest_' + category + '.pkl', 'wb'))
    print("Model saved at ../Flask/models/RandomForest_"+ category + ".pkl", )

In [30]:
rf_use_hours = RandomForestRegressor(max_depth = 15, n_estimators = 25)
rf_train_with_string_classifier("use_hours", rf_use_hours)

Category:  use_hours
Training...


  if sys.path[0] == '':


Training completed!
Predicting...
Prediction Completed! 
Training Accuracy =  0.8764625741304696
Testing Accuracy =  0.8529529609744371
Model saved at ../Flask/models/RandomForest_use_hours.pkl


In [31]:
rf_use_salaries = RandomForestRegressor(max_depth = 20, n_estimators = 50)
rf_train_with_string_classifier("use_salaries", rf_use_salaries)

Category:  use_salaries
Training...


  # Remove the CWD from sys.path while we load stuff.


Training completed!
Predicting...
Prediction Completed! 
Training Accuracy =  0.9257092482769674
Testing Accuracy =  0.9281192403237439
Model saved at ../Flask/models/RandomForest_use_salaries.pkl


In [32]:
rf_is_produtive = RandomForestRegressor(max_depth = 10, n_estimators = 20)
rf_train_with_string_classifier("is_productive", rf_is_produtive)

Category:  is_productive
Training...


  


Training completed!
Predicting...
Prediction Completed! 
Training Accuracy =  0.6936408078217663
Testing Accuracy =  0.8012661270935171
Model saved at ../Flask/models/RandomForest_is_productive.pkl


So the accuracy does increase with our string classifier! Great! 

In [33]:
# Saving the string classifer 
pickle.dump(string_classifier, open('../Flask/models/string_classifier.pkl', 'wb'))