# Pay Type Predictor - Random Forest Model

The goal of this project is trying to build an algorithm to automatically test if a given payment information description belongs to the following three categories respectively: `is_productive`, `use_salaries`, and `use_hours`. These categories are independent, which means a specific payment can belong to more than one categories. 

So far, Random Forest seems to work the best. So in this file, we aim to tune the parameters for better accuracy.

In [1]:
ls data

README.md                 paycode_data.xlsx         paycode_first_clean.xlsx


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Step 1 - Data Preprocessing

In [3]:
data = pd.read_excel('data/paycode_data.xlsx')
data.head()

Unnamed: 0,report_id,pay_type_id,pay_type_description,Total_Dollars,Total_Hours,Hourly Rate,cr255210db1_pay_pay_types.is_productive,cr255210db1_pay_pay_types.use_salaries,cr255210db1_pay_pay_types.use_hours
0,87,101,MISC-REGULAR,93339554.0,2679091.0,34.840009,1,1,1
1,87,102,MISC-SICK PAY,5258853.0,158087.0,33.265563,0,1,1
2,87,104,MISC-VACATION,6710429.0,196136.0,34.213143,0,1,1
3,87,105,MISC-OTHER TIMEOFF,10103587.0,289865.0,34.856181,0,1,1
4,87,201,UNIFORM-REGULAR,5080.0,180.0,28.222222,1,1,1


In [4]:
data = data.drop(columns = ["report_id","pay_type_id" ])

In [5]:
data.columns = ['pay_type_description', 'total_dollars', 'total_hours', 'hourly_rate', 'is_productive', 'use_salaries', 'use_hours']
data.head()

Unnamed: 0,pay_type_description,total_dollars,total_hours,hourly_rate,is_productive,use_salaries,use_hours
0,MISC-REGULAR,93339554.0,2679091.0,34.840009,1,1,1
1,MISC-SICK PAY,5258853.0,158087.0,33.265563,0,1,1
2,MISC-VACATION,6710429.0,196136.0,34.213143,0,1,1
3,MISC-OTHER TIMEOFF,10103587.0,289865.0,34.856181,0,1,1
4,UNIFORM-REGULAR,5080.0,180.0,28.222222,1,1,1


# Step 2 - Split Data into Training and Testing

To avoid positional bias, we can shuffle the data frame rows.

In [6]:
data = data.sample(frac=1).reset_index(drop=True)
data.head()

Unnamed: 0,pay_type_description,total_dollars,total_hours,hourly_rate,is_productive,use_salaries,use_hours
0,Management Hours 3,25183.47,350.0,71.952771,1,1,1
1,Total Hours Worked,0.0,1113282.43,0.0,1,1,0
2,REIMB_TUITION,303104.64,0.0,0.0,0,0,0
3,HOL MNP Dual Vst HH,1561.18,0.0,0.0,0,1,1
4,XWD WKEND DIFF,3510.16,1507.0,2.329237,0,1,0


In [7]:
total = len(data)
total

62391

In [8]:
train = data.iloc[ : int(total*0.8)]
train.head()

Unnamed: 0,pay_type_description,total_dollars,total_hours,hourly_rate,is_productive,use_salaries,use_hours
0,Management Hours 3,25183.47,350.0,71.952771,1,1,1
1,Total Hours Worked,0.0,1113282.43,0.0,1,1,0
2,REIMB_TUITION,303104.64,0.0,0.0,0,0,0
3,HOL MNP Dual Vst HH,1561.18,0.0,0.0,0,1,1
4,XWD WKEND DIFF,3510.16,1507.0,2.329237,0,1,0


In [9]:
train.tail()

Unnamed: 0,pay_type_description,total_dollars,total_hours,hourly_rate,is_productive,use_salaries,use_hours
49907,Reimbursement -Tuition Taxl PR,3442.37,0.0,0.0,1,1,0
49908,Holiday,4459.02,60.8,73.339145,1,0,0
49909,Base Pay,119752700.0,2523053.47,47.463397,1,1,1
49910,Dom Part-CA Exempt (Noncash),0.0,0.0,0.0,0,0,0
49911,Salaries Reg,12924780.0,412736.0,31.314891,1,1,1


In [10]:
test = data.iloc[int(total*0.8):]
test.head()

Unnamed: 0,pay_type_description,total_dollars,total_hours,hourly_rate,is_productive,use_salaries,use_hours
49912,CI Rad Special Rate Work L6 DOT,2807.14,16.5,170.129697,1,1,1
49913,$TUITION,376155.5,0.0,0.0,0,1,1
49914,Transport Differentail,21636.0,21636.0,1.0,0,1,0
49915,CLIENT SERV. COMMISS,54553.0,0.0,0.0,0,1,0
49916,PTO Payoff (Term-Supp Tax),143420.31,3610.48,39.723336,0,1,0


What if we don't scale the values like we did in `Analysis.ipynb`? Would that give us the same accuracy?  Let's see. 

In [11]:
train_x = train[["total_dollars", "total_hours", "hourly_rate"]]
valid_x = test[["total_dollars", "total_hours", "hourly_rate"]]

# train_x = train[["total_dollars_scaled"]]
# valid_x = test[["total_dollars_scaled"]]

train_y_is_productive = train[["is_productive"]]
valid_y_is_productive = test.is_productive

train_y_use_salaries = train[["use_salaries"]]
valid_y_use_salaries = test.use_salaries

train_y_use_hours = train[["use_hours"]]
valid_y_use_hours = test.use_hours

In [12]:
train_x.head()

Unnamed: 0,total_dollars,total_hours,hourly_rate
0,25183.47,350.0,71.952771
1,0.0,1113282.43,0.0
2,303104.64,0.0,0.0
3,1561.18,0.0,0.0
4,3510.16,1507.0,2.329237


In [13]:
valid_x.head()

Unnamed: 0,total_dollars,total_hours,hourly_rate
49912,2807.14,16.5,170.129697
49913,376155.5,0.0,0.0
49914,21636.0,21636.0,1.0
49915,54553.0,0.0,0.0
49916,143420.31,3610.48,39.723336


# Random Forest

In [14]:
from sklearn.ensemble import RandomForestRegressor
import pickle

In [15]:
def rf_train(category, rf):
    
    # We assume hyperparameters are already initialized in rf
    # Training 
    print("Category: ", str(category))
    print("Training...")
    
    if category == "is_productive":
        rf.fit(train_x, train_y_is_productive)
    elif category == "use_salaries":
        rf.fit(train_x, train_y_use_salaries)
    elif category == "use_hours":
        rf.fit(train_x, train_y_use_hours)
    else:
        print("ERROR: Input Category Not Recognized!")
        return 
        
    print("Training completed!")

    # Predicting 
    print("Predicting...")
    pred = rf.predict(valid_x)
    train_pred = rf.predict(train_x)

    pred = pred.reshape((len(pred), ))
    train_pred = train_pred.reshape((len(train_pred), ))

    pred = [round(x) for x in pred]
    train_pred = [round(x) for x in train_pred]
    print("Prediction Completed! ")
    
    # Printing Results
    if category == "is_productive":
        count = 0
        for i in range(len(train_pred)):
            if train_pred[i] == train_y_is_productive.is_productive[i]:
                count += 1
        print("Training Accuracy = ", str(count / len(train_pred)))

        count = 0
        vy = list(valid_y_is_productive)
        for i in range(len(valid_x)):
            if pred[i] == vy[i]:
                count += 1
        print("Testing Accuracy = ", str(count / len(valid_x)))
    
    elif category == "use_salaries":
        count = 0
        for i in range(len(train_pred)):
            if train_pred[i] == train_y_use_salaries.use_salaries[i]:
                count += 1
        print("Training Accuracy = ", str(count / len(train_pred)))

        count = 0
        vy = list(valid_y_use_salaries)
        for i in range(len(valid_x)):
            if pred[i] == vy[i]:
                count += 1
        print("Testing Accuracy = ", str(count / len(valid_x)))
    
    elif category == "use_hours": 
        count = 0
        for i in range(len(train_pred)):
            if train_pred[i] == train_y_use_hours.use_hours[i]:
                count += 1
        print("Training Accuracy = ", str(count / len(train_pred)))

        count = 0
        vy = list(valid_y_use_hours)
        for i in range(len(valid_x)):
            if pred[i] == vy[i]:
                count += 1
        print("Testing Accuracy = ", str(count / len(valid_x)))
    
    # Saving Models to /Flask/models
    pickle.dump(rf, open('../Flask/models/RandomForest_' + category + '.pkl', 'wb'))
    print("Model saved at ../Flask/models/RandomForest_"+ category + ".pkl", )

In [16]:
rf_use_hours = RandomForestRegressor(max_depth = 15, n_estimators = 25)
rf_train("use_hours", rf_use_hours)

Category:  use_hours
Training...


  del sys.path[0]


Training completed!
Predicting...
Prediction Completed! 
Training Accuracy =  0.8772239140887963
Testing Accuracy =  0.8509495953201378
Model saved at ../Flask/models/RandomForest_use_hours.pkl


In [17]:
rf_use_salaries = RandomForestRegressor(max_depth = 20, n_estimators = 50)
rf_train("use_salaries", rf_use_salaries)

Category:  use_salaries
Training...


  # This is added back by InteractiveShellApp.init_path()


Training completed!
Predicting...
Prediction Completed! 
Training Accuracy =  0.9278930918416413
Testing Accuracy =  0.8937414856959692
Model saved at ../Flask/models/RandomForest_use_salaries.pkl


In [18]:
rf_is_produtive = RandomForestRegressor(max_depth = 10, n_estimators = 20)
rf_train("is_productive", rf_is_produtive)

Category:  is_productive
Training...


  if __name__ == '__main__':


Training completed!
Predicting...
Prediction Completed! 
Training Accuracy =  0.6909360474435006
Testing Accuracy =  0.6613510697972594
Model saved at ../Flask/models/RandomForest_is_productive.pkl


In [19]:
ls ../Flask/models

RandomForest.pkl                RandomForest_use_hours.pkl
RandomForest_is_productive.pkl  RandomForest_use_salaries.pkl
