# Setup

## Dependencies

In [133]:
import os
import pandas as pd

## Importing Data

In [134]:
# # this allows the 'data_prep' file to be ran before this file tries to bring in those datasets
# os.system("python data_prep.py")

# this is from the datasets with non-corrected 'gm r4f' dates
os.system("python data_prep_without_corrections.py")

project_table_data = "./data_copy/cleaned_data/project_table.csv"
project_info_data = "./data_copy/cleaned_data/project_info_table.csv"
workflow_table_data = "./data_copy/cleaned_data/workflow_table.csv"


# imports the 'project table' data
# project_table_data = "./data/cleaned_data/project_table.csv"
project_table_df = pd.read_csv(
    project_table_data, dtype={
        'Claim #': str,
        'Job #': str,
        'Branch':str,
        'Claim Status':str},
    parse_dates=[
        'Rep Agreement Signed', 'Rep Claim Collected','FTA Scope Completed',
        'FTA Scope Rejected', 'BC Estimate Completed','OB Scope Completed',
        'Sup Job Submitted', 'BC Approved for Production', 'OB Order Built',
        'GM Order Processed', 'PA Permit Applied', 'PA Permit Processed',
        'PA OA Processed', 'PA OA Invoiced', 'PA Notify of Delivery',
        'PA Notify of Start', 'Delivery Date', 'Roof Start',
        'Roof End', 'GM Approved for Inspection', 'GM Change Order Date',
        'GM Labor Adjustment Date', 'RA Inspection Requested', 'RA Inspection Processed', 
        'Rep COC Collected', 'SA Job Docs Uploaded', 'BC Project Invoiced','BC Project Closed'])

# imports the 'project information' data
# project_info_data = "./data/cleaned_data/project_info_table.csv"
project_info_df = pd.read_csv(project_info_data, dtype={
    'Claim #':str, 'Job #':str, 'Branch':str,
    'City':str, 'Building Department':str, 'Permit Req?':str,
    'Supplier Name':str, 'Crew':str, 'Insurance Company':str,
    'Multi-rejected':str,'Sup':str, 'Rep':str,
    'FTA':str, 'BC':str, 'OB':str, 'GM':str})

# imports 'workflow table' data
# workflow_table_data = "./data/cleaned_data/workflow_table.csv"
workflow_table_df = pd.read_csv(workflow_table_data)


# Separate Outlier Data

## Workflow Data

In [135]:
workflow_table_df.describe()

Unnamed: 0,Rep Collecting Claim,FTA Completing Scope,BC Completing Estimate,OB Completing Scope,Sup Submitting Job,BC Approving Job,OB Building Order,GM Processing Order,PA Processing OA,PA Invoicing OA,GM Approving for Inspection,RA Requesting Inspection,Rep Collecting COC,SA Uploading Docs,BC Invoicing Project,BC Closed Project,Days in Pipeline
count,2389.0,1897.0,1846.0,1737.0,1099.0,1015.0,956.0,793.0,751.0,683.0,627.0,222.0,545.0,445.0,393.0,202.0,2389.0
mean,6.401005,5.401687,0.704225,1.805412,10.549591,3.349754,1.460251,8.141236,1.29028,3.055637,0.69059,7.801802,5.73578,8.220225,3.223919,16.950495,29.098786
std,12.740876,5.198584,1.335061,3.40948,9.855701,5.311671,3.563851,9.308149,2.74972,3.950553,1.990382,9.063197,7.675873,6.198012,3.993073,16.621257,29.324748
min,0.0,-83.0,-7.0,-35.0,-5.0,-13.0,-9.0,-8.0,-10.0,-20.0,-1.0,0.0,-4.0,0.0,0.0,0.0,0.0
25%,0.0,2.0,0.0,0.0,4.0,1.0,0.0,1.0,0.0,1.0,0.0,2.0,0.0,2.0,0.0,6.0,7.0
50%,4.0,4.0,0.0,1.0,8.0,1.0,0.0,6.0,1.0,3.0,0.0,5.0,4.0,8.0,1.0,13.0,21.0
75%,8.0,7.0,1.0,3.0,14.0,4.0,1.0,13.0,1.0,5.0,1.0,11.0,8.0,12.0,5.0,23.75,44.0
max,269.0,43.0,17.0,86.0,66.0,46.0,53.0,62.0,42.0,32.0,30.0,67.0,62.0,32.0,21.0,81.0,302.0


## Rep Claim Data

In [136]:
# created a variable to be able to allow a 10% error in 'rep claim days'
rep_quantile = workflow_table_df['Rep Collecting Claim'].quantile(.90)

# produce only positive day claims within 90% of all records
rep_claim_df = workflow_table_df.loc[(workflow_table_df['Rep Collecting Claim'] >= 0) & (
    workflow_table_df['Rep Collecting Claim'] <= rep_quantile), :]

# creating a rep claim outlier df
outlier_rep_claim_df = workflow_table_df.loc[(workflow_table_df['Rep Collecting Claim'] < 0) | (
    workflow_table_df['Rep Collecting Claim'] > rep_quantile), :]

# confirming no records were lost
print(f"Workflow Records with Rep Claim: {workflow_table_df['Rep Collecting Claim'].count()}")
print(f"Quantile: {rep_quantile}")
print(f"Cleaned Records: {len(rep_claim_df)}")
print(f"Outlier Records: {len(outlier_rep_claim_df)}")

Workflow Records with Rep Claim: 2389
Quantile: 15.0
Cleaned Records: 2160
Outlier Records: 229


In [137]:
outlier_rep_claim_df['Rep Collecting Claim'].value_counts()

16.0     24
19.0     24
18.0     24
20.0     18
22.0     17
17.0     16
21.0     14
25.0     13
27.0     11
26.0      9
23.0      6
32.0      5
24.0      5
30.0      4
33.0      4
35.0      3
34.0      3
37.0      3
28.0      3
29.0      2
41.0      2
55.0      1
52.0      1
45.0      1
143.0     1
156.0     1
269.0     1
42.0      1
38.0      1
219.0     1
176.0     1
162.0     1
59.0      1
231.0     1
39.0      1
62.0      1
36.0      1
31.0      1
43.0      1
48.0      1
Name: Rep Collecting Claim, dtype: int64

## FTA Scope Data

In [138]:
# created a variable to be able to allow a 10% error in 'fta scope days'
fta_quantile = workflow_table_df['FTA Completing Scope'].quantile(.90)

# produce only positive day claims within 90% of all records
fta_scope_df = workflow_table_df.loc[(workflow_table_df['FTA Completing Scope'] >= 0) & (
    workflow_table_df['FTA Completing Scope'] <= fta_quantile), :]

# creating a fta scope outlier df
outlier_fta_scope_df = workflow_table_df.loc[(workflow_table_df['FTA Completing Scope'] < 0) | (
    workflow_table_df['FTA Completing Scope'] > fta_quantile), :]

# confirming no records were lost
print(f"Workflow Records with FTA Scope: {workflow_table_df['FTA Completing Scope'].count()}")
print(f"Quantile: {fta_quantile}")
print(f"Cleaned Records: {len(fta_scope_df)}")
print(f"Outlier Records: {len(outlier_fta_scope_df)}")

Workflow Records with FTA Scope: 1897
Quantile: 12.0
Cleaned Records: 1735
Outlier Records: 162


In [139]:
outlier_fta_scope_df['FTA Completing Scope'].value_counts()

 13.0    39
 14.0    31
 15.0    19
 17.0    12
 19.0    11
 18.0    10
 16.0     7
 20.0     7
 21.0     4
 23.0     3
 27.0     3
 32.0     2
 22.0     2
-83.0     1
 25.0     1
 24.0     1
 28.0     1
 33.0     1
 26.0     1
-13.0     1
 34.0     1
 43.0     1
 30.0     1
 39.0     1
 36.0     1
Name: FTA Completing Scope, dtype: int64

## BC Estimate Data

In [140]:
# created a variable to be able to allow a 10% error in 'rep claim days'
bc_quantile = workflow_table_df['BC Completing Estimate'].quantile(.90)
num_bc_estimates = workflow_table_df['BC Completing Estimate'].count()

# produce only positive day claims within 90% of all records
bc_estimate_df = workflow_table_df.loc[(workflow_table_df['BC Completing Estimate'] >= 0) & (
    workflow_table_df['BC Completing Estimate'] <= bc_quantile), :]

# creating a bc estimate outlier df
outlier_bc_estimate_df = workflow_table_df.loc[(workflow_table_df['BC Completing Estimate'] < 0) | (
    workflow_table_df['BC Completing Estimate'] > bc_quantile), :]

# confirming no records were lost
print(f"Workflow Records with BC Estimate: {workflow_table_df['BC Completing Estimate'].count()}")
print(f"Quantile: {bc_quantile}")
print(f"Cleaned Records: {len(bc_estimate_df)}")
print(f"Outlier Records: {len(outlier_bc_estimate_df)}")

Workflow Records with BC Estimate: 1846
Quantile: 2.0
Cleaned Records: 1669
Outlier Records: 177


In [141]:
outlier_bc_estimate_df['BC Completing Estimate'].value_counts()

 3.0     118
 4.0      22
 5.0      12
 6.0       7
 10.0      3
 7.0       3
-1.0       2
 8.0       2
-3.0       1
-4.0       1
-2.0       1
-7.0       1
 14.0      1
 12.0      1
 17.0      1
 13.0      1
Name: BC Completing Estimate, dtype: int64

## OB Scope Data

In [142]:
# created a variable to be able to allow a 10% error in 'ob scoped days'
ob_quantile = workflow_table_df['OB Completing Scope'].quantile(.90)
num_ob_scopes = workflow_table_df['OB Completing Scope'].count()

# produce only positive day claims within 90% of all records
ob_scope_df = workflow_table_df.loc[(workflow_table_df['OB Completing Scope'] >= 0) & (
    workflow_table_df['OB Completing Scope'] <= ob_quantile), :]

# creating a ob scoped outlier df
outlier_ob_scope_df = workflow_table_df.loc[(workflow_table_df['OB Completing Scope'] < 0) | (
    workflow_table_df['OB Completing Scope'] > ob_quantile), :]

# confirming no records were lost
print(f"Workflow Records with OB Scope: {workflow_table_df['OB Completing Scope'].count()}")
print(f"Quantile: {ob_quantile}")
print(f"Cleaned Records: {len(ob_scope_df)}")
print(f"Outlier Records: {len(outlier_ob_scope_df)}")

Workflow Records with OB Scope: 1737
Quantile: 5.0
Cleaned Records: 1396
Outlier Records: 341


In [169]:
outlier_ob_scope_df['OB Completing Scope'].value_counts()

-1.0     170
 6.0      70
 7.0      30
 8.0      15
 10.0     14
 9.0      11
 11.0      9
 12.0      5
 13.0      4
-4.0       3
-2.0       2
 15.0      1
 24.0      1
 16.0      1
-35.0      1
-5.0       1
 86.0      1
-8.0       1
 14.0      1
Name: OB Completing Scope, dtype: int64

In [168]:
outlier_ob_scope_df.to_csv("data_copy/ob_scope_outlier_data.csv", index=False)

## Sup Submittal Data

In [144]:
# created a variable to be able to allow a 10% error in 'sup submitted days'
sup_quantile = workflow_table_df['Sup Submitting Job'].quantile(.90)
num_sup_submits = workflow_table_df['Sup Submitting Job'].count()

# produce only positive day claims within 90% of all records
sup_submit_df = workflow_table_df.loc[(workflow_table_df['Sup Submitting Job'] >= 0) & (
    workflow_table_df['Sup Submitting Job'] <= sup_quantile), :]

# creating a sup submitted outlier df
outlier_sup_submit_df = workflow_table_df.loc[(workflow_table_df['Sup Submitting Job'] < 0) | (
    workflow_table_df['Sup Submitting Job'] > sup_quantile), :]

# confirming no records were lost
print(f"Workflow Records with Sup Submit: {workflow_table_df['Sup Submitting Job'].count()}")
print(f"Quantile: {sup_quantile}")
print(f"Cleaned Records: {len(sup_submit_df)}")
print(f"Outlier Records: {len(outlier_sup_submit_df)}")

Workflow Records with Sup Submit: 1099
Quantile: 22.0
Cleaned Records: 995
Outlier Records: 104


In [145]:
outlier_sup_submit_df['Sup Submitting Job'].value_counts()

 23.0    12
 27.0    10
 33.0     7
 24.0     6
 34.0     5
 38.0     4
 29.0     4
 26.0     4
 28.0     3
 41.0     3
 40.0     3
 35.0     3
 25.0     3
 37.0     3
 43.0     3
 36.0     2
 51.0     2
 42.0     2
 56.0     2
 45.0     2
 46.0     2
 31.0     2
 39.0     2
 30.0     2
-4.0      2
-5.0      1
 54.0     1
 44.0     1
 50.0     1
 66.0     1
 49.0     1
 62.0     1
 48.0     1
 60.0     1
 32.0     1
 61.0     1
Name: Sup Submitting Job, dtype: int64

## BC Approval Data

In [146]:
# created a variable to be able to allow a 10% error in 'bc approved days'
bc_quantile = workflow_table_df['BC Approving Job'].quantile(.90)
num_bc_approvals = workflow_table_df['BC Approving Job'].count()

# produce only positive day claims within 90% of all records
bc_approval_df = workflow_table_df.loc[(workflow_table_df['BC Approving Job'] >= 0) & (
    workflow_table_df['BC Approving Job'] <= bc_quantile), :]

# creating a bc approved outlier df
outlier_bc_approval_df = workflow_table_df.loc[(workflow_table_df['BC Approving Job'] < 0) | (
    workflow_table_df['BC Approving Job'] > bc_quantile), :]

# confirming no records were lost
print(f"Workflow Records with BC Approval: {workflow_table_df['BC Approving Job'].count()}")
print(f"Quantile: {bc_quantile}")
print(f"Cleaned Records: {len(bc_approval_df)}")
print(f"Outlier Records: {len(outlier_bc_approval_df)}")

Workflow Records with BC Approval: 1015
Quantile: 8.600000000000023
Cleaned Records: 910
Outlier Records: 105


In [147]:
outlier_bc_approval_df['BC Approving Job'].value_counts()

 13.0    13
 9.0     12
 12.0    11
 11.0    10
 20.0     8
 19.0     7
 14.0     5
 26.0     5
 10.0     4
 21.0     4
 17.0     4
 16.0     3
 25.0     3
 18.0     3
 15.0     2
 23.0     2
-13.0     1
 32.0     1
-4.0      1
 39.0     1
 27.0     1
-11.0     1
 41.0     1
 35.0     1
 46.0     1
Name: BC Approving Job, dtype: int64

## OB Create Order Data

In [148]:
# created a variable to be able to allow a 10% error in 'ob created days'
ob_quantile = workflow_table_df['OB Building Order'].quantile(.90)
num_ob_orders = workflow_table_df['OB Building Order'].count()

# produce only positive day claims within 90% of all records
ob_order_df = workflow_table_df.loc[(workflow_table_df['OB Building Order'] >= 0) & (
    workflow_table_df['OB Building Order'] <= ob_quantile), :]

# creating a ob created outlier df
outlier_ob_order_df = workflow_table_df.loc[(workflow_table_df['OB Building Order'] < 0) | (
    workflow_table_df['OB Building Order'] > ob_quantile), :]

# confirming no records were lost
print(f"Workflow Records with OB Order: {workflow_table_df['OB Building Order'].count()}")
print(f"Quantile: {ob_quantile}")
print(f"Cleaned Records: {len(ob_order_df)}")
print(f"Outlier Records: {len(outlier_ob_order_df)}")

Workflow Records with OB Order: 956
Quantile: 4.0
Cleaned Records: 865
Outlier Records: 91


In [149]:
outlier_ob_order_df['OB Building Order'].value_counts()

 5.0     30
 6.0     12
 10.0     9
 11.0     9
 7.0      7
 12.0     7
 8.0      4
 9.0      3
 13.0     2
 20.0     2
 40.0     1
 53.0     1
 24.0     1
 28.0     1
 27.0     1
-9.0      1
Name: OB Building Order, dtype: int64

## GM Process Order Data

In [150]:
# created a variable to be able to allow a 10% error in 'gm processed days'
gm_quantile = workflow_table_df['GM Processing Order'].quantile(.90)
num_gm_orders = workflow_table_df['GM Processing Order'].count()

# produce only positive day claims within 90% of all records
gm_order_df = workflow_table_df.loc[(workflow_table_df['GM Processing Order'] >= 0) & (
    workflow_table_df['GM Processing Order'] <= gm_quantile), :]

# creating a gm processed outlier df
outlier_gm_order_df = workflow_table_df.loc[(workflow_table_df['GM Processing Order'] < 0) | (
    workflow_table_df['GM Processing Order'] > gm_quantile), :]

# confirming no records were lost
print(f"Workflow Records with GM Processed Order: {workflow_table_df['GM Processing Order'].count()}")
print(f"Quantile: {gm_quantile}")
print(f"Cleaned Records: {len(gm_order_df)}")
print(f"Outlier Records: {len(outlier_gm_order_df)}")

Workflow Records with GM Processed Order: 793
Quantile: 20.0
Cleaned Records: 634
Outlier Records: 159


In [151]:
outlier_gm_order_df['GM Processing Order'].value_counts()

-1.0     85
 21.0     9
 27.0     8
 26.0     7
 28.0     7
 23.0     7
 29.0     6
 22.0     4
 37.0     3
 33.0     3
 35.0     3
 24.0     2
 41.0     2
 31.0     1
 46.0     1
 59.0     1
 25.0     1
 39.0     1
 34.0     1
 40.0     1
 54.0     1
 45.0     1
 62.0     1
 53.0     1
 42.0     1
-8.0      1
Name: GM Processing Order, dtype: int64

## PA Process OA Data

In [152]:
# created a variable to be able to allow a 10% error in 'pa oa processeddays'
pa_quantile = workflow_table_df['PA Processing OA'].quantile(.90)
num_pa_oa_processed = workflow_table_df['PA Processing OA'].count()

# produce only positive day claims within 90% of all records
pa_processed_oa_df = workflow_table_df.loc[(workflow_table_df['PA Processing OA'] >= 0) & (
    workflow_table_df['PA Processing OA'] <= pa_quantile), :]

# creating a pa oa processedoutlier df
outlier_pa_processed_oa_df = workflow_table_df.loc[(workflow_table_df['PA Processing OA'] < 0) | (
    workflow_table_df['PA Processing OA'] > pa_quantile), :]

# confirming no records were lost
print(f"Workflow Records with PA OA Processed: {workflow_table_df['PA Processing OA'].count()}")
print(f"Quantile: {pa_quantile}")
print(f"Cleaned Records: {len(pa_processed_oa_df)}")
print(f"Outlier Records: {len(outlier_pa_processed_oa_df)}")

Workflow Records with PA OA Processed: 751
Quantile: 3.0
Cleaned Records: 684
Outlier Records: 67


In [153]:
outlier_pa_processed_oa_df['PA Processing OA'].value_counts()

 4.0     24
 5.0     18
 6.0      6
 10.0     4
 7.0      2
 11.0     2
 8.0      2
-10.0     1
 20.0     1
 42.0     1
-1.0      1
 15.0     1
 14.0     1
 18.0     1
 12.0     1
 30.0     1
Name: PA Processing OA, dtype: int64

## PA Invoicing OA Data

In [154]:
# created a variable to be able to allow a 10% error in 'pa oa invoiceddays'
pa_quantile = workflow_table_df['PA Invoicing OA'].quantile(.90)
num_pa_oa_invoiced = workflow_table_df['PA Invoicing OA'].count()

# produce only positive day claims within 90% of all records
pa_invoiced_oa_df = workflow_table_df.loc[(workflow_table_df['PA Invoicing OA'] >= 0) & (
    workflow_table_df['PA Invoicing OA'] <= pa_quantile), :]

# creating a pa oa invoicedoutlier df
outlier_pa_invoiced_oa_df = workflow_table_df.loc[(workflow_table_df['PA Invoicing OA'] < 0) | (
    workflow_table_df['PA Invoicing OA'] > pa_quantile), :]

# confirming no records were lost
print(f"Workflow Records with PA OA Invoiced: {workflow_table_df['PA Invoicing OA'].count()}")
print(f"Quantile: {pa_quantile}")
print(f"Cleaned Records: {len(pa_invoiced_oa_df)}")
print(f"Outlier Records: {len(outlier_pa_invoiced_oa_df)}")

Workflow Records with PA OA Invoiced: 683
Quantile: 7.0
Cleaned Records: 592
Outlier Records: 91


In [155]:
outlier_pa_invoiced_oa_df['PA Invoicing OA'].value_counts()

 10.0    15
-1.0     13
-5.0     13
 8.0     10
-6.0      5
-8.0      5
 11.0     4
-7.0      4
 18.0     3
 9.0      3
 14.0     3
-3.0      2
 12.0     2
 27.0     1
 20.0     1
 15.0     1
-20.0     1
 13.0     1
-2.0      1
 17.0     1
 19.0     1
 32.0     1
Name: PA Invoicing OA, dtype: int64

## GM Approving for Inspection Data

In [156]:
# created a variable to be able to allow a 10% error in 'gm approved for inspection days'
gm_quantile = workflow_table_df['GM Approving for Inspection'].quantile(.90)
num_gm_approved_inspection = workflow_table_df['GM Approving for Inspection'].count()

# produce only positive day claims within 90% of all records
gm_approved_inspection_df = workflow_table_df.loc[(workflow_table_df['GM Approving for Inspection'] >= 0) & (
    workflow_table_df['GM Approving for Inspection'] <= gm_quantile), :]

# creating a gm approved for inspection outlier df
outlier_gm_approved_inspection_df = workflow_table_df.loc[(workflow_table_df['GM Approving for Inspection'] < 0) | (
    workflow_table_df['GM Approving for Inspection'] > gm_quantile), :]

# confirming no records were lost
print(f"Workflow Records with GM Approved for Inspection: {workflow_table_df['GM Approving for Inspection'].count()}")
print(f"Quantile: {gm_quantile}")
print(f"Cleaned Records: {len(gm_approved_inspection_df)}")
print(f"Outlier Records: {len(outlier_gm_approved_inspection_df)}")

Workflow Records with GM Approved for Inspection: 627
Quantile: 3.0
Cleaned Records: 492
Outlier Records: 135


In [157]:
outlier_gm_approved_inspection_df['GM Approving for Inspection'].value_counts()

-1.0     78
 4.0     27
 5.0     17
 6.0     11
 30.0     1
 8.0      1
Name: GM Approving for Inspection, dtype: int64

## RA Requesting Inspection Data


In [158]:
# created a variable to be able to allow a 10% error in 'ra requesting inspection days'
ra_quantile = workflow_table_df['RA Requesting Inspection'].quantile(.90)
num_ra_requested = workflow_table_df['RA Requesting Inspection'].count()

# produce only positive day claims within 90% of all records
ra_requested_inspection_df = workflow_table_df.loc[(workflow_table_df['RA Requesting Inspection'] >= 0) & (
    workflow_table_df['RA Requesting Inspection'] <= ra_quantile), :]

# creating a ra requesting inspection outlier df
outlier_ra_requested_inspection_df = workflow_table_df.loc[(workflow_table_df['RA Requesting Inspection'] < 0) | (
    workflow_table_df['RA Requesting Inspection'] > ra_quantile), :]

# confirming no records were lost
print(f"Workflow Records with RA Requested Inspection: {workflow_table_df['RA Requesting Inspection'].count()}")
print(f"Quantile: {ra_quantile}")
print(f"Cleaned Records: {len(ra_requested_inspection_df)}")
print(f"Outlier Records: {len(outlier_ra_requested_inspection_df)}")

Workflow Records with RA Requested Inspection: 222
Quantile: 19.0
Cleaned Records: 202
Outlier Records: 20


In [159]:
outlier_ra_requested_inspection_df['RA Requesting Inspection'].value_counts()

25.0    3
20.0    3
22.0    2
24.0    2
67.0    1
31.0    1
47.0    1
27.0    1
28.0    1
41.0    1
36.0    1
42.0    1
21.0    1
26.0    1
Name: RA Requesting Inspection, dtype: int64

## Rep Collecting COC Data

In [160]:
# created a variable to be able to allow a 10% error in 'rep collect coc days'
rep_quantile = workflow_table_df['Rep Collecting COC'].quantile(.90)
num_rep_collected = workflow_table_df['Rep Collecting COC'].count()

# produce only positive day claims within 90% of all records
rep_collected_df = workflow_table_df.loc[(workflow_table_df['Rep Collecting COC'] >= 0) & (
    workflow_table_df['Rep Collecting COC'] <= rep_quantile), :]

# creating a rep collect coc outlier df
outlier_rep_collected_df = workflow_table_df.loc[(workflow_table_df['Rep Collecting COC'] < 0) | (
    workflow_table_df['Rep Collecting COC'] > rep_quantile), :]

# confirming no records were lost
print(f"Workflow Records with Rep COC Collected: {workflow_table_df['Rep Collecting COC'].count()}")
print(f"Quantile: {rep_quantile}")
print(f"Cleaned Records: {len(rep_collected_df)}")
print(f"Outlier Records: {len(outlier_rep_collected_df)}")

Workflow Records with Rep COC Collected: 545
Quantile: 15.0
Cleaned Records: 440
Outlier Records: 105


In [161]:
outlier_rep_collected_df['Rep Collecting COC'].value_counts()

-1.0     52
 16.0    10
 20.0     6
 17.0     4
 24.0     3
 19.0     3
 22.0     3
 21.0     2
 23.0     2
 26.0     2
 37.0     2
 33.0     2
 18.0     2
 28.0     2
 29.0     1
 25.0     1
-4.0      1
 62.0     1
 46.0     1
-3.0      1
 35.0     1
 50.0     1
 32.0     1
 30.0     1
Name: Rep Collecting COC, dtype: int64

## SA Uploading Docs Data

In [162]:
# created a variable to be able to allow a 10% error in 'sa uploaded days'
sa_quantile = workflow_table_df['SA Uploading Docs'].quantile(.90)
num_sa_uploaded = workflow_table_df['SA Uploading Docs'].count()

# produce only positive day claims within 90% of all records
sa_uploaded_df = workflow_table_df.loc[(workflow_table_df['SA Uploading Docs'] >= 0) & (
    workflow_table_df['SA Uploading Docs'] <= sa_quantile), :]

# creating a sa uploaded outlier df
outlier_sa_uploaded_df = workflow_table_df.loc[(workflow_table_df['SA Uploading Docs'] < 0) | (
    workflow_table_df['SA Uploading Docs'] > sa_quantile), :]

# confirming no records were lost
print(f"Workflow Records with SA Docs Uploaded: {workflow_table_df['SA Uploading Docs'].count()}")
print(f"Quantile: {sa_quantile}")
print(f"Cleaned Records: {len(sa_uploaded_df)}")
print(f"Outlier Records: {len(outlier_sa_uploaded_df)}")

Workflow Records with SA Docs Uploaded: 445
Quantile: 14.600000000000023
Cleaned Records: 400
Outlier Records: 45


In [163]:
outlier_sa_uploaded_df['SA Uploading Docs'].value_counts()

15.0    7
21.0    6
18.0    5
25.0    4
16.0    4
19.0    4
24.0    3
17.0    3
27.0    2
26.0    2
20.0    2
32.0    1
28.0    1
23.0    1
Name: SA Uploading Docs, dtype: int64

## BC Invoicing Project Data

In [164]:
# created a variable to be able to allow a 10% error in 'bc invoiced days'
bc_quantile = workflow_table_df['BC Invoicing Project'].quantile(.90)
num_bc_invoiced = workflow_table_df['BC Invoicing Project'].count()

# produce only positive day claims within 90% of all records
bc_invoiced_df = workflow_table_df.loc[(workflow_table_df['BC Invoicing Project'] >= 0) & (
    workflow_table_df['BC Invoicing Project'] <= bc_quantile), :]

# creating a bc invoiced outlier df
outlier_bc_invoiced_df = workflow_table_df.loc[(workflow_table_df['BC Invoicing Project'] < 0) | (
    workflow_table_df['BC Invoicing Project'] > bc_quantile), :]

# confirming no records were lost
print(f"Workflow Records with BC Project Invoiced: {workflow_table_df['BC Invoicing Project'].count()}")
print(f"Quantile: {bc_quantile}")      
print(f"Cleaned Records: {len(bc_invoiced_df)}")
print(f"Outlier Records: {len(outlier_bc_invoiced_df)}")

Workflow Records with BC Project Invoiced: 393
Quantile: 8.0
Cleaned Records: 355
Outlier Records: 38


In [165]:
outlier_bc_invoiced_df['BC Invoicing Project'].value_counts()

9.0     8
12.0    5
10.0    5
14.0    4
11.0    4
15.0    3
17.0    3
16.0    2
18.0    1
21.0    1
13.0    1
20.0    1
Name: BC Invoicing Project, dtype: int64

## BC Closed Project Data

In [166]:
# created a variable to be able to allow a 10% error in 'bc closed days'
bc_quantile = workflow_table_df['BC Closed Project'].quantile(.90)
num_bc_closed = workflow_table_df['BC Closed Project'].count()

# produce only positive day claims within 90% of all records
bc_closed_df = workflow_table_df.loc[(workflow_table_df['BC Closed Project'] >= 0) & (
    workflow_table_df['BC Closed Project'] <= bc_quantile), :]

# creating a bc closed outlier df
outlier_bc_closed_df = workflow_table_df.loc[(workflow_table_df['BC Closed Project'] < 0) | (
    workflow_table_df['BC Closed Project'] > bc_quantile), :]

# confirming no records were lost
print(f"Workflow Records with BC Project Closed: {workflow_table_df['BC Closed Project'].count()}")
print(f"Quantile: {bc_quantile}")      
print(f"Cleaned Records: {len(bc_closed_df)}")
print(f"Outlier Records: {len(outlier_bc_closed_df)}")

Workflow Records with BC Project Closed: 202
Quantile: 39.900000000000006
Cleaned Records: 181
Outlier Records: 21


In [167]:
outlier_bc_closed_df['BC Closed Project'].value_counts()

41.0    5
81.0    2
51.0    2
43.0    2
49.0    1
40.0    1
48.0    1
69.0    1
78.0    1
47.0    1
67.0    1
63.0    1
68.0    1
42.0    1
Name: BC Closed Project, dtype: int64