# Setup

## Dependencies

In [1]:
import os
import pandas as pd
from datetime import datetime
from datetime import timedelta
from datetime import date

## Importing Data

In [2]:
# # this allows the 'data_prep' file to be ran before this file tries to bring in those datasets
os.system("python data_prep.py")

# this is from the datasets with non-corrected 'gm r4f' dates
# os.system("python data_prep_without_corrections.py")

project_table_data = "./data/cleaned_data/project_table.csv"
project_info_data = "./data/cleaned_data/project_info_table.csv"
workflow_table_data = "./data/cleaned_data/workflow_table.csv"


# imports the 'project table' data
project_table_data = "./data/cleaned_data/project_table.csv"
project_table_df = pd.read_csv(
    project_table_data, dtype={
        'Claim #': str,
        'Job #': str,
        'Branch':str,
        'Claim Status':str},
    parse_dates=[
        'Rep Agreement Signed', 'Rep Claim Collected','FTA Scope Completed',
        'FTA Scope Rejected', 'BC Estimate Completed','OB Scope Completed',
        'Sup Job Submitted', 'BC Approved for Production', 'OB Order Built',
        'GM Order Processed', 'PA Permit Applied', 'PA Permit Processed',
        'PA OA Processed', 'PA OA Invoiced', 'PA Notify of Delivery',
        'PA Notify of Start', 'Delivery Date', 'Roof Start',
        'Roof End', 'GM Approved for Inspection', 'GM Change Order Date',
        'GM Labor Adjustment Date', 'RA Inspection Requested', 'RA Inspection Processed', 
        'Rep COC Collected', 'SA Job Docs Uploaded', 'BC Project Invoiced','BC Project Closed'])

# imports the 'project information' data
project_info_data = "./data/cleaned_data/project_info_table.csv"
project_info_df = pd.read_csv(project_info_data, dtype={
    'Claim #':str, 'Job #':str, 'Branch':str,
    'City':str, 'Building Department':str, 'Permit Req?':str,
    'Supplier Name':str, 'Crew':str, 'Insurance Company':str,
    'Multi-rejected':str,'Sup':str, 'Rep':str,
    'FTA':str, 'BC':str, 'OB':str, 'GM':str})

# imports 'workflow table' data
workflow_table_data = "./data/cleaned_data/workflow_table.csv"
workflow_table_df = pd.read_csv(workflow_table_data)


In [3]:
project_table_df.columns

Index(['Claim #', 'Job #', 'Branch', 'Claim Status', 'Rep Agreement Signed',
       'Rep Claim Collected', 'FTA Scope Completed', 'FTA Scope Rejected',
       'BC Estimate Completed', 'OB Scope Completed', 'Sup Job Submitted',
       'BC Approved for Production', 'OB Order Built', 'GM Order Processed',
       'PA Permit Applied', 'PA Permit Processed', 'PA OA Processed',
       'PA OA Invoiced', 'PA Notify of Delivery', 'PA Notify of Start',
       'Delivery Date', 'Roof Start', 'Roof End', 'GM Approved for Inspection',
       'GM Change Order Date', 'GM Labor Adjustment Date',
       'RA Inspection Requested', 'RA Inspection Processed',
       'Rep COC Collected', 'SA Job Docs Uploaded', 'BC Project Invoiced',
       'BC Project Closed'],
      dtype='object')

In [4]:
project_info_df.columns

Index(['Claim #', 'Job #', 'Branch', 'City', 'Building Department',
       'Permit Req?', 'Supplier Name', 'Crew', 'Insurance Company',
       'Multi-rejected', 'Scope Rejections', 'Change Orders',
       'Labor Adjustments', 'Sup', 'Rep', 'FTA', 'BC', 'OB', 'GM'],
      dtype='object')

In [5]:
workflow_table_df.columns

Index(['Claim #', 'Rep Collecting Claim', 'FTA Completing Scope',
       'BC Completing Estimate', 'OB Completing Scope', 'Sup Submitting Job',
       'BC Approving Job', 'OB Building Order', 'GM Processing Order',
       'PA Processing OA', 'PA Invoicing OA', 'GM Approving for Inspection',
       'RA Requesting Inspection', 'Rep Collecting COC', 'SA Uploading Docs',
       'BC Invoicing Project', 'BC Closed Project', 'Days in Pipeline'],
      dtype='object')

# Workflow Data

In [24]:
project_table_df.describe()

Unnamed: 0,Claim #,Job #,Branch,Claim Status,Rep Agreement Signed,Rep Claim Collected,FTA Scope Completed,FTA Scope Rejected,BC Estimate Completed,OB Scope Completed,...,Roof End,GM Approved for Inspection,GM Change Order Date,GM Labor Adjustment Date,RA Inspection Requested,RA Inspection Processed,Rep COC Collected,SA Job Docs Uploaded,BC Project Invoiced,BC Project Closed
count,2476,1120.0,2478,129,2479,2479,2015,867,1969,1866,...,863,645,100,319,280,181,615,523,464,241
unique,2473,1120.0,5,1,157,145,137,96,108,106,...,102,92,19,1,47,40,88,67,71,60
top,H02643410,1936120.0,FCO,Dead,2019-07-11 00:00:00,2019-07-16 00:00:00,2019-07-23 00:00:00,2019-07-24 00:00:00,2019-07-16 00:00:00,2019-07-25 00:00:00,...,2019-08-01 00:00:00,2019-07-22 00:00:00,2019-06-11 00:00:00,2019-04-23 00:00:00,2019-07-23 00:00:00,2019-07-25 00:00:00,2019-07-24 00:00:00,2019-07-02 00:00:00,2019-07-25 00:00:00,2019-07-24 00:00:00
freq,2,1.0,951,129,142,92,51,30,147,52,...,29,21,19,319,42,22,24,35,30,13
first,,,,,2018-08-19 00:00:00,2019-02-04 00:00:00,2019-02-04 00:00:00,2019-02-08 00:00:00,2019-02-14 00:00:00,2019-02-14 00:00:00,...,2019-02-26 00:00:00,2019-02-26 00:00:00,2019-04-23 00:00:00,2019-04-23 00:00:00,2019-04-17 00:00:00,2019-04-19 00:00:00,2019-03-01 00:00:00,2019-03-11 00:00:00,2019-03-11 00:00:00,2019-03-14 00:00:00
last,,,,,2019-07-27 00:00:00,2019-07-27 00:00:00,2019-07-27 00:00:00,2019-07-26 00:00:00,2019-07-26 00:00:00,2019-07-26 00:00:00,...,2019-10-16 00:00:00,2019-07-27 00:00:00,2019-07-26 00:00:00,2019-04-23 00:00:00,2019-07-26 00:00:00,2019-07-29 00:00:00,2019-07-26 00:00:00,2019-07-26 00:00:00,2019-07-26 00:00:00,2019-07-26 00:00:00


In [6]:
workflow_table_df.describe()

Unnamed: 0,Rep Collecting Claim,FTA Completing Scope,BC Completing Estimate,OB Completing Scope,Sup Submitting Job,BC Approving Job,OB Building Order,GM Processing Order,PA Processing OA,PA Invoicing OA,GM Approving for Inspection,RA Requesting Inspection,Rep Collecting COC,SA Uploading Docs,BC Invoicing Project,BC Closed Project,Days in Pipeline
count,2479.0,2015.0,1969.0,1865.0,1191.0,1118.0,1085.0,884.0,827.0,714.0,645.0,280.0,615.0,522.0,464.0,237.0,2479.0
mean,6.406616,5.577171,0.735399,2.225737,10.361041,3.301431,1.654378,8.642534,1.280532,4.317927,0.942636,7.296429,5.960976,8.773946,3.519397,17.827004,31.900766
std,12.605553,5.405767,1.405409,3.521391,9.817098,5.244208,3.777408,9.186198,2.698963,3.352341,1.881125,9.281422,8.000312,7.76667,4.203214,17.59236,31.556575
min,0.0,-83.0,-7.0,-34.0,-10.0,-13.0,-9.0,-7.0,-10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,2.0,0.0,1.0,4.0,1.0,0.0,2.0,0.0,2.0,0.0,1.0,1.0,3.0,0.0,6.0,9.0
50%,4.0,4.0,0.0,1.0,7.0,1.0,0.0,6.0,1.0,4.0,0.0,4.0,3.0,8.0,2.0,14.0,22.0
75%,8.0,8.0,1.0,3.0,14.0,4.0,2.0,13.0,1.0,6.0,1.0,9.0,7.0,12.0,6.0,25.0,49.0
max,269.0,43.0,23.0,87.0,66.0,46.0,53.0,63.0,42.0,33.0,30.0,67.0,62.0,71.0,21.0,81.0,337.0


## Rep Claim Data

In [7]:
# created a variable to be able to allow a 10% error in 'rep claim days'
rep_quantile = workflow_table_df['Rep Collecting Claim'].quantile(.90)

# produce only positive day claims within 90% of all records
rep_claim_df = workflow_table_df.loc[(workflow_table_df['Rep Collecting Claim'] >= 0) & (
    workflow_table_df['Rep Collecting Claim'] <= rep_quantile), :]

# creating a rep claim outlier df
outlier_rep_claim_df = workflow_table_df.loc[(workflow_table_df['Rep Collecting Claim'] < 0) | (
    workflow_table_df['Rep Collecting Claim'] > rep_quantile), :]

# confirming no records were lost
print(f"Workflow Records with Rep Claim: {workflow_table_df['Rep Collecting Claim'].count()}")
print(f"Quantile: {rep_quantile}")
print(f"Cleaned Records: {len(rep_claim_df)}")
print(f"Outlier Records: {len(outlier_rep_claim_df)}")

Workflow Records with Rep Claim: 2479
Quantile: 15.0
Cleaned Records: 2235
Outlier Records: 244


### Outlier Data

In [8]:
# outlier_rep_claim_df['Rep Collecting Claim'].value_counts()

### Analysis Data

In [10]:
# information to be combined with the rep data from 'project_table_df'
rep_claim_project_analysis_df = project_table_df[['Claim #', 'Job #','Claim Status','Rep Agreement Signed',
                                                  'Rep Claim Collected']]

# information to be combined with the rep data from 'project_info_df'
rep_claim_info_analysis_df = project_info_df[['Claim #', 'Job #', 'Branch', 
                                              'City', 'Building Department',]]

# information to be combined with the rep data from 'workflow_table_df'
rep_claim_workflow_analysis_df = workflow_table_df[['Claim #', 'Rep Collecting Claim']]


# merging all of the dfs together to prepare rep claim analysis
rep_claim_analysis_df = rep_claim_project_analysis_df.merge(
    rep_claim_info_analysis_df, on=["Claim #", "Job #"]).merge(
    rep_claim_workflow_analysis_df, on=['Claim #'])

# organizing the df
rep_claim_analysis_df = rep_claim_analysis_df[['Claim #','Job #','Branch',
                                               'Building Department','City','Rep Agreement Signed',
                                               'Rep Claim Collected','Rep Collecting Claim','Claim Status']]

rep_claim_analysis_df.head()

Unnamed: 0,Claim #,Job #,Branch,Building Department,City,Rep Agreement Signed,Rep Claim Collected,Rep Collecting Claim,Claim Status
0,00140067991A028,1938689,DEN,,Denver,2019-07-18,2019-07-18,0.0,
1,040357502,1938476,FCO,,fort collins,2019-07-15,2019-07-15,0.0,
2,01001691605,1938472,DEN,"Westminster, City of (DEN)",Westminster,2019-07-15,2019-07-15,0.0,
3,020487835-7,1938470,DEN,,Westminster,2019-07-15,2019-07-15,0.0,
4,23255128-002,1938447,OMA,,Omaha,2019-07-15,2019-07-16,1.0,


## FTA Scope Data

In [34]:
# created a variable to be able to allow a 10% error in 'fta scope days'
fta_quantile = workflow_table_df['FTA Completing Scope'].quantile(.90)

# produce only positive day claims within 90% of all records
fta_scope_df = workflow_table_df.loc[(workflow_table_df['FTA Completing Scope'] >= 0) & (
    workflow_table_df['FTA Completing Scope'] <= fta_quantile), :]

# creating a fta scope outlier df
outlier_fta_scope_df = workflow_table_df.loc[(workflow_table_df['FTA Completing Scope'] < 0) | (
    workflow_table_df['FTA Completing Scope'] > fta_quantile), :]

# confirming no records were lost
print(f"Workflow Records with FTA Scope: {workflow_table_df['FTA Completing Scope'].count()}")
print(f"Quantile: {fta_quantile}")
print(f"Cleaned Records: {len(fta_scope_df)}")
print(f"Outlier Records: {len(outlier_fta_scope_df)}")

Workflow Records with FTA Scope: 2015
Quantile: 12.0
Cleaned Records: 1813
Outlier Records: 202


### Outlier Data

In [35]:
# outlier_fta_scope_df['FTA Completing Scope'].value_counts()

### Analysis Data

In [47]:
# information to be combined with the rep data from 'project_table_df'
fta_scope_project_analysis_df = project_table_df[['Claim #', 'Job #', 'Branch', 
                                                  'Claim Status', 'Rep Claim Collected', 
                                                  'FTA Scope Completed', 'FTA Scope Rejected']]

# information to be combined with the rep data from 'project_info_df'
fta_scope_info_analysis_df = project_info_df[['Claim #', 'Job #', 
                                              'City', 'Building Department','Multi-rejected',
                                              'Scope Rejections']]

# information to be combined with the rep data from 'workflow_table_df'
fta_scope_workflow_analysis_df = workflow_table_df[['Claim #', 'FTA Completing Scope']]


# merging all of the dfs together to prepare rep claim analysis
fta_scope_analysis_df = fta_scope_project_analysis_df.merge(
    fta_scope_info_analysis_df, on=["Claim #", "Job #"]).merge(
    fta_scope_workflow_analysis_df, on=['Claim #'])

# organizing the df
fta_scope_analysis_df = fta_scope_analysis_df[['Claim #','Job #','Branch',
                                               'Building Department','City','Rep Claim Collected',
                                               'FTA Scope Completed', 'FTA Scope Rejected','FTA Completing Scope',
                                               'Scope Rejections', 'Multi-rejected','Claim Status']]


fta_scope_analysis_df = fta_scope_analysis_df.loc[fta_scope_analysis_df['FTA Scope Completed'].notnull()==True,:]


fta_scope_analysis_df["Multi-rejected"].value_counts()

False    593
True     276
Name: Multi-rejected, dtype: int64

In [28]:
fta_scope_analysis_df.head()

Unnamed: 0,Claim #,Job #,Branch,Building Department,City,Rep Claim Collected,FTA Scope Completed,FTA Scope Rejected,FTA Completing Scope,Scope Rejections,Multi-rejected,Claim Status
0,00140067991A028,1938689,DEN,,Denver,2019-07-18,2019-07-20,2019-07-23,5.0,1.0,False,
1,040357502,1938476,FCO,,fort collins,2019-07-15,2019-07-22,NaT,7.0,,,
2,01001691605,1938472,DEN,"Westminster, City of (DEN)",Westminster,2019-07-15,2019-07-17,NaT,2.0,,,
3,020487835-7,1938470,DEN,,Westminster,2019-07-15,2019-07-16,NaT,1.0,,,
4,23255128-002,1938447,OMA,,Omaha,2019-07-16,2019-07-17,2019-07-23,7.0,1.0,False,


## BC Estimate Data

In [90]:
# created a variable to be able to allow a 10% error in 'rep claim days'
bc_quantile = workflow_table_df['BC Completing Estimate'].quantile(.90)
num_bc_estimates = workflow_table_df['BC Completing Estimate'].count()

# produce only positive day claims within 90% of all records
bc_estimate_df = workflow_table_df.loc[(workflow_table_df['BC Completing Estimate'] >= 0) & (
    workflow_table_df['BC Completing Estimate'] <= bc_quantile), :]

# creating a bc estimate outlier df
outlier_bc_estimate_df = workflow_table_df.loc[(workflow_table_df['BC Completing Estimate'] < 0) | (
    workflow_table_df['BC Completing Estimate'] > bc_quantile), :]

# confirming no records were lost
print(f"Workflow Records with BC Estimate: {workflow_table_df['BC Completing Estimate'].count()}")
print(f"Quantile: {bc_quantile}")
print(f"Cleaned Records: {len(bc_estimate_df)}")
print(f"Outlier Records: {len(outlier_bc_estimate_df)}")

Workflow Records with BC Estimate: 1969
Quantile: 2.0
Cleaned Records: 1793
Outlier Records: 176


In [91]:
outlier_bc_estimate_df['BC Completing Estimate'].value_counts()

 3.0     116
 4.0      21
 5.0      11
 6.0       7
 10.0      3
 7.0       3
 8.0       3
-1.0       2
 12.0      1
-4.0       1
 17.0      1
 14.0      1
 13.0      1
 9.0       1
-7.0       1
 23.0      1
-2.0       1
-3.0       1
Name: BC Completing Estimate, dtype: int64

## OB Scope Data

In [92]:
# created a variable to be able to allow a 10% error in 'ob scoped days'
ob_quantile = workflow_table_df['OB Completing Scope'].quantile(.90)
num_ob_scopes = workflow_table_df['OB Completing Scope'].count()

# produce only positive day claims within 90% of all records
ob_scope_df = workflow_table_df.loc[(workflow_table_df['OB Completing Scope'] >= 0) & (
    workflow_table_df['OB Completing Scope'] <= ob_quantile), :]

# creating a ob scoped outlier df
outlier_ob_scope_df = workflow_table_df.loc[(workflow_table_df['OB Completing Scope'] < 0) | (
    workflow_table_df['OB Completing Scope'] > ob_quantile), :]

# confirming no records were lost
print(f"Workflow Records with OB Scope: {workflow_table_df['OB Completing Scope'].count()}")
print(f"Quantile: {ob_quantile}")
print(f"Cleaned Records: {len(ob_scope_df)}")
print(f"Outlier Records: {len(outlier_ob_scope_df)}")

Workflow Records with OB Scope: 1865
Quantile: 6.0
Cleaned Records: 1711
Outlier Records: 154


In [93]:
outlier_ob_scope_df['OB Completing Scope'].value_counts()

 7.0     59
 8.0     22
 10.0    13
 9.0     11
-1.0     11
 11.0    11
 12.0     8
 13.0     4
-4.0      3
-2.0      2
 16.0     1
 25.0     1
 17.0     1
 15.0     1
-34.0     1
-7.0      1
 87.0     1
 14.0     1
-5.0      1
 49.0     1
Name: OB Completing Scope, dtype: int64

In [94]:
# will want to see what is causing the outliers

In [95]:
outlier_ob_scope_df.to_csv("data/outliers/ob_scope_outlier_data.csv", index=False)

## Sup Submittal Data

In [96]:
# created a variable to be able to allow a 10% error in 'sup submitted days'
sup_quantile = workflow_table_df['Sup Submitting Job'].quantile(.90)
num_sup_submits = workflow_table_df['Sup Submitting Job'].count()

# produce only positive day claims within 90% of all records
sup_submit_df = workflow_table_df.loc[(workflow_table_df['Sup Submitting Job'] >= 0) & (
    workflow_table_df['Sup Submitting Job'] <= sup_quantile), :]

# creating a sup submitted outlier df
outlier_sup_submit_df = workflow_table_df.loc[(workflow_table_df['Sup Submitting Job'] < 0) | (
    workflow_table_df['Sup Submitting Job'] > sup_quantile), :]

# confirming no records were lost
print(f"Workflow Records with Sup Submit: {workflow_table_df['Sup Submitting Job'].count()}")
print(f"Quantile: {sup_quantile}")
print(f"Cleaned Records: {len(sup_submit_df)}")
print(f"Outlier Records: {len(outlier_sup_submit_df)}")

Workflow Records with Sup Submit: 1191
Quantile: 22.0
Cleaned Records: 1078
Outlier Records: 113


In [97]:
outlier_sup_submit_df['Sup Submitting Job'].value_counts()

 23.0    13
 27.0    10
 33.0     8
 34.0     7
 24.0     6
 38.0     5
 37.0     4
 26.0     4
 28.0     3
 29.0     3
 56.0     3
 40.0     3
 39.0     3
 35.0     3
 31.0     3
 43.0     3
 25.0     3
 30.0     2
-2.0      2
 41.0     2
-4.0      2
 42.0     2
 36.0     2
 51.0     2
 45.0     2
 46.0     2
 44.0     1
 32.0     1
 50.0     1
 61.0     1
 49.0     1
 62.0     1
 48.0     1
-5.0      1
 60.0     1
-10.0     1
 66.0     1
Name: Sup Submitting Job, dtype: int64

## BC Approval Data

In [98]:
# created a variable to be able to allow a 10% error in 'bc approved days'
bc_quantile = workflow_table_df['BC Approving Job'].quantile(.90)
num_bc_approvals = workflow_table_df['BC Approving Job'].count()

# produce only positive day claims within 90% of all records
bc_approval_df = workflow_table_df.loc[(workflow_table_df['BC Approving Job'] >= 0) & (
    workflow_table_df['BC Approving Job'] <= bc_quantile), :]

# creating a bc approved outlier df
outlier_bc_approval_df = workflow_table_df.loc[(workflow_table_df['BC Approving Job'] < 0) | (
    workflow_table_df['BC Approving Job'] > bc_quantile), :]

# confirming no records were lost
print(f"Workflow Records with BC Approval: {workflow_table_df['BC Approving Job'].count()}")
print(f"Quantile: {bc_quantile}")
print(f"Cleaned Records: {len(bc_approval_df)}")
print(f"Outlier Records: {len(outlier_bc_approval_df)}")

Workflow Records with BC Approval: 1118
Quantile: 8.300000000000068
Cleaned Records: 1003
Outlier Records: 115


In [99]:
outlier_bc_approval_df['BC Approving Job'].value_counts()

 13.0    15
 9.0     14
 12.0    13
 11.0    12
 20.0     8
 19.0     7
 14.0     5
 26.0     5
 10.0     4
 17.0     4
 21.0     4
 25.0     4
 18.0     3
 16.0     3
 23.0     2
 15.0     2
 27.0     1
 41.0     1
 35.0     1
-11.0     1
 39.0     1
 29.0     1
-13.0     1
 32.0     1
-4.0      1
 46.0     1
Name: BC Approving Job, dtype: int64

## OB Create Order Data

In [100]:
# created a variable to be able to allow a 10% error in 'ob created days'
ob_quantile = workflow_table_df['OB Building Order'].quantile(.90)
num_ob_orders = workflow_table_df['OB Building Order'].count()

# produce only positive day claims within 90% of all records
ob_order_df = workflow_table_df.loc[(workflow_table_df['OB Building Order'] >= 0) & (
    workflow_table_df['OB Building Order'] <= ob_quantile), :]

# creating a ob created outlier df
outlier_ob_order_df = workflow_table_df.loc[(workflow_table_df['OB Building Order'] < 0) | (
    workflow_table_df['OB Building Order'] > ob_quantile), :]

# confirming no records were lost
print(f"Workflow Records with OB Order: {workflow_table_df['OB Building Order'].count()}")
print(f"Quantile: {ob_quantile}")
print(f"Cleaned Records: {len(ob_order_df)}")
print(f"Outlier Records: {len(outlier_ob_order_df)}")

Workflow Records with OB Order: 1085
Quantile: 5.0
Cleaned Records: 990
Outlier Records: 95


In [101]:
outlier_ob_order_df['OB Building Order'].value_counts()

 7.0     15
 6.0     14
 11.0    12
 8.0      9
 10.0     9
 9.0      9
 12.0     7
 13.0     4
 15.0     4
 20.0     4
 16.0     2
-9.0      1
 53.0     1
 27.0     1
 28.0     1
 24.0     1
 40.0     1
Name: OB Building Order, dtype: int64

## GM Process Order Data

In [102]:
# created a variable to be able to allow a 10% error in 'gm processed days'
gm_quantile = workflow_table_df['GM Processing Order'].quantile(.90)
num_gm_orders = workflow_table_df['GM Processing Order'].count()

# produce only positive day claims within 90% of all records
gm_order_df = workflow_table_df.loc[(workflow_table_df['GM Processing Order'] >= 0) & (
    workflow_table_df['GM Processing Order'] <= gm_quantile), :]

# creating a gm processed outlier df
outlier_gm_order_df = workflow_table_df.loc[(workflow_table_df['GM Processing Order'] < 0) | (
    workflow_table_df['GM Processing Order'] > gm_quantile), :]

# confirming no records were lost
print(f"Workflow Records with GM Processed Order: {workflow_table_df['GM Processing Order'].count()}")
print(f"Quantile: {gm_quantile}")
print(f"Cleaned Records: {len(gm_order_df)}")
print(f"Outlier Records: {len(outlier_gm_order_df)}")

Workflow Records with GM Processed Order: 884
Quantile: 20.0
Cleaned Records: 796
Outlier Records: 88


In [103]:
outlier_gm_order_df['GM Processing Order'].value_counts()

 21.0    12
 22.0     9
 29.0     8
 27.0     7
 28.0     7
 24.0     7
 30.0     5
 23.0     4
 38.0     3
 34.0     3
 36.0     3
 25.0     2
 41.0     2
 26.0     2
 42.0     2
 47.0     1
 60.0     1
-7.0      1
 40.0     1
 35.0     1
 55.0     1
 46.0     1
 63.0     1
 54.0     1
 37.0     1
 43.0     1
 32.0     1
Name: GM Processing Order, dtype: int64

In [104]:
# will want to determine what is causing the outliers
outlier_gm_order_df.to_csv("data_copy/outliers/gm_order_outlier_data.csv", index=False)

## PA Process OA Data

In [105]:
# created a variable to be able to allow a 10% error in 'pa oa processeddays'
pa_quantile = workflow_table_df['PA Processing OA'].quantile(.90)
num_pa_oa_processed = workflow_table_df['PA Processing OA'].count()

# produce only positive day claims within 90% of all records
pa_processed_oa_df = workflow_table_df.loc[(workflow_table_df['PA Processing OA'] >= 0) & (
    workflow_table_df['PA Processing OA'] <= pa_quantile), :]

# creating a pa oa processedoutlier df
outlier_pa_processed_oa_df = workflow_table_df.loc[(workflow_table_df['PA Processing OA'] < 0) | (
    workflow_table_df['PA Processing OA'] > pa_quantile), :]

# confirming no records were lost
print(f"Workflow Records with PA OA Processed: {workflow_table_df['PA Processing OA'].count()}")
print(f"Quantile: {pa_quantile}")
print(f"Cleaned Records: {len(pa_processed_oa_df)}")
print(f"Outlier Records: {len(outlier_pa_processed_oa_df)}")

Workflow Records with PA OA Processed: 827
Quantile: 3.0
Cleaned Records: 756
Outlier Records: 71


In [106]:
outlier_pa_processed_oa_df['PA Processing OA'].value_counts()

 4.0     24
 5.0     20
 6.0      7
 10.0     4
 7.0      2
 11.0     2
 8.0      2
 30.0     1
 17.0     1
 12.0     1
-10.0     1
 20.0     1
 14.0     1
 15.0     1
-1.0      1
 42.0     1
 18.0     1
Name: PA Processing OA, dtype: int64

## PA Invoicing OA Data

In [107]:
# created a variable to be able to allow a 10% error in 'pa oa invoiceddays'
pa_quantile = workflow_table_df['PA Invoicing OA'].quantile(.90)
num_pa_oa_invoiced = workflow_table_df['PA Invoicing OA'].count()

# produce only positive day claims within 90% of all records
pa_invoiced_oa_df = workflow_table_df.loc[(workflow_table_df['PA Invoicing OA'] >= 0) & (
    workflow_table_df['PA Invoicing OA'] <= pa_quantile), :]

# creating a pa oa invoicedoutlier df
outlier_pa_invoiced_oa_df = workflow_table_df.loc[(workflow_table_df['PA Invoicing OA'] < 0) | (
    workflow_table_df['PA Invoicing OA'] > pa_quantile), :]

# confirming no records were lost
print(f"Workflow Records with PA OA Invoiced: {workflow_table_df['PA Invoicing OA'].count()}")
print(f"Quantile: {pa_quantile}")
print(f"Cleaned Records: {len(pa_invoiced_oa_df)}")
print(f"Outlier Records: {len(outlier_pa_invoiced_oa_df)}")

Workflow Records with PA OA Invoiced: 714
Quantile: 8.0
Cleaned Records: 668
Outlier Records: 46


In [108]:
outlier_pa_invoiced_oa_df['PA Invoicing OA'].value_counts()

11.0    14
9.0     10
10.0     4
15.0     3
12.0     3
19.0     3
13.0     2
33.0     1
21.0     1
20.0     1
18.0     1
14.0     1
16.0     1
28.0     1
Name: PA Invoicing OA, dtype: int64

In [109]:
# Want to see what is causing the outliers
outlier_pa_invoiced_oa_df.to_csv("data_copy/outliers/pa_invoiced_oa_outlier_data.csv", index=False)

## GM Approving for Inspection Data

In [110]:
# created a variable to be able to allow a 10% error in 'gm approved for inspection days'
gm_quantile = workflow_table_df['GM Approving for Inspection'].quantile(.90)
num_gm_approved_inspection = workflow_table_df['GM Approving for Inspection'].count()

# produce only positive day claims within 90% of all records
gm_approved_inspection_df = workflow_table_df.loc[(workflow_table_df['GM Approving for Inspection'] >= 0) & (
    workflow_table_df['GM Approving for Inspection'] <= gm_quantile), :]

# creating a gm approved for inspection outlier df
outlier_gm_approved_inspection_df = workflow_table_df.loc[(workflow_table_df['GM Approving for Inspection'] < 0) | (
    workflow_table_df['GM Approving for Inspection'] > gm_quantile), :]

# confirming no records were lost
print(f"Workflow Records with GM Approved for Inspection: {workflow_table_df['GM Approving for Inspection'].count()}")
print(f"Quantile: {gm_quantile}")
print(f"Cleaned Records: {len(gm_approved_inspection_df)}")
print(f"Outlier Records: {len(outlier_gm_approved_inspection_df)}")

Workflow Records with GM Approved for Inspection: 645
Quantile: 3.0
Cleaned Records: 587
Outlier Records: 58


In [111]:
outlier_gm_approved_inspection_df['GM Approving for Inspection'].value_counts()

4.0     26
5.0     17
6.0     13
30.0     1
8.0      1
Name: GM Approving for Inspection, dtype: int64

In [112]:
# Want to see what is causing the outliers
outlier_gm_approved_inspection_df.to_csv("data_copy/outliers/gm_approved_inspection_outlier_data.csv", index=False)

## RA Requesting Inspection Data


In [113]:
# created a variable to be able to allow a 10% error in 'ra requesting inspection days'
ra_quantile = workflow_table_df['RA Requesting Inspection'].quantile(.90)
num_ra_requested = workflow_table_df['RA Requesting Inspection'].count()

# produce only positive day claims within 90% of all records
ra_requested_inspection_df = workflow_table_df.loc[(workflow_table_df['RA Requesting Inspection'] >= 0) & (
    workflow_table_df['RA Requesting Inspection'] <= ra_quantile), :]

# creating a ra requesting inspection outlier df
outlier_ra_requested_inspection_df = workflow_table_df.loc[(workflow_table_df['RA Requesting Inspection'] < 0) | (
    workflow_table_df['RA Requesting Inspection'] > ra_quantile), :]

# confirming no records were lost
print(f"Workflow Records with RA Requested Inspection: {workflow_table_df['RA Requesting Inspection'].count()}")
print(f"Quantile: {ra_quantile}")
print(f"Cleaned Records: {len(ra_requested_inspection_df)}")
print(f"Outlier Records: {len(outlier_ra_requested_inspection_df)}")

Workflow Records with RA Requested Inspection: 280
Quantile: 18.0
Cleaned Records: 254
Outlier Records: 26


In [114]:
outlier_ra_requested_inspection_df['RA Requesting Inspection'].value_counts()

19.0    4
20.0    3
25.0    3
27.0    2
22.0    2
26.0    1
21.0    1
42.0    1
36.0    1
45.0    1
67.0    1
24.0    1
28.0    1
61.0    1
47.0    1
31.0    1
41.0    1
Name: RA Requesting Inspection, dtype: int64

## Rep Collecting COC Data

In [115]:
# created a variable to be able to allow a 10% error in 'rep collect coc days'
rep_quantile = workflow_table_df['Rep Collecting COC'].quantile(.90)
num_rep_collected = workflow_table_df['Rep Collecting COC'].count()

# produce only positive day claims within 90% of all records
rep_collected_df = workflow_table_df.loc[(workflow_table_df['Rep Collecting COC'] >= 0) & (
    workflow_table_df['Rep Collecting COC'] <= rep_quantile), :]

# creating a rep collect coc outlier df
outlier_rep_collected_df = workflow_table_df.loc[(workflow_table_df['Rep Collecting COC'] < 0) | (
    workflow_table_df['Rep Collecting COC'] > rep_quantile), :]

# confirming no records were lost
print(f"Workflow Records with Rep COC Collected: {workflow_table_df['Rep Collecting COC'].count()}")
print(f"Quantile: {rep_quantile}")
print(f"Cleaned Records: {len(rep_collected_df)}")
print(f"Outlier Records: {len(outlier_rep_collected_df)}")

Workflow Records with Rep COC Collected: 615
Quantile: 15.0
Cleaned Records: 556
Outlier Records: 59


In [116]:
outlier_rep_collected_df['Rep Collecting COC'].value_counts()

16.0    10
20.0     6
19.0     4
23.0     4
17.0     4
24.0     3
22.0     3
35.0     2
21.0     2
26.0     2
33.0     2
18.0     2
37.0     2
28.0     2
29.0     1
47.0     1
43.0     1
25.0     1
62.0     1
46.0     1
34.0     1
50.0     1
32.0     1
49.0     1
30.0     1
Name: Rep Collecting COC, dtype: int64

In [117]:
# Want to see what is causing the outliers
outlier_rep_collected_df.to_csv("data_copy/outliers/rep_collected_outlier_data.csv", index=False)

## SA Uploading Docs Data

In [118]:
# created a variable to be able to allow a 10% error in 'sa uploaded days'
sa_quantile = workflow_table_df['SA Uploading Docs'].quantile(.90)
num_sa_uploaded = workflow_table_df['SA Uploading Docs'].count()

# produce only positive day claims within 90% of all records
sa_uploaded_df = workflow_table_df.loc[(workflow_table_df['SA Uploading Docs'] >= 0) & (
    workflow_table_df['SA Uploading Docs'] <= sa_quantile), :]

# creating a sa uploaded outlier df
outlier_sa_uploaded_df = workflow_table_df.loc[(workflow_table_df['SA Uploading Docs'] < 0) | (
    workflow_table_df['SA Uploading Docs'] > sa_quantile), :]

# confirming no records were lost
print(f"Workflow Records with SA Docs Uploaded: {workflow_table_df['SA Uploading Docs'].count()}")
print(f"Quantile: {sa_quantile}")
print(f"Cleaned Records: {len(sa_uploaded_df)}")
print(f"Outlier Records: {len(outlier_sa_uploaded_df)}")

Workflow Records with SA Docs Uploaded: 522
Quantile: 16.0
Cleaned Records: 473
Outlier Records: 49


In [119]:
outlier_sa_uploaded_df['SA Uploading Docs'].value_counts()

21.0    8
26.0    5
19.0    4
25.0    4
32.0    3
24.0    3
18.0    3
27.0    3
28.0    2
37.0    2
23.0    2
20.0    2
17.0    2
22.0    1
54.0    1
33.0    1
29.0    1
44.0    1
71.0    1
Name: SA Uploading Docs, dtype: int64

## BC Invoicing Project Data

In [120]:
# created a variable to be able to allow a 10% error in 'bc invoiced days'
bc_quantile = workflow_table_df['BC Invoicing Project'].quantile(.90)
num_bc_invoiced = workflow_table_df['BC Invoicing Project'].count()

# produce only positive day claims within 90% of all records
bc_invoiced_df = workflow_table_df.loc[(workflow_table_df['BC Invoicing Project'] >= 0) & (
    workflow_table_df['BC Invoicing Project'] <= bc_quantile), :]

# creating a bc invoiced outlier df
outlier_bc_invoiced_df = workflow_table_df.loc[(workflow_table_df['BC Invoicing Project'] < 0) | (
    workflow_table_df['BC Invoicing Project'] > bc_quantile), :]

# confirming no records were lost
print(f"Workflow Records with BC Project Invoiced: {workflow_table_df['BC Invoicing Project'].count()}")
print(f"Quantile: {bc_quantile}")      
print(f"Cleaned Records: {len(bc_invoiced_df)}")
print(f"Outlier Records: {len(outlier_bc_invoiced_df)}")

Workflow Records with BC Project Invoiced: 464
Quantile: 9.0
Cleaned Records: 422
Outlier Records: 42


In [121]:
outlier_bc_invoiced_df['BC Invoicing Project'].value_counts()

14.0    7
15.0    6
10.0    6
12.0    5
11.0    5
16.0    3
17.0    3
13.0    3
18.0    2
21.0    1
20.0    1
Name: BC Invoicing Project, dtype: int64

## BC Closed Project Data

In [122]:
# created a variable to be able to allow a 10% error in 'bc closed days'
bc_quantile = workflow_table_df['BC Closed Project'].quantile(.90)
num_bc_closed = workflow_table_df['BC Closed Project'].count()

# produce only positive day claims within 90% of all records
bc_closed_df = workflow_table_df.loc[(workflow_table_df['BC Closed Project'] >= 0) & (
    workflow_table_df['BC Closed Project'] <= bc_quantile), :]

# creating a bc closed outlier df
outlier_bc_closed_df = workflow_table_df.loc[(workflow_table_df['BC Closed Project'] < 0) | (
    workflow_table_df['BC Closed Project'] > bc_quantile), :]

# confirming no records were lost
print(f"Workflow Records with BC Project Closed: {workflow_table_df['BC Closed Project'].count()}")
print(f"Quantile: {bc_quantile}")      
print(f"Cleaned Records: {len(bc_closed_df)}")
print(f"Outlier Records: {len(outlier_bc_closed_df)}")

Workflow Records with BC Project Closed: 237
Quantile: 41.0
Cleaned Records: 216
Outlier Records: 21


In [123]:
outlier_bc_closed_df['BC Closed Project'].value_counts()

81.0    2
49.0    2
51.0    2
43.0    2
47.0    2
48.0    1
69.0    1
78.0    1
67.0    1
63.0    1
70.0    1
77.0    1
71.0    1
68.0    1
50.0    1
42.0    1
Name: BC Closed Project, dtype: int64