# Setup
**This analysis includes all projects for the 2019 Season (includes projects marked as "Dead")**

## Dependencies

In [1]:
import pandas as pd
from datetime import datetime

## Import Data

In [24]:
# imports the '[TVA] Workflow Analysis' data
all_project_data = "./data/workflow_analysis.csv"

# imports the '[TVA] Project Workflow Analysis' data
all_production_data = "./data/project_workflow_analysis.csv"

# imports '[TVA] Project Info Analysis' data
info_data = "./data/info_table.csv"

# imports '[TVA] FTA Scope Analysis' data
rejection_data = "./data/rejection_table.csv"

# Parsing Data

## Workflow Analysis
**Includes projects marked as 'Dead'**

In [26]:
project_df = pd.read_csv(
    all_project_data, dtype={
        'Claim #':str,
        'Job #':str,
        'Branch':str,
        'Claim Status':str},
    parse_dates=[
        'Claim # Date',
        'FTA Scope. Req Date',
        'Submit for Estimate Date',
        '[OB] Created Scope Calc',
        '[B] Created Estimate Date',
        'Job Submittal Date',
        '[B] - Date Approved by BC',
        '[OB] Completed',
        'COC Rcvd Date [A]'],
)

# having trouble recognizing 'coc' date as 'datetime', manually converted the dtype.
project_df['COC Rcvd Date [A]'] = pd.to_datetime(project_df['COC Rcvd Date [A]'], errors='coerce')

project_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2257 entries, 0 to 2256
Data columns (total 13 columns):
Claim #                      2246 non-null object
Job #                        972 non-null object
Branch                       2256 non-null object
Claim Status                 113 non-null object
Claim # Date                 2257 non-null datetime64[ns]
FTA Scope. Req Date          2257 non-null datetime64[ns]
Submit for Estimate Date     1818 non-null datetime64[ns]
[B] Created Estimate Date    1798 non-null datetime64[ns]
[OB] Created Scope Calc      1658 non-null datetime64[ns]
Job Submittal Date           1066 non-null datetime64[ns]
[B] - Date Approved by BC    972 non-null datetime64[ns]
[OB] Completed               912 non-null datetime64[ns]
COC Rcvd Date [A]            513 non-null datetime64[ns]
dtypes: datetime64[ns](9), object(4)
memory usage: 229.3+ KB


## Project Workflow Analysis

**Projects that have been submitted to production (OB Orderbuilt)**

In [25]:
production_df = pd.read_csv(
    all_production_data, dtype={
        'Job #':str,
        'Supplier Name':str,
        'Building Department':str},
    parse_dates=[
        'Permit Applied [A]',
        'Order Date',
        'Permit Received',
        'OA Date',
        'Invoice Date',
        'Ntfd H.O. Dlvry',
        'Dlvry Start',
        'Ntfd H.O. Start',
        'Roof Start',
        'Roof Complete Date',
        'R4F',
        'Requested Final Insp',
        'Final Inspection Date',
    ]
)

# the data I would like to move over to the 'info' df
moving_data_df = production_df[['Job #','Supplier Name', 'Building Department']]

# removing the data I will be moving over to the 'info' df from the 'production' df
del production_df['Supplier Name'], production_df['Building Department']

production_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 912 entries, 0 to 911
Data columns (total 14 columns):
Job #                    912 non-null object
Permit Applied [A]       501 non-null datetime64[ns]
Order Date               755 non-null datetime64[ns]
Permit Received          449 non-null datetime64[ns]
OA Date                  706 non-null datetime64[ns]
Invoice Date             638 non-null datetime64[ns]
Ntfd H.O. Dlvry          730 non-null datetime64[ns]
Dlvry Start              743 non-null datetime64[ns]
Ntfd H.O. Start          606 non-null datetime64[ns]
Roof Start               741 non-null datetime64[ns]
Roof Complete Date       741 non-null datetime64[ns]
R4F                      558 non-null datetime64[ns]
Requested Final Insp     212 non-null datetime64[ns]
Final Inspection Date    129 non-null datetime64[ns]
dtypes: datetime64[ns](13), object(1)
memory usage: 99.8+ KB


## FTA Scope Analysis
**Includes projects marked as 'Dead'**

In [29]:
rejection_info_df = pd.read_csv(
    rejection_data,dtype={
        'Claim #': str, 
        'Job #': str},
    parse_dates=['Created'])

# rejection_info_df.head()

### Latest Rejection
**Determine the most recent rejection for any project**

In [31]:
# 'idmax()' of the 'Created' column provides the most current rejection date
reject_df = rejection_info_df.loc[rejection_info_df.groupby('Claim #')['Created'].idxmax()]

# renaming the 'created' column for clarity
reject_df = reject_df.rename(columns={"Created": "Scope Rejection Date"})

# reject_df.head()

###  Multi-Rejection Counts
**Determine the amount of rejections for each project and if it was rejected more than once**

In [33]:
# creating a 'df' to count how many times a job was rejected
rejection_count_df = (rejection_info_df.groupby("Claim #").count())

# # resets the 'claim #' from being the index
rejection_count_df.reset_index(inplace=True)

# created a list to collect the boolean response to multi-rejection count
multi_reject_list = []

# iterates over 'Created' to determine if it has been multi-rejected
for index, row in rejection_count_df.iterrows():

    if row['Created'] <= 1:
        multi_reject = False
        multi_reject_list.append(multi_reject)

    else:
        multi_reject = True
        multi_reject_list.append(multi_reject)

# adding the 'boolean' list to the 'improvements' df
rejection_count_df["Multi-rejected"] = multi_reject_list

# renaming the 'created' column for clarity
rejection_count_df = rejection_count_df.rename(columns={"Created": "Scope Rejections"})

# rejection_count_df.head()

## Project Info Analysis
**Includes projects marked as 'Dead'**

In [36]:
info_df = pd.read_csv(info_data, dtype={'Job #':str})

info_df = info_df.rename(columns={
    'Sup Name': 'Sup',
    'Rep Name': 'Rep',
    '[BC] Name': 'BC',
    'Full Name': 'OB',
    'Full Name.1': 'FTA',
    'Full Name.2': 'GM'
})

# merging the info from the 'production' df more applicable to the 'info' df 
# as well as the 'rejection_count' df information
info_df = info_df.merge(moving_data_df, how='left', on='Job #').merge(rejection_count_df, how='left', on='Claim #')

# info_df.head()

# Merge Data

## Merge Project, Production, Rejection, and Rejection Count dfs
**Merging dfs on the shared 'Claim #' Column ('Job # for 'production_df')**

In [64]:
# Merged the 'project df' merged with 'latest rejection date' merged with 
# rejection count and 'multi-rejection' boolean merged with 'production df' 
merged_df = project_df.merge(reject_df, how='left', on='Claim #').merge(production_df, how='left', on='Job #')

# merged_df.columns

## Renaming Merged Data

In [66]:
# renaming the columnns to be able to read easier
all_project_df = merged_df.rename(columns={
    'FTA Scope. Req Date': 'Rep Claim Collected',
    'Scope Rejection Date': 'FTA Scope Rejected',
    'Submit for Estimate Date': 'FTA Scope Completed',
    '[B] Created Estimate Date':'BC Estimate Completed',
    '[OB] Created Scope Calc': 'OB Scope Completed',
    'Job Submittal Date': 'Sup Job Submitted',
    '[B] - Date Approved by BC': 'BC Approved for Production',
    '[OB] Completed': 'OB Order Built',
    'Permit Applied [A]': 'PA Permit Applied',
    'Order Date': 'GM Order Processed',
    'Permit Received': 'PA Permit Processed',
    'OA Date': 'PA OA Processed',
    'Invoice Date':'PA OA Invoiced',
    'Ntfd H.O. Dlvry': 'PA Notify of Delivery',
    'Dlvry Start': 'Delivery Date',
    'Ntfd H.O. Start': 'PA Notify of Start',
    'Roof Complete Date': 'Roof End',
    'R4F': 'GM Approved for Inspection',
    'Requested Final Insp': 'PA Inspection Requested',
    'Final Inspection Date': 'PA Inspection Processed',
    'COC Rcvd Date [A]': 'SA COC Processed'
})

# all_project_df.columns

## Organizing Merged Data to follow Workflow
**Datestamps order is for asthetics only, coding preforms implicit date diffs**

In [67]:
all_project_df = all_project_df[[
    'Claim #',
    'Job #',
    'Branch',
    'Claim Status',
    'Claim # Date',
    'Rep Claim Collected',
    'FTA Scope Completed',
    'FTA Scope Rejected',
    'BC Estimate Completed',
    'OB Scope Completed',
    'Sup Job Submitted',
    'BC Approved for Production',
    'OB Order Built',
    'GM Order Processed',
    'PA Permit Applied',
    'PA Permit Processed',
    'PA OA Processed',
    'PA OA Invoiced',
    'PA Notify of Delivery',
    'PA Notify of Start',
    'Delivery Date',
    'Roof Start',
    'Roof End',
    'GM Approved for Inspection',
    'PA Inspection Requested',
    'PA Inspection Processed', 
    'SA COC Processed'
]]

# all_project_df.columns

# Comparing Data

## Gathering Date Differences
**Finding any 'Submit for Estimate' and 'Rejection Date' Subsitutions**

In [68]:
# will store this project info
claim_num = []
branch_list = []
claim_status_list = []

# will store these date diffs
rep_claim_diff = []
fta_scope_diff = []
ob_scope_diff = []
bc_estimate_diff = []
sup_pfynr_diff = []
bc_approval_diff = []
ob_order_build_diff = []
gm_create_order_diff = []
pa_oa_processed_diff = []
pa_invoice_diff = []
gm_approval_diff = []
pa_request_inspection_diff = []
sa_processed_coc_diff = []


# this data applies to leadtimes, not workflow
pa_permit_applied_diff = []
pa_permit_processed_diff = []
pa_notify_delivery_diff = []
pa_notify_start_diff = []


# iterating over the df to create 'date diff' variables
for index, row in all_project_df.iterrows():

    # creating 'date_diff' variables for each step in the workflow
    rep_claim_date_diff = float((row['Rep Claim Collected'] - row['Claim # Date']).days)

    # if the record has NOT had the FTA Scope Rejected...
    if row['FTA Scope Rejected'] == 'NaT':

        # if the bc estimate was created prior to July 16th...
        if row['BC Estimate Completed'] <= datetime(2019, 7, 15):

            # then compare the 'bc estimate' date to the 'ob scope calc' date
            # as well as 'ob scope' date to 'fta scope' date
            fta_date_diff = (row['FTA Scope Completed'] - row['Rep Claim Collected']).days
            ob_scope_date_diff = (row['OB Scope Completed'] - row['FTA Scope Completed']).days
            bc_estimate_date_diff = (row['BC Estimate Completed'] - row['OB Scope Completed']).days
            sup_pfynr_date_diff = (row['Sup Job Submitted'] - row['BC Estimate Completed']).days

        # if the record was addressed during the 'blip'...
        elif row['BC Estimate Completed'] == datetime(2019, 7, 16) or row['BC Estimate Completed'] == datetime(2019, 7, 17):

            # then compare the 'bc estimate' to the 'blip' date, and the 'ob scope' date to the new 'bc date'
            fta_date_diff = (row['FTA Scope Completed'] - row['Rep Claim Collected']).days
            bc_estimate_date_diff = (row['BC Estimate Completed'] - datetime(2019, 7, 15)).days
            ob_scope_date_diff = (row['OB Scope Completed'] - row['BC Estimate Completed']).days
            sup_pfynr_date_diff = (row['Sup Job Submitted'] - row['OB Scope Completed']).days

        # if the bc estimate was created after the 'blip' on July 16th...
        else:

            # then use the new workflow dates to compare the date diffs
            fta_date_diff = (row['FTA Scope Completed'] - row['Rep Claim Collected']).days
            bc_estimate_date_diff = (row['BC Estimate Completed'] - row['FTA Scope Completed']).days
            ob_scope_date_diff = (row['OB Scope Completed'] - row['BC Estimate Completed']).days
            sup_pfynr_date_diff = (row['Sup Job Submitted'] - row['OB Scope Completed']).days

    # if the record HAS has the FTA Scope Rejected...
    else:

        # and was rejected prior to the 'blip'...
        if row['BC Estimate Completed'] <= datetime(2019, 7, 15):

            # use 'rejected' date and clarify sup date diffs
            fta_date_diff = (row['FTA Scope Rejected'] - row['Rep Claim Collected']).days
            bc_estimate_date_diff = (row['BC Estimate Completed'] - row['FTA Scope Completed']).days
            ob_scope_date_diff = (row['OB Scope Completed'] - row['FTA Scope Rejected']).days
            sup_pfynr_date_diff = (row['Sup Job Submitted'] - row['BC Estimate Completed']).days

        # if rejected after the 'blip'...
        else:

            # use 'rejected' date and clarify sup date diffs
            fta_date_diff = (row['FTA Scope Rejected'] - row['Rep Claim Collected']).days
            bc_estimate_date_diff = (row['BC Estimate Completed'] - row['FTA Scope Completed']).days
            ob_scope_date_diff = (row['OB Scope Completed'] - row['FTA Scope Rejected']).days
            sup_pfynr_date_diff = (row['Sup Job Submitted'] - row['OB Scope Completed']).days

    bc_approval_date_diff = (row['BC Approved for Production'] - row['Sup Job Submitted']).days
    ob_orderbuild_date_diff = (row['OB Order Built'] - row['BC Approved for Production']).days
    gm_create_order_date_diff = (row['GM Order Processed'] - row['OB Order Built']).days
    pa_oa_processed_date_diff = (row['PA OA Processed'] - row['GM Order Processed']).days
    pa_invoice_date_diff = (row['PA OA Invoiced'] - row['PA OA Processed']).days
    gm_approval_date_diff = (row['GM Approved for Inspection'] - row['Roof End']).days
    pa_requested_inspection_date_diff = (row['PA Inspection Requested'] - row['GM Approved for Inspection']).days
    sa_processed_coc_date_diff = (row['SA COC Processed'] - row['Roof End']).days

    # these provide the lead times of tasks not directly impacting the workflow.
    pa_permit_applied_date_diff = (row['PA Permit Applied'] - row['BC Approved for Production']).days
    pa_permit_processed_date_diff = (row['PA Permit Processed'] - row['PA Permit Applied']).days
    pa_notify_delivery_date_diff = (row['Delivery Date'] - row['PA Notify of Delivery']).days
    pa_notify_start_date_diff = (row['Roof Start'] - row['PA Notify of Start']).days

    # appending 'date diff' values to lists to create each df column
    claim_num.append(row["Claim #"])
    branch_list.append(row['Branch'])
    claim_status_list.append(row['Claim Status'])
    rep_claim_diff.append(rep_claim_date_diff)
    fta_scope_diff.append(fta_date_diff)
    ob_scope_diff.append(ob_scope_date_diff)
    bc_estimate_diff.append(bc_estimate_date_diff)
    sup_pfynr_diff.append(sup_pfynr_date_diff)
    bc_approval_diff.append(bc_approval_date_diff)
    ob_order_build_diff.append(ob_orderbuild_date_diff)
    gm_create_order_diff.append(gm_create_order_date_diff)
    pa_oa_processed_diff.append(pa_oa_processed_date_diff)
    pa_invoice_diff.append(pa_invoice_date_diff)
    gm_approval_diff.append(gm_approval_date_diff)
    pa_request_inspection_diff.append(pa_requested_inspection_date_diff)
    sa_processed_coc_diff.append(sa_processed_coc_date_diff)
    
    # this data applies to leadtimes, not workflow
    pa_permit_applied_diff.append(pa_permit_applied_date_diff)
    pa_permit_processed_diff.append(pa_permit_processed_date_diff)
    pa_notify_delivery_diff.append(pa_notify_delivery_date_diff)
    pa_notify_start_diff.append(pa_notify_start_date_diff)

## Creating 'Workflow Days' df
**The days between each Teammate step in the workflow (excluding some production processes)**

In [72]:
# creating the 'days_df' to hold all date values for each role in the project
days_df = pd.DataFrame({
    "Claim #": claim_num,
    "Rep Collecting Claim": rep_claim_diff,
    "FTA Completing Scope": fta_scope_diff,
    "BC Completing Estimate": bc_estimate_diff,
    "OB Completing Scope": ob_scope_diff,
    "Sup Submitting Job": sup_pfynr_diff,
    "BC Approving Job": bc_approval_diff,
    "OB Building Order": ob_order_build_diff,
    "GM Processing Order": gm_create_order_diff,
    "PA Processing OA": pa_oa_processed_diff,
    "PA Invoicing OA": pa_invoice_diff,
    'GM Approving for Inspection': gm_approval_diff,
    'PA Requesting Inspection': pa_request_inspection_diff,
    'SA Processing COC': sa_processed_coc_diff,
    
})

# creating a column holding the running tally across a row (project)
# can be done because not including 'date diffs' on non-workflow items
days_df['Days in Pipeline'] = days_df.sum(axis=1)

days_df.head()

Unnamed: 0,Claim #,Rep Collecting Claim,FTA Completing Scope,BC Completing Estimate,OB Completing Scope,Sup Submitting Job,BC Approving Job,OB Building Order,GM Processing Order,PA Processing OA,PA Invoicing OA,GM Approving for Inspection,PA Requesting Inspection,SA Processing COC,Days in Pipeline
0,40048935,0.0,,3.0,,6.0,4.0,0.0,14.0,3.0,1.0,0.0,,6.0,37.0
1,jdi33101,0.0,12.0,9.0,0.0,22.0,4.0,7.0,0.0,1.0,,,,0.0,55.0
2,40044962,4.0,,6.0,,8.0,7.0,0.0,5.0,3.0,10.0,0.0,6.0,8.0,57.0
3,065123G57,0.0,,5.0,,15.0,1.0,5.0,20.0,0.0,6.0,,,2.0,54.0
4,068603B59,10.0,7.0,3.0,1.0,42.0,3.0,1.0,1.0,0.0,8.0,,,3.0,79.0


# Export Data

In [73]:
all_project_df.to_csv("data/cleaned_data/project_table.csv", index=False)

days_df.to_csv("data/cleaned_data/workflow_table.csv", index=False)

info_df.to_csv("data/cleaned_data/project_info_table.csv", index=False)