# Setup
**This analysis includes all projects for the 2019 Season (includes projects marked as "Dead")**

## Dependencies

In [1]:
import pandas as pd
from datetime import datetime

## Import Data

In [2]:
# imports the '[W] Customers Table Audit' data
all_project_data = "./data/all_projects.csv"

# imports '[W] Scope Improvements' data
improvements_data = "./data/improvement_table.csv"

# Parsing Data

## All Projects

**Includes projects marked as 'Dead'**

In [3]:
project_df = pd.read_csv(
    all_project_data, dtype={'Claim #': str, 'Job #': str, 'Branch' : str, 'Claim Status': str},
    parse_dates=[
        'Claim # Date',
        'FTA Scope. Req Date',
        'Submit for Estimate Date',
        '[OB] Created Scope Calc',
        '[B] Created Estimate Date',
        'Job Submittal Date',
        '[B] - Date Approved by BC',
        '[OB] Completed']
)

# provides a summary of the 'project_df' holding all jobs for the season
project_summary = project_df.count()
project_summary

Claim #                      1850
Job #                         852
Branch                       1860
Claim # Date                 1861
FTA Scope. Req Date          1861
Submit for Estimate Date     1633
[OB] Created Scope Calc      1503
[B] Created Estimate Date    1481
Job Submittal Date            940
[B] - Date Approved by BC     852
[OB] Completed                809
Claim Status                  100
dtype: int64

## All Rejections

**Includes projects marked as 'Dead'**

In [4]:
# created 'improvements_df' to hold all dates of fta scope rejections to use correct date
improvements_df = pd.read_csv(
    improvements_data,
    dtype={'Claim #': str},
    parse_dates=['Created'])

##  Multi-Rejection Counts
**Determine the amount of rejections for each project**

In [5]:
# creating a 'df' to count how many times a job was rejected (if at all)
improvement_counts_df = (improvements_df.groupby("Claim #").count())

# resets the 'claim #' from being the index
improvement_counts_df.reset_index(inplace=True)

# renaming the 'improvement_counts'to make it easier to merge
improvement_counts_df = improvement_counts_df.rename(columns = {"Created" : "Scope Rejections"})

## Latest Rejections
**Determine the most recent rejection for any project**

In [6]:
# 'unique_improv_dates_df' holds most recent 'created' dates for improvements;
# 'idmax()' provides the most current date
latest_rejection_df = improvements_df.loc[
    improvements_df.groupby('Claim #')['Created'].idxmax()]

# renaming the 'unique improvements' df to make it easier to merge
latest_rejection_df = latest_rejection_df.rename(columns={"Created": "Rejection Date"})

# Merge Data

## Merge 'All Projects' and 'Multi-Rejection' dataframes
**Merging dfs on the shared 'Claim #' Column**

In [7]:
first_merge_df = pd.merge(project_df, improvement_counts_df, how='left', on='Claim #')
first_merge_df['Scope Rejections'] = first_merge_df['Scope Rejections'].fillna(0)

first_merge_df.head()

Unnamed: 0,Claim #,Job #,Branch,Claim # Date,FTA Scope. Req Date,Submit for Estimate Date,[OB] Created Scope Calc,[B] Created Estimate Date,Job Submittal Date,[B] - Date Approved by BC,[OB] Completed,Claim Status,Scope Rejections
0,HO0000002220666,1937704,KCI,2019-07-04,2019-07-04,2019-07-08,2019-07-09,2019-07-11,2019-07-12,2019-07-12,NaT,,0.0
1,218952GJ,1937584,FCO,2019-06-28,2019-07-02,2019-07-03,2019-07-08,2019-07-08,2019-07-09,2019-07-09,2019-07-09,,0.0
2,PP0018371868,1937550,KCI,2019-06-27,2019-06-27,2019-07-02,2019-07-03,2019-07-05,2019-07-10,2019-07-10,2019-07-11,,0.0
3,40280291,1937511,FCO,2019-06-27,2019-07-02,2019-07-03,2019-07-10,2019-07-10,2019-07-11,2019-07-12,NaT,,0.0
4,066286T79,1937486,FCO,2019-06-26,2019-06-26,2019-06-27,2019-06-28,2019-06-28,2019-07-09,2019-07-10,2019-07-10,,0.0


## Combine with 'Latest Rejections'
**Merging dfs on the shared 'Claim #' Column**

In [11]:
final_merged_df = pd.merge(first_merge_df, latest_rejection_df, how='left', on='Claim #')
final_merged_df.head()

Unnamed: 0,Claim #,Job #,Branch,Claim # Date,FTA Scope. Req Date,Submit for Estimate Date,[OB] Created Scope Calc,[B] Created Estimate Date,Job Submittal Date,[B] - Date Approved by BC,[OB] Completed,Claim Status,Scope Rejections,Rejection Date
0,HO0000002220666,1937704,KCI,2019-07-04,2019-07-04,2019-07-08,2019-07-09,2019-07-11,2019-07-12,2019-07-12,NaT,,0.0,NaT
1,218952GJ,1937584,FCO,2019-06-28,2019-07-02,2019-07-03,2019-07-08,2019-07-08,2019-07-09,2019-07-09,2019-07-09,,0.0,NaT
2,PP0018371868,1937550,KCI,2019-06-27,2019-06-27,2019-07-02,2019-07-03,2019-07-05,2019-07-10,2019-07-10,2019-07-11,,0.0,NaT
3,40280291,1937511,FCO,2019-06-27,2019-07-02,2019-07-03,2019-07-10,2019-07-10,2019-07-11,2019-07-12,NaT,,0.0,NaT
4,066286T79,1937486,FCO,2019-06-26,2019-06-26,2019-06-27,2019-06-28,2019-06-28,2019-07-09,2019-07-10,2019-07-10,,0.0,NaT


# Comparing Data

## Gathering Date Differences
**Finding any 'Submit for Estimate' and 'Rejection Date' Subsitutions**

In [12]:
# list to store the 'date diffs' value for each step'
claim_num = []
job_num = []
rep_claim_diff = []
fta_scope_diff = []
ob_scope_diff = []
bc_estimate_diff = []
sup_pfynr_diff = []
bc_approval_diff = []
ob_order_build_diff = []
total_days = []

# iterating over the df to create 'date diff' variables
for index, row in final_merged_df.iterrows():

    # creating 'date_diff' variables for each step in the workflow
    rep_claim_date_diff = (row["FTA Scope. Req Date"] - row["Claim # Date"]).days
    bc_estimate_date_diff = (row['[B] Created Estimate Date'] - row['[OB] Created Scope Calc']).days
    sup_pfynr_date_diff = (row["Job Submittal Date"] - row["[B] Created Estimate Date"]).days
    bc_approval_date_diff = (row["[B] - Date Approved by BC"] - row["Job Submittal Date"]).days
    ob_orderbuild_date_diff = (row['[OB] Completed'] - row['[B] - Date Approved by BC']).days
    
    # if the project has not been rejected, then the 'Submit for Estimate' date is used
    if row['Rejection Date'] != row['Rejection Date']:
        
        fta_date_diff = (row["Submit for Estimate Date"] - row["FTA Scope. Req Date"]).days
        ob_scope_date_diff = (row['[OB] Created Scope Calc'] - row['Submit for Estimate Date']).days
    
    # if the project has been rejected, then the most recent 'Rejection' Date is used
    else:
        
        fta_date_diff = (row["Rejection Date"] - row["FTA Scope. Req Date"]).days
        ob_scope_date_diff = (row['[OB] Created Scope Calc'] - row['Rejection Date']).days

    # adding up all of the 'date_diff' variables above and assigning to 'total_days_sum'
    day_diffs = [rep_claim_date_diff, fta_date_diff, ob_scope_date_diff, bc_estimate_date_diff,
                 sup_pfynr_date_diff, bc_approval_date_diff, ob_orderbuild_date_diff]
    total_days_sum = sum(day_diffs)
    
    # appending 'date diff' values to lists to create each df column
    claim_num.append(row["Claim #"])
    job_num.append(row["Job #"])
    rep_claim_diff.append(rep_claim_date_diff)
    fta_scope_diff.append(fta_date_diff)
    ob_scope_diff.append(ob_scope_date_diff)
    bc_estimate_diff.append(bc_estimate_date_diff)
    sup_pfynr_diff.append(sup_pfynr_date_diff)
    bc_approval_diff.append(bc_approval_date_diff)
    ob_order_build_diff.append(ob_orderbuild_date_diff)
    total_days.append(total_days_sum)



## Creating 'Workflow Days' df
**The days between each Teammate step in the workflow**

In [13]:
# creating the 'days_df' to hold all date values for each role in the project
days_df = pd.DataFrame({
    "claim_#": claim_num,
    "job_#": job_num,
    "rep_claim": rep_claim_diff,
    "fta_scope": fta_scope_diff,
    "ob_scope": ob_scope_diff,
    "bc_estimate": bc_estimate_diff,
    "sup_pfynr": sup_pfynr_diff,
    "bc_approval": bc_approval_diff,
    "ob_orderbuild": ob_order_build_diff,
    "total_days": total_days
})

# Export Data

In [14]:
# 'projects' and 'workflow days' CSVs

final_merged_df.to_csv("data/project_table.csv", index=False)

days_df.to_csv("data/workflow_days.csv", index=False)


In [16]:
days_df.head()

Unnamed: 0,claim_#,job_#,rep_claim,fta_scope,ob_scope,bc_estimate,sup_pfynr,bc_approval,ob_orderbuild,total_days
0,HO0000002220666,1937704,0,4.0,1.0,2.0,1.0,0.0,,
1,218952GJ,1937584,4,1.0,5.0,0.0,1.0,0.0,0.0,11.0
2,PP0018371868,1937550,0,5.0,1.0,2.0,5.0,0.0,1.0,14.0
3,40280291,1937511,5,1.0,7.0,0.0,1.0,1.0,,
4,066286T79,1937486,0,1.0,1.0,0.0,11.0,1.0,0.0,14.0


In [21]:
(days_df['total_days'] - days_df['rep_claim']).mean()

22.08910891089109