# Setup

## Dependencies

In [1]:
import pandas as pd
import sqlite3
from datetime import datetime

## Importing Data

In [10]:
# imports each tv table data, via external csvs
project_data = "./data/customers_table.csv"

improvements_data = "./data/improvements_table.csv"

# Workflow Data

## Separate completed Jobs to 'in_production_df'

**Jobs where '[OB] Completed' and 'Job #' is not blank**

In [5]:
# mydateparser = lambda x: pd.datetime.strptime(x, "%Y %m %d %H:%M:%S")

# reading the project data into a df
project_df = pd.read_csv(project_data,
                         dtype={'Claim #': str, 'Job #': str},
                         parse_dates=['Claim # Date', 'FTA Scope. Req Date',
                                      'Submit for Estimate Date', '[OB] Created Scope Calc',
                                      '[B] Created Estimate Date', 'Job Submittal Date',
                                      '[B] - Date Approved by BC', '[OB] Completed'],)

# created 'in_production_df' to hold all current jobs in production, to avoid 'NaN' date values
in_production_df = (project_df.loc[
    (project_df['[OB] Completed'].isnull() == False) & 
    (project_df["Job #"].isnull() == False), :])

pre_production_df = (project_df.loc[(project_df["Job #"].isnull() == True), :])


project_df.head()

Unnamed: 0,Claim #,Job #,Claim # Date,FTA Scope. Req Date,Submit for Estimate Date,[OB] Created Scope Calc,[B] Created Estimate Date,Job Submittal Date,[B] - Date Approved by BC,[OB] Completed
0,5006023896,1937392,2019-06-24,2019-06-24,2019-06-25,2019-06-27,2019-06-27,2019-06-27,2019-06-27,2019-06-28 14:52:00
1,001824322-412,1937164,2019-06-17,2019-06-17,2019-06-23,2019-06-24,2019-06-24,2019-06-27,2019-06-27,2019-06-28 11:05:00
2,3011280610-1-1,1937159,2019-06-19,2019-06-21,2019-06-24,2019-06-25,2019-06-26,2019-06-27,2019-06-28,NaT
3,017159602-91A-020,1937148,2019-06-15,2019-06-15,2019-06-17,2019-06-20,2019-06-20,2019-06-27,2019-06-27,2019-06-27 13:59:00
4,HO2224472,1937146,2019-06-15,2019-06-16,2019-06-25,2019-06-28,2019-06-28,2019-06-30,2019-07-01,NaT


In [50]:
# created 'improvements_df' to hold all dates of fta scope rejections to use correct date
improvements_df = pd.read_csv(improvements_data,
                              dtype={'Claim #': str, 'Job #': str},
                              parse_dates=['Submit for Estimate Date', 'Created'])

# 'improvement_dates_df' holds all current jobs in production, to avoid 'NaN' date values
improvement_dates_df = improve_df.loc[improve_df["Job #"].isnull() == False, :]

# 'unique_improv_dates_df' holds most recent 'created' dates for 
unique_improv_dates_df = improvement_dates_df.loc[improvement_dates_df.groupby(
    'Claim #')['Created'].idxmax()]


# will need to update the csv to remove 'submit for estimate' date
df.rename(columns = {
  "Claim #" : "Claim #",
  "Job #" : "Job #",
    "Created":"Improvement Date",
})

# **Want to compare the 'production_df' against the 'improvement_dates_df' before creating a 'days_df'**

In [51]:
print(in_production_df.count())
print('----------------------------------------')
print(improvement_dates_df.count())
print('----------------------------------------')
print(unique_improv_dates_df.count())

Claim #                      641
Job #                        642
Claim # Date                 642
FTA Scope. Req Date          642
Submit for Estimate Date     642
[OB] Created Scope Calc      642
[B] Created Estimate Date    642
Job Submittal Date           642
[B] - Date Approved by BC    642
[OB] Completed               642
dtype: int64
----------------------------------------
Claim #                     423
Job #                       424
Submit for Estimate Date    424
Created                     424
dtype: int64
----------------------------------------
Claim #                     290
Job #                       290
Submit for Estimate Date    290
Created                     290
dtype: int64


In [52]:
unique_improv_dates_df.head()

Unnamed: 0,Claim #,Job #,Submit for Estimate Date,Created
5,00125106691A002,1937116,2019-06-20,2019-06-21 11:56:00
6,001771944,1935841,2019-05-16,2019-05-17 14:00:00
12,00225357330,1935033,2019-02-04,2019-02-08 09:52:00
14,002761823-615,1935789,2019-05-14,2019-05-14 15:55:00
15,00315128417,1935190,2019-03-26,2019-03-27 13:48:00


## Create the 'days_df' holding days information

In [25]:
# lists to collect the 'date diffs'
claim_num = []                         
job_num = []
rep_claim_diff = []
fta_scopes_diff = []
ob_scope_diff = []
bc_estimate_diff = []
sup_pfynr_diff = []
bc_approvals_diff = []
ob_order_builds_diff = []
total_days = []

# iterating over the df to create 'date diff' variables
for index, row in in_production_df.iterrows():

    # creating 'date_diff' variables for each step in the workflow
    rep_claim_date_diff = (row["FTA Scope. Req Date"] - row["Claim # Date"]).days
    fta_date_diff = (row["Submit for Estimate Date"] - row["FTA Scope. Req Date"]).days
    ob_scope_date_diff = (row['[OB] Created Scope Calc'] - row['Submit for Estimate Date']).days
    bc_estimate_date_diff = (row['[B] Created Estimate Date'] - row['[OB] Created Scope Calc']).days
    sup_pfynr_date_diff = (row["Job Submittal Date"] - row["[B] Created Estimate Date"]).days
    bc_approval_date_diff = (row["[B] - Date Approved by BC"] - row["Job Submittal Date"]).days
    ob_orderbuild_date_diff = (row['[OB] Completed'] - row['[B] - Date Approved by BC']).days
    
    # adding up all of the 'date_diff' variables above and assigning to 'total_days_sum'
    day_diffs = [rep_claim_date_diff, fta_date_diff, ob_scope_date_diff, bc_estimate_date_diff, sup_pfynr_date_diff, bc_approval_date_diff, ob_orderbuild_date_diff]
    total_days_sum = sum(day_diffs)

    # appending 'date diff' values to lists to create each df column
    claim_num.append(row["Claim #"])
    job_num.append(row["Job #"])
    rep_claim_diff.append(rep_claim_date_diff)
    fta_scopes_diff.append(fta_date_diff)
    ob_scope_diff.append(ob_scope_date_diff)
    bc_estimate_diff.append(bc_estimate_date_diff)
    sup_pfynr_diff.append(sup_pfynr_date_diff)
    bc_approvals_diff.append(bc_approval_date_diff)
    ob_order_builds_diff.append(ob_orderbuild_date_diff)
    total_days.append(total_days_sum)

 # creating the 'days_df' to hold all date values for each role in the project
days_df = pd.DataFrame({
    "claim_#" : claim_num,
    "job_#" : job_num,
    "rep_claim" : rep_claim_diff, 
    "fta_scope" : fta_scopes_diff,
    "ob_scope" : ob_scope_diff,
    "bc_estimate" : bc_estimate_diff,
    "sup_pfynr" : sup_pfynr_diff,
    "bc_approval" : bc_approvals_diff,
    "ob_orderbuild" : ob_order_builds_diff,
    "total_days" : total_days
})    


# Export Data

In [27]:
# writes the df to a csv file; 'index=True' writes row names (default)
in_production_df.to_csv("data/in_production.csv", index=False)

pre_production_df.to_csv("data/pre_production.csv", index=False)

days_df.to_csv("data/workflow_days.csv", index=False)