In [3]:
import pandas as pd
import numpy as np
import oracledb
import os
from datetime import datetime
import warnings

# Suppress all Python warnings
warnings.filterwarnings("ignore")

# Set pandas option to display numbers in plain format
pd.set_option('display.float_format', '{:.0f}'.format)

# Get today's date
today = datetime.today()

# Calculate the number of days between today and the most recent Sunday
newestsunday = (today.weekday() - 6) % 7

# Calculate the number of days between today and X Sundays ago
oldestsunday = newestsunday + (7 * 8)

In [4]:
# Establish the database connection
connection = oracledb.connect(user='usernamehere',
                              password='pswdhere',
                              dsn=oracledb.makedsn('hostnamehere', '1234', service_name='DATABASE_NAME_HERE'))
cursor = connection.cursor()

query_edw = f"""
SELECT od.ID_DIVISION
	,pd.ID_SKU
	,pgd.CD_HPIS_MAJOR || ' - ' || pgd.DS_HPIS_MAJOR AS "Major HPIS Class - Description"
	,(pd.AT_CORP_BASE_COST * pd.RT_PURCHASE_UOM_CONVERSION) AS "Unit_Cost_PUOM"
    ,SUM(sf.QT_ORDER / pd.RT_PURCHASE_UOM_CONVERSION) AS ORDERED_QTY_PUOM
    ,SUM(sf.QT_SHIP / pd.RT_PURCHASE_UOM_CONVERSION) AS SHIPPED_QTY_PUOM
	,CASE 
		WHEN TO_CHAR(td.KY_TIME, 'D') = 1
			THEN TRUNC(td.KY_TIME)
		ELSE TRUNC(td.KY_TIME, 'IW') - 1
		END AS STARTDATE
FROM DBINSTANCE.SALES_FACTS sf
JOIN DBINSTANCE.PRODUCT_DIM pd ON pd.KY_PRODUCT = sf.KY_PRODUCT
JOIN DBINSTANCE.ORG_DIM od ON sf.KY_ORG = od.KY_ORG
JOIN DBINSTANCE.TIME_DIM td ON sf.KY_TIME = td.KY_TIME
JOIN DBINSTANCE.INVOICE_TYPE_DIM itd ON itd.KY_INVOICE_TYPE = sf.KY_INVOICE_TYPE
LEFT JOIN DBINSTANCE.PRODUCT_GROUP_DIM pgd ON pd.CD_HPIS = pgd.CD_HPIS_CLASS
WHERE sf.QT_ORDER > 0
	AND td.KY_TIME BETWEEN TRUNC(SYSDATE) - {oldestsunday}
		AND TRUNC(SYSDATE) - {newestsunday} 		-- Pulls 60 days worth of data
	AND pd.ID_SKU NOT LIKE '8%' 			-- Excludes customer owned inventory
	AND itd.CD_INVOICE_TYPE <> 'REBILL'
	AND itd.CD_INVOICE_TYPE <> 'DIRECT' 	-- Filters out direct orders (supplier directly to customer)
	AND itd.FG_ORIGINAL_ORDER = 'ORIGINAL' 	-- Filters out backorders
	AND pd.FG_CURRENT = 'Y'
	AND pd.FG_ACTIVE_SKU = 'ACTIVE'
    AND od.ID_DIVISION IN (03, 08, 14, 16, 20, 21, 30, 37, 41, 44, 45, 48, 49, 50, 51, 53, 56, 58, 59, 60, 64, 65, 66, 67, 68, 69, 70, 71, 78, 80, 82, 84, 85, 87, 89, 90, 91, 92, 93, 94, 96, 98)
    GROUP BY od.ID_DIVISION, ID_SKU, pgd.CD_HPIS_MAJOR || ' - ' || pgd.DS_HPIS_MAJOR, (pd.AT_CORP_BASE_COST * pd.RT_PURCHASE_UOM_CONVERSION),
         CASE
             WHEN TO_CHAR(td.KY_TIME, 'D') = 1 THEN TRUNC(td.KY_TIME)
             ELSE TRUNC(td.KY_TIME, 'IW') - 1
         END
"""



query_by = f"""
SELECT SUBSTR(s.loc, LENGTH(s.loc) - 1, 2) AS "ID_DIVISION",
   i.VENDORNUM,
   hf.DMDUNIT AS "ID_SKU",
   hf.LAG,
   hf.STARTDATE,
   nvl(hf.BASEFCST,0) AS "BASEFCST",
   nvl(hf.NONBASEFCST,0) AS "NONBASEFCST",
   nvl(hf.TOTFCST,0) AS "TOTFCST",
   nvl(hf.RECONCILEDFCST,0) AS "RECONCILEDFCST",
   nvl(hf.FCSTOVERRIDE,0) AS "FCSTOVERRIDE",
   nvl(hf.EXTERNALEVENTS,0) AS "EXTERNALEVENTS"
FROM SCPDBINSTANCE.fcstperfstatichist hf
JOIN SCPDBINSTANCE.sku s ON hf.dmdunit = s.item AND hf.loc = s.loc
JOIN SCPDBINSTANCE.item i ON hf.dmdunit = i.item AND s.item = i.item
WHERE hf.lag in (1, 4)
AND s.loc in ('DIV 03', 'DIV 08', 'DIV 14', 'DIV 16', 'DIV 20', 'DIV 21', 'DIV 30', 'DIV 37', 'DIV 41', 'DIV 44', 'DIV 45', 'DIV 48', 'DIV 49', 'DIV 50', 'DIV 51', 'DIV 53', 'DIV 56', 'DIV 58', 'DIV 59', 'DIV 60', 'DIV 64', 'DIV 65', 'DIV 66', 'DIV 67', 'DIV 68', 'DIV 69', 'DIV 70', 'DIV 71', 'DIV 78', 'DIV 80', 'DIV 82', 'DIV 84', 'DIV 85', 'DIV 87', 'DIV 89', 'DIV 90', 'DIV 91', 'DIV 92', 'DIV 93', 'DIV 94', 'DIV 96', 'DIV 98')
AND i.item NOT LIKE '8%' -- Excludes customer owned inventory
AND hf.startdate between trunc(sysdate,'D') - {oldestsunday} and trunc(sysdate,'D') - {newestsunday}
"""

# read the query into a dataframe
df_fa_edw = pd.read_sql(query_edw, con=connection)
df_fa_by = pd.read_sql(query_by, con=connection)

# Close database connection
connection.close()

print("Sales Facts # of records returned: ", len(df_fa_edw))
print("Blue Yonder # of records returned: ", len(df_fa_by))

Sales Facts # of records returned:  1249385
Blue Yonder # of records returned:  6038735


In [5]:
# View date ranges for both data pulls
print('# of unique STARTDATEs from Blue Yonder DATA:', df_fa_by['STARTDATE'].nunique())

# Group by 'STARTDATE' and sum 'ORDERED_QTY_PUOM'
sumbySTARTDATE = df_fa_edw.groupby('STARTDATE')['ORDERED_QTY_PUOM'].sum().reset_index()
countbySTARTDATE = df_fa_edw.groupby('STARTDATE')['ID_SKU'].count().reset_index()
countuniquebySTARTDATE = df_fa_edw.groupby('STARTDATE')['ID_SKU'].nunique().reset_index()


print(sumbySTARTDATE)
print(countbySTARTDATE)
print(countuniquebySTARTDATE)

# of unique STARTDATEs from Blue Yonder DATA: 8
   STARTDATE  ORDERED_QTY_PUOM
0 2024-09-08           1630146
1 2024-09-15           1542804
2 2024-09-22           1552258
3 2024-09-29           5416777
4 2024-10-06           1580497
5 2024-10-13           1544337
6 2024-10-20           1526002
7 2024-10-27           1666588
   STARTDATE  ID_SKU
0 2024-09-08  158355
1 2024-09-15  157516
2 2024-09-22  157627
3 2024-09-29  156389
4 2024-10-06  154958
5 2024-10-13  154778
6 2024-10-20  155052
7 2024-10-27  154710
   STARTDATE  ID_SKU
0 2024-09-08   47250
1 2024-09-15   47443
2 2024-09-22   47176
3 2024-09-29   46634
4 2024-10-06   46324
5 2024-10-13   46185
6 2024-10-20   46502
7 2024-10-27   46228


In [10]:
# Bring in our new DC-item segmentation data
seg = pd.read_excel('Inputs/ABCXYZ segmentation.xlsx', dtype={'ID_DIVISION': str})

In [11]:
# Format STARTDATEs into datetime
df_fa_edw['STARTDATE'] = pd.to_datetime(df_fa_edw['STARTDATE']).dt.date
df_fa_by['STARTDATE'] = pd.to_datetime(df_fa_by['STARTDATE']).dt.date

### Bring in EDW sales data to BY histfcst data
df_fa = pd.merge(df_fa_by, df_fa_edw, on=['ID_DIVISION', 'ID_SKU', 'STARTDATE'], how='left')

### Bring in segmentation data
df_fa = pd.merge(df_fa, seg, on=['ID_DIVISION', 'ID_SKU'], how='left')

# Fill missing values in the 'Segmentation' column with 'Unclassified'
df_fa['Segment'].fillna('Unclassified', inplace=True)

# Format date column to datetime again
df_fa['STARTDATE'] = pd.to_datetime(df_fa['STARTDATE'])

# Capture the most recent startdate
lastSTARTDATE = df_fa['STARTDATE'].max().strftime('%Y-%m-%d')

In [12]:
# Start with a copy of the DataFrame
df_fa2 = df_fa.copy()

# Clip negative values in 'TOTFCST'
df_fa2['TOTFCST'] = df_fa2['TOTFCST'].clip(lower=0)

# Helper function to create error columns
def calculate_abs_error(forecast_column, actual_column):
    return (df_fa2[forecast_column] - df_fa2[actual_column]).abs()

# Create calculated columns
df_fa2 = df_fa2.assign(
    **{
        'BASEFCST+RECONCILEDFCST': df_fa2['BASEFCST'] + df_fa2['RECONCILEDFCST'],
        'TotalATSFcstAdd': df_fa2['FCSTOVERRIDE'] + df_fa2['EXTERNALEVENTS'],
        'TOTFCST w/o ATS': df_fa2['TOTFCST'] - (df_fa2['FCSTOVERRIDE'] + df_fa2['EXTERNALEVENTS']),
        'BASEFCST Abs Error': calculate_abs_error('BASEFCST', 'ORDERED_QTY_PUOM'),
        'BASEFCST+RECONCILEDFCST Abs Error': calculate_abs_error('BASEFCST+RECONCILEDFCST', 'ORDERED_QTY_PUOM'),
        'TOTFCST Abs Error': calculate_abs_error('TOTFCST', 'ORDERED_QTY_PUOM'),
        'TOTFCST w/o ATS Abs Error': calculate_abs_error('TOTFCST w/o ATS', 'ORDERED_QTY_PUOM'),
        'Ordered > TOTFCST (Over)': df_fa2['ORDERED_QTY_PUOM'].sub(df_fa2['TOTFCST']).clip(lower=0),
        'Ordered < TOTFCST (Under)': df_fa2['TOTFCST'].sub(df_fa2['ORDERED_QTY_PUOM']).clip(lower=0),
        'Ordered Absolute Error Delta': (df_fa2['TOTFCST'] - df_fa2['ORDERED_QTY_PUOM']).abs(),
        'Shipped > TOTFCST (Over)': df_fa2['SHIPPED_QTY_PUOM'].sub(df_fa2['TOTFCST']).clip(lower=0),
        'Shipped < TOTFCST (Under)': df_fa2['TOTFCST'].sub(df_fa2['SHIPPED_QTY_PUOM']).clip(lower=0),
        'Shipped Absolute Error Delta': (df_fa2['TOTFCST'] - df_fa2['SHIPPED_QTY_PUOM']).abs()
    }
)

# Set 'ATS Error' based on conditions
df_fa2['ATS Error'] = 0
df_fa2.loc[(df_fa2['TotalATSFcstAdd'].abs() > 0) & (df_fa2['TOTFCST'] == 0) & (df_fa2['ORDERED_QTY_PUOM'] > 0), 'ATS Error'] = df_fa2['ORDERED_QTY_PUOM']
df_fa2.loc[df_fa2['TotalATSFcstAdd'].abs() > 0, 'ATS Error'] = df_fa2['TOTFCST Abs Error'] - df_fa2['TOTFCST w/o ATS Abs Error']

# Create currency columns
columns_to_multiply = [
    'BASEFCST', 'NONBASEFCST', 'TOTFCST', 'RECONCILEDFCST', 'FCSTOVERRIDE',
    'EXTERNALEVENTS', 'ORDERED_QTY_PUOM', 'SHIPPED_QTY_PUOM', 'BASEFCST+RECONCILEDFCST',
    'TotalATSFcstAdd', 'TOTFCST w/o ATS', 'BASEFCST Abs Error', 'BASEFCST+RECONCILEDFCST Abs Error',
    'TOTFCST Abs Error', 'TOTFCST w/o ATS Abs Error', 'ATS Error'
]
df_fa2 = df_fa2.assign(**{f'${col}': df_fa2[col] * df_fa2['Unit_Cost_PUOM'] for col in columns_to_multiply})

# Roll up per specifications
df_fa2_rollup = (
    df_fa2.groupby(
        ['ID_DIVISION', 'VENDORNUM', 'Major HPIS Class - Description', 'Segment', 'LAG', 'STARTDATE'],
        as_index=False
    )
    .sum()
)

# Separate out LAGs
df_fa2_rollup_lag1 = df_fa2_rollup[df_fa2_rollup['LAG'] == 1]
df_fa2_rollup_lag4 = df_fa2_rollup[df_fa2_rollup['LAG'] == 4]

print("# of rows in the final rolled-up report: ", len(df_fa2_rollup_lag1))

# of rows in the final rolled up report:  440633


In [13]:
# Define list of columns to aggregate to avoid repetition
columns_to_aggregate = [
    'BASEFCST', 'NONBASEFCST', 'TOTFCST', 'RECONCILEDFCST', 'FCSTOVERRIDE', 'EXTERNALEVENTS', 
    'ORDERED_QTY_PUOM', 'SHIPPED_QTY_PUOM', 'BASEFCST+RECONCILEDFCST', 'TotalATSFcstAdd', 
    'TOTFCST w/o ATS', 'Ordered > TOTFCST (Over)', 'Ordered < TOTFCST (Under)', 
    'Ordered Absolute Error Delta', 'Shipped > TOTFCST (Over)', 'Shipped < TOTFCST (Under)', 
    'Shipped Absolute Error Delta', 'BASEFCST Abs Error', 'BASEFCST+RECONCILEDFCST Abs Error', 
    'TOTFCST Abs Error', 'TOTFCST w/o ATS Abs Error', 'ATS Error', '$BASEFCST', '$NONBASEFCST', 
    '$TOTFCST', '$RECONCILEDFCST', '$FCSTOVERRIDE', '$EXTERNALEVENTS', '$ORDERED_QTY_PUOM', 
    '$SHIPPED_QTY_PUOM', '$BASEFCST+RECONCILEDFCST', '$TotalATSFcstAdd', '$TOTFCST w/o ATS', 
    '$BASEFCST Abs Error', '$BASEFCST+RECONCILEDFCST Abs Error', '$TOTFCST Abs Error', 
    '$TOTFCST w/o ATS Abs Error', '$ATS Error'
]

# Filter data for LAG = 1
df_fa2_lag1 = df_fa2[df_fa2['LAG'] == 1]

# Function to perform rollup based on specified groupby columns
def perform_rollup(dataframe, groupby_columns, aggregate_columns):
    return dataframe.groupby(groupby_columns)[aggregate_columns].sum().reset_index()

# Rollup to DC by STARTDATE
df_fa2_DCrollup_lag1 = perform_rollup(
    dataframe=df_fa2_lag1, 
    groupby_columns=['ID_DIVISION', 'STARTDATE'], 
    aggregate_columns=columns_to_aggregate
)

# Rollup to Network by STARTDATE
df_fa2_Networkrollup_lag1 = perform_rollup(
    dataframe=df_fa2_lag1, 
    groupby_columns=['STARTDATE'], 
    aggregate_columns=columns_to_aggregate
)

print("# of rows in DC rollup:", len(df_fa2_DCrollup_lag1))
print("# of rows in Network rollup:", len(df_fa2_Networkrollup_lag1))

# of rows in DC rollup:  320
# of rows in Network rollup:  8


In [15]:
### Create dataframes that identify large discrepancies when ATS qtys come into play

# Filter for only lag 1 records
df_fa2_lag1 = df_fa2[df_fa2['LAG'] == 1]

# Helper function to get records with large discrepancies between Ordered and Shipped quantities
def filter_large_discrepancy(dataframe):
    return dataframe[(dataframe['ORDERED_QTY_PUOM'] >= dataframe['SHIPPED_QTY_PUOM'] * 4) & (dataframe['SHIPPED_QTY_PUOM'] > 0)]

# Helper function to get top SKUs by Ordered Absolute Error Delta for each DC
def get_top_skus_by_error(dataframe, top_n=10):
    grouped = dataframe.groupby(['ID_DIVISION', 'ID_SKU'])['Ordered Absolute Error Delta'].sum().reset_index()
    top_skus = grouped.groupby('ID_DIVISION').apply(lambda x: x.nlargest(top_n, 'Ordered Absolute Error Delta')).reset_index(drop=True)
    # Merge back to get complete records for these top SKUs
    return pd.merge(dataframe, top_skus[['ID_DIVISION', 'ID_SKU']], on=['ID_DIVISION', 'ID_SKU'], how='inner')

# Helper function to roll up aggregated data
def perform_rollup(dataframe, groupby_columns):
    return dataframe.groupby(groupby_columns)[columns_to_aggregate].sum().reset_index()

# Large discrepancy records for DC-SKU-LAG-STARTDATE
df_fa2_lag1_discrep = filter_large_discrepancy(df_fa2_lag1)

# Top 10 SKUs with the highest ordered error delta for each DC
df_fa2_lag1_top10 = get_top_skus_by_error(df_fa2_lag1, top_n=10)
df_fa2_lag1_top10 = perform_rollup(df_fa2_lag1_top10, ['ID_DIVISION', 'ID_SKU', 'VENDORNUM', 'Major HPIS Class - Description', 
                                                      'Segment', 'LAG', 'STARTDATE'])
# Sort the result
df_fa2_lag1_top10 = df_fa2_lag1_top10.sort_values(by=['ID_DIVISION', 'Ordered Absolute Error Delta'], ascending=[True, False])

# Capture top 100 ID_SKU - STARTDATE combinations by ATS Error at each DC
df_fa2_lag1_sorted = df_fa2_lag1.sort_values(by=['ID_DIVISION', 'ATS Error'], ascending=[True, False])
df_fa2_lag1_top100ATSerror = df_fa2_lag1_sorted.groupby('ID_DIVISION').head(100)

# Select the required columns for the top 100 records
df_fa2_lag1_top100ATSerror = df_fa2_lag1_top100ATSerror[['ID_DIVISION', 'ID_SKU', 'STARTDATE', 'ATS Error', '$ATS Error']]

# Display row counts
print("# of rows in large discrepancy records:", len(df_fa2_lag1_discrep))
print("# of rows in top 10 SKUs by error:", len(df_fa2_lag1_top10))
print("# of rows in top 100 ATS Error records per DC:", len(df_fa2_lag1_top100ATSerror))



### Create dataframe used to define ATS Error Logic

ATSErrorlogic = {
    'Condition': ['Condition 1', 'Condition 2', 'Condition 3'],
    'Logic': [
        "IF ABS('TotalATSFcstAdd') > 0 AND 'TOTFCST' >= 0 AND 'ORDERED_QTY_PUOM' == 0, THEN 'ATS Error' = 'ORDERED_QTY_PUOM'",
        "IF ABS('TotalATSFcstAdd') > 0, THEN 'ATS Error' = 'TOTFCST Abs Error' - 'TOTFCST w/o ATS Abs Error'",
        "ELSE 'ATS Error' = 0"
    ]
}

ATSErrorlogic = pd.DataFrame(ATSErrorlogic)

In [12]:
# Create Excel writer instance
Filewriter = pd.ExcelWriter(f'Outputs/Forecast Accuracy 8wks thru {lastSTARTDATE}.xlsx', engine='xlsxwriter')

# List of dataframes and their respective sheet names
dataframes = [
    (df_fa2_Networkrollup_lag1, 'Network rollup'),
    (df_fa2_DCrollup_lag1, 'DC rollup'),
    (df_fa2_rollup_lag1, 'lag1 by Vendor-Segment'),
    (df_fa2_rollup_lag4, 'lag4 by Vendor-Segment'),
    (df_fa2_lag1_top100ATSerror, 'Top 100 DC-Item ATS Error'),
    (df_fa2_lag1_top10, 'Top 10 DC-Item Order deltas'),
    (df_fa2_lag1_discrep, 'Large Order v Ship Discrep'),
    (ATSErrorlogic, 'ATS Error Logic')
]

# Write all dataframes to their respective sheets in a loop
for df, sheet_name in dataframes:
    df.to_excel(Filewriter, sheet_name=sheet_name, index=False)

# Get the xlsxwriter workbook and worksheet objects
workbook = Filewriter.book

# Define formats
header_format = workbook.add_format({
    'bold': True,
    'text_wrap': True,
    'valign': 'center',
    'align': 'center'
})
currency_format = workbook.add_format({'num_format': '$#,##0.00'})

# Function to format each worksheet
def format_worksheet(writer, sheet_name, df):
    worksheet = writer.sheets[sheet_name]
    for col_num, value in enumerate(df.columns):
        worksheet.write(0, col_num, value, header_format)
        worksheet.set_column(col_num, col_num, len(value) + 2)  # Set column width based on header length
        # Apply currency format to columns where header starts with '$'
        if value.startswith('$'):
            worksheet.set_column(col_num, col_num, None, currency_format)
    
    worksheet.autofilter(0, 0, 0, len(df.columns) - 1)  # Add dropdown filters
    worksheet.freeze_panes(1, 0)  # Freeze the top row

# Apply formatting to all worksheets in a loop
for df, sheet_name in dataframes:
    format_worksheet(Filewriter, sheet_name, df)

# Close the writer
Filewriter.close()