# Setup & Test Notebook

**Purpose**: Verify your environment is set up correctly for Hassett forecasting.

Run this notebook first to ensure everything works!

## 1. Import Required Packages

In [2]:
# Core data science
import numpy as np
import pandas as pd
import sqlite3
from pathlib import Path

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Time series
from statsmodels.tsa.seasonal import STL

# Machine learning
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.float_format', '{:.2f}'.format)

# Plotting style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('husl')

print("‚úÖ All packages imported successfully!")
print(f"\nVersions:")
print(f"  pandas: {pd.__version__}")
print(f"  numpy: {np.__version__}")

‚úÖ All packages imported successfully!

Versions:
  pandas: 2.2.3
  numpy: 2.3.5


## 2. Set Up Project Paths

In [1]:
# Get project root directory
import sys
from pathlib import Path

# Add src to path for imports
project_root = Path.cwd().parent
src_path = project_root / 'src'
sys.path.insert(0, str(src_path))

# Define data paths
data_dir = project_root / 'data'
models_dir = project_root / 'models'
docs_dir = project_root / 'docs'

print("üìÅ Project Structure:")
print(f"  Root: {project_root}")
print(f"  Data: {data_dir}")
print(f"  Source: {src_path}")
print(f"  Models: {models_dir}")
print(f"\n‚úÖ Paths configured!")

üìÅ Project Structure:
  Root: /Users/frankgiles/Downloads/hassett-forecasting
  Data: /Users/frankgiles/Downloads/hassett-forecasting/data
  Source: /Users/frankgiles/Downloads/hassett-forecasting/src
  Models: /Users/frankgiles/Downloads/hassett-forecasting/models

‚úÖ Paths configured!


## 3. Connect to Azure Databricks

In [1]:
from databricks import sql

# Establish connection to Azure Databricks
conn = sql.connect(
    server_hostname="adb-434028626745069.9.azuredatabricks.net",
    http_path="/sql/1.0/warehouses/23a9897d305fb7e2",
    auth_type="databricks-oauth"
)
print("‚úÖ Connection to Azure Databricks established successfully!")

# Test the connection with a simple query
cursor = conn.cursor()
cursor.execute("SELECT 1 as test")
result = cursor.fetchone()
print(f"‚úÖ Query test passed: {result}")

# List available tables (uncomment to run)
# cursor.execute("SHOW TABLES")
# tables = cursor.fetchall()
# print("\nüìä Available Tables:")
# for table in tables:
#     print(f"  - {table}")

[WARN] pyarrow is not installed by default since databricks-sql-connector 4.0.0,any arrow specific api (e.g. fetchmany_arrow) and cloud fetch will be disabled.If you need these features, please run pip install pyarrow or pip install databricks-sql-connector[pyarrow] to install


‚úÖ Connection to Azure Databricks established successfully!
‚úÖ Query test passed: Row(test=1)


## 4. Query Data from Databricks

In [None]:
# Query Hassett report table - Sample for testing
# First, check how many rows exist in the source table
count_query = "SELECT COUNT(*) as row_count FROM decus_domesticops_prod.dbo.tmp_hassett_report"
count_result = pd.read_sql(count_query, conn)
total_rows_in_db = count_result['row_count'][0]
print(f"Total rows in source table: {total_rows_in_db:,}")

# Fetch a sample for testing (10,000 rows)
SAMPLE_SIZE = 10000
query = f"SELECT * FROM decus_domesticops_prod.dbo.tmp_hassett_report LIMIT {SAMPLE_SIZE}"
print(f"\nFetching {SAMPLE_SIZE:,} sample rows from Databricks...")
df_hassett = pd.read_sql(query, conn)
print(f"‚úÖ Loaded {len(df_hassett):,} rows (SAMPLE dataset)")
print(f"Columns ({len(df_hassett.columns)}): {list(df_hassett.columns)}")
print(f"üìä Sample represents {len(df_hassett)/total_rows_in_db*100:.1f}% of total data")

print(f"\nFirst 5 rows:")
df_hassett.head()

## 4. Test Tier Mapping Data

In [None]:
# Check tier mapping
tier_path = data_dir / 'odc_tier_mapping.csv'

if tier_path.exists():
    tiers = pd.read_csv(tier_path)
    print("‚úÖ Tier mapping loaded!\n")
    print("üìä ODC Tiers:")
    display(tiers)
    
    print("\nüìà Tier Summary:")
    print(tiers.groupby('tier').agg({
        'ODC': 'count',
        'total_2024': 'sum'
    }).rename(columns={'ODC': 'count'}))
else:
    print(f"‚ö†Ô∏è  Tier mapping not found at: {tier_path}")

## 5. Quick Forecasting Test

In [None]:
# Simple 2024 baseline forecast test
if db_path.exists():
    conn = sqlite3.connect(db_path)
    
    # Get Week 50 from 2024 as baseline
    query = """
    SELECT 
        ODC,
        ProductType,
        SUM(PIECES) as total_pieces
    FROM hassett_report
    WHERE ProductType IN ('MAX', 'EXP')
        AND strftime('%Y', DATE_SHIP) = '2024'
        AND strftime('%W', DATE_SHIP) = '50'
    GROUP BY ODC, ProductType
    ORDER BY total_pieces DESC
    LIMIT 10
    """
    
    baseline = pd.read_sql(query, conn)
    conn.close()
    
    print("üìä Top 10 ODC-Product Combinations (2024 Week 50 Baseline):\n")
    display(baseline)
    
    # Visualize
    fig, ax = plt.subplots(figsize=(10, 6))
    baseline_pivot = baseline.pivot(index='ODC', columns='ProductType', values='total_pieces')
    baseline_pivot.plot(kind='bar', ax=ax, width=0.8)
    ax.set_ylabel('Pieces (Week 50, 2024)')
    ax.set_title('Top ODCs by Product Type - Week 50 Baseline')
    ax.legend(title='Product')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()
    
    print("\n‚úÖ Forecasting test complete!")

## 6. Environment Summary

In [None]:
print("="*60)
print("ENVIRONMENT SUMMARY")
print("="*60)

checks = [
    ("Python packages", True),
    ("Project paths", True),
    ("Database connection", db_path.exists()),
    ("Tier mapping", tier_path.exists()),
]

print("\n‚úÖ Status Check:")
for check, status in checks:
    symbol = "‚úÖ" if status else "‚ùå"
    print(f"  {symbol} {check}")

all_good = all(status for _, status in checks)

if all_good:
    print("\n" + "="*60)
    print("üéâ ALL CHECKS PASSED! You're ready to start forecasting!")
    print("="*60)
    print("\nNext steps:")
    print("  1. Open 01_quick_forecast.ipynb for a forecasting demo")
    print("  2. Open 02_data_exploration.ipynb to explore the data")
    print("  3. Review docs/META_ANALYSIS_100_EXPERIMENTS.md")
else:
    print("\n" + "="*60)
    print("‚ö†Ô∏è  SOME CHECKS FAILED")
    print("="*60)
    print("\nPlease:")
    if not db_path.exists():
        print(f"  - Copy hassett.db to {data_dir}/")
    if not tier_path.exists():
        print(f"  - Copy odc_tier_mapping.csv to {data_dir}/")

In [None]:
from databricks import sql
import pandas as pd

# Establish connection to Azure Databricks
conn = sql.connect(
    server_hostname="adb-434028626745069.9.azuredatabricks.net",
    http_path="/sql/1.0/warehouses/23a9897d305fb7e2",
    auth_type="databricks-oauth"
)
print("‚úÖ Connection to Azure Databricks established successfully!")

# Test the connection with a simple query
cursor = conn.cursor()
cursor.execute("SELECT 1 as test")
result = cursor.fetchone()
print(f"‚úÖ Query test passed: {result}")


In [4]:
from databricks import sql
import pandas as pd

# Establish connection to Azure Databricks
conn = sql.connect(
    server_hostname="adb-434028626745069.9.azuredatabricks.net",
    http_path="/sql/1.0/warehouses/23a9897d305fb7e2",
    auth_type="databricks-oauth"
)
print("‚úÖ Connection to Azure Databricks established successfully!")

# Test the connection with a simple query
cursor = conn.cursor()
cursor.execute("SELECT 1 as test")
result = cursor.fetchone()
print(f"‚úÖ Query test passed: {result}")


‚úÖ Connection to Azure Databricks established successfully!
‚úÖ Query test passed: Row(test=1)


In [5]:
# Cell 2: Core data science
import numpy as np
import pandas as pd
import sqlite3
from pathlib import Path

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Time series
from statsmodels.tsa.seasonal import STL

# Machine learning
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.float_format', '{:.2f}'.format)

# Plotting style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('husl')

print("‚úÖ All packages imported successfully!")
print(f"\nVersions:")
print(f"  pandas: {pd.__version__}")
print(f"  numpy: {np.__version__}")

‚úÖ All packages imported successfully!

Versions:
  pandas: 2.2.3
  numpy: 2.3.5


In [6]:
# Cell 4: Get project root directory
import sys
from pathlib import Path

# Add src to path for imports
project_root = Path.cwd().parent
src_path = project_root / 'src'
sys.path.insert(0, str(src_path))

# Define data paths
data_dir = project_root / 'data'
models_dir = project_root / 'models'
docs_dir = project_root / 'docs'

print("üìÅ Project Structure:")
print(f"  Root: {project_root}")
print(f"  Data: {data_dir}")
print(f"  Source: {src_path}")
print(f"  Models: {models_dir}")
print(f"\n‚úÖ Paths configured!")

üìÅ Project Structure:
  Root: /Users/frankgiles/Downloads/hassett-forecasting
  Data: /Users/frankgiles/Downloads/hassett-forecasting/data
  Source: /Users/frankgiles/Downloads/hassett-forecasting/src
  Models: /Users/frankgiles/Downloads/hassett-forecasting/models

‚úÖ Paths configured!


In [7]:
# Cell 6: from databricks import sql

# Establish connection to Azure Databricks
conn = sql.connect(
    server_hostname="adb-434028626745069.9.azuredatabricks.net",
    http_path="/sql/1.0/warehouses/23a9897d305fb7e2",
    auth_type="databricks-oauth"
)
print("‚úÖ Connection to Azure Databricks established successfully!")

# Test the connection with a simple query
cursor = conn.cursor()
cursor.execute("SELECT 1 as test")
result = cursor.fetchone()
print(f"‚úÖ Query test passed: {result}")

‚úÖ Connection to Azure Databricks established successfully!
‚úÖ Query test passed: Row(test=1)


In [9]:
# Cell 8: Query Hassett report table - FULL DATASET (no limit)
# First, check how many rows exist in the source table
count_query = "SELECT COUNT(*) as row_count FROM decus_domesticops_prod.dbo.tmp_hassett_report"
count_result = pd.read_sql(count_query, conn)
total_rows_in_db = count_result['row_count'][0]
print(f"Total rows in source table: {total_rows_in_db:,}")

# Now fetch all data
query = "SELECT * FROM decus_domesticops_prod.dbo.tmp_hassett_report"
print(f"\nFetching ALL rows from Databricks...")
df_hassett = pd.read_sql(query, conn)
print(f"‚úÖ Loaded {len(df_hassett):,} rows (COMPLETE dataset)")
print(f"Columns ({len(df_hassett.columns)}): {list(df_hassett.columns)}")

if len(df_hassett) != total_rows_in_db:
    print(f"\n‚ö†Ô∏è  WARNING: Loaded {len(df_hassett):,} rows but source has {total_rows_in_db:,} rows!")
else:
    print(f"\n‚úÖ Confirmed: All {total_rows_in_db:,} rows loaded successfully")

print(f"\nFirst 5 rows:")
df_hassett.head()

  count_result = pd.read_sql(count_query, conn)


Total rows in source table: 360,296

Fetching ALL rows from Databricks...


  df_hassett = pd.read_sql(query, conn)


KeyboardInterrupt: 

In [10]:
# Query Hassett report table - Sample for testing
# First, check how many rows exist in the source table
count_query = "SELECT COUNT(*) as row_count FROM decus_domesticops_prod.dbo.tmp_hassett_report"
count_result = pd.read_sql(count_query, conn)
total_rows_in_db = count_result['row_count'][0]
print(f"Total rows in source table: {total_rows_in_db:,}")

# Fetch a sample for testing (10,000 rows)
SAMPLE_SIZE = 10000
query = f"SELECT * FROM decus_domesticops_prod.dbo.tmp_hassett_report LIMIT {SAMPLE_SIZE}"
print(f"\nFetching {SAMPLE_SIZE:,} sample rows from Databricks...")
df_hassett = pd.read_sql(query, conn)
print(f"‚úÖ Loaded {len(df_hassett):,} rows (SAMPLE dataset)")
print(f"Columns ({len(df_hassett.columns)}): {list(df_hassett.columns)}")
print(f"üìä Sample represents {len(df_hassett)/total_rows_in_db*100:.1f}% of total data")

print(f"\nFirst 5 rows:")
df_hassett.head()

  count_result = pd.read_sql(count_query, conn)


Total rows in source table: 360,296

Fetching 10,000 sample rows from Databricks...


  df_hassett = pd.read_sql(query, conn)


‚úÖ Loaded 10,000 rows (SAMPLE dataset)
Columns (128): ['HASSETT', 'FACILITY', 'SHIPPER_NAME', 'SHIPPER_ADR_1', 'SHIPPER_ADR_2', 'SHIPPER_CITY', 'SHIPPER_STATE', 'SHIPPER_ZIP', 'SHIPPER_PHONE', 'SHIPPER_CONTACT', 'SHIPPER_REF_NO', 'SHIPPER', 'CONSIGNEE_NAME', 'CONSIGNEE_ADDRESS_1', 'CONSIGNEE_ADDRESS_2', 'CONSIGNEE_CITY', 'CONSIGNEE_STATE', 'CONSIGNEE_ZIP', 'CONSIGNEE_PHONE', 'CONSIGNEE_CONTACT', 'CONSIGNEE_REF_NO', 'CONSIGNEE', 'BILL_TO_NAME', 'BILL_TO_ADR_1', 'BILL_TO_ADR_2', 'BILL_TO_CITY', 'BILL_TO_STATE', 'BILL_TO_ZIP', 'BILL_TO_PHONE', 'BILL_TO_CONTACT', 'BILL_TO', 'DESCRIPTION_1', 'DESCRIPTION_2', 'DESCRIPTION_3', 'DESCRIPTION_4', 'SPECIAL_INSTRUCTIONS_1', 'SPECIAL_INSTRUCTIONS_2', 'SPECIAL_INSTRUCTIONS_3', 'RECEIVED_BY_POD', 'ORIGIN', 'DESTIN', 'CARRIER', 'FLIGHT', 'INVOICE', 'SERVICE_1', 'SERVICE_2', 'SERVICE_3', 'SERVICE_4', 'PIECES', 'WEIGHT', 'CUBIC_WEIGHT', 'COPY_COUNT', 'TOTAL_COST', 'AIR_COST', 'PICK_UP_COST', 'DELIVERY_COST', 'EXCESS_VALUE_COST', 'INSURANCE_COST', 'ADV_

Unnamed: 0,HASSETT,FACILITY,SHIPPER_NAME,SHIPPER_ADR_1,SHIPPER_ADR_2,SHIPPER_CITY,SHIPPER_STATE,SHIPPER_ZIP,SHIPPER_PHONE,SHIPPER_CONTACT,SHIPPER_REF_NO,SHIPPER,CONSIGNEE_NAME,CONSIGNEE_ADDRESS_1,CONSIGNEE_ADDRESS_2,CONSIGNEE_CITY,CONSIGNEE_STATE,CONSIGNEE_ZIP,CONSIGNEE_PHONE,CONSIGNEE_CONTACT,CONSIGNEE_REF_NO,CONSIGNEE,BILL_TO_NAME,BILL_TO_ADR_1,BILL_TO_ADR_2,BILL_TO_CITY,BILL_TO_STATE,BILL_TO_ZIP,BILL_TO_PHONE,BILL_TO_CONTACT,BILL_TO,DESCRIPTION_1,DESCRIPTION_2,DESCRIPTION_3,DESCRIPTION_4,SPECIAL_INSTRUCTIONS_1,SPECIAL_INSTRUCTIONS_2,SPECIAL_INSTRUCTIONS_3,RECEIVED_BY_POD,ORIGIN,DESTIN,CARRIER,FLIGHT,INVOICE,SERVICE_1,SERVICE_2,SERVICE_3,SERVICE_4,PIECES,WEIGHT,CUBIC_WEIGHT,COPY_COUNT,TOTAL_COST,AIR_COST,PICK_UP_COST,DELIVERY_COST,EXCESS_VALUE_COST,INSURANCE_COST,ADV_ORIGIN_COST,ADV_DESTIN_COST,OTHER_COST,FREIGHT_COD_COST,MECHANDISE_COD_COST,SURCHARGE_AIR_COST,SURCHARGE_CARTAGE_COST,SURCHARGE_SECURITY_COST,TOTAL2_COST,DATE_ISSUE,DATE_SHIP,INVOICE_DATE,DATE_RECEIVED,DEPARTURE,ARRIVAL,TIME_RECEIVED,owner,fileName,LoadDate,LoadDatetime,DW_INSERT_DATE,svc1,svc2,svc3,svc4,BillTo,ShipperName,ConsigneeName,ConsigneeCity,WeightBand,HAXServiceLevel,ProductType,ShipDay,ReceiveDay,WeekEnding,ODC,DDC,TransitDays,AdjTransitDays,InvoiceLagDays,NightFlag,WeekendFlag,CrossZoneFlag,HighValueFlag,Avg6WeekCost,StdDev6WeekCost,Avg6WeekWeight,StdDev6WeekWeight,Avg6WeekTransitDays,Pctl95InvoiceLag,Avg6WeekInvoiceLag,Avg6WeekPieces,Avg6WeekNightRate,Avg6WeekWeekendRate,TruckType,TruckFlow,ExpectedTransitDays,TransitVariance,OpsPortal_DN,OTM_Container,OTM_TU_Count,TransitException,NoPOD_Issue,NoPOD_Code,NoPOD_ID,CostAnomalyFlag,CostPctDiff,TagType,TagType_ID,ExpectedDeliveryDate
0,52689565,5,DHL GLOBAL MAIL OF LOS ANGELES,3963 WORKMAN MILL ROAD,UNIT A,WHITTIER,CA,90601,562-760-4724,MARK STEVE VALDEZ,,41430,DHL GLOBAL MAIL OF ATLANTA,1370 DISCOVERY INDUSTRIAL,COURT SE,MABLETON,GA,30126,901-647-8877,Lakiesha Long,1.0,,DHL GLOBAL MAIL OF ATLANTA,PO BOX 189103,,PLANTATION,FL,33318,678/363-3390,,41356.0,,09/02/25 ISSUE,,,,,,,LAX,ATL,006,,,,SDS,,,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2025-09-02,2025-09-02,,,,,,frank.giles@dhl.com,Hassett_Raw_Data_2025352025-09-02T12:04:30.342...,2025-09-02,2025-09-02 12:14:57.123,2025-12-12 18:25:18.828,,SDS,,,41356,DHL GLOBAL MAIL OF LOS ANGELES,DHL GLOBAL MAIL OF ATLANTA,MABLETON,0-10,SDS,EXP,Tuesday,,2025-09-06,LAX,ATL,,,0,0,0,1,0,,,,,,0.0,0.0,49.5,0.0,0.0,,,0.0,,,,,Delayed,,,,,,CTL,150.0,2025-09-02
1,52675822,5,HASSETT EXPRESS LLC/COMAT,5214 W. 104TH STREET,,LOS ANGELES,CA,90045,310-645-4515,MEHDY NAITAKI,,25111,DHL GLOBAL MAIL/SAN FRANCISCO,30041 AHERN AVE,,UNION CITY,CA,94587,925-890-0212,Charles Vick,,41359.0,HASSETT EXPRESS LLC/COMAT,5214 W. 104TH STREET,,LOS ANGELES,CA,90045,310-645-4515,MEHDY NAITAKI,25111.0,15 PALLETS OF BOXES,,,,WEEKLY DELIVERY WITH PRE-SCHEDULED,TRUCK,,Manuel,SFO,SFO,,,52253204.0,GND,,,,15,5250,0,0,0.0,0.0,1.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,2025-09-02,,2025-09-02,,,14:00,frank.giles@dhl.com,Hassett_Raw_Data_2025392025-10-02T21:04:43.358...,2025-10-02,2025-10-02 21:09:58.782,2025-12-12 18:25:18.828,GND,,,,25111,HASSETT EXPRESS LLC/COMAT,DHL GLOBAL MAIL/SAN FRANCISCO,UNION CITY,100+,GND,LOCAL,Tuesday,Tuesday,2025-09-06,SFO,SFO,0.0,0.0,0,0,0,0,0,376.34,423.09,4370.77,1365.82,0.0,0.0,0.0,3.27,0.0,0.1,,DEL,,,,,,On Time,,,,Normal,-100.0,,,
2,52691223,5,DHL AVIATION CARGO-SFO,944 NORTH FIELD ROAD,,SAN FRANCISCO,CA,94128,800-225-5345,,,41429,DHL eCOMMERCE,30041 AHERN AVE,,UNION CITY,CA,94587,510-491-2183,,,,"DHL GLOBAL MAIL ""MAX""",P.O BOX 189103,,PLANTATION,FL,33318,678-363-3390,ACCOUNTS PAYABLE,41406.0,AVIATION,09/02/25 ISSUE,DHL AVIATION AWB# 99220183240;,99220183225.0,PICKUP FROM DHL AVIATION AT SFO AIRPORT,AND RUN DIRECTLY TO DHL eCOMMERCE IN,UNION CITY; CA.,MANUEL V,SFO,SFO,ACT2,,52253220.0,BIL,GND,,,2,3962,0,0,815.4,0.0,0.0,755.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,60.4,0.0,815.4,2025-09-02,2025-09-02,,2025-09-02,,,14:00,frank.giles@dhl.com,Hassett_Raw_Data_2025392025-10-02T21:04:43.358...,2025-10-02,2025-10-02 21:09:58.782,2025-12-12 18:25:18.828,BIL,GND,,,41406,DHL AVIATION CARGO-SFO,DHL eCOMMERCE,UNION CITY,100+,GND,LOCAL,Tuesday,Tuesday,2025-09-06,SFO,SFO,0.0,0.0,0,0,0,0,0,376.34,423.09,4370.77,1365.82,0.0,0.0,0.0,3.27,0.0,0.1,ROLLERBED,P/U,,,,,,On Time,,,,Normal,116.7,,,
3,52692034,3,DHL GLOBAL MAIL OF CHARLOTTE,8475 AUTOMATION DRIVE STE200,,CONCORD,NC,28027,919-434-4943,TODD HAYES,,41433,DHL GLOBAL MAIL OF CHARLOTTE,8475 AUTOMATION DRIVE,SUITE 200,CONCORD,NC,28027,,JEFF OBRIEN,1.0,,DHL GLOBAL MAIL OF ATLANTA,PO BOX 189103,,PLANTATION,FL,33318,678/363-3390,,41356.0,DHLGM,09/02/25 ISSUE,,,DELIVERY BILL,,ST,BILLING ONLY,CLT,CLT,,,52253219.0,BIL,GND,,,1,1,0,0,448.2,0.0,0.0,415.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,33.2,0.0,448.2,2025-09-02,2025-09-02,,2025-09-02,,,23:00,frank.giles@dhl.com,Hassett_Raw_Data_2025392025-10-02T21:04:43.358...,2025-10-02,2025-10-02 21:09:58.782,2025-12-12 18:25:18.828,BIL,GND,,,41356,DHL GLOBAL MAIL OF CHARLOTTE,DHL GLOBAL MAIL OF CHARLOTTE,CONCORD,0-10,GND,LOCAL,Tuesday,Tuesday,2025-09-06,CLT,CLT,0.0,0.0,0,0,0,0,0,607.5,166.38,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.17,,DEL,,,,,,On Time,,,,Normal,-26.2,,,
4,52688414,3,DHL GLOBAL MAIL OF CINCINNATI,2300 AIRPORT NORTH DRIVE,,HEBRON,KY,41048,859-640-5166,GILBERT FLEEK,,41360,DHL GLOBAL MAIL OF CINCINNATI,2300 AIRPORT NORTH DRIVE,,HEBRON,KY,41048,,,2.0,,DHL GLOBAL MAIL OF ATLANTA,PO BOX 189103,,PLANTATION,FL,33318,678/363-3390,,41356.0,,08/30/25 ISSUE,,,DELIVERY BILL ST,,,BILLING ONLY,CVG,CVG,,,52253201.0,BIL,GND,,,1,1,0,0,394.2,0.0,0.0,365.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,29.2,0.0,394.2,2025-08-31,2025-08-31,,2025-08-31,,,17:45,frank.giles@dhl.com,Hassett_Raw_Data_2025392025-09-30T21:03:46.298...,2025-09-30,2025-09-30 21:10:17.176,2025-12-12 18:25:18.828,BIL,GND,,,41356,DHL GLOBAL MAIL OF CINCINNATI,DHL GLOBAL MAIL OF CINCINNATI,HEBRON,0-10,GND,LOCAL,Sunday,Sunday,2025-09-06,CVG,CVG,0.0,-1.0,0,0,1,0,0,394.2,0.0,1.0,0.0,-1.0,0.0,0.0,1.0,0.0,0.19,,DEL,,,,,,On Time,,,,Normal,0.0,,,


In [11]:
# Cell 10: Check tier mapping
tier_path = data_dir / 'odc_tier_mapping.csv'

if tier_path.exists():
    tiers = pd.read_csv(tier_path)
    print("‚úÖ Tier mapping loaded!\n")
    print("üìä ODC Tiers:")
    display(tiers)
    
    print("\nüìà Tier Summary:")
    print(tiers.groupby('tier').agg({
        'ODC': 'count',
        'total_2024': 'sum'
    }).rename(columns={'ODC': 'count'}))
else:
    print(f"‚ö†Ô∏è  Tier mapping not found at: {tier_path}")

‚úÖ Tier mapping loaded!

üìä ODC Tiers:


Unnamed: 0,ODC,tier,total_2024
0,LAX,Large,119209.0
1,EWR,Large,94554.0
2,IAD,Large,69074.0
3,SLC,Large,63409.0
4,ATL,Medium,38689.0
5,DFW,Medium,38280.0
6,PHX,Medium,31310.0
7,CVG,Medium,27612.0
8,IAH,Medium,26011.0
9,SEA,Medium,23617.0



üìà Tier Summary:
        count  total_2024
tier                     
Large       4   346246.00
Medium      7   206060.00
Small       9    94757.00


In [12]:
# Cell 12: Quick forecasting test using the sample data we already loaded
# Use the df_hassett DataFrame instead of querying again

# Filter for MAX and EXP products from the sample
sample_forecast = df_hassett[
    (df_hassett['ProductType'].isin(['MAX', 'EXP'])) &
    (df_hassett['DATE_SHIP'].notna())
].copy()

if len(sample_forecast) > 0:
    print(f"üìä Sample Forecast Data ({len(sample_forecast)} rows):\n")
    
    # Group by ODC and ProductType
    baseline = sample_forecast.groupby(['ODC', 'ProductType']).agg({
        'PIECES': 'sum',
        'WEIGHT': 'sum'
    }).reset_index().sort_values('PIECES', ascending=False).head(10)
    
    display(baseline)
    
    print("\n‚úÖ Forecasting test complete!")
else:
    print("‚ö†Ô∏è  No MAX/EXP products found in sample data")

üìä Sample Forecast Data (7900 rows):



Unnamed: 0,ODC,ProductType,PIECES,WEIGHT
18,LAX,MAX,10448,346872
32,SLC,MAX,6981,186087
11,EWR,EXP,6714,245953
29,SFO,EXP,6650,262334
14,IAD,MAX,5808,191553
31,SLC,EXP,5108,191279
10,DFW,MAX,4891,194981
12,EWR,MAX,4659,162615
6,CVG,MAX,4452,165914
25,PHX,EXP,4120,135512



‚úÖ Forecasting test complete!


In [13]:
# Cell 14: Environment Summary
print("="*60)
print("ENVIRONMENT SUMMARY")
print("="*60)

# Update checks for Databricks setup
checks = [
    ("Python packages", True),
    ("Project paths", True),
    ("Databricks connection", conn is not None),
    ("Hassett data loaded", 'df_hassett' in locals() and len(df_hassett) > 0),
    ("Tier mapping", tier_path.exists()),
]

print("\n‚úÖ Status Check:")
for check, status in checks:
    symbol = "‚úÖ" if status else "‚ùå"
    print(f"  {symbol} {check}")

all_good = all(status for _, status in checks)

if all_good:
    print("\n" + "="*60)
    print("üéâ ALL CHECKS PASSED! You're ready to start forecasting!")
    print("="*60)
    print("\nNext steps:")
    print("  1. Explore the loaded Hassett data (df_hassett)")
    print("  2. Review docs/META_ANALYSIS_100_EXPERIMENTS.md")
    print("  3. Build forecasting models using the sample data")
    print(f"\nData Summary:")
    print(f"  ‚Ä¢ Sample size: {len(df_hassett):,} rows")
    print(f"  ‚Ä¢ Total in DB: 360,296 rows")
    print(f"  ‚Ä¢ Columns: {len(df_hassett.columns)}")
    print(f"  ‚Ä¢ ODC tiers: 20 locations")
else:
    print("\n" + "="*60)
    print("‚ö†Ô∏è  SOME CHECKS FAILED")
    print("="*60)

ENVIRONMENT SUMMARY

‚úÖ Status Check:
  ‚úÖ Python packages
  ‚úÖ Project paths
  ‚úÖ Databricks connection
  ‚úÖ Hassett data loaded
  ‚úÖ Tier mapping

üéâ ALL CHECKS PASSED! You're ready to start forecasting!

Next steps:
  1. Explore the loaded Hassett data (df_hassett)
  2. Review docs/META_ANALYSIS_100_EXPERIMENTS.md
  3. Build forecasting models using the sample data

Data Summary:
  ‚Ä¢ Sample size: 10,000 rows
  ‚Ä¢ Total in DB: 360,296 rows
  ‚Ä¢ Columns: 128
  ‚Ä¢ ODC tiers: 20 locations
