In [1]:
import os
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
import pickle

PROJECT_ROOT = r'D:\Supply-Chain-Efficiency-Analytics'  
DATA_DIR = os.path.join(PROJECT_ROOT, 'data', 'raw')

products = pd.read_csv(os.path.join(DATA_DIR, 'products.csv'))
suppliers = pd.read_csv(os.path.join(DATA_DIR, 'suppliers.csv'))
purchase_orders = pd.read_csv(os.path.join(DATA_DIR, 'purchase_orders.csv'))
inbound_records = pd.read_csv(os.path.join(DATA_DIR, 'inbound_records.csv'))
inventory = pd.read_csv(os.path.join(DATA_DIR, 'inventory.csv'))
sales_orders = pd.read_csv(os.path.join(DATA_DIR, 'sales_orders.csv'))

for df, name in zip(
    [products, suppliers, purchase_orders, inbound_records, inventory, sales_orders],
    ['products','suppliers','purchase_orders','inbound_records','inventory','sales_orders']
):
    print(f"{name} info:")
    print(df.info())
    print(df.isna().sum())
    print('-'*40)

# -----------------------------
# 1. 基础指标计算
# -----------------------------

# 库存告急产品
inventory_merged = inventory.merge(
    products[['product_id','product_name','safety_stock']],
    on='product_id'
)
inventory_merged['is_stock_alert'] = inventory_merged['quantity'] < inventory_merged['safety_stock']
stock_alert_products = inventory_merged[inventory_merged['is_stock_alert']]

# 供应商交货及时率
po_with_inbound = purchase_orders.merge(
    inbound_records.groupby('purchase_order_id')['received_date'].max().reset_index(),
    on='purchase_order_id', how='left'
)
po_with_inbound['purchase_order_date'] = pd.to_datetime(po_with_inbound['purchase_order_date'])
po_with_inbound['received_date'] = pd.to_datetime(po_with_inbound['received_date'])
po_with_inbound['delivery_days'] = (po_with_inbound['received_date'] - po_with_inbound['purchase_order_date']).dt.days
po_with_inbound['on_time'] = po_with_inbound['delivery_days'] <= po_with_inbound['lead_time_days']
supplier_delivery_rate = po_with_inbound.groupby('supplier_id')['on_time'].mean().reset_index()
supplier_delivery_rate = supplier_delivery_rate.merge(
    suppliers[['supplier_id','supplier_name']],
    on='supplier_id'
)

# 销售趋势
sales_orders['sales_order_date'] = pd.to_datetime(sales_orders['sales_order_date'])
sales_trend = sales_orders.groupby(pd.Grouper(key='sales_order_date', freq='M'))['quantity_sold'].sum().reset_index()

# -----------------------------
# 2. 可视化（Plotly）
# -----------------------------

# 销售趋势图
fig_sales_trend = px.line(
    sales_trend,
    x='sales_order_date',
    y='quantity_sold',
    title='月度销售趋势'
)

# 库存告急产品
fig_stock_alert = px.bar(
    stock_alert_products,
    x='product_name',
    y='quantity',
    title='库存告急产品'
)

# 供应商交货及时率
fig_supplier_performance = px.bar(
    supplier_delivery_rate,
    x='supplier_name',
    y='on_time',
    title='供应商交货及时率'
)

# -----------------------------
# 3. SKU 积压/畅销热力图（优化版）
# -----------------------------
# 月度销售聚合
monthly_sales = sales_orders.groupby(
    ['product_id', pd.Grouper(key='sales_order_date', freq='M')]
)['quantity_sold'].sum().reset_index()

# 当前库存
inventory_current = inventory[['product_id', 'quantity']]

# 合并计算库存天数
heatmap_df = monthly_sales.merge(inventory_current, on='product_id', how='left')
heatmap_df['daily_avg_sales'] = heatmap_df['quantity_sold'] / 30  # 近似30天
heatmap_df['inventory_days'] = heatmap_df['quantity'] / heatmap_df['daily_avg_sales']
heatmap_df['inventory_days'] = heatmap_df['inventory_days'].replace([np.inf, np.nan], 0)

# pivot_table 聚合
sku_heatmap = heatmap_df.pivot_table(
    index='sales_order_date',
    columns='product_id',
    values='inventory_days',
    aggfunc='mean'
)

# 取销量前 20 的 SKU
top_sku_ids = sales_orders.groupby('product_id')['quantity_sold'].sum().nlargest(20).index
sku_heatmap_filtered = sku_heatmap[top_sku_ids]

# 可视化（浅蓝→红色）
fig_sku_heatmap = px.imshow(
    sku_heatmap_filtered,
    labels=dict(x="SKU", y="月份", color="库存天数"),
    aspect="auto",
    title="前20 SKU 积压/畅销热力图",
    color_continuous_scale=['lightblue', 'red']  # 浅蓝低，红高
)

# -----------------------------
# 4. 订单履约漏斗图
# -----------------------------
# 统计每个阶段的数量
total_orders = len(sales_orders)
delivered_orders = len(sales_orders[sales_orders['status'] == 'Delivered'])
# 这里假设库存可售数量为总订单减已发货数量的近似
in_stock_orders = total_orders - delivered_orders
shipped_orders = delivered_orders  # 简化，假设发货=已签收

funnel_fig = go.Figure(go.Funnel(
    y=['下单总量','库存可售','已发货','已签收'],
    x=[total_orders, in_stock_orders, shipped_orders, delivered_orders]
))

# -----------------------------
# 5. 输出供 Dashboard 调用
# -----------------------------
eda_results = {
    'fig_sales_trend': fig_sales_trend,
    'fig_stock_alert': fig_stock_alert,
    'fig_supplier_performance': fig_supplier_performance,
    'fig_sku_heatmap': fig_sku_heatmap,
    'funnel_fig': funnel_fig,
    'stock_alert_products': stock_alert_products,
    'supplier_delivery_rate': supplier_delivery_rate,
    'sku_heatmap_filtered': sku_heatmap_filtered
}

# 可选：保存 Pickle
pickle_path = os.path.join(PROJECT_ROOT, 'dashboards', 'eda_results.pkl')
os.makedirs(os.path.dirname(pickle_path), exist_ok=True)
with open(pickle_path, 'wb') as f:
    pickle.dump(eda_results, f)
print("EDA 结果已保存到：", pickle_path)


products info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   product_id    200 non-null    int64 
 1   sku           200 non-null    object
 2   product_name  200 non-null    object
 3   category      200 non-null    object
 4   unit          200 non-null    object
 5   safety_stock  200 non-null    int64 
dtypes: int64(2), object(4)
memory usage: 9.5+ KB
None
product_id      0
sku             0
product_name    0
category        0
unit            0
safety_stock    0
dtype: int64
----------------------------------------
suppliers info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   supplier_id    50 non-null     int64  
 1   supplier_name  50 non-null     object 
 2   location       50 non-null   

  sales_trend = sales_orders.groupby(pd.Grouper(key='sales_order_date', freq='M'))['quantity_sold'].sum().reset_index()

'M' is deprecated and will be removed in a future version, please use 'ME' instead.

