# Q3 销售订单金额异常变动检测

依据PDF提示：
- 计算每个订单的总金额（数量×(单价-行折扣)累计）；
- 对每个客户的订单总额计算Z分数，|z|≥3判定为异常；
- 可视化Top异常订单与客户的订单金额分布。

为避免超时，读取部分行（例如前20万行）进行开发。

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option('display.max_columns', 200)
pd.set_option('display.width', 200)

so = pd.read_excel('/workspace/Sample Data.Case 3.xlsx', sheet_name='Sales Orders', nrows=200000)
so2 = so.copy()
for c in ['CONFIRM_DATE','CREATION_DATE','REAL_CONFIRM_DATE','CUSTOMER_DATE','REAL_CUSTOMER_DATE','RECEIPT_DATE','INVOICE_DATE']:
    if c in so2.columns:
        so2[c] = pd.to_datetime(so2[c], errors='coerce')

qty_col = 'QTY' if 'QTY' in so2.columns else ('CUSTOMER_REAL_QTY' if 'CUSTOMER_REAL_QTY' in so2.columns else None)
price_col = 'PRICE' if 'PRICE' in so2.columns else None
line_disc = 'LINE_DISCOUNT' if 'LINE_DISCOUNT' in so2.columns else None
so2['line_total'] = pd.to_numeric(so2.get(qty_col, 0), errors='coerce') * (pd.to_numeric(so2.get(price_col, 0), errors='coerce') - pd.to_numeric(so2.get(line_disc, 0), errors='coerce'))
order_id = 'DELIVERY_CODE' if 'DELIVERY_CODE' in so2.columns else 'INVOICE_CODE'
customer_col = 'CUSTOMER_CODE' if 'CUSTOMER_CODE' in so2.columns else 'CUSTOMER_NAME'

order_totals = so2.groupby([customer_col, order_id]).agg(total_amount=('line_total','sum'), order_date=('CONFIRM_DATE','max')).reset_index()
order_totals['z'] = order_totals.groupby(customer_col)['total_amount'].transform(lambda s: (s - s.mean())/ (s.std(ddof=0)+1e-9))
anom_so = order_totals[order_totals['z'].abs()>=3].sort_values('z', ascending=False)
print('异常订单数（子集）:', anom_so.shape[0])
anom_so.head(10)

In [None]:
plt.figure(figsize=(10,4))
sns.barplot(x='z', y=order_id, data=anom_so.head(10), color='purple')
plt.title('Top10 异常订单的Z分数（子集）')
plt.xlabel('Z分数')
plt.ylabel('订单号')
plt.tight_layout()
plt.show()

### 客户订单金额分布可视化（示例客户）

In [None]:
sample_customers = anom_so[customer_col].drop_duplicates().head(5)
plt.figure(figsize=(12,6))
for c in sample_customers:
    s = order_totals[order_totals[customer_col]==c]
    plt.plot(s['order_date'].dt.to_period('M').astype(str), s['total_amount'], marker='o', label=str(c))
plt.xticks(rotation=45, ha='right')
plt.legend()
plt.title('部分客户订单金额时间序列（子集）')
plt.ylabel('订单金额')
plt.xlabel('月份')
plt.tight_layout()
plt.show()

### 结果与审计建议
- 异常大额订单可能涉及异常定价、折扣政策偏离或虚构交易，建议结合合同、折扣审批、发货签收与发票记账核查。
- 可进一步按产品维度与销售人员维度做分解分析，并检查同日/同客户的集中开单与冲单现象。

## 附加：全量运行与稳健异常检测（MAD/IQR）

In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
outputs_path = Path('/workspace/KPMG_HW1/outputs')
outputs_path.mkdir(parents=True, exist_ok=True)

# 全量读取
so_all = pd.read_excel('/workspace/Sample Data.Case 3.xlsx', sheet_name='Sales Orders')
so2 = so_all.copy()
for c in ['CONFIRM_DATE','CREATION_DATE','REAL_CONFIRM_DATE','CUSTOMER_DATE','REAL_CUSTOMER_DATE','RECEIPT_DATE','INVOICE_DATE']:
    if c in so2.columns:
        so2[c] = pd.to_datetime(so2[c], errors='coerce')
qty_col = 'QTY' if 'QTY' in so2.columns else ('CUSTOMER_REAL_QTY' if 'CUSTOMER_REAL_QTY' in so2.columns else None)
price_col = 'PRICE' if 'PRICE' in so2.columns else None
line_disc = 'LINE_DISCOUNT' if 'LINE_DISCOUNT' in so2.columns else None
so2['line_total'] = pd.to_numeric(so2.get(qty_col, 0), errors='coerce') * (pd.to_numeric(so2.get(price_col, 0), errors='coerce') - pd.to_numeric(so2.get(line_disc, 0), errors='coerce'))
order_id = 'DELIVERY_CODE' if 'DELIVERY_CODE' in so2.columns else 'INVOICE_CODE'
customer_col = 'CUSTOMER_CODE' if 'CUSTOMER_CODE' in so2.columns else 'CUSTOMER_NAME'
order_totals = so2.groupby([customer_col, order_id]).agg(total_amount=('line_total','sum'), order_date=('CONFIRM_DATE','max')).reset_index()

# Z分数异常
order_totals['z'] = order_totals.groupby(customer_col)['total_amount'].transform(lambda s: (s - s.mean())/ (s.std(ddof=0)+1e-9))
anom_z = order_totals[order_totals['z'].abs()>=3].copy()

# MAD稳健异常
def mad_outliers(group, k=5):
    x = group['total_amount']
    med = x.median()
    mad = (x - med).abs().median() + 1e-9
    zmad = (x - med)/mad
    return zmad.abs()>=k
mask_mad = order_totals.groupby(customer_col, group_keys=False).apply(lambda g: mad_outliers(g, k=5))
anom_mad = order_totals[mask_mad].copy()

# IQR异常
def iqr_bounds(group, k=1.5):
    x = group['total_amount']
    q1 = x.quantile(0.25)
    q3 = x.quantile(0.75)
    iqr = q3 - q1
    low, high = q1 - k*iqr, q3 + k*iqr
    return (x<low) | (x>high)
mask_iqr = order_totals.groupby(customer_col, group_keys=False).apply(lambda g: iqr_bounds(g))
anom_iqr = order_totals[mask_iqr].copy()

# 导出
anom_z.to_csv(outputs_path/'Q3_abnormal_orders_z.csv', index=False)
anom_mad.to_csv(outputs_path/'Q3_abnormal_orders_mad.csv', index=False)
anom_iqr.to_csv(outputs_path/'Q3_abnormal_orders_iqr.csv', index=False)
print('Z异常数：', len(anom_z), 'MAD异常数：', len(anom_mad), 'IQR异常数：', len(anom_iqr))