In [2]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
    
import openpyxl

from scipy import stats

In [3]:
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
sns.set_style("whitegrid")

In [4]:
def P_false(df, group):
    """
    计算每个组别在不同孕周t的条件false比例
    P_false(t) = (截至t周的 y染色体浓度<0.04 样本数) / (截至t周的总样本数)
    """
    group_data = df[df['组别'] == int(group)].copy()
    group_data = group_data.sort_values('week')  
    
    t_values = np.arange(70, 28*7, 7)
    results = {}
    
    for t in t_values:
        # 截至t周的所有样本
        samples_up_to_t = group_data[group_data['week'] <= t]
        total_up_to_t = len(samples_up_to_t)
        
        if total_up_to_t == 0:
            results[t] = 1  # 如果没有样本，设为1,相当于排除掉这个t
        else:
            # 截至t周的false样本（Y<0.04）
            false_t = len(samples_up_to_t[samples_up_to_t['Y染色体浓度'] < 0.04])
            results[t] = false_t / total_up_to_t  

            results[t] = np.where(results[t]>0.1, results[t], 0.1) # 样本数过少时，拟合的结果相当于增加了隐性风险
    
    return results


In [5]:
def P_late(t):
    if t <= 12*7: 
        return 0
    elif t > 12*7 and t < 7*28:
        return (t - 12*7) / (7*(28 - 12))
    else:
        return 1 # 28之后已经有很高风险，直接增加大的惩罚
# 我们认为，在早期之后，随着时间增加风险线性增加

In [6]:
def risk(P_false_curves, w1=0.7, w2=0.3):
    """
    计算每个组别的风险函数并找出最优检测时间
    Risk(t) = w1 * P_false(t) + w2 * P_late(t)
    """
    risk_results = {}
    optimal_times = {}
    
    for group, p_false_curve in P_false_curves.items():
        group_risk = {}
        
        # 计算每个t的风险值
        for t, p_false in p_false_curve.items():
            risk = w1 * p_false + w2 * P_late(t)
            group_risk[t] = risk
        
        # 找到风险最小的t
        min_risk_t = min(group_risk.items(), key=lambda x: x[1])[0]
        min_risk_value = group_risk[min_risk_t]
        
        risk_results[group] = group_risk
        optimal_times[group] = (min_risk_t, min_risk_value)
    
    return risk_results, optimal_times

In [7]:
df = pd.read_excel('附件_分组_XY.xlsx')
df['week'] = df['检测孕周_天数'] 

# 确认week的范围
print(max(df['week']))
print(min(df['week']))

# 计算每个组别的P_false曲线
P_false_curves = {}
for group in df['组别'].unique():
    P_false_curves[group] = P_false(df, group)

P_false_curves

203
0


{np.int64(2): {np.int64(70): 1,
  np.int64(77): array(0.1),
  np.int64(84): array(0.16216216),
  np.int64(91): array(0.13265306),
  np.int64(98): array(0.11971831),
  np.int64(105): array(0.10714286),
  np.int64(112): array(0.11538462),
  np.int64(119): array(0.132),
  np.int64(126): array(0.12790698),
  np.int64(133): array(0.13207547),
  np.int64(140): array(0.13103448),
  np.int64(147): array(0.12944984),
  np.int64(154): array(0.12658228),
  np.int64(161): array(0.12578616),
  np.int64(168): array(0.11904762),
  np.int64(175): array(0.11396011),
  np.int64(182): array(0.11267606),
  np.int64(189): array(0.11204482)},
 np.int64(3): {np.int64(70): array(0.1),
  np.int64(77): array(0.1),
  np.int64(84): array(0.11764706),
  np.int64(91): array(0.15929204),
  np.int64(98): array(0.11675127),
  np.int64(105): array(0.1),
  np.int64(112): array(0.1),
  np.int64(119): array(0.11082474),
  np.int64(126): array(0.1092233),
  np.int64(133): array(0.10926366),
  np.int64(140): array(0.1111111

In [8]:
def calculate_all_optimal_results(P_false_curves, weight_combinations):
    """
    计算所有权重组合在各个组的最小risk和最佳检测时间
    """
    results = []
    
    for w1, w2 in weight_combinations:
        _, optimal_times = risk(P_false_curves, w1, w2)
        
        for group, (best_t, min_risk) in optimal_times.items():
            results.append({
                '权重组合': f'w1={w1}, w2={w2}',
                '组别': group,
                '最佳检测时间(天)': best_t,
                '最小风险值': min_risk,
                'w1': w1,
                'w2': w2
            })
    
    # 转换为DataFrame
    results_df = pd.DataFrame(results)
    
    return results_df

In [10]:
weight_combinations = [
    (0.8, 0.2),  
    (0.7, 0.3),  
    (0.6, 0.4),  
    (0.5, 0.5),  
    (0.9, 0.1),  
    (0.4, 0.6)   
]

results_df = calculate_all_optimal_results(P_false_curves, weight_combinations)

excel_filename = '最优检测时间结果_use_1.xlsx'
with pd.ExcelWriter(excel_filename, engine='openpyxl') as writer:
    results_df.to_excel(writer, sheet_name='完整结果', index=False)
    
    for weight_combo in results_df['权重组合'].unique():
        combo_df = results_df[results_df['权重组合'] == weight_combo]
        sheet_name = f"权重{weight_combo.replace('w1=', '').replace('w2=', '').replace(', ', '_')}"
        combo_df.to_excel(writer, sheet_name=sheet_name, index=False)
    
    summary_df = results_df.pivot_table(
        index='组别', 
        columns='权重组合', 
        values=['最佳检测时间(天)', '最小风险值'],
        aggfunc='first'
    )
    summary_df.to_excel(writer, sheet_name='汇总表')

print(results_df.head(10))

             权重组合  组别  最佳检测时间(天)     最小风险值   w1   w2
0  w1=0.8, w2=0.2   2         77  0.080000  0.8  0.2
1  w1=0.8, w2=0.2   3         70  0.080000  0.8  0.2
2  w1=0.8, w2=0.2   4         84  0.200000  0.8  0.2
3  w1=0.8, w2=0.2   5        175  0.470192  0.8  0.2
4  w1=0.8, w2=0.2   1        119  0.462500  0.8  0.2
5  w1=0.7, w2=0.3   2         77  0.070000  0.7  0.3
6  w1=0.7, w2=0.3   3         70  0.070000  0.7  0.3
7  w1=0.7, w2=0.3   4         84  0.175000  0.7  0.3
8  w1=0.7, w2=0.3   5        175  0.512981  0.7  0.3
9  w1=0.7, w2=0.3   1        119  0.443750  0.7  0.3


In [11]:
import pandas as pd
import numpy as np

def pfalse_plate(df, P_false_curves):
    groups = sorted(df['组别'].unique())
    t_values = np.arange(10, 28)
    
    results = []
    
    for group in groups:
        for t in t_values:
            # 计算P_false
            p_false = P_false_curves.get(group, {}).get(t, np.nan)
            
            # 计算P_late
            p_late = P_late(t)
            
            results.append({
                '组别': group,
                '孕周t(天)': t,
                'P_false': p_false,
                'P_late': p_late
            })
    
    results_df = pd.DataFrame(results)
    
    return results_df

all_results_df = pfalse_plate(df, P_false_curves)

excel_filename = '各组别_P_false_P_late_详细数据_use_1.xlsx'
with pd.ExcelWriter(excel_filename, engine='openpyxl') as writer:
    all_results_df.to_excel(writer, sheet_name='完整数据', index=False)

    for group in all_results_df['组别'].unique():
        group_df = all_results_df[all_results_df['组别'] == group]
        sheet_name = f'组别{group}'
        group_df.to_excel(writer, sheet_name=sheet_name, index=False)
    
    pivot_false = all_results_df.pivot_table(
        index='孕周t(天)', 
        columns='组别', 
        values='P_false',
        aggfunc='first'
    )
    pivot_late = all_results_df.pivot_table(
        index='孕周t(天)', 
        columns='组别', 
        values='P_late',
        aggfunc='first'
    )
    
    pivot_false.to_excel(writer, sheet_name='P_false_汇总')
    pivot_late.to_excel(writer, sheet_name='P_late_汇总')

print(all_results_df.head(10))

for group in all_results_df['组别'].unique():
    group_data = all_results_df[all_results_df['组别'] == group]
    avg_p_false = group_data['P_false'].mean()
    max_p_false = group_data['P_false'].max()
    min_p_false = group_data['P_false'].min()
    print(f"组别 {group}: 平均P_false = {avg_p_false:.4f}, 最大值 = {max_p_false:.4f}, 最小值 = {min_p_false:.4f}")

   组别  孕周t(天)  P_false  P_late
0   1      10      NaN       0
1   1      11      NaN       0
2   1      12      NaN       0
3   1      13      NaN       0
4   1      14      NaN       0
5   1      15      NaN       0
6   1      16      NaN       0
7   1      17      NaN       0
8   1      18      NaN       0
9   1      19      NaN       0
组别 1: 平均P_false = nan, 最大值 = nan, 最小值 = nan
组别 2: 平均P_false = nan, 最大值 = nan, 最小值 = nan
组别 3: 平均P_false = nan, 最大值 = nan, 最小值 = nan
组别 4: 平均P_false = nan, 最大值 = nan, 最小值 = nan
组别 5: 平均P_false = nan, 最大值 = nan, 最小值 = nan


## 使用熵权法所得的权重进行计算

In [14]:
def risk(P_false_curves, weights_list):
    risk_results = {}
    optimal_times = {}
    
    for i, (group, p_false_curve) in enumerate(P_false_curves.items()):
        if i >= 5:  
            break
            
        w1, w2 = weights_list[i]
        group_risk = {}
        
        for t, p_false in p_false_curve.items():
            risk_value = w1 * p_false + w2 * P_late(t)
            group_risk[t] = risk_value
    
        min_risk_t = min(group_risk.items(), key=lambda x: x[1])[0]
        min_risk_value = group_risk[min_risk_t]
        
        risk_results[group] = {
            'risk_curve': group_risk,
            'weights': (w1, w2),
            'optimal_time': min_risk_t,
            'min_risk': min_risk_value
        }
        optimal_times[group] = (min_risk_t, min_risk_value)
    
    return risk_results, optimal_times

weights_list = [
    (0.6320, 0.3680),  
    (0.8080, 0.1920),  
    (0.5535, 0.4465),  
    (0.6344, 0.3656),  
    (0.4721, 0.5279)   
]

risk_results, optimal_times = risk(P_false_curves, weights_list)

print("各组最优检测时间和最小风险:")
print("=" * 50)
for group, (optimal_time, min_risk) in optimal_times.items():
    w1, w2 = risk_results[group]['weights']
    print(f"{group}: 最优检测时间 = {optimal_time}天({optimal_time/7}周), 最小风险 = {min_risk:.4f}")
    print(f"    权重: w1 = {w1:.4f}, w2 = {w2:.4f}")
    print("-" * 50)


各组最优检测时间和最小风险:
2: 最优检测时间 = 77天(11.0周), 最小风险 = 0.0632
    权重: w1 = 0.6320, w2 = 0.3680
--------------------------------------------------
3: 最优检测时间 = 70天(10.0周), 最小风险 = 0.0808
    权重: w1 = 0.8080, w2 = 0.1920
--------------------------------------------------
4: 最优检测时间 = 84天(12.0周), 最小风险 = 0.1384
    权重: w1 = 0.5535, w2 = 0.4465
--------------------------------------------------
5: 最优检测时间 = 168天(24.0周), 最小风险 = 0.5385
    权重: w1 = 0.6344, w2 = 0.3656
--------------------------------------------------
1: 最优检测时间 = 119天(17.0周), 最小风险 = 0.4010
    权重: w1 = 0.4721, w2 = 0.5279
--------------------------------------------------
