In [3]:
import pandas as pd
import numpy as np

# 1. 读取rawdata.xlsx
file_path = 'rawdata.xlsx'
adni_org_df = pd.read_excel(file_path, sheet_name='ADNI Org.')
csf_biomarker_df = pd.read_excel(file_path, sheet_name='CSF Biomarker')

# 2. 初始化一个空的DataFrame来存储结果
df = pd.DataFrame(columns=['RID', 'EXAMDATE', 'AGE', 'ABETA', 'TAU', 'N', 'C'])

# 3. 处理'ADNI Org.' sheet
for index, row in adni_org_df.iterrows():
    rid = row['RID']
    
    # 检查'CSF Biomarker'中是否有这个RID
    if rid in csf_biomarker_df['RID'].values:
        # 读取'ADNI Org.'中的'EXAMDATE', 'AGE', 'C', 'N'
        examdate = row['EXAMDATE']
        age = row['AGE']
        c = row['C']
        n = row['N']
        
        # 将数据添加到result_df中
        new_row = pd.DataFrame({
            'RID': [rid],
            'EXAMDATE': [examdate],
            'AGE': [age],
            'C': [c],
            'N': [n],
            'ABETA': [None],
            'TAU': [None]
        })
        df = pd.concat([df, new_row], ignore_index=True)

# 4. 处理'CSF Biomarker' sheet
for index, row in csf_biomarker_df.iterrows():
    rid = row['RID']
    drwdte = row['DRWDTE']
    
    # 检查'ADNI Org.'中是否有这个RID
    if rid in adni_org_df['RID'].values:
        # 检查result_df中是否有相同的RID和DRWDTE
        match = df[(df['RID'] == rid) & (df['EXAMDATE'] == drwdte)]
        
        if not match.empty:
            # 如果找到匹配的行，更新'ABETA'和'TAU'
            df.loc[match.index, 'ABETA'] = row['ABETA']
            df.loc[match.index, 'TAU'] = row['TAU']
        else:
            # 如果没有找到匹配的行，创建新的一行
            new_row = pd.DataFrame({
                'RID': [rid],
                'EXAMDATE': [drwdte],
                'AGE': [None],
                'C': [None],
                'N': [None],
                'ABETA': [row['ABETA']],
                'TAU': [row['TAU']]
            })
            df = pd.concat([df, new_row], ignore_index=True)

# 5. Sorting
df = df[['RID', 'EXAMDATE', 'AGE', 'ABETA', 'TAU', 'N', 'C']]

# 2. 删去 ABETA, TAU, C, N 全为 0 的行
condition = (df[['ABETA', 'TAU', 'C', 'N']].isna() | (df[['ABETA', 'TAU', 'C', 'N']] == 0)).all(axis=1)

# 删除符合条件的行
df = df[~condition]

# 3. 对于每个 RID，计算并重写 AGE
# 按 RID 分组
grouped = df.groupby('RID')

# 用于存储更新后的数据
updated_rows = []

for rid, group in grouped:
    # 按 DATE 排序
    group = group.sort_values(by='EXAMDATE')
    
    # 获取第一个 DATE 对应的 AGE
    first_age = group['AGE'].iloc[0]
    
    # 计算后续行的 AGE（保留一位小数）
    for i, row in group.iterrows():
        if i == group.index[0]:
            # 第一行保持原 AGE
            updated_rows.append(row)
        else:
            # 计算新的 AGE
            date_diff = (row['EXAMDATE'] - group['EXAMDATE'].iloc[0]).days / 365
            new_age = round(first_age + date_diff, 1)
            row['AGE'] = new_age
            updated_rows.append(row)

# 将更新后的数据重新组成 DataFrame
df = pd.DataFrame(updated_rows)

# 2. 将全部的 0 替换为空值
df = df.replace(0, np.nan)

# 3. 删去 EXAMDATE 为空值的各行
df = df.dropna(subset=['EXAMDATE'])

# 4. 合并符合条件的行
# 按 RID 和 AGE 分组
grouped = df.groupby(['RID', 'AGE'])

# 用于存储合并后的数据
merged_rows = []

for (rid, age), group in grouped:
    # 按 EXAMDATE 排序
    group = group.sort_values(by='EXAMDATE')
    
    # 初始化一个字典来存储合并后的行
    merged_row = {
        'RID': rid,
        'EXAMDATE': group['EXAMDATE'].iloc[0],  # 取第一个 EXAMDATE
        'AGE': age,
        'ABETA': np.nan,
        'TAU': np.nan,
        'C': np.nan,
        'N': np.nan,
    }
    
    # 遍历组中的每一行
    for _, row in group.iterrows():
        if not pd.isna(row['ABETA']) or not pd.isna(row['TAU']):
            # 如果 ABETA 或 TAU 不为空，更新 ABETA 和 TAU
            merged_row['ABETA'] = row['ABETA']
            merged_row['TAU'] = row['TAU']
        if not pd.isna(row['C']) or not pd.isna(row['N']):
            # 如果 C 或 N 不为空，更新 C 和 N
            merged_row['C'] = row['C']
            merged_row['N'] = row['N']
    
    # 将合并后的行添加到列表中
    merged_rows.append(merged_row)

# 将合并后的数据重新组成 DataFrame
df = pd.DataFrame(merged_rows)

    
#this section we figure out the quantile
name = df[['RID', 'AGE']]
df = df[['ABETA', 'TAU', 'N', 'C']]
q5 = df.quantile(0.05)
q95 = df.quantile(0.95)
df = (df - q5)/(q95 - q5)
df = pd.concat([name, df], axis=1)
with pd.ExcelWriter('data.xlsx', mode='a', engine='openpyxl', if_sheet_exists='replace') as writer:
    df.to_excel(writer, sheet_name='Sheet1', index=False)


q5_df = np.array(q5.values)
q95_df = np.array(q95.values)
q = np.vstack((q5_df.T, q95_df.T))
np.save('quantile.npy', q)