In [1]:
import pandas as pd
import numpy as np
import os

mimic_data = pd.read_csv("../../data/mmicrodeficiency/mimic_data.csv")
eicu_data = pd.read_csv("../../data/mmicrodeficiency/eicu_data.csv")

In [None]:
# 1. 创建保存结果的目录
output_dir = '../../res/mimic_eda'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# 2. 定义保存结果的文件名
output_file = os.path.join(output_dir, 'mimic_eda_results.txt')

# 3. EDA 分析 (并将所有结果写入同一个文件)
with open(output_file, 'w') as f:

    # 3.1 总体信息
    f.write("---------- 总体信息 ----------\n\n")
    f.write(f"数据形状：{mimic_data.shape}\n")
    f.write(f"列名：{', '.join(mimic_data.columns)}\n")
    f.write("\n数据类型：\n")
    f.write(str(mimic_data.dtypes) + "\n")
    f.write("\n前5行数据：\n")
    f.write(str(mimic_data.head()) + "\n")

    # 3.2 缺失值分析
    f.write("\n---------- 缺失值分析 ----------\n\n")
    f.write("每个特征的缺失数量：\n")
    f.write(str(mimic_data.isnull().sum()) + "\n")
    f.write("\n每个特征的缺失比例：\n")
    f.write(str(mimic_data.isnull().mean()) + "\n")

    # 可视化缺失值（用字符图代替）
    f.write("\n缺失值可视化 (字符图):\n")
    missing_matrix = mimic_data.isnull().astype(int).to_string()
    f.write(missing_matrix + "\n")
    f.write("  (1 表示缺失, 0 表示非缺失)\n")

    # 3.3 描述性统计
    f.write("\n---------- 描述性统计 ----------\n\n")
    f.write("数值型特征：\n")
    f.write(str(mimic_data.describe(include=[np.number])) + "\n")

    # 检查是否有类别型特征
    if any(mimic_data.dtypes == 'object'):
        f.write("\n类别型特征：\n")
        f.write(str(mimic_data.describe(include=['O'])) + "\n")  # 'O' for object
    else:
        f.write("\n没有类别型特征。\n")

    # 3.4 类别型特征的频数分布（同样需要检查）
    f.write("\n---------- 类别型特征频数分布 ----------\n\n")
    if any(mimic_data.dtypes == 'object'):
        for col in mimic_data.select_dtypes(include=['O']).columns:
            f.write(f"特征 '{col}' 的频数分布：\n")
            f.write(str(mimic_data[col].value_counts()) + "\n")
            f.write("\n")
    else:
        f.write("没有类别型特征。\n")

    # 3.5 数值型特征的分布 (用分位数和直方图的文本描述代替)
    f.write("\n---------- 数值型特征分布 ----------\n\n")
    for col in mimic_data.select_dtypes(include=[np.number]).columns:
        f.write(f"特征 '{col}' 的分布：\n")
        # 分位数
        quantiles = mimic_data[col].quantile([0, 0.25, 0.5, 0.75, 1]).to_string()
        f.write(f"分位数：\n{quantiles}\n")

        # 文本直方图 (用numpy生成数据)
        hist, bin_edges = np.histogram(mimic_data[col].dropna(), bins=10)
        f.write("\n直方图 (文本形式):\n")
        for i in range(len(hist)):
            f.write(f"[{bin_edges[i]:.2f}, {bin_edges[i+1]:.2f}): {'*' * hist[i]} ({hist[i]})\n")
        f.write("\n")

    # 3.6 (可选) 相关性分析 (数值型特征)
    f.write("\n---------- 相关性分析 (数值型特征) ----------\n\n")
    correlation_matrix = mimic_data.corr()
    f.write(str(correlation_matrix) + "\n")

     # 文本热图 (使用符号表示相关性强度)
    f.write("\n相关性热图 (文本形式):\n")
    for i in range(len(correlation_matrix)):
        row_str = ""
        for j in range(len(correlation_matrix)):
            corr_val = correlation_matrix.iloc[i, j]
            if abs(corr_val) >= 0.8:
                symbol = "++" if corr_val > 0 else "--"
            elif abs(corr_val) >= 0.5:
                symbol = "+" if corr_val > 0 else "-"
            else:
                symbol = "  "
            row_str += f"{symbol} "
        f.write(row_str + '\n')
print("EDA 结果已保存至 '../../res/mimic_eda/mimic_eda_results.txt' 文件中。")

EDA 结果已保存至 '../../res/mimic_eda/mimic_eda_results.txt' 文件中。


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os


try:
    import missingno as msno
    HAS_MISSINGNO = True
except ImportError:
    HAS_MISSINGNO = False
    print("Warning: missingno library is not installed. Missing value visualization will be skipped.")

plt.rcParams['font.sans-serif'] = ['SimHei'] 
plt.rcParams['axes.unicode_minus'] = False

def perform_eda(data, label_col='label', output_dir='eda_results'):
    """
    对DataFrame进行探索性数据分析 (EDA) 并将结果保存到指定目录，
    包括整体数据、每个Label类别以及 "谵妄 vs. 非谵妄" 分组的分析。

    参数:
    data (pd.DataFrame): 输入的DataFrame，包含标签列。
    label_col (str): 标签列的名称，默认为 'label'。
    output_dir (str): 保存EDA结果的目录，默认为 'eda_results'。
    """

    # 确保输出目录存在
    os.makedirs(output_dir, exist_ok=True)
    insights_dir = os.path.join(output_dir, 'insights') # 创建 insights 总目录
    os.makedirs(insights_dir, exist_ok=True)

    summary_insights_file = os.path.join(insights_dir, 'eda_insights_summary.txt') # 总的文本逻辑文件

    print(f"开始 EDA 分析，结果将保存到目录: {output_dir}")

    # 初始化总的文本逻辑内容
    summary_content = f"EDA 总逻辑总结\n===================\n\n"

    # 1. 数据概览 (Data Overview) - 整体数据
    print("\n1. 数据概览 (Data Overview) - 整体数据...")
    overall_dir = os.path.join(output_dir, 'overall') # 整体数据结果目录
    os.makedirs(overall_dir, exist_ok=True)
    data_overview_file = os.path.join(overall_dir, 'data_overview.txt')
    with open(data_overview_file, 'w', encoding='utf-8') as f: # 指定 utf-8 编码
        f.write("数据形状 (Shape):\n")
        f.write(str(data.shape) + "\n\n")
        f.write("数据类型 (Data Types):\n")
        f.write(str(data.dtypes) + "\n\n")
        f.write("缺失值统计 (Missing Values):\n")
        f.write(str(data.isnull().sum()) + "\n\n")
        f.write("前5行数据 (Head):\n")
        f.write(str(data.head()) + "\n")
    print(f"  - 整体数据概览信息已保存到: {data_overview_file}")
    summary_content += f"1. 数据概览 (Data Overview) - 整体数据 (详细信息见 {data_overview_file}):\n"
    with open(data_overview_file, 'r', encoding='utf-8') as f:
        summary_content += f.read() + "\n\n"


    # 2. 描述性统计 (Descriptive Statistics) - 整体数据
    print("\n2. 描述性统计 (Descriptive Statistics) - 整体数据...")
    desc_stats = data.describe(include='all').T  # 转置方便查看
    desc_stats_file = os.path.join(overall_dir, 'descriptive_statistics.csv')
    desc_stats.to_csv(desc_stats_file, encoding='utf-8') # 指定 utf-8 编码
    print(f"  - 整体数据描述性统计信息已保存到: {desc_stats_file}")
    summary_content += f"2. 描述性统计 (Descriptive Statistics) - 整体数据 (详细信息见 {desc_stats_file}):\n"
    summary_content += f"  - (请查看 {desc_stats_file} 文件获取详细的描述性统计信息，例如均值、中位数、标准差、分位数等)\n\n"


    # 3. 分 Label 类别进行单变量分析
    label_values = sorted(data[label_col].unique())
    for label_value in label_values:
        label_data = data[data[label_col] == label_value]
        label_dir = os.path.join(output_dir, f'label_{label_value}') # 每个 Label 的结果目录
        os.makedirs(label_dir, exist_ok=True)
        print(f"\n3. Label {label_value} 单变量分析...")
        summary_content += f"\n3. Label {label_value} 单变量分析:\n"

        numerical_features = label_data.select_dtypes(include=np.number).columns.tolist()
        categorical_features = label_data.select_dtypes(exclude=np.number).columns.tolist()

        # 数值特征单变量分析 - 分 Label
        print("  - 数值特征单变量分析...")
        for col in numerical_features:
            if col == label_col:
                continue
            plt.figure(figsize=(8, 6))
            hist_plot = plt.hist(label_data[col].dropna(), bins=30)
            plt.title(f'Label {label_value} - 数值特征分布 - {col}') # 标题包含 Label
            plt.xlabel(col)
            plt.ylabel('频率')
            hist_file = os.path.join(label_dir, f'hist_{col}.png') # 保存到 Label 目录
            plt.savefig(hist_file)
            plt.close()
            print(f"    - {col} 直方图已保存: {hist_file}")

            # 保存直方图逻辑到文本
            hist_insights_file = os.path.join(insights_dir, f'label_{label_value}_hist_insights_{col}.txt') # insights 文件名包含 Label
            with open(hist_insights_file, 'w', encoding='utf-8') as f_insights:
                f_insights.write(f"Label {label_value} - 数值特征 {col} 直方图分析:\n") # 文本内容包含 Label
                f_insights.write(f"  - 数据分布大致形状:  (请查看直方图 {hist_file} )\n")
                counts, bins = hist_plot[0], hist_plot[1]
                peak_bin_index = np.argmax(counts)
                peak_bin_center = (bins[peak_bin_index] + bins[peak_bin_index + 1]) / 2
                f_insights.write(f"  - 峰值大致位置 (众数): {peak_bin_center:.2f}\n")
                f_insights.write(f"  - 可能的分布类型: (需要人工判断).\n")
                # ... (更详细的描述性统计信息，与之前的代码相同) ...
                mean_val = label_data[col].mean()
                median_val = label_data[col].median()
                skewness = label_data[col].skew()
                f_insights.write(f"  - 均值 (Mean): {mean_val:.2f}\n")
                f_insights.write(f"  - 中位数 (Median): {median_val:.2f}\n")
                f_insights.write(f"  - 偏度 (Skewness): {skewness:.2f}\n")
                f_insights.write(f"  - 数据值大致范围: 从 {label_data[col].min():.2f} 到 {label_data[col].max():.2f}\n")
            print(f"    - {col} 直方图逻辑已保存: {hist_insights_file}")
            summary_content += f"  3.1. 数值特征单变量分析 - {col} (Label {label_value}, 详细逻辑见 {hist_insights_file}, 图表见 {hist_file}):\n"
            with open(hist_insights_file, 'r', encoding='utf-8') as f_insight_read:
                summary_content += f_insight_read.read()


            plt.figure(figsize=(8, 6))
            boxplot = sns.boxplot(y=label_data[col].dropna())
            plt.title(f'Label {label_value} - 数值特征箱线图 - {col}') # 标题包含 Label
            plt.ylabel(col)
            boxplot_file = os.path.join(label_dir, f'boxplot_{col}.png') # 保存到 Label 目录
            plt.savefig(boxplot_file)
            plt.close()
            print(f"    - {col} 箱线图已保存: {boxplot_file}")

            # 保存箱线图逻辑到文本
            boxplot_insights_file = os.path.join(insights_dir, f'label_{label_value}_boxplot_insights_{col}.txt') # insights 文件名包含 Label
            with open(boxplot_insights_file, 'w', encoding='utf-8') as f_insights:
                f_insights.write(f"Label {label_value} - 数值特征 {col} 箱线图分析:\n") # 文本内容包含 Label
                f_insights.write(f"  - 中位数 (Median): {label_data[col].median():.2f}\n")
                f_insights.write(f"  - 四分位数范围 (IQR): {label_data[col].quantile(0.75) - label_data[col].quantile(0.25):.2f}\n")
                f_insights.write(f"  - 异常值情况: (请查看箱线图 {boxplot_file} )\n")
                # ... (更详细的箱线图描述，与之前的代码相同) ...
                q1 = label_data[col].quantile(0.25)
                q3 = label_data[col].quantile(0.75)
                iqr = q3 - q1
                lower_whisker = label_data[col][label_data[col] >= (q1 - 1.5 * iqr)].min()
                upper_whisker = label_data[col][label_data[col] <= (q3 + 1.5 * iqr)].max()
                outliers = label_data[col][(label_data[col] < (q1 - 1.5 * iqr)) | (label_data[col] > (q3 + 1.5 * iqr))]
                f_insights.write(f"  - 下四分位数 (Q1): {q1:.2f}\n")
                f_insights.write(f"  - 上四分位数 (Q3): {q3:.2f}\n")
                f_insights.write(f"  - 下须 (Lower Whisker): {lower_whisker:.2f}\n")
                f_insights.write(f"  - 上须 (Upper Whisker): {upper_whisker:.2f}\n")
                f_insights.write(f"  - 异常值数量 (Outliers, 1.5*IQR rule): {len(outliers)}\n")
                f_insights.write(f"  - 数据分布的对称性/偏斜程度: (根据中位数和箱体位置、须的长度判断).\n")
            print(f"    - {col} 箱线图逻辑已保存: {boxplot_insights_file}")
            summary_content += f"  3.1. 数值特征单变量分析 - {col} (Label {label_value}, 详细逻辑见 {boxplot_insights_file}, 图表见 {boxplot_file}):\n"
            with open(boxplot_insights_file, 'r', encoding='utf-8') as f_insight_read:
                summary_content += f_insight_read.read()


        # 类别特征单变量分析 - 分 Label
        print("  - 类别特征单变量分析...")
        for col in categorical_features:
            if col == label_col:
                continue
            plt.figure(figsize=(8, 6))
            countplot = sns.countplot(x=col, data=label_data)
            plt.title(f'Label {label_value} - 类别特征分布 - {col}') # 标题包含 Label
            plt.xlabel(col)
            plt.ylabel('计数')
            plt.xticks(rotation=45, ha='right')
            plt.tight_layout()
            countplot_file = os.path.join(label_dir, f'countplot_{col}.png') # 保存到 Label 目录
            plt.savefig(countplot_file)
            plt.close()
            print(f"    - {col} 柱状图已保存: {countplot_file}")

            # 保存柱状图逻辑到文本
            countplot_insights_file = os.path.join(insights_dir, f'label_{label_value}_countplot_insights_{col}.txt') # insights 文件名包含 Label
            with open(countplot_insights_file, 'w', encoding='utf-8') as f_insights:
                f_insights.write(f"Label {label_value} - 类别特征 {col} 柱状图分析:\n") # 文本内容包含 Label
                category_counts = label_data[col].value_counts()
                f_insights.write("  - 各类别计数:\n")
                f_insights.write(str(category_counts) + "\n")
                # ... (更详细的类别特征描述，与之前的代码相同) ...
                total_count = len(label_data[col])
                category_percentages = category_counts / total_count * 100
                f_insights.write("  - 各类别百分比:\n")
                f_insights.write(str(category_percentages.round(2)) + "%\n")
                f_insights.write(f"  - 类别分布是否均衡: (根据各类别计数/比例判断).\n")
                f_insights.write(f"  - 常见类别: {category_counts.index[0]} (计数: {category_counts.iloc[0]})\n")
                rare_categories = category_counts[category_counts < total_count * 0.01]
                if not rare_categories.empty:
                    f_insights.write(f"  - 稀有类别 (少于1%): {', '.join(rare_categories.index.tolist())}\n")
                else:
                    f_insights.write(f"  - 无明显稀有类别 (少于1%).\n")
            print(f"    - {col} 柱状图逻辑已保存: {countplot_insights_file}")
            summary_content += f"  3.2. 类别特征单变量分析 - {col} (Label {label_value}, 详细逻辑见 {countplot_insights_file}, 图表见 {countplot_file}):\n"
            with open(countplot_insights_file, 'r', encoding='utf-8') as f_insight_read:
                summary_content += f_insight_read.read()


    # 4. 创建 "谵妄 vs. 非谵妄" 分组
    data['label_group'] = data[label_col].apply(lambda x: 'Delirium' if x in [0, 1, 2] else 'No Delirium')
    group_label_col = 'label_group' # 新的分组标签列名
    group_dir = os.path.join(output_dir, 'delirium_vs_no_delirium') # 分组数据结果目录
    os.makedirs(group_dir, exist_ok=True)
    print(f"\n4. \"谵妄 vs. 非谵妄\" 分组 EDA 分析...")
    summary_content += f"\n4. \"谵妄 vs. 非谵妄\" 分组 EDA 分析:\n"


    # 5. 分组数据的单变量分析
    print("  - 分组数据单变量分析...")
    group_numerical_features = data.select_dtypes(include=np.number).columns.tolist()
    group_categorical_features = data.select_dtypes(exclude=np.number).columns.tolist()

    # 数值特征单变量分析 - 分组数据
    print("    - 数值特征单变量分析...")
    for col in group_numerical_features:
        if col in [label_col, group_label_col]: # 避免对原始标签和分组标签进行数值特征的单变量分析
            continue
        plt.figure(figsize=(8, 6))
        hist_plot = plt.hist(data[col].dropna(), bins=30)
        plt.title(f'分组 - 数值特征分布 - {col}') # 标题包含 "分组"
        plt.xlabel(col)
        plt.ylabel('频率')
        hist_file = os.path.join(group_dir, f'hist_{col}.png') # 保存到分组目录
        plt.savefig(hist_file)
        plt.close()
        print(f"      - {col} 直方图已保存: {hist_file}")

        # 保存直方图逻辑到文本
        hist_insights_file = os.path.join(insights_dir, f'group_hist_insights_{col}.txt') # insights 文件名包含 "group"
        with open(hist_insights_file, 'w', encoding='utf-8') as f_insights:
            f_insights.write(f"分组 - 数值特征 {col} 直方图分析:\n") # 文本内容包含 "分组"
            f_insights.write(f"  - 数据分布大致形状:  (请查看直方图 {hist_file} )\n")
            # ... (描述性统计信息，与之前的代码相同) ...
            mean_val = data[col].mean()
            median_val = data[col].median()
            skewness = data[col].skew()
            f_insights.write(f"  - 均值 (Mean): {mean_val:.2f}\n")
            f_insights.write(f"  - 中位数 (Median): {median_val:.2f}\n")
            f_insights.write(f"  - 偏度 (Skewness): {skewness:.2f}\n")
            f_insights.write(f"  - 数据值大致范围: 从 {data[col].min():.2f} 到 {data[col].max():.2f}\n")
        print(f"      - {col} 直方图逻辑已保存: {hist_insights_file}")
        summary_content += f"  4.1. 分组数据单变量分析 - 数值特征 {col} (详细逻辑见 {hist_insights_file}, 图表见 {hist_file}):\n"
        with open(hist_insights_file, 'r', encoding='utf-8') as f_insight_read:
            summary_content += f_insight_read.read()


        plt.figure(figsize=(8, 6))
        boxplot = sns.boxplot(y=data[col].dropna())
        plt.title(f'分组 - 数值特征箱线图 - {col}') # 标题包含 "分组"
        plt.ylabel(col)
        boxplot_file = os.path.join(group_dir, f'boxplot_{col}.png') # 保存到分组目录
        plt.savefig(boxplot_file)
        plt.close()
        print(f"      - {col} 箱线图已保存: {boxplot_file}")

        # 保存箱线图逻辑到文本
        boxplot_insights_file = os.path.join(insights_dir, f'group_boxplot_insights_{col}.txt') # insights 文件名包含 "group"
        with open(boxplot_insights_file, 'w', encoding='utf-8') as f_insights:
            f_insights.write(f"分组 - 数值特征 {col} 箱线图分析:\n") # 文本内容包含 "分组"
            f_insights.write(f"  - 中位数 (Median): {data[col].median():.2f}\n")
            f_insights.write(f"  - 四分位数范围 (IQR): {data[col].quantile(0.75) - data[col].quantile(0.25):.2f}\n")
            f_insights.write(f"  - 异常值情况: (请查看箱线图 {boxplot_file} )\n")
            # ... (更详细的箱线图描述，与之前的代码相同) ...
            q1 = data[col].quantile(0.25)
            q3 = data[col].quantile(0.75)
            iqr = q3 - q1
            lower_whisker = data[col][data[col] >= (q1 - 1.5 * iqr)].min()
            upper_whisker = data[col][data[col] <= (q3 + 1.5 * iqr)].max()
            outliers = data[col][(data[col] < (q1 - 1.5 * iqr)) | (data[col] > (q3 + 1.5 * iqr))]
            f_insights.write(f"  - 下四分位数 (Q1): {q1:.2f}\n")
            f_insights.write(f"  - 上四分位数 (Q3): {q3:.2f}\n")
            f_insights.write(f"  - 下须 (Lower Whisker): {lower_whisker:.2f}\n")
            f_insights.write(f"  - 上须 (Upper Whisker): {upper_whisker:.2f}\n")
            f_insights.write(f"  - 异常值数量 (Outliers, 1.5*IQR rule): {len(outliers)}\n")
            f_insights.write(f"  - 数据分布的对称性/偏斜程度: (根据中位数和箱体位置、须的长度判断).\n")
        print(f"      - {col} 箱线图逻辑已保存: {boxplot_file}")
        summary_content += f"  4.1. 分组数据单变量分析 - 数值特征 {col} (详细逻辑见 {boxplot_insights_file}, 图表见 {boxplot_file}):\n"
        with open(boxplot_insights_file, 'r', encoding='utf-8') as f_insight_read:
            summary_content += f_insight_read.read()


        # 类别特征单变量分析 - 分组数据
        print("    - 类别特征单变量分析...")
        for col in group_categorical_features:
            if col in [label_col, group_label_col]: # 避免对原始标签和分组标签进行类别特征的单变量分析
                continue
            plt.figure(figsize=(8, 6))
            countplot = sns.countplot(x=col, data=data)
            plt.title(f'分组 - 类别特征分布 - {col}') # 标题包含 "分组"
            plt.xlabel(col)
            plt.ylabel('计数')
            plt.xticks(rotation=45, ha='right')
            plt.tight_layout()
            countplot_file = os.path.join(group_dir, f'countplot_{col}.png') # 保存到分组目录
            plt.savefig(countplot_file)
            plt.close()
            print(f"      - {col} 柱状图已保存: {countplot_file}")

            # 保存柱状图逻辑到文本
            countplot_insights_file = os.path.join(insights_dir, f'group_countplot_insights_{col}.txt') # insights 文件名包含 "group"
            with open(countplot_insights_file, 'w', encoding='utf-8') as f_insights:
                f_insights.write(f"分组 - 类别特征 {col} 柱状图分析:\n") # 文本内容包含 "分组"
                category_counts = data[col].value_counts()
                f_insights.write("  - 各类别计数:\n")
                f_insights.write(str(category_counts) + "\n")
                # ... (更详细的类别特征描述，与之前的代码相同) ...
                total_count = len(data[col])
                category_percentages = category_counts / total_count * 100
                f_insights.write("  - 各类别百分比:\n")
                f_insights.write(str(category_percentages.round(2)) + "%\n")
                f_insights.write(f"  - 类别分布是否均衡: (根据各类别计数/比例判断).\n")
                f_insights.write(f"  - 常见类别: {category_counts.index[0]} (计数: {category_counts.iloc[0]})\n")
                rare_categories = category_counts[category_counts < total_count * 0.01]
                if not rare_categories.empty:
                    f_insights.write(f"  - 稀有类别 (少于1%): {', '.join(rare_categories.index.tolist())}\n")
                else:
                    f_insights.write(f"  - 无明显稀有类别 (少于1%).\n")
            print(f"      - {col} 柱状图逻辑已保存: {countplot_insights_file}")
            summary_content += f"  4.2. 分组数据单变量分析 - 类别特征 {col} (详细逻辑见 {countplot_insights_file}, 图表见 {countplot_file}):\n"
            with open(countplot_insights_file, 'r', encoding='utf-8') as f_insight_read:
                summary_content += f_insight_read.read()


    # 6. 分组数据的双变量分析 (特征 vs. label_group)
    print("  - 分组数据的双变量分析 (vs. label_group)...")

    # 数值特征 vs. label_group
    print("    - 数值特征 vs. label_group...")
    for col in group_numerical_features:
        if col in [label_col, group_label_col]:
            continue
        plt.figure(figsize=(8, 6))
        boxplot_vs_group = sns.boxplot(data=data, x=group_label_col, y=col) # x 轴为分组标签
        plt.title(f'分组 - 数值特征 vs. 分组标签 - {col} vs. {group_label_col}') # 标题包含 "分组"
        plt.xlabel(group_label_col) # x 轴标签为分组标签列名
        plt.ylabel(col)
        boxplot_vs_group_file = os.path.join(group_dir, f'boxplot_{col}_vs_group_label.png') # 保存到分组目录
        plt.savefig(boxplot_vs_group_file)
        plt.close()
        print(f"      - {col} vs. {group_label_col} 箱线图已保存: {boxplot_vs_group_file}")

        # 保存 数值特征 vs. 分组标签 箱线图逻辑到文本
        boxplot_vs_group_insights_file = os.path.join(insights_dir, f'group_boxplot_vs_label_group_insights_{col}.txt') # insights 文件名包含 "group"
        with open(boxplot_vs_group_insights_file, 'w', encoding='utf-8') as f_insights:
            f_insights.write(f"分组 - 数值特征 {col} vs. 分组标签 {group_label_col} 箱线图分析:\n") # 文本内容包含 "分组"
            f_insights.write(f"  - 不同分组标签下 {col} 的分布差异: (请查看箱线图 {boxplot_vs_group_file} )\n")
            # ... (更详细的描述不同分组标签下的分布差异，与之前的代码类似，但针对 label_group) ...
            for group_value in sorted(data[group_label_col].unique()):
                group_data = data[data[group_label_col] == group_value][col].dropna()
                median_val = group_data.median()
                iqr_val = group_data.quantile(0.75) - group_data.quantile(0.25)
                f_insights.write(f"  - 分组标签 {group_value}: 中位数 (Median) = {median_val:.2f}, 四分位数范围 (IQR) = {iqr_val:.2f}\n")
            f_insights.write(f"  - 比较不同分组标签下 {col} 的中位数、IQR、异常值等，判断 {col} 是否在不同分组间有显著差异。\n")
        print(f"      - {col} vs. {group_label_col} 箱线图逻辑已保存: {boxplot_vs_group_file}")
        summary_content += f"\n4.3. 分组数据双变量分析 - 数值特征 vs. 分组标签 - {col} vs. {group_label_col} (详细逻辑见 {boxplot_vs_group_insights_file}, 图表见 {boxplot_vs_group_file}):\n"
        with open(boxplot_vs_group_insights_file, 'r', encoding='utf-8') as f_insight_read:
            summary_content += f_insight_read.read()


        # 类别特征 vs. label_group
        print("    - 类别特征 vs. label_group...")
        for col in group_categorical_features:
            if col in [label_col, group_label_col]:
                continue
            plt.figure(figsize=(8, 6))
            countplot_vs_group = sns.countplot(data=data, x=col, hue=group_label_col) # hue 为分组标签
            plt.title(f'分组 - 类别特征 vs. 分组标签 - {col} vs. {group_label_col}') # 标题包含 "分组"
            plt.xlabel(col)
            plt.ylabel('计数')
            plt.xticks(rotation=45, ha='right')
            plt.tight_layout()
            countplot_vs_group_file = os.path.join(group_dir, f'countplot_{col}_vs_group_label.png') # 保存到分组目录
            plt.savefig(countplot_vs_group_file)
            plt.close()
            print(f"      - {col} vs. {group_label_col} 柱状图已保存: {countplot_vs_group_file}") # 移动到循环内

            # 保存 类别特征 vs. 分组标签 柱状图逻辑到文本
            countplot_vs_group_insights_file = os.path.join(insights_dir, f'group_countplot_vs_label_group_insights_{col}.txt') # insights 文件名包含 "group"
            with open(countplot_vs_group_insights_file, 'w', encoding='utf-8') as f_insights:
                f_insights.write(f"分组 - 类别特征 {col} vs. 分组标签 {group_label_col} 柱状图分析:\n") # 文本内容包含 "分组"
                f_insights.write(f"  - 不同分组标签下 {col} 的类别分布差异: (请查看柱状图 {countplot_vs_group_file} )\n") # 移动到循环内
                # ... (更详细的描述不同分组标签下的类别分布差异，与之前的代码类似，但针对 label_group) ...
                for group_value in sorted(data[group_label_col].unique()):
                    f_insights.write(f"  - 分组标签 {group_value} 中 {col} 的类别分布:\n")
                    group_category_counts = data[data[group_label_col] == group_value][col].value_counts(normalize=True) * 100
                    f_insights.write(str(group_category_counts.round(2)) + "%\n") # 类别百分比
                f_insights.write(f"  - 观察不同分组标签中，{col} 各类别的比例差异，判断 {col} 是否在不同分组间有分布差异。\n") # 移动到循环内
            print(f"      - {col} vs. {group_label_col} 柱状图逻辑已保存: {countplot_vs_group_file}") # 移动到循环内
            summary_content += f"\n4.4. 分组数据双变量分析 - 类别特征 vs. 分组标签 - {col} vs. {group_label_col} (详细逻辑见 {countplot_vs_group_insights_file}, 图表见 {countplot_vs_group_file}):\n" # 移动到循环内
            with open(countplot_vs_group_insights_file, 'r', encoding='utf-8') as f_insight_read: # 移动到循环内
                summary_content += f_insight_read.read() # 移动到循环内


    # 7. 缺失值分析 (Missing Value Analysis) - 整体数据 (保持不变，因为缺失值分析是针对整体数据的)
    print("\n5. 缺失值分析 (Missing Value Analysis) - 整体数据...") # 步骤编号改为 5，因为前面增加了分组分析步骤
    missing_value_counts = data.isnull().sum().sort_values(ascending=False)
    missing_value_percentages = (data.isnull().sum() / len(data)).sort_values(ascending=False)
    missing_info = pd.concat([missing_value_counts, missing_value_percentages], axis=1, keys=['缺失值计数', '缺失值比例'])
    missing_value_analysis_file = os.path.join(overall_dir, 'missing_value_analysis.csv') # 保存到整体数据目录
    missing_info.to_csv(missing_value_analysis_file, encoding='utf-8')
    print(f"  - 整体数据缺失值统计已保存到: {missing_value_analysis_file}")

    # 保存缺失值分析逻辑到文本
    missing_insights_file = os.path.join(insights_dir, 'missing_value_insights.txt')
    with open(missing_insights_file, 'w', encoding='utf-8') as f_insights:
        f_insights.write(f"缺失值分析 - 整体数据:\n") # 文本内容包含 "整体数据"
        f_insights.write("  - 各列缺失值计数和比例 (详细信息见 missing_value_analysis.csv):\n")
        f_insights.write(str(missing_info) + "\n")
        # ... (更详细的缺失值描述，与之前的代码相同) ...
        top_missing_cols = missing_info[missing_info['缺失值比例'] > 0.1] # 例如 缺失比例 > 10% 的列
        if not top_missing_cols.empty:
            f_insights.write(f"  - 缺失值比例较高的列 (>= 10%):\n")
            f_insights.write(str(top_missing_cols) + "\n")
        else:
            f_insights.write(f"  - 无列缺失值比例高于 10%.\n")
        f_insights.write(f"  - 缺失值模式: (如果绘制了 missing_value_matrix.png，请查看矩阵了解缺失值是否呈现特定模式).\n")
    print(f"  - 整体数据缺失值逻辑已保存到: {missing_insights_file}")
    summary_content += f"\n5. 缺失值分析 - 整体数据 (详细逻辑见 {missing_insights_file}, 统计信息见 {missing_value_analysis_file}):\n"
    with open(missing_insights_file, 'r', encoding='utf-8') as f_insight_read:
        summary_content += f_insight_read.read()


    if HAS_MISSINGNO:
        print("  - 缺失值可视化 (如果 missingno 已安装)...")
        plt.figure(figsize=(10, 6))
        msno_matrix = msno.matrix(data)
        plt.title('缺失值模式矩阵')
        missing_value_matrix_file = os.path.join(overall_dir, 'missing_value_matrix.png') # 保存到整体数据目录
        plt.savefig(missing_value_matrix_file)
        plt.close()
        print(f"    - 缺失值矩阵可视化已保存: {missing_value_matrix_file}")

        plt.figure(figsize=(10, 6))
        msno_bar = msno.bar(data)
        plt.title('各列缺失值柱状图')
        missing_value_bar_file = os.path.join(overall_dir, 'missing_value_bar.png') # 保存到整体数据目录
        plt.savefig(missing_value_bar_file)
        plt.close()
        print(f"    - 缺失值柱状图可视化已保存: {missing_value_bar_file}")
        summary_content += f"\n5.1. 缺失值可视化 - 整体数据 (图表见 {missing_value_matrix_file}, {missing_value_bar_file}):\n" # 文本内容包含 "整体数据"
        summary_content += f"  - (请查看 {missing_value_matrix_file} 和 {missing_value_bar_file} 图片了解缺失值可视化信息)\n"
    else:
        print("  - 跳过缺失值可视化，因为 missingno 库未安装。")
        summary_content += "\n5.1. 缺失值可视化 - 整体数据: 跳过缺失值可视化，因为 missingno 库未安装。\n" # 文本内容包含 "整体数据"


    summary_content += "\n===================\nEDA 分析完成，详细结果已保存到目录: " + output_dir
    with open(summary_insights_file, 'w', encoding='utf-8') as f_summary:
        f_summary.write(summary_content)
    print(f"\nEDA 分析完成，结果已保存到目录: {output_dir}")
    print(f"  - 图片逻辑文本 insights 已保存到目录: {insights_dir}")
    print(f"  - **总的文本逻辑总结已保存到: {summary_insights_file}**")

perform_eda(eicu_data, label_col='label', output_dir='eicu_eda_results')
perform_eda(mimic_data, label_col='label', output_dir='mimic_eda_results')

开始 EDA 分析，结果将保存到目录: eicu_eda_results

1. 数据概览 (Data Overview) - 整体数据...
  - 整体数据概览信息已保存到: eicu_eda_results\overall\data_overview.txt

2. 描述性统计 (Descriptive Statistics) - 整体数据...
  - 整体数据描述性统计信息已保存到: eicu_eda_results\overall\descriptive_statistics.csv

3. Label 0 单变量分析...
  - 数值特征单变量分析...
    - age 直方图已保存: eicu_eda_results\label_0\hist_age.png
    - age 直方图逻辑已保存: eicu_eda_results\insights\label_0_hist_insights_age.txt
    - age 箱线图已保存: eicu_eda_results\label_0\boxplot_age.png
    - age 箱线图逻辑已保存: eicu_eda_results\insights\label_0_boxplot_insights_age.txt
    - gender_Male 直方图已保存: eicu_eda_results\label_0\hist_gender_Male.png
    - gender_Male 直方图逻辑已保存: eicu_eda_results\insights\label_0_hist_insights_gender_Male.txt
    - gender_Male 箱线图已保存: eicu_eda_results\label_0\boxplot_gender_Male.png
    - gender_Male 箱线图逻辑已保存: eicu_eda_results\insights\label_0_boxplot_insights_gender_Male.txt
    - heartrate 直方图已保存: eicu_eda_results\label_0\hist_heartrate.png
    - heartrate 直方图逻辑已保存: eicu_eda_resu