In [2]:
import pandas as pd
from scipy.stats import mannwhitneyu, chi2_contingency

In [63]:
data = "./20240820 对数加一训练集和测试集.xlsx"
# 载入数据集
dataset = pd.DataFrame(pd.read_excel(data, sheet_name="训练集"))
x_train = dataset[dataset.columns[1:]]
y_train = dataset[['Group']].values.ravel()

dataset_test = pd.DataFrame(pd.read_excel(data, sheet_name="测试集"))
x_test = dataset_test[dataset_test.columns[1:]]
y_test = dataset_test[['Group']].values.ravel()

print(x_train.shape)

(300, 32)


In [81]:
df1 = x_train
df2 = x_test
# 添加新列标识来源
df1.loc[:, 'source'] = 1  # 来自df1的行标记为1
df2.loc[:, 'source'] = 2  # 来自df2的行标记为2

# 合并两个DataFrame
merged_df = pd.concat([df1, df2], ignore_index=True)

# 获取特征列
features = df1.columns

# 创建一个空的结果列表，用于存储统计检验结果
results = []

# 遍历每个特征进行统计检验
for feature in features:
    # 判断特征是否为分类变量
    if df1[feature].dtype == 'object' or len(df1[feature].unique()) < 10:
        # 对于分类变量，进行卡方检验
        contingency_table = pd.crosstab(merged_df['source'], merged_df[feature])
        chi2, p, _, _ = chi2_contingency(contingency_table)
        test = "Chi-Square"
    else:
        # 对于连续性变量，进行Mann-Whitney U检验
        stat, p = mannwhitneyu(df1[feature], df2[feature], alternative='two-sided')
        test = "Mann-Whitney U"

    # 判断显著性水平
    significant = "Yes" if p < 0.05 else "No"
    
    # 将结果加入列表
    results.append({"Feature": feature, "Test": test, "p-value": p, "Significant": significant})

# 将结果转换为DataFrame并展示
results_df = pd.DataFrame(results)
print(results_df)

    Feature            Test        p-value Significant
0    Gender      Chi-Square   8.035581e-02          No
1     Fever      Chi-Square   9.736668e-01          No
2     Cough      Chi-Square   3.928736e-01          No
3      Rash      Chi-Square   5.255603e-04         Yes
4       Age  Mann-Whitney U   2.429202e-04         Yes
5       WBC  Mann-Whitney U   3.006571e-02         Yes
6       NEU  Mann-Whitney U   9.029052e-02          No
7       LYM  Mann-Whitney U   3.440809e-03         Yes
8       MON  Mann-Whitney U   4.276373e-02         Yes
9       EOS  Mann-Whitney U   5.869269e-02          No
10      BAS  Mann-Whitney U   8.058018e-01          No
11       HB  Mann-Whitney U   7.364868e-02          No
12      RBC  Mann-Whitney U   1.707256e-01          No
13      PLT  Mann-Whitney U   8.278980e-01          No
14      ALT  Mann-Whitney U   6.875823e-01          No
15      AST  Mann-Whitney U   5.980592e-01          No
16      CD4  Mann-Whitney U   2.694428e-08         Yes
17      LD

In [83]:
results_df.to_excel('./组间比较(十六支队).xlsx', index=False)

# 组内

In [64]:
import pandas as pd
from scipy.stats import mannwhitneyu, chi2_contingency

# 读取Excel文件
df1 = dataset
df2 = dataset_test

# 定义一个函数来对每个数据集的 Group=0 和 Group=1 组进行比较
def compare_groups(df):
    # 根据 Group 列将数据分成两组,用于进行Mann-Whitney U检验
    group0 = df[df['Group'] == 0].drop(columns=["Group"])
    group1 = df[df['Group'] == 1].drop(columns=["Group"])
    
    # 创建一个空的结果列表，用于存储统计检验结果
    results = []

    # 遍历每个特征列
    for feature in df.columns:
        # 判断特征是否为分类变量
        if df[feature].dtype == 'object' or len(df[feature].unique()) < 10:
            # 对于分类变量，进行卡方检验
            contingency_table = pd.crosstab(df['Group'], df[feature])
            chi2, p, _, _ = chi2_contingency(contingency_table)
            test = "Chi-Square"
        else:
            # 对于连续性变量，进行Mann-Whitney U检验
            stat, p = mannwhitneyu(group0[feature], group1[feature], alternative='two-sided')
            test = "Mann-Whitney U"
        
        # 判断显著性水平
        significant = "Yes" if p < 0.05 else "No"
        
        # 将结果加入列表
        results.append({"Feature": feature, "Test": test, "p-value": p, "Significant": significant})

    # 将结果转换为DataFrame返回
    return pd.DataFrame(results)

# 分别对两个数据集的 Group=0 和 Group=1 组进行比较
results_df1 = compare_groups(df1)
results_df2 = compare_groups(df2)

# 打印结果
print("Dataset 1 Comparison:")
print(results_df1)
print("\nDataset 2 Comparison:")
print(results_df2)

Dataset 1 Comparison:
    Feature            Test       p-value Significant
0     Group      Chi-Square  5.685911e-66         Yes
1    Gender      Chi-Square  8.224273e-02          No
2     Fever      Chi-Square  5.756183e-01          No
3     Cough      Chi-Square  2.140542e-02         Yes
4      Rash      Chi-Square  4.423077e-02         Yes
5       Age  Mann-Whitney U  6.863003e-01          No
6       WBC  Mann-Whitney U  3.405992e-02         Yes
7       NEU  Mann-Whitney U  8.506917e-02          No
8       LYM  Mann-Whitney U  6.682651e-01          No
9       MON  Mann-Whitney U  3.541727e-02         Yes
10      EOS  Mann-Whitney U  4.665225e-01          No
11      BAS  Mann-Whitney U  3.180265e-01          No
12       HB  Mann-Whitney U  1.935023e-02         Yes
13      RBC  Mann-Whitney U  1.471122e-02         Yes
14      PLT  Mann-Whitney U  6.780992e-06         Yes
15      ALT  Mann-Whitney U  2.056083e-02         Yes
16      AST  Mann-Whitney U  1.826453e-04         Yes
17    

In [67]:
results_df1.to_excel('./组内比较trian(十六支队).xlsx', index=False)
results_df2.to_excel('./组内比较test(十六支队).xlsx', index=False)