使用t_xsjbxxb中筛选得到的计算机学生名单，在t_cj_ben提取cs15-22的所有成绩，保存到t_cj_cs15-22
```sql
SELECT *
FROM liwenhao.t_xsjbxxb
WHERE bjmc ilike '%计算机%'

SELECT *
FROM liwenhao.t_cj_ben
WHERE xh IN (SELECT xh FROM liwenhao.t_jsj_xsjbxxb);
```
简单预处理

In [None]:
import pandas as pd
data = pd.read_csv('t_cj_cs15-22_origin.csv')

# 剔除课程名中额外空格
data['kcm'] = data['kcm'].str.strip()

# 修改错误课程名
data['kcm'] = data['kcm'].str.replace('高等数学I1', '高等数学I-1')
data['kcm'] = data['kcm'].str.replace('高等数学I2', '高等数学I-2')
data['kcm'] = data['kcm'].str.replace('线性代数与解析几何II', '线性代数与解析几何')
data['kcm'] = data['kcm'].str.replace('金工实习Ⅰ', '金工实习')
data['kcm'] = data['kcm'].str.replace('电工实习Ⅰ', '电工实习')
data['kcm'] = data['kcm'].str.replace('电子技术实验-1', '电子技术实验1')
data['kcm'] = data['kcm'].str.replace('电子技术实验-2', '电子技术实验2')
data['kcm'] = data['kcm'].str.replace('大学物理I1', '大学物理I-1')
data['kcm'] = data['kcm'].str.replace('大学物理I2', '大学物理I-2')
data['kcm'] = data['kcm'].str.replace('大学物理II1', '大学物理II-1')
data['kcm'] = data['kcm'].str.replace('大学物理II2', '大学物理II-2')
data['kcm'] = data['kcm'].str.replace('大学物理实验I1', '大学物理实验I-1')
data['kcm'] = data['kcm'].str.replace('大学物理实验I2', '大学物理实验I-2')
data['kcm'] = data['kcm'].str.replace('工程制图III', '工程制图')

# 剔除缺失值，去重
data.dropna(subset=['zcj'], inplace=True)
data.drop_duplicates(inplace=True)

# 保存修改后的数据集到新的CSV文件
data.to_csv('t_cj_cs15-22.csv', index=False)

转化为学号-成绩表格，有多个成绩（补考）的取最高值

In [None]:
data.sort_values(by=['xh', 'zcj'], ascending=[
                         True, False], inplace=True)
data = data.groupby(['xh', 'kcm']).first().reset_index()

# 透视表，将数据重新排列
pivot_table = pd.pivot_table(data, values='zcj',
                             index='xh', columns='kcm', aggfunc='sum', fill_value=0)

# 保存结果到文件
pivot_table.to_csv('cs15-22_all.csv')

筛选30%选课率以上的必修课程

In [None]:
import csv

# 读取CSV文件
input_file = 'cs15-22_all.csv'
output_file = 'cs15-22_compulsory.csv'

# 定义阈值（30%）
threshold = 30

with open(input_file, 'r', newline='', encoding='utf-8') as infile, open(output_file, 'w', newline='', encoding='utf-8') as outfile:
    reader = csv.reader(infile)
    writer = csv.writer(outfile)

    # 读取文件头
    header = next(reader)
    xh_index = header.index("xh")  # 找到"xh"列的索引

    # 初始化一个字典，用于统计每一列的零值数量和总行数
    zero_counts = {}
    total_rows = 0

    # 统计每一列的零值数量
    for col_index, col_name in enumerate(header):
        if col_index != xh_index:
            zero_counts[col_index] = 0

    for row in reader:
        total_rows += 1
        for col_index, value in enumerate(row):
            if col_index != xh_index and float(value) == 0.0:  # 排除"xh"列并检查是否为零
                zero_counts[col_index] += 1

    # 根据零值数量计算每一列的零值占比，决定是否保留
    columns_to_keep = [header[xh_index]]  # 保留"xh"列的值
    for col_index, zero_count in zero_counts.items():
        zero_percentage = (zero_count / total_rows) * 100
        if zero_percentage < 100 - threshold:
            columns_to_keep.append(header[col_index])

    # 重新写入文件头和符合条件的列
    writer.writerow(columns_to_keep)

    # 写入符合条件的行数据
    infile.seek(0)  # 回到文件开头
    next(reader)  # 跳过第一行
    for row in reader:
        row_to_write = [row[xh_index]]  # 保留"xh"列的值
        for col_index, col_name in enumerate(header):
            if col_index != xh_index and col_name in columns_to_keep:
                row_to_write.append(row[col_index])
        writer.writerow(row_to_write)

print("处理完成，结果保存在 '{}' 文件中".format(output_file))

手动剔除体育类、政治类等课程，保存在cs15-22_main.csv中

正态性检验及处理

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import probplot, shapiro, kstest, anderson

# 指定plt字体为宋体
plt.rcParams['font.sans-serif'] = ['SimSun']

data = pd.read_csv('cs15-22_main.csv')

# 将零值替换为NaN，保存后再读取（不这样做会报错，推测和NULL值类型有关）
data.replace(0, pd.NA, inplace=True)
data.to_csv('cs15-22_main_NULL.csv', index=False)
data = pd.read_csv('cs15-22_main_NULL.csv')
# 获取数据集中的所有数值型变量列
# numeric_columns = data.columns
# numeric_columns = data.select_dtypes(include=['float', 'int'])

non_normal = []

for column_name in data.columns:
    # 提取当前变量的数据列
    column_data = data[column_name]
    column_data = column_data.dropna()

    # ------------ 执行Shapiro-Wilk正态性检验 --------------#
    statistic, p_value = shapiro(column_data)

    # # 输出检验统计量和p-value
    # print(f'{column_name}:')
    # print('  Shapiro-Wilk统计量：', statistic)
    # print('  p-value：', p_value)

    # # 输出检验结果
    # if statistic < 0.9:
    #     print("  不符合正态分布")
    # else:
    #     print("  符合正态分布")
    
    #------------ 生成 Q-Q 图 --------------#
    # qq_plot = probplot(column_data, dist="norm", plot=plt)
    # # plt.plot(qq_plot[0], qq_plot[0], color='red', linestyle='--')
    # plt.title(f"{column_name}的Q-Q图")
    # plt.show()

    # ------------ 执行Kolmogorov-Smirnov正态性检验 --------------#
    ks_statistic, p_value = kstest(column_data, 'norm')

    # 输出检验统计量和p-value
    # print(f'{column_name}:')
    # print('  Kolmogorov-Smirnov统计量：', statistic)
    # print('  p-value：', p_value)

    # 输出检验结果
    if statistic < 0.9:
        # print("  不符合正态分布")
        non_normal.append(column_name)
    # else:
        # print("  符合正态分布")

    # # ------------ Anderson-Darling检验 --------------#
    # # 进行正态性评估
    # result = anderson(column_data, 'norm')

    # # 输出检验统计量和p-value
    # print(f'{column_name}:')
    # print('  Anderson-Darling统计量：', statistic)
    # print('  p-value：', p_value)

    # # 输出检验结果
    # if p-value < 0.05:
    #     print("  不符合正态分布")
    # else:
    #     print("  符合正态分布")

# print("不符合正态分布的课程有", non_normal

# data.fillna('0', inplace=True)
data = data.astype(float)

## 筛选符合高斯分布的连续型变量
# data.drop(columns=non_normal, inplace=True)
# data.to_csv('cs15-22_continuous.csv',index=False)

# # 筛选不符合高斯分布的变量，准备将其离散化·
data = data.filter(items=non_normal)
data.to_csv('cs15-22_discrete.csv',index=False)

```r
# 离散化为5类
library(bnlearn)
data <- read.csv("cs15-22_discrete.csv")
discretized_data <- discretize(data, method = "interval", breaks = 5)
write.csv(discretized_data, file = "cs15-22_discretized_data.csv", row.names = FALSE)

# 非超自然变换(npn)最大化高斯性
library(huge)
data <- read.csv("cs15-22_continuous.csv")
data_normalized <- huge.npn(data)
write.csv(data_normalized, file = "cs15-22_continuous_normalized.csv", row.names = FALSE)
```

检验npn后的高斯性

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import probplot, shapiro, kstest, anderson

# 指定plt字体为宋体
plt.rcParams['font.sans-serif'] = ['SimSun']
data = pd.read_csv('cs15-22_continuous_normalized.csv')

for column_name in data.columns:
    # 提取当前变量的数据列
    column_data = data[column_name]
    column_data = column_data.dropna()

    # ------------ 执行Shapiro-Wilk正态性检验 --------------#
    # statistic, p_value = shapiro(column_data)

    # # 输出检验统计量和p-value
    # print(f'{column_name}:')
    # print('  Shapiro-Wilk统计量：', statistic)
    # print('  p-value：', p_value)
    # # 输出检验结果
    # if statistic < 0.9:
    #     print("  不符合正态分布")
    # else:
    #     print("  符合正态分布")
    
    #------------ 生成 Q-Q 图 --------------#
    qq_plot = probplot(column_data, dist="norm", plot=plt)
    plt.title(f"{column_name}的Q-Q图")
    plt.show()

    # ------------ 执行Kolmogorov-Smirnov正态性检验 --------------#
    ks_statistic, p_value = kstest(column_data, 'norm')

    #输出检验统计量和p-value
    print(f'{column_name}:')
    print('  Kolmogorov-Smirnov统计量：', ks_statistic)
    print('  p-value：', p_value)
    # 输出检验结果
    if ks_statistic < 0.9:
        print("  不符合正态分布")
        non_normal.append(column_name)
    else:
        print("  符合正态分布")

    # ------------ Anderson-Darling检验 --------------#
    # # 进行正态性评估
    # result = anderson(column_data, 'norm')

    # # 输出检验统计量和p-value
    # print(f'{column_name}:')
    # print('  Anderson-Darling统计量：', result.statistic)
    # print('  临界值：', result.critical_values)

    # # 输出检验结果
    # if result.statistic < result.critical_values[2]:
    #     print("  不符合正态分布")
    # else:
    #     print("  符合正态分布")

测试npn数据的因果图

In [None]:
import pandas as pd
import networkx as nx
import cdt.causality.graph as graph
import os

data = pd.read_csv('cs15-22_discretized_data.csv')
data.fillna('0', inplace=True)
data.to_csv('cs15-22_discretized_data.csv', index=False)

model = graph.PC(CItest="gaussian",  alpha=0.05)
output = model.predict(data)
nx.drawing.nx_agraph.write_dot(output, 'cs15_22_PC.dot')
os.system('dot -Tsvg cs15_22_PC.dot -o cs15_22_PC.svg')

In [10]:
import pandas as pd
import networkx as nx
import cdt.causality.graph as graph
import os

data = pd.read_csv('cs15-22_continuous.csv')
data = data.astype(float)
model = graph.IAMB(alpha=0.05)
output = model.predict(data)
nx.drawing.nx_agraph.write_dot(output, 'cs15_22_IAMB.dot')
os.system('dot -Tsvg cs15_22_IAMB.dot -o cs15_22_IAMB.svg')

0

In [11]:
import pandas as pd
import networkx as nx
import cdt.causality.graph as graph
import os

data = pd.read_csv('cs15-22_continuous_normalized.csv')
model = graph.IAMB()
output = model.predict(data)
nx.drawing.nx_agraph.write_dot(output, 'cs15_22_IAMB_normalized.dot')
os.system('dot -Tsvg cs15_22_IAMB_normalized.dot -o cs15_22_IAMB_normalized.svg')

0

混合离散和npn数据