### 检查嵌入分布

In [None]:
import numpy as np
import matplotlib.pyplot as plt


# 读取数据
arr = np.load('data/NIPS34/all/exer_embeds.npy')
print(arr.shape)

# 计算每一行的L2范数
row_norms = np.linalg.norm(arr, axis=1)

# 绘制直方图
plt.figure(figsize=(10, 6))
plt.hist(row_norms, bins=50, color='skyblue', edgecolor='black', alpha=0.7)
plt.title('Row-wise L2 Norm Distribution')
plt.xlabel('Norm Value')
plt.ylabel('Frequency')
plt.grid(axis='y', alpha=0.3)
plt.show()

# 返回计算结果（前5行示例）
print("前5行的范数值：")
print(row_norms[:5])

### 将csv格式的数据转换为NCDM标准输入格式

In [5]:
import pandas as pd
import json
import ast


# 指定CSV文件的路径
# senario = 'Algebra'
# senario = 'GeometryandMeasure'
# senario = 'Number'
senario = 'Algebra_cold'

# file_name = 'train'
# file_name = 'val'
file_name = 'test'

root_dir = f'data/NIPS34/{senario}'
file_in = f'{root_dir}/{file_name}.csv'
file_out = f'{root_dir}/{file_name}.json'

# 使用pandas的read_csv方法读取CSV文件
df = pd.read_csv(file_in)
# 重命名指定的列
df.rename(columns={'QuestionId':'exer_id', 'UserId':'user_id', 'IsCorrect':'score', 'Kc':'knowledge_code'}, inplace=True)
# 删除多余的列
df.drop(columns=['Time', 'AnswerValue', 'CorrectAnswer', 'AnswerId'], inplace=True)
# 将每一行转换为字典，并存储在列表中
dict_list = df.to_dict(orient='records')

# 格式规范化
for elem in dict_list:
    elem['knowledge_code'] = ast.literal_eval(elem['knowledge_code'])

# 保存为json
with open(file_out, 'w', encoding='utf-8') as json_file:
    json.dump(dict_list, json_file, indent=4, ensure_ascii=False)


### 逆操作：从json转csv

In [8]:
import pandas as pd
import json


# 指定场景参数
senario = 'longtail'

# file_name = 'train'
file_name = 'test'

root_dir = f'data/NIPS34/{senario}'
file_in = f'{root_dir}/{file_name}.json'
file_out = f'{root_dir}/{file_name}.csv'

# 读取JSON文件
with open(file_in, 'r', encoding='utf-8') as f:
    dict_list = json.load(f)

# 转换为DataFrame
df = pd.DataFrame(dict_list)

# 将列表转换回字符串格式
df['knowledge_code'] = df['knowledge_code'].astype(str)

# 列名逆向映射恢复
df.rename(columns={
    'exer_id': 'item_id',
    'user_id': 'user_id',
    'score': 'score'
    # 'knowledge_code': 'Kc'
}, inplace=True)

# # 添加原始被删除的列（用空值填充）
# for col in ['Time', 'AnswerValue', 'CorrectAnswer', 'AnswerId']:
#     df[col] = pd.NA  # 使用pandas的缺失值标记

# 按原始列顺序排序（假设原始列顺序如下）
column_order = [
    'user_id', 
    'item_id', 
    'score'
]
df = df[column_order]

# 保存为CSV
df.to_csv(file_out, index=False, encoding='utf-8')

### 统计高频和长尾KC

In [9]:
import pandas as pd


# 文件路径
senario = 'longtail'
file_name = 'test'
root_dir = f'data/NIPS34/{senario}'
csv_train = f'{root_dir}/train.csv'
csv_test = f'{root_dir}/{file_name}.csv'
output_high = f'{root_dir}/{file_name}_highfreq.csv'  # 高频结果文件
output_low = f'{root_dir}/{file_name}_longtail.csv'   # 低频结果文件

# 读取CSV文件
df = pd.read_csv(csv_train)
df_test = pd.read_csv(csv_test)

# 统计QuestionId出现次数
question_counts = df['item_id'].value_counts()

# 获取不同频次的题目ID列表
high_freq = question_counts[question_counts > 10].index.tolist()   # 计数>10的ID
low_freq = question_counts[question_counts <= 3].index.tolist()    # 计数≤3的ID

print(f'[高频题目] 出现超过10次的QuestionId：{len(high_freq)}个')
print(high_freq)

print(f'\n[低频题目] 出现不超过3次的QuestionId：{len(low_freq)}个')
print(low_freq)

# 提取高频题目数据
high_df = df_test[df_test['item_id'].isin(high_freq)]
# 提取低频题目数据
low_df = df_test[df_test['item_id'].isin(low_freq)]

# 保存结果（保留原始列结构）
high_df.to_csv(output_high, index=False)
low_df.to_csv(output_low, index=False)

print(f'高频数据已保存至：{output_high}（共 {len(high_df)} 行）')
print(f'低频数据已保存至：{output_low}（共 {len(low_df)} 行）')


[高频题目] 出现超过10次的QuestionId：594个
[199, 911, 625, 855, 520, 83, 528, 547, 533, 676, 856, 460, 50, 815, 47, 727, 178, 761, 502, 634, 599, 836, 494, 421, 312, 91, 22, 670, 639, 449, 391, 862, 409, 844, 342, 635, 941, 209, 463, 185, 290, 939, 236, 943, 446, 372, 283, 527, 749, 887, 337, 592, 638, 461, 293, 134, 831, 596, 637, 40, 813, 278, 265, 295, 45, 583, 614, 525, 664, 75, 745, 605, 333, 349, 311, 49, 39, 56, 101, 263, 885, 190, 587, 84, 360, 183, 335, 282, 457, 195, 626, 150, 868, 325, 585, 138, 850, 148, 808, 184, 417, 118, 249, 177, 923, 210, 38, 858, 297, 740, 751, 830, 790, 119, 8, 601, 932, 52, 383, 90, 212, 475, 804, 673, 82, 889, 732, 644, 834, 480, 522, 472, 787, 260, 852, 376, 945, 92, 629, 292, 193, 133, 882, 506, 451, 789, 53, 186, 24, 495, 154, 924, 767, 129, 645, 213, 816, 896, 439, 34, 611, 76, 361, 368, 402, 31, 16, 539, 707, 559, 88, 624, 615, 435, 304, 37, 211, 493, 800, 365, 327, 328, 158, 145, 554, 805, 369, 160, 423, 579, 908, 485, 403, 392, 147, 513, 86, 838, 116, 6

### check冷启动情景

In [2]:
import pandas as pd


# 文件路径
senario = 'student_all'
root_dir = f'data/NIPS34/{senario}'
csv_train = f'{root_dir}/train.csv'
csv_test = f'{root_dir}/test.csv'

# 读取CSV文件
df = pd.read_csv(csv_train)
df_test = pd.read_csv(csv_test)

# 统计QuestionId出现次数
question_counts = df['QuestionId'].value_counts()
# 获取不同频次的题目ID列表
pid_train = question_counts.index.tolist()   # 计数>10的ID

# 统计QuestionId出现次数
question_counts = df_test['QuestionId'].value_counts()
# 获取不同频次的题目ID列表
pid_test = question_counts.index.tolist()   # 计数>10的ID

print(set(pid_train) & set(pid_test))


set()
