### 检查嵌入分布

In [None]:
import numpy as np
import matplotlib.pyplot as plt


# 读取数据
arr = np.load('data/NIPS34/all/exer_embeds.npy')
print(arr.shape)

# 计算每一行的L2范数
row_norms = np.linalg.norm(arr, axis=1)

# 绘制直方图
plt.figure(figsize=(10, 6))
plt.hist(row_norms, bins=50, color='skyblue', edgecolor='black', alpha=0.7)
plt.title('Row-wise L2 Norm Distribution')
plt.xlabel('Norm Value')
plt.ylabel('Frequency')
plt.grid(axis='y', alpha=0.3)
plt.show()

# 返回计算结果（前5行示例）
print("前5行的范数值：")
print(row_norms[:5])

### 将csv格式的数据转换为NCDM标准输入格式

In [5]:
import pandas as pd
import json
import ast


# 指定CSV文件的路径
# senario = 'Algebra'
# senario = 'GeometryandMeasure'
# senario = 'Number'
senario = 'Algebra_cold'

# file_name = 'train'
# file_name = 'val'
file_name = 'test'

root_dir = f'data/NIPS34/{senario}'
file_in = f'{root_dir}/{file_name}.csv'
file_out = f'{root_dir}/{file_name}.json'

# 使用pandas的read_csv方法读取CSV文件
df = pd.read_csv(file_in)
# 重命名指定的列
df.rename(columns={'QuestionId':'exer_id', 'UserId':'user_id', 'IsCorrect':'score', 'Kc':'knowledge_code'}, inplace=True)
# 删除多余的列
df.drop(columns=['Time', 'AnswerValue', 'CorrectAnswer', 'AnswerId'], inplace=True)
# 将每一行转换为字典，并存储在列表中
dict_list = df.to_dict(orient='records')

# 格式规范化
for elem in dict_list:
    elem['knowledge_code'] = ast.literal_eval(elem['knowledge_code'])

# 保存为json
with open(file_out, 'w', encoding='utf-8') as json_file:
    json.dump(dict_list, json_file, indent=4, ensure_ascii=False)


### 逆操作：从json转csv

In [8]:
import pandas as pd
import json


# 指定场景参数
senario = 'longtail'

# file_name = 'train'
file_name = 'test'

root_dir = f'data/NIPS34/{senario}'
file_in = f'{root_dir}/{file_name}.json'
file_out = f'{root_dir}/{file_name}.csv'

# 读取JSON文件
with open(file_in, 'r', encoding='utf-8') as f:
    dict_list = json.load(f)

# 转换为DataFrame
df = pd.DataFrame(dict_list)

# 将列表转换回字符串格式
df['knowledge_code'] = df['knowledge_code'].astype(str)

# 列名逆向映射恢复
df.rename(columns={
    'exer_id': 'item_id',
    'user_id': 'user_id',
    'score': 'score'
    # 'knowledge_code': 'Kc'
}, inplace=True)

# # 添加原始被删除的列（用空值填充）
# for col in ['Time', 'AnswerValue', 'CorrectAnswer', 'AnswerId']:
#     df[col] = pd.NA  # 使用pandas的缺失值标记

# 按原始列顺序排序（假设原始列顺序如下）
column_order = [
    'user_id', 
    'item_id', 
    'score'
]
df = df[column_order]

# 保存为CSV
df.to_csv(file_out, index=False, encoding='utf-8')

### 统计高频和长尾KC

In [9]:
import pandas as pd


# 文件路径
senario = 'longtail'
file_name = 'test'
root_dir = f'data/NIPS34/{senario}'
csv_train = f'{root_dir}/train.csv'
csv_test = f'{root_dir}/{file_name}.csv'
output_high = f'{root_dir}/{file_name}_highfreq.csv'  # 高频结果文件
output_low = f'{root_dir}/{file_name}_longtail.csv'   # 低频结果文件

# 读取CSV文件
df = pd.read_csv(csv_train)
df_test = pd.read_csv(csv_test)

# 统计QuestionId出现次数
question_counts = df['item_id'].value_counts()

# 获取不同频次的题目ID列表
high_freq = question_counts[question_counts > 10].index.tolist()   # 计数>10的ID
low_freq = question_counts[question_counts <= 3].index.tolist()    # 计数≤3的ID

print(f'[高频题目] 出现超过10次的QuestionId：{len(high_freq)}个')
print(high_freq)

print(f'\n[低频题目] 出现不超过3次的QuestionId：{len(low_freq)}个')
print(low_freq)

# 提取高频题目数据
high_df = df_test[df_test['item_id'].isin(high_freq)]
# 提取低频题目数据
low_df = df_test[df_test['item_id'].isin(low_freq)]

# 保存结果（保留原始列结构）
high_df.to_csv(output_high, index=False)
low_df.to_csv(output_low, index=False)

print(f'高频数据已保存至：{output_high}（共 {len(high_df)} 行）')
print(f'低频数据已保存至：{output_low}（共 {len(low_df)} 行）')


[高频题目] 出现超过10次的QuestionId：594个
[199, 911, 625, 855, 520, 83, 528, 547, 533, 676, 856, 460, 50, 815, 47, 727, 178, 761, 502, 634, 599, 836, 494, 421, 312, 91, 22, 670, 639, 449, 391, 862, 409, 844, 342, 635, 941, 209, 463, 185, 290, 939, 236, 943, 446, 372, 283, 527, 749, 887, 337, 592, 638, 461, 293, 134, 831, 596, 637, 40, 813, 278, 265, 295, 45, 583, 614, 525, 664, 75, 745, 605, 333, 349, 311, 49, 39, 56, 101, 263, 885, 190, 587, 84, 360, 183, 335, 282, 457, 195, 626, 150, 868, 325, 585, 138, 850, 148, 808, 184, 417, 118, 249, 177, 923, 210, 38, 858, 297, 740, 751, 830, 790, 119, 8, 601, 932, 52, 383, 90, 212, 475, 804, 673, 82, 889, 732, 644, 834, 480, 522, 472, 787, 260, 852, 376, 945, 92, 629, 292, 193, 133, 882, 506, 451, 789, 53, 186, 24, 495, 154, 924, 767, 129, 645, 213, 816, 896, 439, 34, 611, 76, 361, 368, 402, 31, 16, 539, 707, 559, 88, 624, 615, 435, 304, 37, 211, 493, 800, 365, 327, 328, 158, 145, 554, 805, 369, 160, 423, 579, 908, 485, 403, 392, 147, 513, 86, 838, 116, 6

### check冷启动情景

In [2]:
import pandas as pd


# 文件路径
senario = 'student_all'
root_dir = f'data/NIPS34/{senario}'
csv_train = f'{root_dir}/train.csv'
csv_test = f'{root_dir}/test.csv'

# 读取CSV文件
df = pd.read_csv(csv_train)
df_test = pd.read_csv(csv_test)

# 统计QuestionId出现次数
question_counts = df['QuestionId'].value_counts()
# 获取不同频次的题目ID列表
pid_train = question_counts.index.tolist()   # 计数>10的ID

# 统计QuestionId出现次数
question_counts = df_test['QuestionId'].value_counts()
# 获取不同频次的题目ID列表
pid_test = question_counts.index.tolist()   # 计数>10的ID

print(set(pid_train) & set(pid_test))


set()


### 精简版的BERT-embedder流程

In [None]:
import json
import torch
import numpy as np
from transformers import BertTokenizer, BertModel


# nips34，kc和文本（英文）
with open(r'/mnt/new_pfs/liming_team/auroraX/songchentao/MyCDM/data/nips34_short_kc.json', 'r', encoding='utf-8') as file:
    questions = json.load(file)

# 输出文件名
file_emb = r'/mnt/new_pfs/liming_team/auroraX/songchentao/MyCDM/data/nips34_short_kc_exer_embeds_bert.npy'
file_token = r'/mnt/new_pfs/liming_team/auroraX/songchentao/MyCDM/data/nips34_short_kc_exer_tokens_bert.json'

# 整理为list（按照pid升序排列）
contents = []
for ind in range(len(questions)):
    contents.append(questions[str(ind)])

# 批量处理文本
tokenizer = BertTokenizer.from_pretrained('/mnt/new_pfs/liming_team/auroraX/songchentao/llama/bert-base-uncased')
model = BertModel.from_pretrained('/mnt/new_pfs/liming_team/auroraX/songchentao/llama/bert-base-uncased')
model.eval()
with torch.no_grad():
    # 分词
    exer_tokenized = tokenizer(contents, padding=True, truncation=True, max_length=512, return_tensors='pt')
#     # 嵌入
#     bert_output = model(**exer_tokenized)
#     exer_emb = bert_output.last_hidden_state[:, 0, :]
#     print(exer_emb.shape)

# # 保存嵌入结果
# np.save(file_emb, exer_emb.detach().numpy())

# check
print(type(exer_tokenized))                                    # <class 'transformers.tokenization_utils_base.BatchEncoding'>
print(exer_tokenized.keys())                                   # dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])
print(exer_tokenized['input_ids'].shape)                       # [948, 512]
print(len(exer_tokenized['attention_mask'].numpy().tolist()))  # 948
print(exer_tokenized['token_type_ids'])                        # 无分句，全0

print(exer_tokenized['input_ids'].dtype)                       # 均为torch.int64
print(exer_tokenized['token_type_ids'].dtype)
print(exer_tokenized['attention_mask'].dtype)

# 保存分词结果
output = {}
for key, value in exer_tokenized.items():
    output[key] = value.numpy().tolist()

print(output)

with open(file_token, 'w', encoding='utf-8') as file:
    json.dump(output, file, ensure_ascii=False)  # , indent=4


<class 'transformers.tokenization_utils_base.BatchEncoding'>
dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])
torch.Size([948, 145])
948
tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]])
torch.int64
torch.int64
torch.int64
{'input_ids': [[101, 1031, 3716, 2685, 1033, 8785, 2015, 1011, 1028, 2193, 1011, 1028, 29299, 1010, 4204, 1998, 6147, 1011, 1028, 14320, 1010, 14291, 2015, 1010, 4385, 1031, 3160, 2592, 1033, 3160, 1024, 2065, 2017, 4800, 22086, 1037, 2675, 2193, 2011, 1023, 1010, 2017, 2131, 1037, 2675, 2193, 1012, 2003, 2023, 4861, 2467, 2995, 1010, 2823, 2995, 1010, 2196, 2995, 1010, 2030, 5263, 2000, 2360, 1029, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [20]:
# 检查token长度情况

# 单独嵌入
temp = []
with torch.no_grad():
    for elem in contents:
        # 分词
        _temp = tokenizer(elem, padding=True, truncation=True, return_tensors='pt')    # , max_length=512
        temp.append(_temp)

len_list = []
for elem in temp:
    len_list.append(len(elem['input_ids'].numpy().tolist()[0]))
print(max(len_list))


# 批量嵌入
with torch.no_grad():
    # 分词
    temp = tokenizer(contents, padding=True, max_length=512, truncation=True, return_tensors='pt')    # 

output = {}
for key, value in temp.items():
    output[key] = value.numpy().tolist()

print(len(output['input_ids'][1]))


145


### 精简版的BGE-embedder流程

In [None]:
# 导入必要的库
from transformers import AutoTokenizer, AutoModel
import torch
import json
import numpy as np


# 读取问题内容：nips34，kc和文本（英文）
with open(r'/mnt/new_pfs/liming_team/auroraX/songchentao/MyCDM/data/nips34_short_kc.json', 'r', encoding='utf-8') as file:
    questions = json.load(file)

# 整理为list（按照pid升序排列）
exer_ids = list(range(len(questions)))
contents = []
for ind in range(len(questions)):
    contents.append(questions[str(ind)])

# 输出文件名
file_emb = r'/mnt/new_pfs/liming_team/auroraX/songchentao/MyCDM/data/nips34_short_kc_exer_embeds_bge.npy'
file_token = r'/mnt/new_pfs/liming_team/auroraX/songchentao/MyCDM/data/nips34_short_kc_exer_tokens_bge.json'

# 加载BGE模型和分词器
model_name = "BAAI/bge-large-zh-v1.5"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
model.eval()  # 设置为评估模式

# 生成嵌入
embeddings = []
inputs_all = []
batch_size = 32  # 根据GPU内存调整批量大小

with torch.no_grad():
    for i in range(0, len(contents), batch_size):
        batch_texts = contents[i:i+batch_size]
        # 使用transformers的tokenizer和model
        inputs = tokenizer(batch_texts, padding=True, truncation=True, max_length=512, return_tensors="pt")
        inputs_all.append(inputs)
        # outputs = model(**inputs)
        # # 使用[CLS]标记的输出作为文本嵌入
        # batch_embeddings = outputs.last_hidden_state[:, 0]
        # # 标准化嵌入向量
        # batch_embeddings = torch.nn.functional.normalize(batch_embeddings, p=2, dim=1)
        # embeddings.append(batch_embeddings)

# 合并所有批次的嵌入
# all_embeddings = torch.cat(embeddings, dim=0)

# # 创建嵌入字典
# embedding_dict = {}
# for i, exer_id in enumerate(exer_ids):
#     embedding_dict[exer_id] = all_embeddings[i].numpy().tolist()

# # 保存嵌入结果
# file_embedding = f'{root_path}/bge_embeddings.json'
# with open(file_embedding, 'w', encoding='utf-8') as file:
#     json.dump(embedding_dict, file, ensure_ascii=False)

# print(f"BGE嵌入已保存到 {file_embedding}")
# print(f"嵌入维度: {all_embeddings.shape}")
