In [None]:
import sys
import numpy as np
import sklearn.metrics.pairwise as skl
import pandas as pd
from collections import defaultdict
import pickle
import os
import json
import ast
import re

In [None]:
# 读取 user-course.json 文件
def read_user_course(file_path):
    return pd.read_csv(file_path, sep="\t", header=None, names=["user_id", "course_id"])

# 读取 course-concept 文件
def read_course_concept(file_path):
    return pd.read_csv(file_path, sep="\t", header=None, names=["course_id", "concept"])

# 读取 course.json 文件
def read_course_json(file_path):
    data = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            course_data = json.loads(line)
            data.append({'course_id': course_data['id'], 'name': course_data['name']})
    return pd.DataFrame(data).rename(columns={'id': 'course_id', 'name': 'course_name'})


# 读取 concept-field 文件
def read_concept_field(file_path):
    return pd.read_csv(file_path, sep="\t", header=None, names=["concept", "type"])


def parse_explanation(explanation):
    # 使用正则表达式找出所有的 x:y 形式的模式
    pattern = re.compile(r'(\w+)：(.*?)\s(?=\w+：|$)')
    matches = pattern.findall(explanation)
    return dict(matches)

def read_concept_json(file_path):
    data = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            try:
                concept_data = json.loads(line)
                if 'explanation' in concept_data:
                    explanation_dict = parse_explanation(concept_data['explanation'])
                    if '学科' in explanation_dict:
                        data.append({'concept': concept_data['id'], 'subtype': explanation_dict['学科']})
            except json.JSONDecodeError:
                continue  # 忽略无法解析的行
    return pd.DataFrame(data)

user_course_path = "MOOCCube/relations/user-course.json"
course_concept_path = "MOOCCube/relations/course-concept.json"
course_json_path = "MOOCCube/entities/course.json"
concept_field_path = "MOOCCube/relations/concept-field.json"
concept_json_path = "MOOCCube/entities/concept.json"


# 读取数据
user_course_df = read_user_course(user_course_path)
course_concept_df = read_course_concept(course_concept_path)
course_df = read_course_json(course_json_path)
concept_field_df = read_concept_field(concept_field_path)
concept_json_df = read_concept_json(concept_json_path)

In [None]:

# 数据整合
# 合并 user_course_df 和 course_concept_df
merged_user_course_concept = pd.merge(user_course_df, course_concept_df, on="course_id")

# 对每个 user_id 和 course_id 组合聚合 concept
grouped_user_course_concept = merged_user_course_concept.groupby(['user_id', 'course_id'])['concept'].apply(lambda x: list(set(x))).reset_index()



# 合并 concept 和 concept-field
merged_concept_type = pd.merge(course_concept_df, concept_field_df, on="concept", how="left")
# 对每个 course_id 聚合 type
grouped_course_type = merged_concept_type.groupby('course_id')['type'].apply(lambda x: list(set(x))).reset_index()


# 合并 concept 和 concept.json
merged_concept_subtype = pd.merge(course_concept_df, concept_json_df, on="concept", how="left")

# 对每个 course_id 聚合 subtype
grouped_course_subtype = merged_concept_subtype.groupby('course_id')['subtype'].apply(lambda x: list(set(x))).reset_index()

# 合并所有数据
final_merged = pd.merge(grouped_user_course_concept, grouped_course_type, on="course_id")
final_merged = pd.merge(final_merged, grouped_course_subtype, on="course_id")
final_merged = pd.merge(final_merged, course_df[['course_id', 'course_name']], on="course_id", how="left")

# 确保 concept, user_id, course_id, name 列不为空
final_merged.dropna(subset=['concept', 'user_id', 'course_id', 'course_name'], inplace=True)

# 导出 CSV 文件
final_csv_path = "MOOCCube/data_full.csv"
final_merged.to_csv(final_csv_path, index=False)

print("CSV 文件已生成:", final_csv_path)


In [None]:
csv_df = pd.read_csv('MOOCCube/data_full.csv')

# 初始化物品的知识图谱映射
itemMap = {}
count = 0
csv_df['sub_type'] = csv_df['sub_type'].astype(str)

# 为data.csv中的每个物品创建知识图谱
for index, row in csv_df.iterrows():
    item = row['course_index']
    types = set(row['type'].split(' '))  # 从类型字段分割并创建集合
    sub_types = set(row['sub_type'].replace(',', ' ').split())  # 替换逗号并分割创建集合
    combined_set = types.union(sub_types)  # 合并两个集合
    # 合并 type 和 sub_type 的值，存入 itemMap
    itemMap[item] = combined_set


# 保存每个物品的知识图谱到文件
for item in itemMap:
    # for feat in itemMap[item]:
    count += len(itemMap[item])


count = 0
processed_course_ids = set()  # 用于跟踪已处理的课程ID

for index, row in csv_df.iterrows():
    item = row['course_id']
    
    # 检查课程ID是否已经处理过，如果是则跳过
    if item in processed_course_ids:
        continue
    
    # 将课程ID添加到已处理的集合中
    processed_course_ids.add(item)
    
    # 将字符串形式的列表转换为实际的列表
    concepts = ast.literal_eval(row['type'])
    count += len(concepts)


# 导出 CSV 文件
final_csv_path = "MOOCCube/data_full.csv"
final_merged.to_csv(final_csv_path, index=False)

print("CSV 文件已生成:", final_csv_path)

In [None]:
csv_df = pd.read_csv('MOOCCube/data_full.csv')

# 筛选出交互次数大于或等于5次的用户
user_interactions = csv_df['user_id'].value_counts()
users_with_5_or_more_interactions = user_interactions[user_interactions >= 5].index
filtered_by_user_interactions = csv_df[csv_df['user_id'].isin(users_with_5_or_more_interactions)]

# 筛选出被10个以上用户交互过的课程
course_interactions = filtered_by_user_interactions['course_id'].value_counts()
courses_with_10_or_more_users = course_interactions[course_interactions >= 10].index
filtered_by_course_interactions = filtered_by_user_interactions[filtered_by_user_interactions['course_id'].isin(courses_with_10_or_more_users)]

# 确保没有 user_id 或 course_id 为空的行
cleaned_df = filtered_by_course_interactions.dropna(subset=['user_id', 'course_id'])
# 使用 factorize 方法重新编码 user_id 和 course_id
cleaned_df['user_id'] = pd.factorize(cleaned_df['user_id'])[0]
cleaned_df['course_id'] = pd.factorize(cleaned_df['course_id'])[0]

# 重置索引
cleaned_df.reset_index(drop=True, inplace=True)

# 统计独特的用户和课程数量
unique_users = cleaned_df['user_id'].nunique()
unique_courses = cleaned_df['course_id'].nunique()

# 打印统计结果
print(f"筛选后的用户数量: {unique_users}")
print(f"筛选后的课程数量: {unique_courses}")

final_csv_path = "data_KEAM.csv"
cleaned_df.to_csv(final_csv_path, index=False)

print("CSV 文件已生成:", final_csv_path)

# 添加固定值为 1 的列
df['interaction'] = 1

# 分割数据集为训练集和测试集，比例为 9:1
train_data, test_data = train_test_split(df, test_size=0.1, random_state=42)

# 只选择 user_id, course_id, interaction 列
train_data = train_data[['user_id', 'course_id', 'interaction']]
test_data = test_data[['user_id', 'course_id', 'interaction']]

# 将训练集保存为 CSV 文件，使用制表符分隔，不包含表头
train_data.to_csv("train_data.tsv", sep='\t', index=False, header=False)

# 将测试集保存为 CSV 文件，使用制表符分隔，不包含表头
test_data.to_csv("test_data.tsv", sep='\t', index=False, header=False)

# 去重：确保每个 course_id 和 course_name 组合是唯一的
unique_courses = df[['course_id', 'course_name']].drop_duplicates().reset_index(drop=True)

# 保存为 .tsv 文件
unique_courses.to_csv('course_map.tsv', sep='\t', index=False)