In [None]:
import pandas as pd
from datetime import datetime

import numpy as np
import os
import random
from math import radians, cos, sin, asin, sqrt
from scipy.sparse import dok_matrix,lil_matrix, save_npz
import torch 
from sklearn.metrics.pairwise import cosine_similarity
import scipy.sparse as sp
from scipy.sparse import load_npz
from torch.nn.utils.rnn import pad_sequence
import tqdm

In [None]:
KEYS = ["Anon Student Id", "KC(Default)", "Questions"]
read_trainfile = "raw/algebra_2005_2006_train.txt"
read_testfile = "raw/algebra_2005_2006_test.txt"
#write_file = "raw/algebra_05_data.csv"

In [None]:
def replace_text(text):
    text = text.replace("_", "####").replace(",", "@@@@")
    return text

In [None]:
df_train = pd.read_table(read_trainfile, encoding = "utf-8", low_memory=False)
df_train["Problem Name"] = df_train["Problem Name"].apply(replace_text)
df_train["Step Name"] = df_train["Step Name"].apply(replace_text)
#df_train["Questions"] = df_train.apply(lambda x:f"{x['Problem Name']}----{x['Step Name']}",axis=1)

In [None]:
df_test = pd.read_table(read_testfile, encoding = "utf-8", low_memory=False)
df_test["Problem Name"] = df_test["Problem Name"].apply(replace_text)
df_test["Step Name"] = df_test["Step Name"].apply(replace_text)

In [None]:
print(df_train.shape) 
print(df_train.columns)

In [None]:
print(df_test.shape) 
print(df_test.columns)

In [None]:

data = pd.concat([df_train, df_test], axis=0)

data.reset_index(drop=True, inplace=True)

output_file = "raw/algebra_05_data_all.csv"
data.to_csv(output_file, index=False, encoding="utf-8")

print(f"The data has been successfully saved to {output_file}")

In [None]:
print(data.head(2))

In [None]:
# In our algebra05 dataset we have First Transaction Time = order_id
data = pd.read_csv(
    'raw/algebra_05_data_all.csv',
    usecols=['Anon Student Id', 'Problem Name', 'KC(Default)', 'First Transaction Time','Correct First Attempt']
).dropna(subset=['Anon Student Id', 'Problem Name', 'KC(Default)', 'First Transaction Time','Correct First Attempt']) 

# 改名 
data = data.rename(columns={'Anon Student Id': 'user_id'})
data = data.rename(columns={'Problem Name': 'problem_id'})
data = data.rename(columns={'KC(Default)': 'skill_id'})
data = data.rename(columns={'First Transaction Time': 'order_id'})
data = data.rename(columns={'Correct First Attempt': 'correct'})

In [None]:
print(data.columns)

In [None]:
data.isnull().sum() 

In [None]:
print(data.shape)

In [None]:
data = data.dropna(subset=['order_id', 'user_id', 'problem_id', 'skill_id', 'correct'])  
data = data[data["correct"].isin([0, 1])]

In [None]:
print(data.shape)

In [None]:
# Time format originally: 2005/9/9 12:24:49, changed to digital milliseconds
# Improved function to support multiple date formats
def change2timestamp(t, date_format="%Y-%m-%d %H:%M:%S.%f"):
    timeStamp = datetime.strptime(t, date_format).timestamp() * 1000
    return int(timeStamp)

# Convert order_id to millisecond timestamp
data["order_id"] = data["order_id"].apply(lambda x: change2timestamp(x))

In [None]:
print(data.head(3))

In [None]:
data['correct']= data['correct'].astype(int) #important！

In [None]:
print(data.head(3))

In [None]:
# Define a function to clean up and standardize skill names
def standardize_skill(skill):
    return skill.strip()

# Extract all individual skills and sub-skills
all_single_skills = set()
for skill in data['skill_id']:
    components = skill.split('~~')  # Split and Compose skills
    all_single_skills.update(standardize_skill(comp) for comp in components)

# Initialize mappings
skill_mapping = {}
inverse_mapping = {}
counter = 0

# Deal with individual skills first
for single_skill in sorted(all_single_skills): # sort makes sure the numbers are in the same order
    skill_mapping[single_skill] = counter
    inverse_mapping[counter] = [counter]  
    counter += 1

# Reprocessing composition skills
for skill in data['skill_id']:
    skill = standardize_skill(skill)
    if '~~' in skill:
        components = skill.split('~~')
        components = [standardize_skill(comp) for comp in components]
        components = sorted(components)  #Sort to ensure consistency in the order of the same combination
        combined_key = '~~'.join(components)  #Recombine into standard form
        if combined_key not in skill_mapping:
            component_ids = [skill_mapping[comp] for comp in components]  #Mapping sub skills to numbers
            skill_mapping[combined_key] = counter
            inverse_mapping[counter] = component_ids  #Combination of record numbering forms
            counter += 1

#Map skill numbers to data
def map_skill_to_id(skill):
    if '~~' in skill:
        components = skill.split('~~')
        components = sorted(standardize_skill(comp) for comp in components)
        combined_key = '~~'.join(components)
        if combined_key in skill_mapping:
            return skill_mapping[combined_key]
        else:
            raise KeyError(f"The combination skill {combined_key} was not found in skill_mapping.")
    else:
        single_key = standardize_skill(skill)
        if single_key in skill_mapping:
            return skill_mapping[single_key]
        else:
            raise KeyError(f"The single skill {single_key} was not found in skill_mapping.")

#Application Mapping
data['skill_id_mapped'] = data['skill_id'].apply(map_skill_to_id)

In [None]:
print("Skill Number Mapping (First 3)：")
print(dict(list(skill_mapping.items())))

In [None]:
#Inverte_mapping records which sub concepts make up a composite concept, which is then used to generate a matrix
print("\nReverse mapping (which small concepts make up a new concept):")  
print(dict(list(inverse_mapping.items())))

print("\nMapped data (first 3 lines):")
print(data.head(3))

In [None]:
#Save the processed data as a CSV file according to the requirements
output_file = "raw/algebra_05_data.csv"
data.to_csv(output_file, index=False, encoding="utf-8")

In [None]:
#View the maximum value of the skill_id_mapped column
max_value = data['skill_id_mapped'].max()
#Output maximum value
print(f"The maximum value of the skill_id_mapped column is:{max_value}")

#This records the mapping relationship of all concepts, and in order, the matrix size of the second channel should be determined by this
print(f"The length of inverte_mapping is:{len(inverse_mapping)}")  

In [None]:
mapped_values = sorted(data['skill_id_mapped'].unique())

#Check if it is arranged in order
is_sequential = mapped_values == list(range(mapped_values[0], mapped_values[-1] + 1))

#Output result
print(f"skill_id_mapped column sorted values:：{mapped_values}")
print(f"Is the skill_id_mapped column arranged in order：{'yes' if is_sequential else 'no'}")

In [None]:
skills_len = None
problems_len = None
#View the data situation after removing the required columns and leaving them empty
def dataShow():
    global skills_len  
    global problems_len
    print(data.shape)

    users=data.user_id.unique().tolist() 
    print('Total number of students：',len(users))

    problem_list = data.problem_id.unique().tolist()
    print('Total number of questions：',len(problem_list))
    problems_len = len(problem_list)

    #skill_list = data.skill_id.unique().tolist()
    print('Total number of skills：',len(inverse_mapping))#技能数要看概念映射关系inverse_mapping的总长度了
    skills_len = len(inverse_mapping)

dataShow()
print(skills_len)
print(problems_len)

# Normal data numbering changes

In [None]:
data=data.groupby('user_id').filter(lambda x:len(x)>2) 

In [None]:
dataShow()

In [None]:
#Establish a practice mapping, numbered from 0 to n (this way, after establishing the correlation matrix, it will not be too large, and questions can be found directly according to the index)
raw_problem=data.problem_id.unique().tolist()
raw_problem.sort()  
num_problem=len(raw_problem)
problems={p:i for i,p in enumerate(raw_problem)}
print("number of problems: %d" % num_problem)

print(type(problems))
#Output the first 5 key value pairs of the dictionary
def print_first_five_items(data):
    for i, (key, value) in enumerate(data.items()):
        print(f'{key}: {value}')
        if i >= 4:  
            break
print_first_five_items(problems)

#Replace problem_id with the value of qMap
data['problem_id']=data['problem_id'].map(problems)  

In [None]:
#Using the assist09 and the assist17 dataset
# ##Establish skill mapping, numbered from 0 to n
# raw_question = data.skill_id.unique().tolist()  
# num_skill = len(raw_question)
# skills = { p: i for i, p in enumerate(raw_question) }  
# print("number of skills: %d" % num_skill)
# #np.save('data/'+dataset+'/map/cMap.npy',skills)

# print(type(skills))

# ##Output the first 5 key value pairs of the dictionary
# def print_first_five_items(data):
#     for i, (key, value) in enumerate(data.items()):
#         print(f'{key}: {value}')
#         if i >= 4:
#             break

# print_first_five_items(skills)

# data['skill_id']=data['skill_id'].map(skills)

In [None]:
#Establish student mapping, numbered from 0 to n (easy to find students after grouping by student column, find students by index number+1)
raw_student = data.user_id.unique().tolist()  
num_student = len(raw_student)
students = { s: i for i, s in enumerate(raw_student) } 
print("number of students: %d" % num_student)
#np.save('data/'+dataset+'/map/sMap.npy',students)

print(type(students))
def print_first_five_items(data):
    for i, (key, value) in enumerate(data.items()):
        print(f'{key}: {value}')
        if i >= 4: 
            break
print_first_five_items(students)

data['user_id']=data['user_id'].map(students)

# Input data into the prediction network

In [None]:
print(data)

In [None]:
#Divide each student's answer sequence into blocks
def parse_all_seq(students):
    all_sequences = []
    split_count = 0  #Record the number of splits
    for student_id in tqdm.tqdm(students, 'parse student sequence:\t'):
        student_sequence = parse_student_seq(data[data.user_id == student_id])
        
        problem_ids, skill_ids, corrects = student_sequence
        if len(problem_ids) > 200:
            # Sort by order_id and split into multiple sequences, with each segment not exceeding 200
            student_sorted = data[data.user_id == student_id].sort_values('order_id')
            problem_ids_sorted = student_sorted['problem_id'].values
            #skill_ids_sorted = student_sorted['skill_id'].values
            skill_ids_sorted = student_sorted['skill_id_mapped'].values
            corrects_sorted = student_sorted['correct'].values
            
            for i in range(0, len(problem_ids_sorted), 200):
                sliced_sequence = (
                    problem_ids_sorted[i:i+200],
                    skill_ids_sorted[i:i+200],
                    corrects_sorted[i:i+200]
                )
                
                 #Determine if the length of the segmented sequence is less than 3. If it is less than 3, discard it
                if len(sliced_sequence[0]) < 3:
                    continue  #Skip the current sequence
                
                all_sequences.append(sliced_sequence)
                split_count += 1  #Increase the count every time it is split
        else:
            #If the length is less than or equal to 200, add directly
            if len(problem_ids) >= 3: 
                all_sequences.append(student_sequence)
            
        #all_sequences.extend([student_sequence])
    return all_sequences, split_count

def parse_student_seq(student):
    seq = student.sort_values('order_id')
    return seq['problem_id'].values,seq['skill_id_mapped'].values, seq['correct'].values

In [None]:
sequences, split_count = parse_all_seq(sorted(data.user_id.unique()))
print(f"Number of student sequence splits: {split_count}") ##This is the number of splits, not a total of a few more students. For example, a sequence of 400 will be split twice, but compared to the original, it only has one more sequence

In [None]:
print(type(sequences))
n = 5
print(sequences[:n])

In [None]:
print('Total number of students：',len(sequences))
print(sequences[0][0])   
print(len(sequences[0][0])) #seq['problem_id'].values
print(len(sequences[0][1])) #seq['skill_id_mapped'].values
print(len(sequences[0][2])) #seq['correct'].values

In [None]:
print(sequences[0])

In [None]:
#Calculate the length of each sequence and find the maximum length
max_length = max(len(seq[0]) for seq in sequences) 

print(f"The maximum sequence length is: {max_length}")

In [None]:
sequences = np.array(sequences, dtype=object)
print(sequences.shape)
print(len(sequences))

In [None]:
has_empty = any(
    seq is None or any(len(part) == 0 for part in seq) for seq in sequences
)

if has_empty:
    print("There are empty values or partially empty sequences in the sequences.")
else:
    print("There are no empty values in sequences.")

In [None]:
np.random.seed(27)
np.random.shuffle(sequences)
train_ratio = 0.6
val_ratio = 0.2
test_ratio = 0.2
data_size=len(sequences)

train_end = int(train_ratio * data_size)
val_end = int((train_ratio + val_ratio) * data_size)
train_set,valid_set,test_set=np.split(sequences,[train_end,val_end])

In [None]:
print(train_set.shape,valid_set.shape,test_set.shape)

In [None]:
np.save('lstm_data/train_set.npy',train_set)
np.save('lstm_data/valid_set.npy',valid_set)
np.save('lstm_data/test_set.npy',test_set)

# Continuing with the preparation of data for each channel below

In [None]:
print(data)

In [None]:
#Convert format to convert problems and concepts into key value pairs
def problemid_skillid_To_key_value(data):
    result = {}   
    count_dict = {}  

    for index, row in data.iterrows():
        problem_id = row['problem_id']
        #skill_id = row['skill_id']
        skill_id = row['skill_id_mapped']
        
        if pd.notna(skill_id) and skill_id != '':
            #If problem_id is already in the dictionary, add skill_id to the collection
            if problem_id in result:
                result[problem_id].add(skill_id)
            else:
               #If problem_id is not in the dictionary, create a new set and add skill_id
                result[problem_id] = {skill_id}
    
    result = {k: list(v) for k, v in result.items()}
    count_dict = {k: len(v) for k, v in result.items()}
    
    return result, count_dict

In [None]:
problemid_skillid_key_value,problemid_skillid_key_value_count_dict = problemid_skillid_To_key_value(data)

problemid_skillid_key_value = dict(sorted(problemid_skillid_key_value.items()))
problemid_skillid_key_value_count_dict = dict(sorted(problemid_skillid_key_value_count_dict.items()))

my_dict = problemid_skillid_key_value  #Problemid_stkillid_key-value: The format of the key value pairs between the problem and the concepts it contains
n = 10 
first_n_pairs = {k: my_dict[k] for k in list(my_dict)[:n]}
print(first_n_pairs)

In [None]:
def get_user_reverse_traj(users_trajs_dict):
    """Get each user's reversed trajectory according to her complete trajectory"""
    users_rev_trajs_dict = {}
    for userID, traj in users_trajs_dict.items():
        rev_traj = traj[::-1]
        users_rev_trajs_dict[userID] = rev_traj

    return users_rev_trajs_dict

In [None]:
problemid_skillid_rev_key_value = get_user_reverse_traj(problemid_skillid_key_value)  

my_dict = problemid_skillid_rev_key_value
n = 10  

first_n_pairs = {k: my_dict[k] for k in list(my_dict)[:n]}
print(first_n_pairs)

# The third channel

In [None]:
skills_len = skills_len  
print(skills_len)

# 1.  Extract all possible skill_ids as dimensions
all_skills = sorted({skill for skills in problemid_skillid_key_value.values() for skill in skills})
print(len(all_skills))

# 2.  Obtain the total number of questions
num_problems = problems_len
print(num_problems)

# 3.  Initialize NumPy two-dimensional array, with each row corresponding to a problem vector
problem_vectors = np.zeros((num_problems, skills_len), dtype=int)

# 4.  Convert each problem_id into a vector and store it in a two-dimensional array
for idx, (problem_id, skills) in enumerate(problemid_skillid_key_value.items()):
    for skill in skills:  #For each question, set the position corresponding to the included skill_id to 1
        problem_vectors[idx, all_skills.index(skill)] = 1

# 5. Output the vector corresponding to each question
# for idx, (problem_id, _) in enumerate(problemid_skillid_key_value.items()):
    # print(f"Problem {problem_id}: {problem_vectors[idx]}")

In [None]:
#Problem-vectors calculates cosine similarity
num_questions = problems_len
vector_dim = skills_len
vectors = problem_vectors

#Initialize sparse matrix for storing cosine similarity
similarity_matrix = lil_matrix((num_questions, num_questions))

#Calculate the cosine similarity of the upper triangle part one by one
for i in range(num_questions):
    if i % 1000 == 0:
        print(f"Processing question {i}/{num_questions}")  
    
    #Calculate the cosine similarity between problem i and all subsequent problems
    if i + 1 < num_questions:  #Ensure that no empty arrays are passed
        similarities = cosine_similarity(vectors[i].reshape(1, -1), vectors[i+1:])[0]

        for j, sim in enumerate(similarities, start=i+1):
            if sim > 0:  
                try:
                    similarity_matrix[i, j] = sim 
                except KeyError as e:
                    print(f"Error assigning similarity at index ({i}, {j}): {e}")

similarity_matrix.setdiag(1)
# #Convert sparse matrices to a more efficient csr_matrix format. Lil_matrix is a linked list format that allows for quick and dynamic modification of the matrix structure.
similarity_matrix = similarity_matrix.tocsr()

#Save the sparse matrix as a file for future use, with diagonal values already set to 1
save_npz('similar/cosine_similarity_matrix.npz', similarity_matrix)

print("Similarity calculation completed, matrix saved.")

In [None]:
print(f"Matrix shape: {similarity_matrix.shape}")
print(f"Non-zero elements: {similarity_matrix.count_nonzero()}")

print(similarity_matrix[:10, :10].toarray())  

In [None]:
#Create an HG_Similarity_matrix, which is an association matrix containing hyperedge node information, to construct a problem hypergraph for the third channel
HG_similarity_matrix = lil_matrix(similarity_matrix.shape)

#Find elements greater than 0.8 (threshold) and set them to 1
rows, cols = similarity_matrix.nonzero()
for row, col in zip(rows, cols):
    if similarity_matrix[row, col] > 0.8:
        HG_similarity_matrix[row, col] = 1  #Set positions greater than 0.8 as 1

#Convert the new matrix to csr_matrix format to improve efficiency
HG_similarity_matrix = HG_similarity_matrix.tocsr()

save_npz('similar/HG_cosine_similarity_matrix.npz', HG_similarity_matrix)

print("Positions with values greater than 0.8 have been set to 1 and a new matrix has been saved")

In [None]:
print(f"Matrix shape: {HG_similarity_matrix.shape}")
print(f"Non-zero elements: {HG_similarity_matrix.count_nonzero()}")

In [None]:
HG_similarity_matrix = load_npz('similar/HG_cosine_similarity_matrix.npz')
print(f"Matrix shape: {HG_similarity_matrix.shape}")

Q_similarity_matrix = HG_similarity_matrix[:10, :10].todense()
print(Q_similarity_matrix)

In [None]:
def normalized_adj(adj, is_symmetric=True):
    """Normalize adjacent matrix for GCN"""
    if is_symmetric:
        rowsum = np.array(adj.sum(1))
        d_inv = np.power(rowsum + 1e-8, -1/2).flatten()
        d_inv[np.isinf(d_inv)] = 0.
        d_mat_inv = sp.diags(d_inv)
        norm_adj = d_mat_inv * adj * d_mat_inv
    else:
        rowsum = np.array(adj.sum(1))
        d_inv = np.power(rowsum + 1e-8, -1).flatten()
        d_inv[np.isinf(d_inv)] = 0.
        d_mat_inv = sp.diags(d_inv)
        norm_adj = d_mat_inv * adj

    return norm_adj

In [None]:
def transform_csr_matrix_to_tensor(csr_matrix):
    """Transform csr matrix to tensor"""
    coo = csr_matrix.tocoo()
    values = coo.data
    indices = np.vstack((coo.row, coo.col))

    i = torch.LongTensor(indices)
    v = torch.FloatTensor(values)
    shape = coo.shape
    sp_tensor = torch.sparse.FloatTensor(i, v, torch.Size(shape))

    return sp_tensor

In [None]:
HG_question_similarity_graph_matrix = normalized_adj(adj=HG_similarity_matrix, is_symmetric=False)  #Normalized adjacency matrix
HG_question_similarity_graph = transform_csr_matrix_to_tensor(HG_question_similarity_graph_matrix) 

In [None]:
torch.save(HG_question_similarity_graph, 'similar/HG_question_similarity_graph.pt')

# The first channel

In [None]:
def gen_sparse_H_user(sessions_dict, num_pois, num_users):
    """Generate sparse incidence matrix for hypergraph"""
    H = np.zeros(shape=(num_pois, num_users))

    for userID, sessions in sessions_dict.items():
        for poi in sessions:
            poi = int(poi) 
            userID = int(userID)
            H[poi, userID] = 1
    
    H = sp.csr_matrix(H)

    return H

In [None]:
print(skills_len)
print(problems_len)

In [None]:
num_skills = skills_len
num_problems = problems_len
keep_rate = 1
H_cq = gen_sparse_H_user(problemid_skillid_key_value, num_skills, num_problems)  # [C, Q]  

In [None]:
def csr_matrix_drop_edge(csr_adj_matrix, keep_rate):
    """Drop edge on scipy.sparse.csr_matrix"""
    if keep_rate == 1.0:
        return csr_adj_matrix

    coo = csr_adj_matrix.tocoo()
    row = coo.row
    col = coo.col
    edgeNum = row.shape[0]

    # generate edge mask
    mask = np.floor(np.random.rand(edgeNum) + keep_rate).astype(np.bool_)

    # get new values and indices
    new_row = row[mask]
    new_col = col[mask]
    new_edgeNum = new_row.shape[0]
    new_values = np.ones(new_edgeNum, dtype=np.float)

    drop_adj_matrix = sp.csr_matrix((new_values, (new_row, new_col)), shape=coo.shape)

    return drop_adj_matrix

In [None]:
H_cq = csr_matrix_drop_edge(H_cq, keep_rate)

In [None]:
def get_hyper_deg(incidence_matrix):
    '''
    # incidence_matrix = [num_nodes, num_hyperedges]
    hyper_deg = np.array(incidence_matrix.sum(axis=axis)).squeeze()
    hyper_deg[hyper_deg == 0.] = 1
    hyper_deg = sp.diags(1.0 / hyper_deg)
    '''

    # H  = [num_node, num_edge]
    # DV = [num_node, num_node]
    # DV * H = [num_node, num_edge]

    # HT = [num_edge, num_node]
    # DE = [num_edge, num_edge]
    # DE * HT = [num_edge, num_node]

    # hyper_deg = incidence_matrix.sum(1)
    # inv_hyper_deg = hyper_deg.power(-1)
    # inv_hyper_deg_diag = sp.diags(inv_hyper_deg.toarray()[0])

    rowsum = np.array(incidence_matrix.sum(1))
    d_inv = np.power(rowsum, -1).flatten()
    d_inv[np.isinf(d_inv)] = 0.
    d_mat_inv = sp.diags(d_inv)

    return d_mat_inv

In [None]:
#Hypergraph convolution normalization operation
Deg_H_cq = get_hyper_deg(H_cq)  
HG_cq = Deg_H_cq * H_cq  
HG_cq = transform_csr_matrix_to_tensor(HG_cq)  

In [None]:
H_qc = H_cq.T  
Deg_H_qc = get_hyper_deg(H_qc)  
HG_qc = Deg_H_qc * H_qc  
HG_qc = transform_csr_matrix_to_tensor(HG_qc)  

In [None]:
torch.save(HG_cq, 'HG1/HG_cq.pt')
torch.save(HG_qc, 'HG1/HG_qc.pt')

In [None]:
print(f"Matrix shape: {HG_cq.shape}")
print(f"Matrix shape: {HG_qc.shape}")

# The second channel

In [None]:
print(skills_len)

In [None]:
num_concepts = skills_len
cooccurrence_matrix = np.zeros((num_concepts, num_concepts), dtype=int)

# #Example data
data_gx = {
    1: [40],
    2: [50],
    3: [32],
    4: [32, 40, 50],
    5: [40],
    6: [50],
    7: [32],
    8: [32, 40, 50, 45],
    9: [45],
    10: [45]
}

for values in problemid_skillid_key_value.values():
    # print(values)
    if len(values) > 1:
        
        for i in range(len(values)):
            for j in range(i + 1, len(values)):
                val_i = values[i]
                val_j = values[j]
                
                cooccurrence_matrix[int(val_i), int(val_j)] += 1
                cooccurrence_matrix[int(val_j), int(val_i)] += 1  


In [None]:
print(cooccurrence_matrix.shape)
num_ones = np.sum(cooccurrence_matrix > 0)
print(num_ones)

In [None]:
#On the basis of the original, it is necessary to add that the newly synthesized concept is connected to the sub concepts with edges, and if there is intersection between the sub concepts of the newly synthesized concept, there should also be edges
#Inverse_mapping records which sub concepts make up a composite concept and generates a matrix
#inverse_mapping：{0:[0],1:[1],...,112:[81,86],...}
# #Update co-occurrence matrix
for concept, sub_concepts in inverse_mapping.items():
    # 1. The newly synthesized concept is connected to its sub concepts
    for sub_concept in sub_concepts:
        cooccurrence_matrix[concept, sub_concept] += 1
        cooccurrence_matrix[sub_concept, concept] += 1

    # 2. There are edges between newly synthesized concepts (if there are intersections between sub concepts)
    for other_concept, other_sub_concepts in inverse_mapping.items():
        if concept != other_concept:  
            if set(sub_concepts) & set(other_sub_concepts):  
                cooccurrence_matrix[concept, other_concept] += 1
                cooccurrence_matrix[other_concept, concept] += 1

In [None]:
num_ones = np.sum(cooccurrence_matrix > 0)
print(num_ones)

In [None]:
np.save('concept_cooccurrence/concept_cooccurrence_matrix.npy', cooccurrence_matrix)

In [None]:
concept_cooccurrence_matrix = np.load('concept_cooccurrence/concept_cooccurrence_matrix.npy')
print("Matrix loaded:")
print(concept_cooccurrence_matrix)

In [None]:
print(f"Matrix shape: {concept_cooccurrence_matrix.shape}")
print(f"Non-zero elements: {np.count_nonzero(concept_cooccurrence_matrix)}")

In [None]:
non_zero_indices = np.nonzero(concept_cooccurrence_matrix)

rows, cols = non_zero_indices
for r, c in zip(rows, cols):
    value = concept_cooccurrence_matrix[r, c]
    print(f"Non zero elements: row {r}, Column {c}, Value {value}")

In [None]:
#Divide each value of the co-occurrence matrix by the total number of concepts, and use it as the weight matrix for concept co-occurrence: W_comocurnence_matrix to prevent future improvements
divisor = skills_len
W_cooccurrence_matrix = np.round(cooccurrence_matrix / divisor, 3)
np.save('concept_cooccurrence/concept_W_cooccurrence_matrix.npy', W_cooccurrence_matrix)

In [None]:
W_cooccurrence_matrix = np.load('concept_cooccurrence/concept_W_cooccurrence_matrix.npy')

In [None]:
concept_cooccurrence_adjacent_matrix = np.load('concept_cooccurrence/concept_cooccurrence_matrix.npy')
print("Matrix loaded:")
print(concept_cooccurrence_adjacent_matrix)

In [None]:
concept_cooccurrence_adjacent_matrix[concept_cooccurrence_matrix >= 1] = 1
np.save('concept_cooccurrence/concept_cooccurrence_adjacent_matrix.npy', concept_cooccurrence_adjacent_matrix)

print("Matrix modification completed and saved as: concept_cooccurrence_adjacent_matrix.npy")

In [None]:
print(f"Matrix shape: {concept_cooccurrence_adjacent_matrix.shape}")
print(f"Non-zero elements: {np.count_nonzero(concept_cooccurrence_adjacent_matrix)}")

In [None]:
non_zero_indices = np.nonzero(concept_cooccurrence_adjacent_matrix)

rows, cols = non_zero_indices
for r, c in zip(rows, cols):
    value = concept_cooccurrence_adjacent_matrix[r, c]
    print(f"Non zero elements: row {r}, Column {c}, Value {value}")

In [None]:
rows = 10  
cols = 10  
sub_matrix = concept_cooccurrence_adjacent_matrix[:rows, :cols]
print("矩阵的前 {} 行和前 {} 列：".format(rows, cols))
print(sub_matrix)

# Save as GCN structure

In [None]:
#Read the concept co-occurrence adjacency matrix. The diagonals of this adjacency matrix are all 0, and PYG's GCN will automatically help fill in 1
concept_cooccurrence_adjacent_matrix = np.load('concept_cooccurrence/concept_cooccurrence_adjacent_matrix.npy')

In [None]:
print(sum(sum(concept_cooccurrence_adjacent_matrix)))

In [None]:
concept_cooccurrence_adjacent_matrix = torch.tensor(concept_cooccurrence_adjacent_matrix)

In [None]:
#Find the index of non-zero terms in the adjacency matrix and generate edge_index
edge_index = concept_cooccurrence_adjacent_matrix.nonzero(as_tuple=False).t()

In [None]:
print(edge_index.shape)

In [None]:
torch.save(edge_index, 'concept_cooccurrence/edge_index.pt')