In [None]:
import pandas as pd
import dgl
import torch
import time
import os

file_path = "C:/Users/suman/OneDrive/Bureau/Case_Study/prj_Graphtransformers/prj_Graphtransformers/data/raw/assessments.csv"
print(f"File exists: {os.path.exists(file_path)}")

def load_data():
    """
    Load OULAD dataset from CSV files.
    """
    print("Loading data...")
    start_time = time.time()
    assessments = pd.read_csv(file_path)
    courses = pd.read_csv("C:/Users/suman/OneDrive/Bureau/Case_Study/prj_Graphtransformers/prj_Graphtransformers/data/raw/courses.csv")
    student_assessments = pd.read_csv("C:/Users/suman/OneDrive/Bureau/Case_Study/prj_Graphtransformers/prj_Graphtransformers/data/raw/studentAssessment.csv")
    student_info = pd.read_csv("C:/Users/suman/OneDrive/Bureau/Case_Study/prj_Graphtransformers/prj_Graphtransformers/data/raw/studentInfo.csv")
    student_registration = pd.read_csv("C:/Users/suman/OneDrive/Bureau/Case_Study/prj_Graphtransformers/prj_Graphtransformers/data/raw/studentRegistration.csv")
    student_vle = pd.read_csv("C:/Users/suman/OneDrive/Bureau/Case_Study/prj_Graphtransformers/prj_Graphtransformers/data/raw/studentVle.csv")
    vle = pd.read_csv("C:/Users/suman/OneDrive/Bureau/Case_Study/prj_Graphtransformers/prj_Graphtransformers/data/raw/vle.csv")
    print(f"Data loaded in {time.time() - start_time:.2f} seconds.")
    return assessments, courses, student_assessments, student_info, student_registration, student_vle, vle

def preprocess_data():
    """
    Preprocess data to create a heterogeneous graph with edge features using DGL.
    """
    assessments, courses, student_assessments, student_info, student_registration, student_vle, vle = load_data()

    print("Cleaning and converting data...")
    student_info['id_student'] = student_info['id_student'].astype(int)
    student_assessments['id_student'] = student_assessments['id_student'].astype(int)
    student_assessments['id_assessment'] = student_assessments['id_assessment'].astype(int)
    student_registration['id_student'] = student_registration['id_student'].astype(int)
    vle['id_site'] = vle['id_site'].astype(int)
    student_vle['id_site'] = student_vle['id_site'].astype(int)
    student_vle['id_student'] = student_vle['id_student'].astype(int)

    print("Building relationships...")
    module_mapping = {code: idx for idx, code in enumerate(courses['code_module'].unique())}
    all_student_ids = set(student_registration['id_student']).union(student_assessments['id_student']).union(student_vle['id_student'])
    all_student_ids = sorted(all_student_ids)
    student_id_mapping = {student_id: idx for idx, student_id in enumerate(all_student_ids)}

    student_info = student_info.drop_duplicates(subset='id_student')
    print(f"Total unique students in student_info: {student_info['id_student'].nunique()}")

    relations = {
        ('student', 'registered_in', 'module'): (
            [student_id_mapping[s] for s in student_registration['id_student']],
            [module_mapping[m] for m in student_registration['code_module']]
        ),
        ('student', 'submitted', 'assessment'): (
            [student_id_mapping[s] for s in student_assessments['id_student']],
            student_assessments['id_assessment'].values.tolist()
        ),
        ('student', 'interacted_with', 'material'): (
            [student_id_mapping[s] for s in student_vle['id_student']],
            student_vle['id_site'].values.tolist()
        ),
        ('module', 'includes', 'assessment'): (
            [module_mapping[m] for m in assessments['code_module']],
            assessments['id_assessment'].values.tolist()
        ),
        ('module', 'uses', 'material'): (
            [module_mapping[m] for m in vle['code_module']],
            vle['id_site'].values.tolist()
        ),
    }

    print("Creating DGL graph...")
    graph = dgl.heterograph({
        edge_type: (torch.tensor(src, dtype=torch.int64), torch.tensor(dst, dtype=torch.int64))
        for edge_type, (src, dst) in relations.items()
    })

    print("Graph created successfully!")
    return graph

if __name__ == "__main__":
    start_time = time.time()
    try:
        graph = preprocess_data()
        dgl.save_graphs('data/processed/oulad_graph_with_features.bin', [graph])
        print(f"Graph saved successfully in {time.time() - start_time:.2f} seconds!")
    except Exception as e:
        print(f"Error: {e}")