In [1]:
import pandas as pd
import torch
import dgl
import torch.nn as nn
from torch.utils.data import DataLoader
from dgl.nn import GraphConv
from sklearn.preprocessing import LabelEncoder


In [2]:
# Define paths to the datasets
assessments_path = 'C:/Users/dell latitude 7400/Documents/case_study/prj_Graphtransformers/data/raw/assessments.csv'
courses_path = 'C:/Users/dell latitude 7400/Documents/case_study/prj_Graphtransformers/data/raw/courses.csv'
student_assessment_path = 'C:/Users/dell latitude 7400/Documents/case_study/prj_Graphtransformers/data/raw/studentAssessment.csv'
student_info_path = 'C:/Users/dell latitude 7400/Documents/case_study/prj_Graphtransformers/data/raw/studentInfo.csv'
student_registration_path = 'C:/Users/dell latitude 7400/Documents/case_study/prj_Graphtransformers/data/raw/studentRegistration.csv'
student_vle_path = 'C:/Users/dell latitude 7400/Documents/case_study/prj_Graphtransformers/data/raw/studentVle.csv'
vle_path = 'C:/Users/dell latitude 7400/Documents/case_study/prj_Graphtransformers/data/raw/vle.csv'

# Load datasets
assessments = pd.read_csv(assessments_path)
courses = pd.read_csv(courses_path)
student_assessment = pd.read_csv(student_assessment_path)
student_info = pd.read_csv(student_info_path)
student_registration = pd.read_csv(student_registration_path)
student_vle = pd.read_csv(student_vle_path)
vle = pd.read_csv(vle_path)

# Verify the first few rows of each dataset
print("Assessments Data:", assessments.head())
print("Courses Data:", courses.head())
print("Student Assessment Data:", student_assessment.head())
print("Student Info Data:", student_info.head())
print("Student Registration Data:", student_registration.head())
print("Student VLE Data:", student_vle.head())
print("VLE Data:", vle.head())


Assessments Data:   code_module code_presentation  id_assessment assessment_type   date  weight
0         AAA             2013J           1752             TMA   19.0    10.0
1         AAA             2013J           1753             TMA   54.0    20.0
2         AAA             2013J           1754             TMA  117.0    20.0
3         AAA             2013J           1755             TMA  166.0    20.0
4         AAA             2013J           1756             TMA  215.0    30.0
Courses Data:   code_module code_presentation  module_presentation_length
0         AAA             2013J                         268
1         AAA             2014J                         269
2         BBB             2013J                         268
3         BBB             2014J                         262
4         BBB             2013B                         240
Student Assessment Data:    id_assessment  id_student  date_submitted  is_banked  score
0           1752       11391              18        

In [3]:
# Print the column names of each DataFrame to ensure consistency and correct references in later processing
print("Columns in Assessments:", assessments.columns.tolist())
print("Columns in Courses:", courses.columns.tolist())
print("Columns in Student Assessment:", student_assessment.columns.tolist())
print("Columns in Student Info:", student_info.columns.tolist())
print("Columns in Student Registration:", student_registration.columns.tolist())
print("Columns in Student VLE:", student_vle.columns.tolist())
print("Columns in VLE:", vle.columns.tolist())


Columns in Assessments: ['code_module', 'code_presentation', 'id_assessment', 'assessment_type', 'date', 'weight']
Columns in Courses: ['code_module', 'code_presentation', 'module_presentation_length']
Columns in Student Assessment: ['id_assessment', 'id_student', 'date_submitted', 'is_banked', 'score']
Columns in Student Info: ['code_module', 'code_presentation', 'id_student', 'gender', 'region', 'highest_education', 'imd_band', 'age_band', 'num_of_prev_attempts', 'studied_credits', 'disability', 'final_result']
Columns in Student Registration: ['code_module', 'code_presentation', 'id_student', 'date_registration', 'date_unregistration']
Columns in Student VLE: ['code_module', 'code_presentation', 'id_student', 'id_site', 'date', 'sum_click']
Columns in VLE: ['id_site', 'code_module', 'code_presentation', 'activity_type', 'week_from', 'week_to']


In [4]:
# Preprocess and clean data
def preprocess_data(student_info):
    # Convert id_student to string and encode the final_result
    student_info['id_student'] = student_info['id_student'].astype(str)
    encoder = LabelEncoder()
    student_info['final_result'] = encoder.fit_transform(student_info['final_result'])
    return student_info

# Call the preprocess function
student_info = preprocess_data(student_info)


In [5]:
# Print columns again to double-check
print("Courses columns:", courses.columns.tolist())
print("Student VLE columns:", student_vle.columns.tolist())

# Check specifically for 'code_module' in both DataFrames
print("'code_module' in Courses:", 'code_module' in courses.columns.tolist())
print("'code_module' in Student VLE:", 'code_module' in student_vle.columns.tolist())


Courses columns: ['code_module', 'code_presentation', 'module_presentation_length']
Student VLE columns: ['code_module', 'code_presentation', 'id_student', 'id_site', 'date', 'sum_click']
'code_module' in Courses: True
'code_module' in Student VLE: True


In [6]:
def prepare_time_series_features(student_vle, courses):
    print("Checking data types before merge:")
    print("Student VLE 'code_module' type:", student_vle['code_module'].dtype)
    print("Courses 'code_module' type:", courses['code_module'].dtype)
    print("Student VLE 'code_presentation' type:", student_vle['code_presentation'].dtype)
    print("Courses 'code_presentation' type:", courses['code_presentation'].dtype)

    # Ensure data types match
    student_vle['code_module'] = student_vle['code_module'].astype(str)
    courses['code_module'] = courses['code_module'].astype(str)
    student_vle['code_presentation'] = student_vle['code_presentation'].astype(str)
    courses['code_presentation'] = courses['code_presentation'].astype(str)
    
    # Aggregate clicks by day, student, and course info
    daily_clicks = student_vle.groupby(['id_student', 'date', 'code_module', 'code_presentation']).agg({'sum_click': 'sum'}).reset_index()
    
    print("Aggregated Daily Clicks:")
    print(daily_clicks.head())  # Show the first few rows to verify
    
    # Merge with courses to include the module presentation length
    daily_clicks = pd.merge(daily_clicks, courses[['code_module', 'code_presentation', 'module_presentation_length']],
                            on=['code_module', 'code_presentation'], how='left')

    print("Merged Daily Clicks with Courses:")
    print(daily_clicks.head())  # Verify the merge was successful

    # Normalize the date values by the length of the course presentation
    daily_clicks['date'] = daily_clicks['date'] / daily_clicks['module_presentation_length']
    
    return daily_clicks

# Prepare daily clicks data
daily_clicks = prepare_time_series_features(student_vle, courses)


Checking data types before merge:
Student VLE 'code_module' type: object
Courses 'code_module' type: object
Student VLE 'code_presentation' type: object
Courses 'code_presentation' type: object
Aggregated Daily Clicks:
   id_student  date code_module code_presentation  sum_click
0        6516   -23         AAA             2014J         28
1        6516   -22         AAA             2014J         82
2        6516   -20         AAA             2014J         41
3        6516   -17         AAA             2014J          7
4        6516   -12         AAA             2014J          2
Merged Daily Clicks with Courses:
   id_student  date code_module code_presentation  sum_click  \
0        6516   -23         AAA             2014J         28   
1        6516   -22         AAA             2014J         82   
2        6516   -20         AAA             2014J         41   
3        6516   -17         AAA             2014J          7   
4        6516   -12         AAA             2014J          2 

In [7]:
# Normalize the date values and verify the resulting data
daily_clicks['normalized_date'] = daily_clicks['date'] / daily_clicks['module_presentation_length']

# Drop unnecessary columns if needed (e.g., module_presentation_length if no longer required)
daily_clicks = daily_clicks.drop(columns=['module_presentation_length'])

# Verify the normalized dataset
print("Daily Clicks with Normalized Date:")
print(daily_clicks.head())


Daily Clicks with Normalized Date:
   id_student      date code_module code_presentation  sum_click  \
0        6516 -0.085502         AAA             2014J         28   
1        6516 -0.081784         AAA             2014J         82   
2        6516 -0.074349         AAA             2014J         41   
3        6516 -0.063197         AAA             2014J          7   
4        6516 -0.044610         AAA             2014J          2   

   normalized_date  
0        -0.000318  
1        -0.000304  
2        -0.000276  
3        -0.000235  
4        -0.000166  


In [8]:
# Ensure `id_student` has the same data type in both DataFrames
daily_clicks['id_student'] = daily_clicks['id_student'].astype(str)
student_info['id_student'] = student_info['id_student'].astype(str)

# Merge daily_clicks with student_info to enrich the data
enriched_data = pd.merge(
    daily_clicks,
    student_info[['id_student', 'code_module', 'code_presentation', 'gender', 'region',
                  'highest_education', 'imd_band', 'age_band', 'num_of_prev_attempts',
                  'studied_credits', 'disability', 'final_result']],
    on=['id_student', 'code_module', 'code_presentation'],
    how='left'
)

# Verify the enriched data
print("Enriched Data:")
print(enriched_data.head())


Enriched Data:
  id_student      date code_module code_presentation  sum_click  \
0       6516 -0.085502         AAA             2014J         28   
1       6516 -0.081784         AAA             2014J         82   
2       6516 -0.074349         AAA             2014J         41   
3       6516 -0.063197         AAA             2014J          7   
4       6516 -0.044610         AAA             2014J          2   

   normalized_date gender    region highest_education imd_band age_band  \
0        -0.000318      M  Scotland  HE Qualification   80-90%     55<=   
1        -0.000304      M  Scotland  HE Qualification   80-90%     55<=   
2        -0.000276      M  Scotland  HE Qualification   80-90%     55<=   
3        -0.000235      M  Scotland  HE Qualification   80-90%     55<=   
4        -0.000166      M  Scotland  HE Qualification   80-90%     55<=   

   num_of_prev_attempts  studied_credits disability  final_result  
0                     0               60          N            

In [9]:
import dgl
from sklearn.preprocessing import OneHotEncoder
import pandas as pd
import numpy as np
import torch

def build_graph_optimized_with_correct_features(enriched_data):
    print("Building the graph with optimized features...")

    # One-hot encode categorical features
    categorical_columns = ['gender', 'region', 'highest_education', 'imd_band', 'age_band']
    encoder = OneHotEncoder(sparse_output=False)  # Use `sparse_output` instead of `sparse`
    encoded_categorical_features = encoder.fit_transform(enriched_data[categorical_columns])

    # Combine numerical and encoded categorical features
    numerical_features = enriched_data[['studied_credits', 'num_of_prev_attempts']].values
    enriched_data['combined_features'] = list(np.hstack([numerical_features, encoded_categorical_features]))

    # Aggregate features by unique student nodes
    grouped_features = enriched_data.groupby('id_student')['combined_features'].first()
    all_features = torch.tensor(np.vstack(grouped_features.values), dtype=torch.float32)

    # Get unique student IDs for nodes
    unique_students = enriched_data['id_student'].unique()
    student_mapping = {student: idx for idx, student in enumerate(unique_students)}

    # Map students to node IDs
    enriched_data['student_node'] = enriched_data['id_student'].map(student_mapping)

    # Create edges based on the same course and presentation (one edge per group)
    edges_src = []
    edges_dst = []
    grouped = enriched_data.groupby(['code_module', 'code_presentation'])
    for _, group in grouped:
        students = group['student_node'].tolist()
        # Create edges by connecting each student to the next (linear connections)
        for i in range(len(students) - 1):
            edges_src.append(students[i])
            edges_dst.append(students[i + 1])
            edges_src.append(students[i + 1])  # Reverse edge for undirected graph
            edges_dst.append(students[i])

    # Create the graph
    graph = dgl.graph((edges_src, edges_dst))

    # Add features to the graph
    graph.ndata['features'] = all_features

    print("Graph built successfully!")
    print(f"Graph has {graph.number_of_nodes()} nodes and {graph.number_of_edges()} edges.")
    print(f"Node features shape: {graph.ndata['features'].shape}")

    return graph, student_mapping

# Call the function to build the graph
graph, student_mapping = build_graph_optimized_with_correct_features(enriched_data)


Building the graph with optimized features...
Graph built successfully!
Graph has 26074 nodes and 3616194 edges.
Node features shape: torch.Size([26074, 36])


In [10]:
# Print basic graph information
print("Graph Information:")
print(graph)

# Verify node and edge features
print("Node features shape:", graph.ndata['features'].shape)
print("Number of edges:", graph.number_of_edges())


Graph Information:
Graph(num_nodes=26074, num_edges=3616194,
      ndata_schemes={'features': Scheme(shape=(36,), dtype=torch.float32)}
      edata_schemes={})
Node features shape: torch.Size([26074, 36])
Number of edges: 3616194


GNN

In [45]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from dgl.nn import GraphConv
import dgl



class GNNModel(nn.Module):
    def __init__(self, in_feats, hidden_feats, out_feats, num_layers):
        super(GNNModel, self).__init__()
        
        # Linear layer to project input features to hidden_dim
        self.input_proj = nn.Linear(in_feats, hidden_feats)

        # GraphConv layers
        self.layers = nn.ModuleList()
        for _ in range(num_layers):
            self.layers.append(GraphConv(hidden_feats, hidden_feats, activation=F.relu))

        # Output layer for node-level classification
        self.output_layer = nn.Linear(hidden_feats, out_feats)

    def forward(self, g, features):
        # Ensure the graph has self-loops
        g = add_self_loops_if_needed(g)

        # Project input features to hidden_dim
        h = self.input_proj(features)

        # Pass through GNN layers
        for layer in self.layers:
            h = layer(g, h)

        # Output for each node
        out = self.output_layer(h)
        return out


# Training loop
def train_model(model, g, features, labels, epochs, optimizer, loss_fn):
    for epoch in range(epochs):
        model.train()
        optimizer.zero_grad()
        
        # Forward pass
        logits = model(g, features)
       
        # Compute loss
        loss = loss_fn(logits, labels)
        
        # Backward pass
        loss.backward()
        optimizer.step()

        # Compute accuracy
        train_acc = (logits.argmax(dim=1) == labels).float().mean().item()
        print(f"Epoch {epoch + 1}/{epochs}, Loss: {loss.item():.4f}, Train Acc: {train_acc:.4f}")



# Evaluate the model
def evaluate_model(model, g, features, labels):
    model.eval()
    with torch.no_grad():
        logits = model(g, features)
        predicted = logits.argmax(dim=1)
        correct = (predicted == labels).sum().item()
        accuracy = correct / labels.size(0)
        print(f"Evaluation Accuracy: {accuracy:.4f}")




In [48]:
num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Nombre de paramètres dans le modèle : {num_params}")
# Assurez-vous que les labels sont alignés avec les nœuds
labels = enriched_data.groupby('id_student')['final_result'].first().values
labels = torch.tensor(labels, dtype=torch.long).to(device)

# Vérifiez la correspondance
assert graph.number_of_nodes() == labels.shape[0], "Mismatch between nodes and labels!"


Nombre de paramètres dans le modèle : 15108


In [49]:
# Vérification des caractéristiques des nœuds
print("Node feature shape:", graph.ndata['features'].shape)

# Vérification des labels
print("Label shape:", labels.shape)

# Assurez-vous que le nombre de nœuds correspond au nombre de labels
assert graph.number_of_nodes() == labels.shape[0], "Mismatch between nodes and labels!"


Node feature shape: torch.Size([26074, 36])
Label shape: torch.Size([26074])


In [53]:
# Hyperparamètres
in_feats = graph.ndata['features'].shape[1]
hidden_feats = 64
out_feats = len(enriched_data['final_result'].unique())
num_layers = 3
epochs = 50
learning_rate = 0.001

# Modèle
model = GNNModel(in_feats, hidden_feats, out_feats, num_layers).to(device)

# Optimiseur et fonction de perte
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=1e-4)
loss_fn = nn.CrossEntropyLoss()




In [56]:
# Entraînement
train_model(model, graph, graph.ndata['features'], labels, epochs, optimizer, loss_fn)

# Évaluation
evaluate_model(model, graph, graph.ndata['features'], labels)

Epoch 1/50, Loss: 1.2598, Train Acc: 0.4366
Epoch 2/50, Loss: 1.2590, Train Acc: 0.4382
Epoch 3/50, Loss: 1.2588, Train Acc: 0.4382
Epoch 4/50, Loss: 1.2591, Train Acc: 0.4374
Epoch 5/50, Loss: 1.2586, Train Acc: 0.4381
Epoch 6/50, Loss: 1.2582, Train Acc: 0.4382
Epoch 7/50, Loss: 1.2583, Train Acc: 0.4378
Epoch 8/50, Loss: 1.2580, Train Acc: 0.4383
Epoch 9/50, Loss: 1.2577, Train Acc: 0.4377
Epoch 10/50, Loss: 1.2576, Train Acc: 0.4379
Epoch 11/50, Loss: 1.2575, Train Acc: 0.4385
Epoch 12/50, Loss: 1.2571, Train Acc: 0.4383
Epoch 13/50, Loss: 1.2569, Train Acc: 0.4384
Epoch 14/50, Loss: 1.2568, Train Acc: 0.4386
Epoch 15/50, Loss: 1.2566, Train Acc: 0.4388
Epoch 16/50, Loss: 1.2563, Train Acc: 0.4385
Epoch 17/50, Loss: 1.2561, Train Acc: 0.4386
Epoch 18/50, Loss: 1.2560, Train Acc: 0.4392
Epoch 19/50, Loss: 1.2557, Train Acc: 0.4389
Epoch 20/50, Loss: 1.2554, Train Acc: 0.4396
Epoch 21/50, Loss: 1.2550, Train Acc: 0.4389
Epoch 22/50, Loss: 1.2548, Train Acc: 0.4388
Epoch 23/50, Loss: 