<a href="https://colab.research.google.com/github/AmurdAmzer/SAKT-Paper_Implementation/blob/main/SAKTPaperImplementation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Cell 1: Uploading data via Colab's File Browser

In [4]:
import os
print("Files in current directory:")
for file in os.listdir():
  if file.endswith(".csv"):
    print(f" - {file}")

Files in current directory:
 - skill_builder_data_corrected_collapsed.csv


#Cell 2: Imports and File upload

In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load the CSV file (Remember to use the actual file name)
# encoding='ISO-8859-1' handles special characters in the data
# low_memory=False prevents dtype warnings for mixed types

df = pd.read_csv('skill_builder_data_corrected_collapsed.csv', encoding='ISO-8859-1', low_memory=False)

# Show basic information about the dataset
print(f"Dataset shape: {df.shape}") # (rows, columns)
print(f"\nColumn names: {list(df.columns)}") # all column names
print(f"\nFirst 5 rows: {df.head()}") # preview first 5 rows


Dataset shape: (346860, 31)

Column names: ['Unnamed: 0', 'order_id', 'assignment_id', 'user_id', 'assistment_id', 'problem_id', 'original', 'correct', 'attempt_count', 'ms_first_response', 'tutor_mode', 'answer_type', 'sequence_id', 'student_class_id', 'position', 'type', 'base_sequence_id', 'skill_id', 'skill_name', 'teacher_id', 'school_id', 'hint_count', 'hint_total', 'overlap_time', 'template_id', 'answer_id', 'answer_text', 'first_action', 'bottom_hint', 'opportunity', 'opportunity_original']

First 5 rows:    Unnamed: 0  order_id  assignment_id  user_id  assistment_id  problem_id  \
0           1  33022537         277618    64525          33139       51424   
1           2  33022709         277618    64525          33150       51435   
2           3  35450204         220674    70363          33159       51444   
3           4  35450295         220674    70363          33110       51395   
4           5  35450311         220674    70363          33196       51481   

   original 

# Cell3: Key Statistics - Understanding the dataset size and scope

In [9]:
# CELL 3: Key Statistics - Understanding the dataset size and scope
print("=== DATASET OVERVIEW ===")

# Count total number of student-problem interactions (rows)
print(f"Total interactions: {len(df)}")

# Count unique students - each student has a unique user_id
print(f"Unique students: {df['user_id'].nunique()}")

# Count unique problems - individual questions students attempt
print(f"Unique problems: {df['problem_id'].nunique()}")

# Count unique skills - knowledge concepts being tested
# This is crucial for SAKT as we'll create embeddings for each skill
print(f"Unique skills: {df['skill_id'].nunique()}")

# Calculate overall performance - percentage of correct answers
print(f"\nCorrect rate: {df['correct'].mean():.2%}")

# Check data completeness for skill_id (critical for SAKT)
# SAKT needs skill_id to work - rows without it must be removed
print(f"Rows with skill_id: {df['skill_id'].notna().sum()}")
print(f"Missing skill_id: {df['skill_id'].isna().sum()} ({df['skill_id'].isna().mean():.1%})")

=== DATASET OVERVIEW ===
Total interactions: 346860
Unique students: 4217
Unique problems: 26688
Unique skills: 149

Correct rate: 64.53%
Rows with skill_id: 283105
Missing skill_id: 63755 (18.4%)


# CELL 4: Examine One Student's Learning Journey
# This helps us understand the sequential nature of the data

In [10]:
# CELL 4: Examine One Student's Learning Journey
# This helps us understand the sequential nature of the data

# Find students sorted by number of attempts (most active students)
student_activity = df['user_id'].value_counts()

# Pick the 11th most active student (avoiding outliers)
student_id = student_activity.index[10]

# Get all data for this student, sorted by time
# order_id represents the sequence of attempts
student_data = df[df['user_id'] == student_id].sort_values('order_id')

# Display student summary
print(f"Student {student_id} attempted {len(student_data)} problems")
print(f"Skills attempted: {student_data['skill_id'].nunique()}")
print(f"Correct rate: {student_data['correct'].mean():.2%}")

# Show their first 10 attempts to see the sequential pattern
print("\nFirst 10 attempts:")
print(student_data[['order_id', 'skill_id', 'correct', 'ms_first_response']].head(10))

Student 78980 attempted 1112 problems
Skills attempted: 94
Correct rate: 58.45%

First 10 attempts:
        order_id skill_id  correct  ms_first_response
110037  20882484       58        1              75090
184857  20882593      277        0               9044
184858  20882618      277        1              14863
184859  20882633      277        1              14619
184860  20882650      277        1              15088
184861  20882693      277        1              40358
184862  20960207      277        1              17053
202327  20960246      279        1              34975
202328  20960333      279        1               8606
202329  21076113      279        0              17704
