In [1]:
import pandas as pd
import os

# Function to load OULAD data from CSV files
# Using raw strings to handle backslashes in Windows file paths
# This function assumes the CSV files are located in the specified directory
# and that the directory structure matches the expected format.

def load_oulad_data(data_path):
    """Load all OULAD CSV files into pandas DataFrames"""

    # Core tables - using raw strings
    courses = pd.read_csv(rf"{data_path}\courses.csv")
    assessments = pd.read_csv(rf"{data_path}\assessments.csv")
    vle = pd.read_csv(rf"{data_path}\vle.csv")

    # Student data
    student_info = pd.read_csv(rf"{data_path}\studentInfo.csv")
    student_registration = pd.read_csv(rf"{data_path}\studentRegistration.csv")
    student_assessment = pd.read_csv(rf"{data_path}\studentAssessment.csv")
    student_vle = pd.read_csv(rf"{data_path}\studentVle.csv")

    return {
        'courses': courses,
        'assessments': assessments,
        'vle': vle,
        'student_info': student_info,
        'student_registration': student_registration,
        'student_assessment': student_assessment,
        'student_vle': student_vle
    }

# Load the data - use raw string for path
oulad_data = load_oulad_data(r"C:\Users\Ritam\Projects\XAIDashboard\dataset")

In [2]:
#data exploration
# Function to explore the main student information table
def explore_student_data(oulad_data):
    """Explore the main student information table"""

    student_info = oulad_data['student_info']

    print("=== STUDENT INFO EXPLORATION ===")
    print(f"Shape: {student_info.shape}")
    print(f"Columns: {list(student_info.columns)}")
    print("\n--- Sample Data ---")
    print(student_info.head(3))

    print("\n--- Target Variable Distribution ---")
    print(student_info['final_result'].value_counts())
    print(student_info['final_result'].value_counts(normalize=True).round(3))

    print("\n--- Demographic Breakdown ---")
    print("Gender:", student_info['gender'].value_counts().to_dict())
    print("Age bands:", student_info['age_band'].value_counts().to_dict())
    print("Disability:", student_info['disability'].value_counts().to_dict())

# Run exploration
explore_student_data(oulad_data)

=== STUDENT INFO EXPLORATION ===
Shape: (32593, 12)
Columns: ['code_module', 'code_presentation', 'id_student', 'gender', 'region', 'highest_education', 'imd_band', 'age_band', 'num_of_prev_attempts', 'studied_credits', 'disability', 'final_result']

--- Sample Data ---
  code_module code_presentation  id_student gender                region  \
0         AAA             2013J       11391      M   East Anglian Region   
1         AAA             2013J       28400      F              Scotland   
2         AAA             2013J       30268      F  North Western Region   

       highest_education imd_band age_band  num_of_prev_attempts  \
0       HE Qualification  90-100%     55<=                     0   
1       HE Qualification   20-30%    35-55                     0   
2  A Level or Equivalent   30-40%    35-55                     0   

   studied_credits disability final_result  
0              240          N         Pass  
1               60          N         Pass  
2               

In [3]:
#Check VLE Engagement Patterns
def explore_vle_data(oulad_data):
    """Explore VLE interaction patterns"""

    student_vle = oulad_data['student_vle']

    print("=== VLE INTERACTION EXPLORATION ===")
    print(f"Total interactions: {len(student_vle):,}")
    print(f"Unique students with VLE data: {student_vle['id_student'].nunique():,}")

    # Basic statistics
    print(f"Clicks per interaction - Mean: {student_vle['sum_click'].mean():.1f}, Max: {student_vle['sum_click'].max()}")

    # Activity patterns
    activity_types = oulad_data['vle']['activity_type'].value_counts().head(10)
    print("\n--- Top Activity Types ---")
    print(activity_types)

# Run VLE exploration
explore_vle_data(oulad_data)



=== VLE INTERACTION EXPLORATION ===
Total interactions: 10,655,280
Unique students with VLE data: 26,074
Clicks per interaction - Mean: 3.7, Max: 6977

--- Top Activity Types ---
activity_type
resource         2660
subpage          1055
oucontent         996
url               886
forumng           194
quiz              127
page              102
oucollaborate      82
questionnaire      61
ouwiki             49
Name: count, dtype: int64


In [4]:
#Assess Data Quality
def assess_data_quality(oulad_data):
    """Quick data quality assessment"""

    print("=== DATA QUALITY ASSESSMENT ===")

    for table_name, df in oulad_data.items():
        missing_pct = (df.isnull().sum().sum() / (len(df) * len(df.columns))) * 100
        print(f"{table_name}: {missing_pct:.1f}% missing values")

        if missing_pct > 0:
            missing_cols = df.isnull().sum()
            critical_missing = missing_cols[missing_cols > len(df) * 0.1]  # >10% missing
            if len(critical_missing) > 0:
                print(f"  ⚠️  High missingness in: {critical_missing.to_dict()}")

# Run quality assessment
assess_data_quality(oulad_data)


=== DATA QUALITY ASSESSMENT ===
courses: 0.0% missing values
assessments: 0.9% missing values
vle: 27.5% missing values
  ⚠️  High missingness in: {'week_from': 5243, 'week_to': 5243}
student_info: 0.3% missing values
student_registration: 13.8% missing values
  ⚠️  High missingness in: {'date_unregistration': 22521}
student_assessment: 0.0% missing values
student_vle: 0.0% missing values
