# A2: Moodle Event Log Analysis Pipeline

**Purpose:** This notebook analyzes anonymized Moodle event logs to extract student-resource interaction patterns.

## Pipeline Overview

1. **Event Parsing**: Extract course module IDs and parse timestamps
2. **Event Classification**: Distinguish teacher updates from student accesses
3. **Availability Inference**: Determine when resources became available
4. **Matrix Construction**: Build student √ó resource interaction matrices
5. **Feature Aggregation**: Summarize by resource type with confidence intervals
6. **Visualization**: Interactive dashboard for exploration

In [42]:
import pandas as pd
import numpy as np
import re
from datetime import datetime, timedelta
from typing import Dict, Any, Optional, Iterable

pd.set_option('display.max_colwidth', 200)

## 2. Helper Functions

This section defines utility functions for:
- Extracting course module IDs from event descriptions
- Parsing Moodle date strings in multiple formats
- Classifying events as teacher updates or student accesses
- Deriving standardized resource types from context strings

In [43]:
def extract_module_id(description: object) -> Optional[str]:
    """Extract a course module ID from an event description like "... course module id '123456' ...".
    Returns the ID as a string or None if not found.
    """
    if pd.isna(description):
        return None
    s = str(description)
    for pat in (r"course module id '(\d+)'", r"module id '(\d+)'", r"cmid[^\d]*(\d+)"):
        m = re.search(pat, s, flags=re.IGNORECASE)
        if m:
            return m.group(1)
    return None


def parse_moodle_date(date_str: object) -> Optional[datetime]:
    """Parse Moodle date strings into datetime (naive). Supports:
    - 'DD/MM/YYYY HH:MM'
    - 'DD/MM/YY, HH:MM:SS'
    - 'DD/MM/YYYY'
    - 'DD/MM/YY'
    Returns None if parsing fails.
    """
    if pd.isna(date_str):
        return None
    s = str(date_str).strip()
    # Try the most specific patterns first
    fmts = ["%d/%m/%Y %H:%M", "%d/%m/%y, %H:%M:%S", "%d/%m/%Y", "%d/%m/%y"]
    for fmt in fmts:
        try:
            return datetime.strptime(s, fmt)
        except Exception:
            continue
    # Heuristic: if there is a space, try first token as date
    try:
        first = s.split()[0]
        return datetime.strptime(first, "%d/%m/%Y")
    except Exception:
        return None


def is_teacher_event(row: pd.Series) -> bool:
    """Check if the event was performed by a teacher."""
    return bool(row.get("Es_profesor", False))


def is_resource_access_event(row: pd.Series) -> bool:
    """Detect a (student) access event to a course module."""
    if is_teacher_event(row):
        return False
    event_name = str(row.get("Nombre evento", "")).lower()
    component = str(row.get("Componente", "")).lower()

    # Keep patterns explicit and narrow to avoid false positives.
    access_patterns: Iterable[str] = (
        "course module viewed",
        "m√≥dulo de curso visto",
        "module viewed",
        "url viewed",
        "resource viewed",
        "file viewed",
        "book viewed",
        "page viewed",
        "folder viewed",
        "glossary viewed",
        "assign viewed",
        "quiz viewed",
        "forum viewed",
    )
    return any(pat in event_name for pat in access_patterns) or (
        "viewed" in event_name and "course" in component
    )


def is_teacher_update_event(row: pd.Series) -> bool:
    """Detect a teacher updating/creating a resource (create/update/modify operations)."""
    if not is_teacher_event(row):
        return False
    text = (str(row.get("Descripci√≥n", "")) + " " + str(row.get("Nombre evento", ""))).lower()
    update_patterns: Iterable[str] = (
        "updated", "created", "modified", "restored",
        "actualizado", "creado", "modificado", "restaurado"
    )
    return any(pat in text for pat in update_patterns)


def add_resource_type(matrix_df: pd.DataFrame,
                      context_col: str = "Resource_Context",
                      output_col: str = "Resource_Type") -> pd.DataFrame:
    """Derive a standardized resource type from a context label like 'Archivo: Tema 1.pdf'."""
    if context_col not in matrix_df.columns:
        return matrix_df.copy()

    result = matrix_df.copy()
    parts = result[context_col].astype("string").str.split(":", n=1, expand=True)
    # Ensure both columns exist
    if parts.shape[1] == 1:
        parts = pd.concat([parts, pd.Series([pd.NA] * len(parts), index=parts.index)], axis=1)

    left = parts[0].str.strip()
    right = parts[1].str.lstrip()

    result[output_col] = np.select(
        [
            right.str.startswith("UML", na=False),
            right.str.startswith("R√∫brica", na=False),
            left.str.startswith("Archivo", na=False),
            left.str.startswith("URL", na=False),
            left.str.startswith("Kaltura", na=False),
        ],
        [
            "V√≠deo",
            "R√∫brica",
            "Material te√≥rico",
            "Herramienta",
            "V√≠deo",
        ],
        default=left.fillna("Recurso")
    )

    return result


## 3. Resource Availability Inference

This function determines when each resource became available to students by analyzing teacher update events and student access patterns.

In [44]:
def get_resource_availability_dates(
    merged_df: pd.DataFrame,
    academic_year_start: datetime,
    *,
    availability_strategy: str = "first_update_or_start",  # NEW default
    debug: bool = False
) -> Dict[str, Dict[str, Any]]:
    """
    Infer availability (publication) date per Module_ID.

    Strategies
    ----------
    - 'first_update_or_start' (recommended):
        availability = first teacher create/update ON OR AFTER academic_year_start;
        if none exists, availability = academic_year_start.
        (cohort-consistent, independent of student access timing)

    - 'last_update_before_access' (original behavior):
        if any student access exists:
            availability = last teacher update ON OR BEFORE first student access;
            if none exists, availability = academic_year_start
        else:
            availability = None (Unknown)

    - 'course_start':
        availability = academic_year_start for all resources.

    Returns
    -------
    Dict[module_id] -> { 'Resource_Context', 'Availability_Date', 'Confidence' }
    """

    required = {
        "Hora", "Descripci√≥n", "Contexto del evento",
        "Nombre evento", "Componente", "Es_profesor"
    }
    missing = required - set(merged_df.columns)
    if missing:
        raise ValueError(f"Missing required columns for availability inference: {sorted(missing)}")

    df = merged_df.copy()
    if "Fecha_parsed" not in df.columns:
        df["Fecha_parsed"] = df["Hora"].apply(parse_moodle_date)
    if "Module_ID" not in df.columns:
        df["Module_ID"] = df["Descripci√≥n"].apply(extract_module_id)

    resource_events = df[df["Module_ID"].notna()].copy()
    if resource_events.empty:
        return {}

    availability_info: Dict[str, Dict[str, Any]] = {}

    for module_id, module_events in resource_events.groupby("Module_ID"):
        module_events = module_events.sort_values("Fecha_parsed")

        resource_context = (
            module_events["Contexto del evento"].dropna().iloc[0]
            if not module_events.empty else f"Module {module_id}"
        )

        # Classify
        teacher_updates = module_events[module_events.apply(is_teacher_update_event, axis=1)]
        student_access  = module_events[module_events.apply(is_resource_access_event, axis=1)]

        availability_date: Optional[datetime] = None
        confidence = "Unknown"

        strategy = availability_strategy.lower()

        if strategy == "course_start":
            availability_date = academic_year_start
            confidence = "Medium"  # assumption by design

        elif strategy == "first_update_or_start":
            # First teacher create/update ON OR AFTER course start
            tu = teacher_updates[teacher_updates["Fecha_parsed"] >= academic_year_start]
            if not tu.empty:
                availability_date = tu["Fecha_parsed"].min()
                confidence = "High"
            else:
                availability_date = academic_year_start
                confidence = "Medium"

        elif strategy == "last_update_before_access":
            if not student_access.empty:
                first_access = student_access["Fecha_parsed"].min()
                tu = teacher_updates[teacher_updates["Fecha_parsed"] <= first_access]
                if not tu.empty:
                    availability_date = tu["Fecha_parsed"].max()
                    confidence = "High"
                else:
                    availability_date = academic_year_start
                    confidence = "Medium"
            else:
                availability_date = None
                confidence = "Unknown"
        else:
            raise ValueError(f"Unknown availability_strategy='{availability_strategy}'")

        if debug:
            print(f"[DEBUG avail] Module {module_id} | strategy={availability_strategy} | "
                  f"availability={availability_date} | conf={confidence}")

        availability_info[str(module_id)] = {
            "Resource_Context": resource_context,
            "Availability_Date": availability_date,
            "Confidence": confidence,
        }

    return availability_info

## 4. Student-Resource Matrix Construction

Builds a matrix with one row per student-resource pair containing timing and access count features.

In [45]:
def create_student_resource_matrix(
    merged_df: pd.DataFrame,
    academic_year_start: datetime,
    interval_start: datetime,
    interval_end: datetime,
    *,
    debug: bool = False
) -> pd.DataFrame:
    """Build a student‚Äìresource interaction matrix restricted to a time window."""
    # Validate columns
    required = {
        "Hora", "Descripci√≥n", "Contexto del evento", "Nombre evento", "Componente",
        "Es_profesor", "Nombre_pseudonimo"
    }
    missing = required - set(merged_df.columns)
    if missing:
        raise ValueError(f"Missing required columns for matrix creation: {sorted(missing)}")

    print(f"Creating student‚Äìresource matrix from {interval_start:%Y-%m-%d} to {interval_end:%Y-%m-%d}‚Ä¶")

    # 1) Availability over the full history
    # Strategy selection based on resource type
    availability_info_all = get_resource_availability_dates(
        merged_df,
        academic_year_start,
        availability_strategy="last_update_before_access",  # default for most resources
        debug=debug
    )
    
    # Apply a consistent strategy only for Rubrics (R√∫bricas)
    availability_info_rubric = get_resource_availability_dates(
        merged_df[merged_df["Contexto del evento"].str.contains("R√∫brica", na=False)],
        academic_year_start,
        availability_strategy="first_update_or_start",
        debug=debug
    )
    
    # Merge both (rubric overrides general)
    availability_info = {**availability_info_all, **availability_info_rubric}

    if not availability_info:
        print("No resource availability data found.")
        return pd.DataFrame()

    # 2) Prepare logs and restrict to interval
    df = merged_df.copy()
    if "Fecha_parsed" not in df.columns:
        df["Fecha_parsed"] = df["Hora"].apply(parse_moodle_date)
    if "Module_ID" not in df.columns:
        df["Module_ID"] = df["Descripci√≥n"].apply(extract_module_id)

    interval_mask = df["Fecha_parsed"].between(interval_start, interval_end, inclusive="both")
    df_interval = df[interval_mask]

    # Student resource access events within the window
    student_access_events = df_interval[
        (df_interval["Module_ID"].notna())
        & (~df_interval["Es_profesor"].astype(bool))
        & (df_interval.apply(is_resource_access_event, axis=1))
    ].copy()

    print(f"Found {len(student_access_events)} student resource access events in interval.")

    # 3) Reference sets
    eval_columns = [
        "Hito 1", "Hito 2", "Hito 3", "Trabajo",
        "Coevaluaci√≥n 1", "Coevaluaci√≥n 2", "Coevaluaci√≥n 3", "Asistencia"
    ]
    # Keep only evaluation columns that exist
    eval_columns = [c for c in eval_columns if c in df.columns]

    all_students = df.loc[~df["Es_profesor"].astype(bool), "Nombre_pseudonimo"].dropna().unique()
    all_resources = list(availability_info.keys())

    print(f"Students: {len(all_students)}, Resources: {len(all_resources)}")

    # Precompute one row per student for evaluation fields (stable per student)
    # If multiple rows per student exist, use first non-null per column.
    eval_by_student = (
        df.loc[~df["Es_profesor"].astype(bool), ["Nombre_pseudonimo"] + eval_columns]
          .groupby("Nombre_pseudonimo", dropna=True)
          .agg(lambda s: s.dropna().iloc[0] if s.dropna().size else np.nan)
    ) if eval_columns else pd.DataFrame()

    # 4) Build matrix
    interaction_rows = []
    for student in all_students:
        # Evaluation snapshot for this student (if available)
        if not eval_by_student.empty and student in eval_by_student.index:
            student_eval = eval_by_student.loc[student].to_dict()
        else:
            student_eval = {}

        # All resources
        for resource_id in all_resources:
            avail = availability_info[resource_id]
            resource_context = avail["Resource_Context"]
            availability_date = avail["Availability_Date"]
            confidence = avail["Confidence"]

            events_sr = student_access_events[
                (student_access_events["Nombre_pseudonimo"] == student)
                & (student_access_events["Module_ID"].astype(str) == str(resource_id))
            ]

            row = {
                "Student_ID": student,
                "Resource_ID": resource_id,
                "Resource_Context": resource_context,
                "Resource_Availability_Date": availability_date,
                "Availability_Confidence": confidence,
            }

            # Attach student evaluation fields
            for c in eval_columns:
                row[c] = student_eval.get(c, np.nan)

            # Access metrics (within interval)
            if not events_sr.empty:
                events_sr = events_sr.sort_values("Fecha_parsed")
                first_access = events_sr["Fecha_parsed"].min()
                last_access = events_sr["Fecha_parsed"].max()
                access_count = len(events_sr)

                # Time to first access from availability
                if isinstance(availability_date, datetime):
                    delta_first = (first_access - availability_date).total_seconds()
                    row["Time_To_First_Access_Hours"] = delta_first / 3600.0
                    row["Time_To_First_Access_Minutes"] = delta_first / 60.0
                else:
                    row["Time_To_First_Access_Hours"] = np.nan
                    row["Time_To_First_Access_Minutes"] = np.nan

                # Time from last access to end of interval
                delta_last = (interval_end - last_access).total_seconds()
                row["Time_Since_Last_Access_Hours"] = delta_last / 3600.0
                row["Time_Since_Last_Access_Minutes"] = delta_last / 60.0

                row.update({
                    "Accessed_Resource": True,
                    "First_Access_Date": first_access,
                    "Last_Access_Date": last_access,
                    "Access_Count": access_count,
                })
            else:
                row.update({
                    "Accessed_Resource": False,
                    "First_Access_Date": None,
                    "Last_Access_Date": None,
                    "Access_Count": 0,
                    "Time_To_First_Access_Hours": np.nan,
                    "Time_To_First_Access_Minutes": np.nan,
                    "Time_Since_Last_Access_Hours": np.nan,
                    "Time_Since_Last_Access_Minutes": np.nan,
                })

            interaction_rows.append(row)

    matrix_df = pd.DataFrame(interaction_rows)

    # Derived flags
    matrix_df["Accessed_Within_24h"] = matrix_df["Time_To_First_Access_Hours"] <= 24
    matrix_df["Accessed_Within_Week"] = matrix_df["Time_To_First_Access_Hours"] <= (7 * 24)
    matrix_df["Multiple_Access"] = matrix_df["Access_Count"] > 1

    # Resource type classification
    print("Classifying resource types‚Ä¶")
    matrix_df = add_resource_type(matrix_df)

    # Order rows for readability
    matrix_df = matrix_df.sort_values(["Student_ID", "Resource_ID"]).reset_index(drop=True)
    print(f"Matrix built with {len(matrix_df)} student‚Äìresource rows.")
    return matrix_df


## 5. Aggregation by Resource Type

Computes summary statistics per student across resource types, including means and 95% confidence intervals.

In [46]:
import numpy as np
import pandas as pd
from scipy import stats

def summarise_by_resource_type(matrix_df: pd.DataFrame, ci: float = 0.95) -> pd.DataFrame:
    """
    Aggregate per student and resource type:
      - Avg_Time_First_Access (minutes) and its CI half-width
      - Avg_Time_Last_Access  (minutes) and its CI half-width
      - Total_Number_Access   (count)
    Returns a wide table with one row per Student_ID.
    """
    required_cols = {
        "Student_ID", "Resource_Type", "Accessed_Resource",
        "Time_To_First_Access_Minutes", "Time_Since_Last_Access_Minutes", "Access_Count"
    }
    missing = required_cols - set(matrix_df.columns)
    if missing:
        raise ValueError(f"Matrix is missing required columns: {sorted(missing)}")

    # Only rows where the student actually accessed the resource
    df_ok = matrix_df[matrix_df["Accessed_Resource"]].copy()

    def ci_half_width(series: pd.Series) -> float:
        n = series.notna().sum()
        if n < 2:
            return np.nan  # insufficient sample size
        se = series.std(ddof=1) / np.sqrt(n)
        tval = stats.t.ppf((1 + ci) / 2, df=n - 1)
        return float(tval * se)

    agg = (
        df_ok
        .groupby(["Student_ID", "Resource_Type"], dropna=True)
        .agg(
            Avg_Time_First_Access = ("Time_To_First_Access_Minutes", "mean"),
            CI_Time_First_Access  = ("Time_To_First_Access_Minutes", ci_half_width),
            Avg_Time_Last_Access  = ("Time_Since_Last_Access_Minutes", "mean"),
            CI_Time_Last_Access   = ("Time_Since_Last_Access_Minutes", ci_half_width),
            Total_Number_Access   = ("Access_Count", "sum"),
        )
    )

    # Long ‚Üí wide
    wide = (
        agg.unstack(level="Resource_Type")
           .sort_index(axis=1, level=1)
    )

    # Flatten MultiIndex columns: (metric, resource_type) ‚Üí "ResourceType_Metric"
    wide.columns = [
        f"{rtype}_{metric}".replace(" ", "_")
        for metric, rtype in wide.columns
    ]

    return wide.reset_index()


## 6. Milestone Consistency Validation

Validates that milestone dates are consistent (within ¬±3 days) across academic years.

In [47]:
from datetime import timedelta

# Default tolerance (¬±3 days)
TOLERANCE = timedelta(days=3)

def check_milestone_consistency(milestones_by_year: dict, tolerance: timedelta = TOLERANCE) -> pd.DataFrame:
    """
    Ensure each milestone label has approximately the same duration across years.
    Returns a pivot (rows = year, cols = label) with durations in days.
    """
    rows = []
    for year, items in milestones_by_year.items():
        for ms in items:
            duration = ms["end"] - ms["start"] + timedelta(days=1)  # inclusive bounds
            rows.append({
                "year": year,
                "label": ms["label"],
                "start": ms["start"],
                "end": ms["end"],
                "duration": duration,
            })
    df_ms = pd.DataFrame(rows)

    pivot = (
        df_ms.pivot(index="year", columns="label", values="duration")
             .applymap(lambda x: x.days)  # show in days
             .sort_index()
    )

    problems = []
    for label in pivot.columns:
        dur_series = pivot[label]
        reference = dur_series.iloc[0]  # first year as reference
        diffs = (dur_series - reference).abs()
        if (diffs > tolerance.days).any():
            problems.append({
                "label": label,
                "reference": reference,
                "max_diff": int(diffs.max()),
                "years_off": dur_series[diffs > tolerance.days].index.tolist(),
            })

    if not problems:
        print(f"‚úÖ All milestones within ¬±{tolerance.days} days tolerance.")
    else:
        print(f"‚ö†Ô∏è Differences larger than ¬±{tolerance.days} days detected:")
        for p in problems:
            print(f"  - {p['label']}: ref {p['reference']} days ‚Üí "
                  f"max diff {p['max_diff']} days in {p['years_off']}")

    return pivot


## 7. Pipeline Configuration

Define input files, milestone dates, and academic year parameters.

In [48]:
from pathlib import Path
from datetime import datetime

# Input merged (anonymized) files per academic year
files = {
    2022: "merged_data_2022.csv",
    2023: "merged_data_2023.csv",
    2024: "merged_data_2024.csv",
    2025: "merged_data_2025.csv",
}

# Output directory for matrices and summaries
out_dir = Path("matrices_out")
out_dir.mkdir(exist_ok=True, parents=True)

# Milestone calendar (year-specific)
milestones_calendar = {
    2022: [
        {"label": "Hito1", "start": datetime(2022, 2, 7),  "end": datetime(2022, 3, 28)},  # also course start
        {"label": "Hito2", "start": datetime(2022, 3, 29), "end": datetime(2022, 4, 29)},
        {"label": "Hito3", "start": datetime(2022, 4, 30), "end": datetime(2022, 5, 22)},
    ],
    2023: [
        {"label": "Hito1", "start": datetime(2023, 2, 6),  "end": datetime(2023, 3, 24)},  # also course start
        {"label": "Hito2", "start": datetime(2023, 3, 25), "end": datetime(2023, 4, 25)},
        {"label": "Hito3", "start": datetime(2023, 4, 26), "end": datetime(2023, 5, 23)},
    ],
    2024: [
        {"label": "Hito1", "start": datetime(2024, 2, 5),  "end": datetime(2024, 4, 2)},   # also course start
        {"label": "Hito2", "start": datetime(2024, 4, 3),  "end": datetime(2024, 4, 29)},
        {"label": "Hito3", "start": datetime(2024, 4, 30), "end": datetime(2024, 5, 20)},
    ],
    2025: [
        {"label": "Hito1", "start": datetime(2025, 2, 6),  "end": datetime(2025, 4, 1)},   # also course start
        {"label": "Hito2", "start": datetime(2025, 4, 2),  "end": datetime(2025, 4, 29)},
        {"label": "Hito3", "start": datetime(2025, 4, 30), "end": datetime(2025, 5, 20)},
    ],
}

# Run consistency check
pivot_table = check_milestone_consistency(milestones_calendar, tolerance=TOLERANCE)
print("\nDurations (days):")
print(pivot_table)


‚ö†Ô∏è Differences larger than ¬±3 days detected:
  - Hito1: ref 50 days ‚Üí max diff 8 days in [2024, 2025]
  - Hito2: ref 32 days ‚Üí max diff 5 days in [2024, 2025]
  - Hito3: ref 23 days ‚Üí max diff 5 days in [2023]

Durations (days):
label  Hito1  Hito2  Hito3
year                      
2022      50     32     23
2023      47     32     28
2024      58     27     21
2025      55     28     21



DataFrame.applymap has been deprecated. Use DataFrame.map instead.



## 8. Execute Analysis Pipeline

Run the complete analysis for all configured years.

In [49]:
matrices = {}

for year, csv_path in files.items():
    print(f"\n=== Academic year {year-1}-{year} ===")

    # Basic file existence check
    if not Path(csv_path).exists():
        print(f"  [SKIP] Missing input file: {csv_path}")
        continue

    try:
        merged_df = pd.read_csv(csv_path)
    except Exception as e:
        print(f"  [ERROR] Failed to read {csv_path}: {e}")
        continue

    # Academic year start = start of Hito1
    academic_start = milestones_calendar[year][0]["start"]

    for ms in milestones_calendar[year]:
        label = ms["label"]
        start, end = ms["start"], ms["end"]
        print(f"  ¬∑ {label}: {start:%d-%b-%Y} ‚Üí {end:%d-%b-%Y}")

        try:
            matrix = create_student_resource_matrix(
                merged_df=merged_df,
                academic_year_start=academic_start,
                interval_start=start,
                interval_end=end,
                debug=False,
            )
        except Exception as e:
            print(f"    [ERROR] Matrix creation failed for {year}-{label}: {e}")
            continue

        matrices[(year, label)] = matrix

        # Save matrix (even if empty, for traceability)
        matrix_path = out_dir / f"matrix_{year}_{label}.csv"
        matrix.to_csv(matrix_path, index=False)
        print(f"    Saved matrix ‚Üí {matrix_path} (rows={len(matrix)})")

        # Summarize by resource type (skip gracefully if matrix empty)
        if not matrix.empty:
            try:
                summary = summarise_by_resource_type(matrix, ci=0.95)
                summary_path = out_dir / f"summary_{year}_{label}.csv"
                summary.to_csv(summary_path, index=False)
                print(f"    Saved summary ‚Üí {summary_path} (rows={len(summary)})")
            except Exception as e:
                print(f"    [WARN] Summary failed for {year}-{label}: {e}")
        else:
            print(f"    [INFO] Matrix empty for {year}-{label}; skipping summary.")



=== Academic year 2021-2022 ===
  ¬∑ Hito1: 07-Feb-2022 ‚Üí 28-Mar-2022
Creating student‚Äìresource matrix from 2022-02-07 to 2022-03-28‚Ä¶
Found 6024 student resource access events in interval.
Students: 71, Resources: 179
Classifying resource types‚Ä¶
Matrix built with 12709 student‚Äìresource rows.
    Saved matrix ‚Üí matrices_out/matrix_2022_Hito1.csv (rows=12709)
    Saved summary ‚Üí matrices_out/summary_2022_Hito1.csv (rows=71)
  ¬∑ Hito2: 29-Mar-2022 ‚Üí 29-Apr-2022
Creating student‚Äìresource matrix from 2022-03-29 to 2022-04-29‚Ä¶
Found 1795 student resource access events in interval.
Students: 71, Resources: 179
Classifying resource types‚Ä¶
Matrix built with 12709 student‚Äìresource rows.
    Saved matrix ‚Üí matrices_out/matrix_2022_Hito2.csv (rows=12709)
    Saved summary ‚Üí matrices_out/summary_2022_Hito2.csv (rows=65)
  ¬∑ Hito3: 30-Apr-2022 ‚Üí 22-May-2022
Creating student‚Äìresource matrix from 2022-04-30 to 2022-05-22‚Ä¶
Found 2738 student resource access events i

## 9. Interactive Dashboard

Run the cell below to open an **interactive dashboard in a new browser tab**.

The dashboard will be available at `http://localhost:5006`.

In [54]:
# =============================================================================
# INTERACTIVE DASHBOARD WITH PLOTLY (Opens in browser tab)
# =============================================================================

import glob
import re
from pathlib import Path

import pandas as pd
import panel as pn
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

pn.extension('plotly')

# Expected metric suffixes in summary files
METRIC_SUFFIXES = [
    'Avg_Time_First_Access',
    'Avg_Time_Last_Access',
    'CI_Time_First_Access',
    'CI_Time_Last_Access',
    'Total_Number_Access',
]

# Display order for milestones
ORDERED_MILESTONES = ['Hito1', 'Hito2', 'Hito3']

# Load all summary files
summary_frames = []
for filepath in glob.glob('matrices_out/summary_*.csv'):
    match = re.match(r'summary_(\d{4})_(Hito\d)\.csv', Path(filepath).name)
    if match:
        year, milestone = int(match.group(1)), match.group(2)
        df = pd.read_csv(filepath).assign(Year=year, Milestone=milestone)
        summary_frames.append(df)

if not summary_frames:
    raise ValueError("No summary CSV files found in 'matrices_out/'.")

summary_all = pd.concat(summary_frames, ignore_index=True)

# Identify resource types and metrics from column names
id_cols = {'Student_ID', 'Year', 'Milestone'}
var_cols = [c for c in summary_all.columns if c not in id_cols]

resource_types = set()
metrics = set()
for col in var_cols:
    for suffix in METRIC_SUFFIXES:
        if col.endswith(suffix):
            res_type = col[:-(len(suffix) + 1)]
            resource_types.add(res_type)
            metrics.add(suffix)
            break

resource_types = sorted(resource_types)
metrics = sorted(metrics)

# Precompute global axis ranges for consistent visualization
global_range = {}
for col in var_cols:
    suffix = next((s for s in METRIC_SUFFIXES if col.endswith(s)), None)
    if not suffix:
        continue
    res_type = col[:-(len(suffix) + 1)]
    data = pd.to_numeric(summary_all[col], errors='coerce').dropna()
    if len(data) > 0:
        global_range[(res_type, suffix)] = (float(data.min()), float(data.max()))

print(f"Loaded {len(summary_all)} summary records.")
print(f"Resource types: {resource_types}")
print(f"Metrics: {metrics}")

# =============================================================================
# CREATE WIDGETS
# =============================================================================

year_widget = pn.widgets.Select(
    name='Course Year',
    options=sorted(summary_all.Year.unique().tolist()),
    width=120
)

milestone_widget = pn.widgets.MultiChoice(
    name='Milestones',
    options=ORDERED_MILESTONES,
    value=['Hito1'],
    width=200
)

resource_widget = pn.widgets.Select(
    name='Resource Type',
    options=resource_types,
    width=200
)

metric_widget = pn.widgets.Select(
    name='Metric',
    options=metrics,
    value='Total_Number_Access',
    width=220
)

chart_widget = pn.widgets.RadioButtonGroup(
    name='Chart Type',
    options=['Histogram', 'Boxplot'],
    value='Histogram',
    button_type='primary'
)

log_widget = pn.widgets.Checkbox(name='Log Y-axis', value=False)


# =============================================================================
# PLOTTING FUNCTION (Same as your original)
# =============================================================================

def is_time_metric(metric_name: str) -> bool:
    """Check if metric represents a time duration."""
    return metric_name.startswith(('Avg_', 'CI_'))


@pn.depends(year_widget, milestone_widget, resource_widget, metric_widget, chart_widget, log_widget)
def create_plot(year, milestones, resource, metric, chart_type, use_log):
    """Create the Plotly visualization."""
    
    if not milestones:
        return pn.pane.Markdown("**Select at least one milestone.**")
    
    # Filter data
    df = summary_all[
        (summary_all.Year == year) &
        (summary_all.Milestone.isin(milestones))
    ].copy()
    
    if df.empty:
        return pn.pane.Markdown("**No data for the selected filters.**")
    
    col = f"{resource}_{metric}"
    if col not in df.columns:
        return pn.pane.Markdown(f"**Column '{col}' not found.**")
    
    # Order milestones
    df = df[df.Milestone.isin(ORDERED_MILESTONES)]
    df['Milestone'] = pd.Categorical(
        df['Milestone'],
        categories=ORDERED_MILESTONES,
        ordered=True
    )
    
    # Convert time metrics to days
    is_time = is_time_metric(metric)
    df[col] = pd.to_numeric(df[col], errors='coerce')
    if is_time:
        df[col] = df[col] / 1440.0  # minutes ‚Üí days
    
    # Get global range
    gmin, gmax = global_range.get((resource, metric), (None, None))
    if is_time and gmin is not None:
        gmin, gmax = gmin / 1440.0, gmax / 1440.0
    
    axis_label = 'Days' if is_time else 'Access count'
    title = f"{resource} ¬∑ {metric.replace('_', ' ')} ({year} ‚Äì {', '.join(milestones)})"
    
    # Color mapping for milestones
    color_map = dict(zip(ORDERED_MILESTONES, px.colors.qualitative.Plotly))
    
    if chart_type == 'Histogram':
        # Create subplots: violin on top, histogram below
        fig = make_subplots(
            rows=2, cols=1,
            shared_xaxes=True,
            row_heights=[0.32, 0.68],
            vertical_spacing=0.05,
            subplot_titles=('Distribution (Violin)', 'Frequency (Histogram)')
        )
        
        # Add violin plots
        for milestone in ORDERED_MILESTONES:
            df_ms = df[df['Milestone'] == milestone]
            if df_ms.empty:
                continue
            fig.add_trace(
                go.Violin(
                    x=df_ms[col],
                    name=milestone,
                    line_color=color_map.get(milestone),
                    fillcolor=color_map.get(milestone),
                    side='positive',
                    points=False,
                    box_visible=False,
                    meanline_visible=True,
                    showlegend=False,
                    opacity=0.65,
                    legendgroup=milestone
                ),
                row=1, col=1
            )
        
        # Add histograms
        for milestone in ORDERED_MILESTONES:
            df_ms = df[df['Milestone'] == milestone]
            if df_ms.empty:
                continue
            fig.add_trace(
                go.Histogram(
                    x=df_ms[col],
                    name=milestone,
                    nbinsx=20,
                    opacity=0.6,
                    marker_color=color_map.get(milestone),
                    marker_line=dict(width=1, color='white'),
                    legendgroup=milestone
                ),
                row=2, col=1
            )
        
        fig.update_layout(
            barmode='overlay',
            height=520,
            title_text=title,
            template='plotly_white',
            legend_title='Milestone'
        )
        fig.update_yaxes(title_text='Density', row=1, col=1)
        fig.update_yaxes(title_text='Frequency', row=2, col=1)
        
        if gmin is not None:
            fig.update_xaxes(range=[gmin, gmax], title_text=axis_label, row=2, col=1)
        else:
            fig.update_xaxes(title_text=axis_label, row=2, col=1)
        
        if use_log:
            fig.update_yaxes(type='log', row=2, col=1)
    
    else:
        # Boxplot
        fig = px.box(
            df,
            y=col,
            color='Milestone',
            points='all',
            title=title,
            template='plotly_white',
            category_orders={'Milestone': ORDERED_MILESTONES}
        )
        fig.update_layout(height=480, legend_title='Milestone')
        
        if gmin is not None:
            fig.update_yaxes(title=axis_label, range=[gmin, gmax])
        else:
            fig.update_yaxes(title=axis_label)
        
        if use_log:
            fig.update_yaxes(type='log')
    
    return pn.pane.Plotly(fig)


# =============================================================================
# BUILD DASHBOARD LAYOUT
# =============================================================================

dashboard = pn.Column(
    pn.pane.Markdown("# Analytics dashboard"),
    pn.Row(
        year_widget,
        milestone_widget,
        resource_widget,
        metric_widget,
    ),
    pn.Row(
        chart_widget,
        log_widget,
    ),
    pn.layout.Divider(),
    create_plot,
)

# =============================================================================
# LAUNCH DASHBOARD
# =============================================================================

print("Opening dashboard at http://localhost:5006")
dashboard.show(port=5006)

Loaded 830 summary records.
Resource types: ['Consulta', 'Cuestionario', 'Foro', 'Herramienta', 'Material_te√≥rico', 'Otro', 'Reuni√≥n_de_Zoom', 'R√∫brica', 'Tarea', 'V√≠deo']
Metrics: ['Avg_Time_First_Access', 'Avg_Time_Last_Access', 'CI_Time_First_Access', 'CI_Time_Last_Access', 'Total_Number_Access']
üöÄ Opening dashboard at http://localhost:5006
Launching server at http://localhost:5010


<panel.io.server.Server at 0x1197423f0>

## 10. Reproducibility Information

In [None]:
from datetime import datetime
print(f'Timestamp: {datetime.now().isoformat()}')
print(f'pandas: {pd.__version__}')
print(f'numpy: {np.__version__}')