In [1]:
# generate_reports.py

import os
import io
from pathlib import Path
from datetime import datetime

# Data and DB Libraries
import pandas as pd
from dotenv import load_dotenv
from supabase import create_client, Client

# Reporting and Plotting Libraries
from docx import Document
from docx.shared import Inches
import matplotlib.pyplot as plt
import matplotlib.dates as mdates

# --- CONFIGURATION ---
# The folder where all reports will be saved
OUTPUT_DIR = Path("GeneratedReports")
# The prefix for learners we are interested in
LEARNER_PREFIX = "BT"

# --- HELPER FUNCTIONS ---

def add_df_to_doc(doc, df, title=""):
    """Adds a Pandas DataFrame to a docx document with a title."""
    if title:
        doc.add_heading(title, level=2)
    
    # Add a table to the document
    if df.empty:
        doc.add_paragraph("No data available.")
        return
        
    # Reset index if it's not meaningful (like 0, 1, 2, ...)
    if isinstance(df.index, pd.RangeIndex):
        df_for_table = df.reset_index(drop=True)
    else:
        df_for_table = df.reset_index()

    table = doc.add_table(rows=1, cols=len(df_for_table.columns))
    table.style = 'Table Grid'
    
    # Add the header rows.
    for j, col_name in enumerate(df_for_table.columns):
        table.cell(0, j).text = str(col_name)

    # Add the rest of the data frame
    for i, row in df_for_table.iterrows():
        row_cells = table.add_row().cells
        for j, val in enumerate(row):
            row_cells[j].text = str(val)

def create_plot_from_df(df, title, y_label, x_col='Decision Point'):
    """Creates a line plot from a dataframe and returns it as an in-memory image."""
    plt.style.use('seaborn-v0_8-whitegrid')
    fig, ax = plt.subplots(figsize=(10, 5))
    
    for col in df.columns:
        if col != x_col:
            ax.plot(df[x_col], df[col], marker='o', linestyle='-', label=col)
    
    ax.set_title(title, fontsize=16)
    ax.set_xlabel(x_col, fontsize=12)
    ax.set_ylabel(y_label, fontsize=12)
    ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))
    plt.xticks(rotation=45, ha='right')
    fig.tight_layout(rect=[0, 0, 0.85, 1]) # Adjust layout to make room for legend
    
    # Save plot to an in-memory buffer
    img_stream = io.BytesIO()
    plt.savefig(img_stream, format='png', dpi=300)
    plt.close(fig) # Close the plot to free memory
    img_stream.seek(0)
    return img_stream


# --- CORE SCRIPT LOGIC ---

def fetch_analytics_data(supabase: Client) -> pd.DataFrame:
    """Fetches and cleans data from the historical_learning_analytics table."""
    print(f"Fetching data for learners starting with '{LEARNER_PREFIX}'...")
    try:
        response = supabase.table('historical_learning_analytics').select(
            'created_at, user_id, goal_id, scenario_attempt_number, decision_number, kc_scores_after_decision, metric_values_after_decision, users!inner(name)'
        ).like(
            'users.name', f'{LEARNER_PREFIX}%'
        ).order(
            'user_id'
        ).order(
            'created_at', desc=False  # <-- THIS IS THE CORRECTED LINE
        ).execute()

        if not response.data:
            print("No data found for the specified learners.")
            return pd.DataFrame()

        df = pd.DataFrame(response.data)
        
        # Unpack user name from the nested dictionary
        df['user_name'] = df['users'].apply(lambda x: x['name'])
        df = df.drop(columns=['users'])
        
        print(f"Successfully fetched {len(df)} records.")
        return df

    except Exception as e:
        print(f"An error occurred while fetching data: {e}")
        return pd.DataFrame()


def process_learner_data(raw_df: pd.DataFrame) -> dict:
    """Processes the raw DataFrame into a dictionary of DataFrames, one for each learner."""
    print("Processing raw data...")
    if raw_df.empty:
        return {}

    all_learners_data = {}
    
    # Get all unique KC and Metric names across the entire dataset
    all_kcs = set()
    all_metrics = set()
    raw_df['kc_scores_after_decision'].dropna().apply(lambda x: all_kcs.update(x.keys()))
    raw_df['metric_values_after_decision'].dropna().apply(lambda x: all_metrics.update(x.keys()))

    # Group data by each user
    for user_id, group in raw_df.groupby('user_id'):
        user_name = group['user_name'].iloc[0]
        print(f"  -> Processing data for {user_name} ({user_id})")

        # Sort chronologically for this user
        learner_df = group.sort_values('created_at').reset_index(drop=True)
        
        # Create a clean 'Decision Point' identifier for plotting
        learner_df['Decision Point'] = [f"G{r.goal_id}-S{r.scenario_attempt_number}-D{r.decision_number}" for r in learner_df.itertuples()]

        # Expand the JSON columns into actual dataframe columns
        kc_df = pd.json_normalize(learner_df['kc_scores_after_decision']).reindex(columns=sorted(list(all_kcs))).fillna(0)
        metric_df = pd.json_normalize(learner_df['metric_values_after_decision']).reindex(columns=sorted(list(all_metrics))).fillna(0)

        # Combine everything into a single, clean DataFrame for the learner
        processed_df = pd.concat([
            learner_df[['user_id', 'user_name', 'created_at', 'Decision Point']],
            kc_df,
            metric_df
        ], axis=1)

        all_learners_data[user_id] = {
            "user_name": user_name,
            "full_data": processed_df,
            "all_kcs": sorted(list(all_kcs)),
            "all_metrics": sorted(list(all_metrics))
        }
        
    print("Data processing complete.")
    return all_learners_data


def generate_individual_report(learner_data: dict, output_dir: Path):
    """Generates a single Word document report for one learner."""
    user_name = learner_data['user_name']
    df = learner_data['full_data']
    kcs = learner_data['all_kcs']
    metrics = learner_data['all_metrics']
    
    print(f"  -> Generating report for {user_name}...")
    
    doc = Document()
    doc.add_heading(f'Learner Report: {user_name}', 0)
    doc.add_paragraph(f"Report generated on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

    # --- Section 1: Final Scores ---
    doc.add_heading('Final Cumulative Scores', level=1)
    final_scores = df.iloc[-1]
    
    # Final KC Scores Table
    final_kc_scores = final_scores[kcs].to_frame(name='Final Score')
    final_kc_scores.index.name = "Knowledge Component (KC)"
    add_df_to_doc(doc, final_kc_scores, "Final KC Scores")

    # Final Metric Scores Table
    final_metric_scores = final_scores[metrics].to_frame(name='Final Value')
    final_metric_scores.index.name = "Metric"
    add_df_to_doc(doc, final_metric_scores, "Final Metric Scores")
    
    doc.add_page_break()

    # --- Section 2: Scores Over Time ---
    doc.add_heading('Performance Over Time', level=1)
    
    # KC Scores Progression
    kc_progression_df = df[['Decision Point'] + kcs]
    add_df_to_doc(doc, kc_progression_df.set_index('Decision Point'), "KC Scores per Decision Point")
    
    kc_plot_img = create_plot_from_df(kc_progression_df, "KC Scores Over Time", "KC Score")
    doc.add_picture(kc_plot_img, width=Inches(6.5))
    
    # Metric Scores Progression
    metric_progression_df = df[['Decision Point'] + metrics]
    add_df_to_doc(doc, metric_progression_df.set_index('Decision Point'), "Metric Values per Decision Point")
    
    metric_plot_img = create_plot_from_df(metric_progression_df, "Metric Values Over Time", "Metric Value")
    doc.add_picture(metric_plot_img, width=Inches(6.5))

    # --- Save the document ---
    file_path = output_dir / f"Report_{user_name.replace(' ', '_')}.docx"
    doc.save(file_path)


def generate_master_report(all_learners_data: dict, output_dir: Path):
    """Generates the master report with aggregate stats and leaderboards."""
    print("-> Generating Master Report...")
    if not all_learners_data:
        print("   No data available to generate a master report.")
        return

    doc = Document()
    doc.add_heading('Master Learner Report', 0)
    doc.add_paragraph(f"Report generated on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
    doc.add_paragraph(f"This report covers {len(all_learners_data)} learners whose names start with '{LEARNER_PREFIX}'.")

    # --- Compile final scores from all learners into one DataFrame ---
    final_scores_list = []
    for user_id, data in all_learners_data.items():
        final_row = data['full_data'].iloc[-1].copy()
        final_scores_list.append(final_row)
    
    final_scores_df = pd.DataFrame(final_scores_list).set_index('user_name')
    kcs = all_learners_data[list(all_learners_data.keys())[0]]['all_kcs']
    metrics = all_learners_data[list(all_learners_data.keys())[0]]['all_metrics']

    # --- Section 1: Aggregate Statistics ---
    doc.add_heading('Aggregate Statistics (Final Scores)', level=1)
    
    # KC Stats
    kc_stats = final_scores_df[kcs].describe().loc[['mean', 'min', 'max']].round(2)
    add_df_to_doc(doc, kc_stats, "KC Score Statistics (Across All Learners)")

    # Metric Stats
    metric_stats = final_scores_df[metrics].describe().loc[['mean', 'min', 'max']].round(2)
    add_df_to_doc(doc, metric_stats, "Metric Value Statistics (Across All Learners)")
    
    doc.add_page_break()
    
    # --- Section 2: Leaderboards ---
    doc.add_heading('Leaderboards', level=1)
    
    # KC Leaderboard (based on sum of all final KC scores)
    final_scores_df['Total KC Score'] = final_scores_df[kcs].sum(axis=1)
    kc_leaderboard = final_scores_df[['Total KC Score']].sort_values('Total KC Score', ascending=False).round(2)
    add_df_to_doc(doc, kc_leaderboard.head(10), "Top 10 - KC Leaderboard (by Sum of Final KC Scores)")
    
    # Metric Leaderboard (based on a normalized composite score)
    # Normalizing is important because metrics have different scales (e.g., Revenue vs. Reputation)
    # We scale each metric to be between 0 and 1, then sum them up for a composite score.
    doc.add_paragraph(
        "The Metric Leaderboard is calculated by normalizing each metric (scaling from 0 to 1 based on the min/max in this group) "
        "and then summing these normalized scores. This gives a balanced view of overall performance across all metrics."
    )
    from sklearn.preprocessing import MinMaxScaler
    scaler = MinMaxScaler()
    normalized_metrics = scaler.fit_transform(final_scores_df[metrics])
    final_scores_df['Composite Metric Score'] = pd.DataFrame(normalized_metrics, index=final_scores_df.index).sum(axis=1)
    metric_leaderboard = final_scores_df[['Composite Metric Score']].sort_values('Composite Metric Score', ascending=False).round(3)
    add_df_to_doc(doc, metric_leaderboard.head(10), "Top 10 - Metric Leaderboard (by Composite Score)")

    # --- Save the document ---
    file_path = output_dir / "Master_Report.docx"
    doc.save(file_path)
    print("   Master Report saved.")


def main():
    """Main function to run the entire reporting script."""
    # --- Setup ---
    print("Starting report generation script...")
    load_dotenv('.env.local')
    
    supabase_url = os.environ.get("NEXT_PUBLIC_SUPABASE_URL")
    supabase_key = os.environ.get("NEXT_PUBLIC_SUPABASE_ANON_KEY")

    if not supabase_url or not supabase_key:
        print("Error: Supabase URL or Key not found in .env.local file.")
        print("Please create a .env.local file with your credentials.")
        return

    # Create output directory if it doesn't exist
    OUTPUT_DIR.mkdir(exist_ok=True)
    
    # Initialize Supabase client
    supabase: Client = create_client(supabase_url, supabase_key)

    # --- Data Fetching and Processing ---
    raw_df = fetch_analytics_data(supabase)
    if raw_df.empty:
        print("Exiting script as no data was fetched.")
        return
        
    all_learners_data = process_learner_data(raw_df)
    
    if not all_learners_data:
        print("Exiting script as no learner data could be processed.")
        return

    # --- Report Generation ---
    print("\nGenerating individual learner reports...")
    for user_id in all_learners_data:
        generate_individual_report(all_learners_data[user_id], OUTPUT_DIR)
    
    print("\nGenerating master report...")
    generate_master_report(all_learners_data, OUTPUT_DIR)
    
    print(f"\n✅ All reports have been generated and saved in the '{OUTPUT_DIR}' folder.")


if __name__ == "__main__":
    main()

Starting report generation script...
Fetching data for learners starting with 'BT'...
An error occurred while fetching data: BaseSelectRequestBuilder.order() got an unexpected keyword argument 'ascending'
Exiting script as no data was fetched.
