In [2]:
!pip install python-docx

Collecting python-docx
  Downloading python_docx-1.2.0-py3-none-any.whl.metadata (2.0 kB)
Downloading python_docx-1.2.0-py3-none-any.whl (252 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/253.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━[0m [32m235.5/253.0 kB[0m [31m6.9 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m253.0/253.0 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: python-docx
Successfully installed python-docx-1.2.0


In [4]:
# ==========================================================
# JHS Student Performance Analysis with Word Report Generation
# Features:
# - Predicts if average score will be <50 or >=50
# - Maintains all original performance categorization
# - Generates comprehensive Word report
# - Includes SHAP values for model interpretability
# ==========================================================

import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from docx import Document
from docx.shared import Inches
from docx.enum.text import WD_PARAGRAPH_ALIGNMENT
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import (accuracy_score, classification_report,
                           confusion_matrix, roc_auc_score)
import warnings
warnings.filterwarnings('ignore')

# Optional SHAP import for feature explanation
try:
    import shap
    _HAS_SHAP = True
except ImportError:
    _HAS_SHAP = False
    print("SHAP not available. Install with: pip install shap")

# ==========================================================
# CONFIGURATION
# ==========================================================

# File paths
FILE_PATH = "/content/CAPSTONE DATA.xlsx"
OUTPUT_WORD = "/content/Ledzokuku_JHS_Analysis_Report.docx"
OUTPUT_PROCESSED = "/content/Ledzokuku_JHS_Processed.xlsx"
FIG_DIR = "/content/figures"
os.makedirs(FIG_DIR, exist_ok=True)

# Performance thresholds
PASS_THRESHOLD = 50
AT_RISK_LOW = PASS_THRESHOLD
AT_RISK_HIGH = PASS_THRESHOLD + 4  # 50-54 range
MEETS_EXPECTATIONS_THRESHOLD = 70
HIGH_ACHIEVER_THRESHOLD = 80

# Expected subject names (with flexible naming)
EXPECTED_SUBJECTS = ['ENG', 'MATHS', 'SCI', 'SOC', 'CAREER',
                     'COMPUTING', 'RME', 'C A', 'GL', 'FREN']

# Visualization style
plt.style.use('ggplot')
sns.set_palette('viridis')

# ==========================================================
# HELPER FUNCTIONS
# ==========================================================

def clean_column_name(col):
    """Standardize column names by removing spaces and special chars"""
    return str(col).strip().replace('.', '').replace(' ', '_')

def save_figure(fname):
    """Save matplotlib figure to file with consistent formatting"""
    path = os.path.join(FIG_DIR, fname)
    plt.tight_layout()
    plt.savefig(path, dpi=150, bbox_inches='tight')
    plt.close()
    return path

def add_dataframe_to_doc(doc, df, title=None, max_rows=30):
    """
    Insert a pandas DataFrame into Word document as a formatted table
    Handles large tables by truncating rows and columns
    """
    if title:
        paragraph = doc.add_paragraph()
        paragraph.alignment = WD_PARAGRAPH_ALIGNMENT.LEFT
        paragraph.add_run(title).bold = True

    # Truncate if too large
    df_display = df.copy()
    if df_display.shape[0] > max_rows:
        df_display = df_display.head(max_rows)

    max_cols = 12
    if df_display.shape[1] > max_cols:
        cols = list(df_display.columns[:max_cols-2]) + list(df_display.columns[-2:])
        df_display = df_display[cols]

    # Create table
    table = doc.add_table(rows=1, cols=len(df_display.columns))
    table.style = 'Light Shading Accent 1'

    # Add headers
    header_cells = table.rows[0].cells
    for j, col in enumerate(df_display.columns):
        header_cells[j].text = str(col)

    # Add data rows
    for _, row in df_display.iterrows():
        row_cells = table.add_row().cells
        for j, col in enumerate(df_display.columns):
            value = row[col]
            row_cells[j].text = "" if pd.isna(value) else str(value)

    doc.add_paragraph()

# ==========================================================
# DATA LOADING AND PREPROCESSING
# ==========================================================

def load_and_preprocess_data(file_path):
    """
    Load Excel data and perform initial cleaning and standardization
    Returns cleaned DataFrame
    """
    try:
        df = pd.read_excel(file_path)
    except Exception as e:
        raise SystemExit(f"Failed to load {file_path}: {e}")

    # Standardize column names
    df.columns = [clean_column_name(c) for c in df.columns]

    # Handle common subject name variations
    name_mapping = {
        'MATH': 'MATHS',
        'COM': 'COMPUTING',
        'GH_LAN': 'GL',
        'CAR_TECH': 'CAREER',
        'C A': 'C_A'  # Standardize to underscore format
    }

    # Apply name mapping
    df.rename(columns=name_mapping, inplace=True)

    # Convert numeric columns, coerce errors to NaN
    for col in df.columns:
        if col in [s.replace(' ', '_') for s in EXPECTED_SUBJECTS]:
            df[col] = pd.to_numeric(df[col], errors='coerce')

    # Remove completely empty rows
    df.dropna(how='all', inplace=True)
    df.reset_index(drop=True, inplace=True)

    # Ensure scores are within 0-100 range
    for subject in [s.replace(' ', '_') for s in EXPECTED_SUBJECTS]:
        if subject in df.columns:
            df[subject] = df[subject].clip(lower=0, upper=100)

    # Calculate average score
    valid_subjects = [s.replace(' ', '_') for s in EXPECTED_SUBJECTS
                     if s.replace(' ', '_') in df.columns and
                     pd.api.types.is_numeric_dtype(df[s.replace(' ', '_')])]
    df['Average_Score'] = df[valid_subjects].mean(axis=1)

    return df

# Load and preprocess data
df = load_and_preprocess_data(FILE_PATH)
print(f"Loaded dataset with {len(df)} students")

# ==========================================================
# SUBJECT ANALYSIS
# ==========================================================

def analyze_subject_performance(df, subjects):
    """
    Analyze performance metrics for each subject:
    - Average and median scores
    - Pass rates
    - At-risk rates
    - Excellence rates
    Returns DataFrame with subject metrics
    """
    subject_metrics = {}

    for subject in subjects:
        if subject in df.columns and pd.api.types.is_numeric_dtype(df[subject]):
            scores = df[subject]

            subject_metrics[subject] = {
                'Average_Score': scores.mean(),
                'Median_Score': scores.median(),
                f'Pass_Rate_>={PASS_THRESHOLD}%': (scores >= PASS_THRESHOLD).mean() * 100,
                f'At_Risk_Rate_{AT_RISK_LOW}-{AT_RISK_HIGH}%': ((scores >= AT_RISK_LOW) & (scores <= AT_RISK_HIGH)).mean() * 100,
                f'Critical_Fail_Rate_<{PASS_THRESHOLD}%': (scores < PASS_THRESHOLD).mean() * 100,
                f'Excellence_Rate_>={HIGH_ACHIEVER_THRESHOLD}%': (scores >= HIGH_ACHIEVER_THRESHOLD).mean() * 100
            }

    metrics_df = pd.DataFrame.from_dict(subject_metrics, orient='index')
    return metrics_df.sort_values('Average_Score', ascending=False)

# Identify valid subjects for analysis
valid_subjects = [s.replace(' ', '_') for s in EXPECTED_SUBJECTS
                 if s.replace(' ', '_') in df.columns and
                 pd.api.types.is_numeric_dtype(df[s.replace(' ', '_')])]

# Perform subject analysis
subject_metrics = analyze_subject_performance(df, valid_subjects)

# ==========================================================
# STUDENT CATEGORIZATION
# ==========================================================

def categorize_students(df):
    """
    Categorize students based on performance thresholds
    Adds columns to DataFrame and returns summary counts
    """
    # Initialize category columns
    categories = {
        'Critical_Fail': 0,
        'At_Risk': 0,
        'Meets_Expectations': 0,
        'High_Achiever': 0
    }

    # Check if core subjects are available
    core_subjects = ['ENG', 'MATHS', 'SCI']
    has_core = all(s in df.columns for s in core_subjects)

    if has_core:
        # Categorize based on core subjects
        df['Critical_Fail'] = ((df['ENG'] < PASS_THRESHOLD) |
                              (df['MATHS'] < PASS_THRESHOLD) |
                              (df['SCI'] < PASS_THRESHOLD)).astype(int)

        df['At_Risk'] = (((df['ENG'] >= AT_RISK_LOW) & (df['ENG'] <= AT_RISK_HIGH)) |
                         ((df['MATHS'] >= AT_RISK_LOW) & (df['MATHS'] <= AT_RISK_HIGH)) |
                         ((df['SCI'] >= AT_RISK_LOW) & (df['SCI'] <= AT_RISK_HIGH))).astype(int)

        df['Meets_Expectations'] = ((df['ENG'] >= MEETS_EXPECTATIONS_THRESHOLD) &
                                   (df['MATHS'] >= MEETS_EXPECTATIONS_THRESHOLD) &
                                   (df['SCI'] >= MEETS_EXPECTATIONS_THRESHOLD)).astype(int)

        df['High_Achiever'] = ((df['ENG'] >= HIGH_ACHIEVER_THRESHOLD) &
                              (df['MATHS'] >= HIGH_ACHIEVER_THRESHOLD) &
                              (df['SCI'] >= HIGH_ACHIEVER_THRESHOLD)).astype(int)
    else:
        # Fallback to average score if available
        if 'Average_Score' in df.columns:
            df['Critical_Fail'] = (df['Average_Score'] < PASS_THRESHOLD).astype(int)
            df['At_Risk'] = ((df['Average_Score'] >= AT_RISK_LOW) &
                            (df['Average_Score'] <= AT_RISK_HIGH)).astype(int)
            df['Meets_Expectations'] = (df['Average_Score'] >= MEETS_EXPECTATIONS_THRESHOLD).astype(int)
            df['High_Achiever'] = (df['Average_Score'] >= HIGH_ACHIEVER_THRESHOLD).astype(int)

    # Calculate summary counts
    total_students = len(df)
    summary = {
        'Critical_Fail': int(df['Critical_Fail'].sum()),
        'At_Risk': int(df['At_Risk'].sum()),
        'Meets_Expectations': int(df['Meets_Expectations'].sum()),
        'High_Achiever': int(df['High_Achiever'].sum()),
        'Total_Students': total_students
    }

    return summary

# Categorize students and get summary
student_summary = categorize_students(df)

# ==========================================================
# VISUALIZATION FUNCTIONS
# ==========================================================

def plot_subject_averages(metrics_df):
    """Create bar plot of subject average scores with threshold lines"""
    plt.figure(figsize=(10, 6))
    ax = sns.barplot(x=metrics_df.index, y='Average_Score', data=metrics_df.reset_index())

    # Add threshold lines
    plt.axhline(y=HIGH_ACHIEVER_THRESHOLD, color='gold', linestyle=':',
               label=f'High ({HIGH_ACHIEVER_THRESHOLD})')
    plt.axhline(y=MEETS_EXPECTATIONS_THRESHOLD, color='green', linestyle='--',
               label=f'Meets ({MEETS_EXPECTATIONS_THRESHOLD})')
    plt.axhline(y=PASS_THRESHOLD, color='orange', linestyle='-',
               label=f'Pass ({PASS_THRESHOLD})')

    plt.xticks(rotation=45)
    plt.ylabel('Average Score')
    plt.title('Subject Average Scores with Performance Thresholds')
    plt.legend()

    return save_figure('subject_averages.png')

def plot_core_correlations(df):
    """Create heatmap of correlations between core subjects"""
    core_subjects = [s for s in ['ENG', 'MATHS', 'SCI', 'GL', 'STEM_Score', 'Core_Avg']
                    if s in df.columns and pd.api.types.is_numeric_dtype(df[s])]

    if len(core_subjects) >= 2:
        plt.figure(figsize=(8, 6))
        sns.heatmap(df[core_subjects].corr(), annot=True, fmt='.2f',
                   cmap='coolwarm', vmin=-1, vmax=1)
        plt.title('Core Subject Correlations')
        return save_figure('core_correlations.png')
    return None

# Generate visualizations
subject_avg_plot = plot_subject_averages(subject_metrics)
core_corr_plot = plot_core_correlations(df)

# ==========================================================
# MODIFIED MACHINE LEARNING ANALYSIS (Predicting Average Score <50)
# ==========================================================

def train_average_score_model(df):
    """
    Train model to predict if average score will be <50 or >=50
    Returns:
    - Trained model
    - Results dictionary
    - Confusion matrix DataFrame
    - Feature importances
    - SHAP values and plots (if available)
    """
    # Create target variable (1 if average score <50, else 0)
    if 'Average_Score' not in df.columns:
        return None, {'note': 'Average_Score not available'}, None, None, None

    y = (df['Average_Score'] < PASS_THRESHOLD).astype(int)

    # Skip if not enough failing examples
    if y.sum() < 5:
        return None, {'note': f'Insufficient students with average <{PASS_THRESHOLD}'}, None, None, None

    # Select features (all subject scores)
    features = [s for s in valid_subjects if s in df.columns]

    if len(features) < 2:
        return None, {'note': 'Insufficient subjects for modeling'}, None, None, None

    X = df[features].fillna(df[features].median())

    # Split data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.25, random_state=42, stratify=y
    )

    # Train model
    model = RandomForestClassifier(
        n_estimators=200,
        max_depth=5,
        random_state=42,
        class_weight='balanced'
    )
    calibrated_model = CalibratedClassifierCV(model, cv=5, method='isotonic')
    calibrated_model.fit(X_train, y_train)

    # Evaluate
    y_pred = calibrated_model.predict(X_test)
    y_proba = calibrated_model.predict_proba(X_test)[:, 1]

    # Create confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    cm_df = pd.DataFrame(cm,
                       index=['Actual >=50', 'Actual <50'],
                       columns=['Predicted >=50', 'Predicted <50'])

    # Plot confusion matrix
    plt.figure(figsize=(6, 5))
    sns.heatmap(cm_df, annot=True, fmt='d', cmap='Blues', cbar=False)
    plt.title('Confusion Matrix for Average Score Prediction')
    cm_plot = save_figure('confusion_matrix_avg_score.png')

    # Get feature importances
    importance_model = RandomForestClassifier(
        n_estimators=200,
        max_depth=5,
        random_state=42,
        class_weight='balanced'
    )
    importance_model.fit(X_train, y_train)
    feature_importances = pd.Series(
        importance_model.feature_importances_,
        index=features
    ).sort_values(ascending=False)

    # Plot feature importance
    plt.figure(figsize=(8, 5))
    sns.barplot(x=feature_importances.values, y=feature_importances.index)
    plt.title('Feature Importances for Average Score Prediction')
    fi_plot = save_figure('feature_importance_avg_score.png')

    # SHAP Analysis - FIXED VERSION
    shap_results = {}
    if _HAS_SHAP:
        try:
            # Create SHAP explainer using the trained model
            explainer = shap.TreeExplainer(importance_model)

            # Calculate SHAP values - ensure we're using the right class
            shap_values = explainer.shap_values(X_test)

            # Handle binary classification SHAP values structure
            if isinstance(shap_values, list) and len(shap_values) == 2:
                # For binary classification, shap_values is a list of [class0, class1]
                shap_values_class1 = shap_values[1]  # Use class 1 (average <50)
            else:
                # For some versions, it might be a single array
                shap_values_class1 = shap_values

            # Ensure the shape matches
            if shap_values_class1.shape != X_test.shape:
                print(f"SHAP shape mismatch: {shap_values_class1.shape} vs {X_test.shape}")
                # Try to handle common shape issues
                if len(shap_values_class1.shape) == 3:
                    shap_values_class1 = shap_values_class1[:, :, 1]  # For 3D arrays
                elif len(shap_values_class1.shape) == 1:
                    shap_values_class1 = shap_values_class1.reshape(-1, 1)

            # Summary plot with proper shape handling
            plt.figure(figsize=(10, 8))
            shap.summary_plot(shap_values_class1, X_test, feature_names=features, show=False)
            plt.title('SHAP Summary Plot - Impact on Predicting Average <50')
            shap_summary_plot = save_figure('shap_summary_plot.png')

            # Force plot for a typical example with proper error handling
            try:
                plt.figure(figsize=(12, 6))
                # Use the first sample that's correctly predicted for better demonstration
                correct_predictions = (y_pred == y_test)
                if correct_predictions.any():
                    sample_idx = np.where(correct_predictions)[0][0]
                else:
                    sample_idx = 0

                shap.force_plot(explainer.expected_value[1] if isinstance(explainer.expected_value, list) else explainer.expected_value,
                              shap_values_class1[sample_idx, :],
                              X_test.iloc[sample_idx, :],
                              feature_names=features,
                              matplotlib=True,
                              show=False)
                plt.title('SHAP Force Plot - Individual Prediction Explanation')
                shap_force_plot = save_figure('shap_force_plot.png')
            except Exception as force_error:
                print(f"Force plot failed: {force_error}")
                shap_force_plot = None

            # Dependence plot for most important feature
            try:
                most_important_feature = feature_importances.index[0]
                feature_idx = list(features).index(most_important_feature)

                plt.figure(figsize=(10, 6))
                shap.dependence_plot(feature_idx, shap_values_class1, X_test,
                                   feature_names=features, show=False)
                plt.title(f'SHAP Dependence Plot - {most_important_feature}')
                shap_dependence_plot = save_figure('shap_dependence_plot.png')
            except Exception as dep_error:
                print(f"Dependence plot failed: {dep_error}")
                shap_dependence_plot = None

            shap_results = {
                'shap_values': shap_values_class1,
                'explainer': explainer,
                'summary_plot': shap_summary_plot,
                'force_plot': shap_force_plot,
                'dependence_plot': shap_dependence_plot,
                'expected_value': explainer.expected_value[1] if isinstance(explainer.expected_value, list) else explainer.expected_value
            }

        except Exception as e:
            print(f"SHAP analysis failed: {e}")
            shap_results = {'error': str(e)}

    # Store results
    results = {
        'features': features,
        'accuracy': accuracy_score(y_test, y_pred),
        'classification_report': classification_report(y_test, y_pred, output_dict=True),
        'confusion_matrix': cm_df,
        'confusion_matrix_plot': cm_plot,
        'feature_importance_plot': fi_plot,
        'feature_importances': feature_importances,
        'shap_results': shap_results
    }

    try:
        results['auc'] = roc_auc_score(y_test, y_proba)
    except:
        results['auc'] = None

    # Add probabilities to dataframe
    df['Avg_Score_Fail_Probability'] = calibrated_model.predict_proba(X)[:, 1]

    return calibrated_model, results, cm_df, feature_importances, shap_results

# Train model for average score prediction
avg_score_model, ml_results, confusion_matrix, feature_importances, shap_results = train_average_score_model(df)

# ==========================================================
# REPORT GENERATION
# ==========================================================

def get_subject_recommendations(subject):
    """Return practical recommendations for each subject"""
    subject = subject.upper()

    if subject.startswith('ENG'):
        return [
            "Daily 30-minute reading comprehension exercises",
            "Weekly vocabulary building with contextual usage",
            "Peer-assisted learning for struggling readers",
            "Structured writing practice with rubrics"
        ]
    elif subject.startswith('MATH'):
        return [
            "Daily basic arithmetic drills (15 minutes)",
            "Hands-on activities with real-world applications",
            "Small-group problem solving sessions",
            "Weekly formative assessments with immediate feedback"
        ]
    elif subject.startswith('SCI'):
        return [
            "Practical experiments using local materials",
            "Concept mapping for key topics",
            "Relate concepts to everyday phenomena",
            "Structured note-taking practice"
        ]
    else:
        return [
            "Targeted vocabulary building",
            "Structured reading comprehension practice",
            "Peer discussion groups",
            "Weekly short-answer assessments"
        ]

def generate_word_report():
    """Create comprehensive Word report with all analysis results"""
    doc = Document()

    # Title page
    doc.add_heading('Ledzokuku Municipality — JHS Student Performance Analysis', level=1)
    doc.add_paragraph(f"Data file: {os.path.basename(FILE_PATH)}")
    doc.add_paragraph(f"Analysis date: {pd.Timestamp.now().strftime('%Y-%m-%d %H:%M')}")
    doc.add_paragraph(
        f"Thresholds: PASS={PASS_THRESHOLD} | AT-RISK={AT_RISK_LOW}-{AT_RISK_HIGH} | "
        f"MEETS={MEETS_EXPECTATIONS_THRESHOLD} | HIGH={HIGH_ACHIEVER_THRESHOLD}"
    )

    # 1. Executive Summary
    doc.add_heading('1. Executive Summary', level=2)
    doc.add_paragraph(
        f"This report analyzes performance data for {student_summary['Total_Students']} JHS students. "
        f"Key findings include {student_summary['Critical_Fail']} students below passing threshold "
        f"({student_summary['Critical_Fail']/student_summary['Total_Students']:.1%}) and "
        f"{student_summary['High_Achiever']} high achievers."
    )

    # 2. Subject Performance
    doc.add_heading('2. Subject Performance Analysis', level=2)
    if not subject_metrics.empty:
        add_dataframe_to_doc(doc, subject_metrics.round(1),
                           title="Subject Performance Metrics")
        if subject_avg_plot:
            doc.add_picture(subject_avg_plot, width=Inches(6))
    else:
        doc.add_paragraph("No valid subject data available for analysis.")

    # 3. Student Categorization
    doc.add_heading('3. Student Performance Categories', level=2)
    doc.add_paragraph(
        f"Critical Fail (<{PASS_THRESHOLD}% in any core subject): "
        f"{student_summary['Critical_Fail']} students "
        f"({student_summary['Critical_Fail']/student_summary['Total_Students']:.1%})"
    )
    doc.add_paragraph(
        f"At-Risk ({AT_RISK_LOW}-{AT_RISK_HIGH}% in any core subject): "
        f"{student_summary['At_Risk']} students "
        f"({student_summary['At_Risk']/student_summary['Total_Students']:.1%})"
    )
    doc.add_paragraph(
        f"Meets Expectations (≥{MEETS_EXPECTATIONS_THRESHOLD}% in all core subjects): "
        f"{student_summary['Meets_Expectations']} students"
    )
    doc.add_paragraph(
        f"High Achievers (≥{HIGH_ACHIEVER_THRESHOLD}% in all core subjects): "
        f"{student_summary['High_Achiever']} students"
    )

    # 4. Core Subject Relationships
    if core_corr_plot:
        doc.add_heading('4. Core Subject Relationships', level=2)
        doc.add_paragraph("Correlation matrix showing relationships between core subjects:")
        doc.add_picture(core_corr_plot, width=Inches(6))

    # 5. Average Score Prediction
    doc.add_heading('5. Average Score Prediction (<50 vs >=50)', level=2)

    if 'accuracy' in ml_results:
        doc.add_paragraph(
            f"Machine learning model trained to predict if average score will be "
            f"below {PASS_THRESHOLD} with accuracy of {ml_results['accuracy']:.1%}."
        )

        # Add confusion matrix
        doc.add_picture(ml_results['confusion_matrix_plot'], width=Inches(4))
        add_dataframe_to_doc(doc, ml_results['confusion_matrix'],
                           title="Confusion Matrix for Average Score Prediction")

        # Add feature importance
        doc.add_paragraph("Most important subjects for predicting average score:")
        doc.add_picture(ml_results['feature_importance_plot'], width=Inches(6))

        # Add SHAP analysis if available
        if _HAS_SHAP and ml_results.get('shap_results') and not ml_results['shap_results'].get('error'):
            doc.add_heading('SHAP Analysis - Model Interpretability', level=3)
            doc.add_paragraph(
                "SHAP (SHapley Additive exPlanations) values show how each feature "
                "contributes to the prediction for individual students:"
            )

            # SHAP Summary Plot
            doc.add_paragraph("SHAP Summary Plot - Global Feature Importance:", style='Heading 3')
            doc.add_picture(ml_results['shap_results']['summary_plot'], width=Inches(6))
            doc.add_paragraph(
                "This plot shows both feature importance (vertical dispersion) and "
                "the impact of each feature on the prediction (color). Red indicates "
                "higher values push the prediction towards average <50."
            )

            # SHAP Force Plot
            if ml_results['shap_results'].get('force_plot'):
                doc.add_paragraph("SHAP Force Plot - Individual Prediction Explanation:", style='Heading 3')
                doc.add_picture(ml_results['shap_results']['force_plot'], width=Inches(6))
                doc.add_paragraph(
                    "This shows how each feature contributes to a specific student's prediction. "
                    "Features pushing the prediction to the right increase the likelihood of average <50."
                )

            # SHAP Dependence Plot
            if ml_results['shap_results'].get('dependence_plot'):
                doc.add_paragraph("SHAP Dependence Plot - Feature Relationship:", style='Heading 3')
                doc.add_picture(ml_results['shap_results']['dependence_plot'], width=Inches(6))
                doc.add_paragraph(
                    f"This shows how the most important feature ({feature_importances.index[0]}) "
                    "interacts with other features to affect the prediction."
                )
        elif not _HAS_SHAP:
            doc.add_paragraph(
                "Note: Install SHAP (pip install shap) for advanced model interpretability features."
            )

        # Add classification report
        cr_df = pd.DataFrame(ml_results['classification_report']).transpose()
        add_dataframe_to_doc(doc, cr_df.round(3), title="Classification Report")

        # Show top at-risk students
        if 'Avg_Score_Fail_Probability' in df.columns:
            top_risk = df.nlargest(10, 'Avg_Score_Fail_Probability')
            display_cols = ['Avg_Score_Fail_Probability', 'Average_Score'] + valid_subjects
            display_cols = [c for c in display_cols if c in top_risk.columns]
            add_dataframe_to_doc(
                doc,
                top_risk[display_cols].round(2),
                title="Top 10 Students at Risk of Average <50"
            )
    else:
        doc.add_paragraph(ml_results.get('note', 'No model trained'))

    # 6. Practical Recommendations
    doc.add_heading('6. Practical Recommendations', level=2)
    doc.add_paragraph("Targeted interventions for improving student performance:")

    # Subject-specific recommendations
    for subject in valid_subjects[:5]:  # Show top 5 subjects
        doc.add_heading(f"{subject} Interventions", level=3)
        for rec in get_subject_recommendations(subject):
            doc.add_paragraph(f"• {rec}", style='List Bullet')

    # Implementation plan
    doc.add_heading('12-Week Implementation Plan', level=3)
    plan = [
        "Weeks 1-2: Diagnostic testing and student grouping",
        "Weeks 3-6: Intensive remedial sessions (3x weekly)",
        "Weeks 7-9: Progress monitoring and adjustment",
        "Weeks 10-12: Final assessment and reporting"
    ]
    for item in plan:
        doc.add_paragraph(item, style='List Bullet')

    # Save document
    doc.save(OUTPUT_WORD)
    print(f"Report saved to {OUTPUT_WORD}")

# Generate the report
generate_word_report()

# ==========================================================
# SAVE PROCESSED DATA WITH SHAP VALUES
# ==========================================================

try:
    # Add SHAP values to dataframe if available
    if _HAS_SHAP and shap_results and 'shap_values' in shap_results and 'explainer' in shap_results:
        # Get SHAP values for all data
        X_all = df[valid_subjects].fillna(df[valid_subjects].median())
        shap_values_all = shap_results['explainer'].shap_values(X_all)

        # Handle binary classification SHAP values structure for all data
        if isinstance(shap_values_all, list) and len(shap_values_all) == 2:
            shap_values_class1_all = shap_values_all[1]  # Use class 1 (average <50)
        else:
            shap_values_class1_all = shap_values_all

        # Add SHAP values for each subject
        for i, subject in enumerate(valid_subjects):
            if i < shap_values_class1_all.shape[1]:  # Ensure we don't exceed array bounds
                df[f'SHAP_{subject}'] = shap_values_class1_all[:, i]

    df.to_excel(OUTPUT_PROCESSED, index=False)
    print(f"Processed data with SHAP values saved to {OUTPUT_PROCESSED}")
except Exception as e:
    print(f"Error saving processed data: {e}")

print("Analysis complete.")

Loaded dataset with 458 students
SHAP shape mismatch: (115, 10, 2) vs (115, 10)
Force plot failed: In v0.20, force plot now requires the base value as the first parameter! Try shap.plots.force(explainer.expected_value, shap_values) or for multi-output models try shap.plots.force(explainer.expected_value[0], shap_values[..., 0]).
Report saved to /content/Ledzokuku_JHS_Analysis_Report.docx
Error saving processed data: Expected a 1D array, got an array with shape (458, 2)
Analysis complete.


<Figure size 1200x600 with 0 Axes>

<Figure size 1000x600 with 0 Axes>