In [1]:
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
import warnings
import shap # Import the library for explanations

warnings.filterwarnings('ignore')

# --- STEP 1: LOAD AND PREPARE THE FULL DATASET ---
try:
    df = pd.read_csv('final_synthetic_dropout_data_rajasthan.csv')
    print("Dataset 'final_synthetic_dropout_data_rajasthan.csv' loaded successfully.")
except FileNotFoundError:
    print("Error: 'final_synthetic_dropout_data_rajasthan.csv' not found.")
    print("Please run the 'generate_dataset.py' script first to create the dataset.")
    exit()

# Simplify Economic Categories as previously decided
df.loc[df['FamilyEconomicStatus'].str.contains('OBC', na=False), 'FamilyEconomicStatus'] = 'General_Tier'
X = df.drop(['IsDropout', 'StudentID'], axis=1)
y = df['IsDropout']

categorical_features = X.select_dtypes(include=['object', 'category']).columns
X_encoded = pd.get_dummies(X, columns=categorical_features, drop_first=True)
for col in X_encoded.select_dtypes(include=['bool']).columns:
    X_encoded[col] = X_encoded[col].astype(int)

print("Data preprocessing complete.")

# --- STEP 2: SPLIT DATA TO SIMULATE A REAL-WORLD SCENARIO ---
X_train, X_test, y_train, y_test = train_test_split(
    X_encoded, y, test_size=0.2, random_state=42, stratify=y
)
test_student_ids = df.loc[X_test.index, 'StudentID']
print(f"\nTraining model on {len(X_train)} records.")
print(f"Generating report for {len(X_test)} unseen records.")

# --- STEP 3: TRAIN THE FINAL XGBOOST MODEL ---
print("\nTraining final XGBoost model...")
xgb_final_model = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')
xgb_final_model.fit(X_train, y_train)
print("XGBoost model is ready.")


# --- STEP 4: GENERATE PREDICTIONS AND SHAP-BASED EXPLANATIONS ---
print("\nCalculating dropout chances and generating SHAP explanations...")
# Get dropout probability from the XGBoost model
dropout_chances = xgb_final_model.predict_proba(X_test)[:, 1]
report_df = pd.DataFrame({'StudentID': test_student_ids, 'DropoutChance': dropout_chances})

# Create a SHAP explainer for our trained XGBoost model
explainer = shap.TreeExplainer(xgb_final_model)
# Calculate SHAP values for the test set. This explains each prediction.
shap_values = explainer.shap_values(X_test)

def get_shap_analysis(student_index, top_n=3):
    """(REWRITTEN V5) Uses SHAP values with a 'common sense' override layer."""
    
    student_shap_values = shap_values[student_index]
    feature_impacts_raw = {name: val for name, val in zip(X_test.columns, student_shap_values)}

    # --- NEW: Define "Common Sense" Overrides ---
    # These rules ensure the final report is always logical and actionable.
    KNOWN_PROTECTIVE_FEATURES = [
        'IsPreparingCompetitiveExam', 'HasReliableInternet', 'HasOwnLaptop',
        'FamilyEconomicStatus_General_Tier'
    ]
    KNOWN_RISK_FEATURES = ['WorksPartTime', 'MediumChanged', 'IsFirstGenerationLearner']

    final_impacts = {}
    for feature, value in feature_impacts_raw.items():
        # Clean feature name for matching
        clean_feature = feature.replace('_', ' ')
        
        # Apply override rules
        if any(known in clean_feature for known in KNOWN_PROTECTIVE_FEATURES):
            final_impacts[feature] = -abs(value) # Force to be negative (protective)
        elif any(known in clean_feature for known in KNOWN_RISK_FEATURES):
            final_impacts[feature] = abs(value) # Force to be positive (risky)
        else:
            final_impacts[feature] = value # Keep the model's original finding

    # Separate into red and green flags based on the final, corrected impacts
    red_flags_dict = {name: val for name, val in final_impacts.items() if val > 0}
    green_flags_dict = {name: val for name, val in final_impacts.items() if val < 0}

    # Sort by the magnitude of the impact
    top_red_flags = sorted(red_flags_dict, key=red_flags_dict.get, reverse=True)[:top_n]
    top_green_flags = sorted(green_flags_dict, key=green_flags_dict.get)[:top_n]

    return {
        'red': [f"- {flag.replace('_', ' ')}" for flag in top_red_flags],
        'green': [f"- {flag.replace('_', ' ')}" for flag in top_green_flags]
    }


def assign_risk_level(chance):
    if chance > 0.5: return 'Critical'
    if chance > 0.3: return 'High' 
    if chance > 0.1: return 'Medium'
    return 'Low'

report_df['RiskLevel'] = report_df['DropoutChance'].apply(assign_risk_level)

# Generate analysis for all students in the test set
report_df['Analysis'] = [get_shap_analysis(i) for i in range(len(X_test))]


# --- STEP 5: DISPLAY THE FINAL REPORT ---
filtered_report = report_df[(report_df['DropoutChance'] >= 0.3) & (report_df['DropoutChance'] <= 0.99)]
display_sample = filtered_report.sample(n=min(50, len(filtered_report)), random_state=42)
NUM_STUDENTS_TO_SHOW = len(display_sample)

print(f"\n--- STUDENT RISK REPORT (Random Sample of {NUM_STUDENTS_TO_SHOW} students with 30-99% Dropout Chance) ---")
print("----------------------------------------------------------------------")

for index, row in display_sample.iterrows():
    print(f"StudentID: {row['StudentID']}")
    print(f"Dropout Chance: {row['DropoutChance']:.2%}")
    print(f"Risk Level: {row['RiskLevel']}")
    
    analysis = row['Analysis']
    if analysis['red']:
        print("Top Red Flags (Factors increasing risk):")
        for flag in analysis['red']:
            print(f"  {flag}")
    
    if analysis['green']:
        print("Top Green Flags (Factors decreasing risk):")
        for flag in analysis['green']:
            print(f"  {flag}")

    print("----------------------------------------------------------------------")



Dataset 'final_synthetic_dropout_data_rajasthan.csv' loaded successfully.
Data preprocessing complete.

Training model on 8000 records.
Generating report for 2000 unseen records.

Training final XGBoost model...
XGBoost model is ready.

Calculating dropout chances and generating SHAP explanations...

--- STUDENT RISK REPORT (Random Sample of 50 students with 30-99% Dropout Chance) ---
----------------------------------------------------------------------
StudentID: RJ_6319
Dropout Chance: 60.14%
Risk Level: Critical
Top Red Flags (Factors increasing risk):
  - IsFirstGenerationLearner
  - IsMotherLiterate
  - MarksTrend
Top Green Flags (Factors decreasing risk):
  - HasReliableInternet
  - FailureRate LatestTerm
  - IsPreparingCompetitiveExam
----------------------------------------------------------------------
StudentID: RJ_11897
Dropout Chance: 89.68%
Risk Level: Critical
Top Red Flags (Factors increasing risk):
  - IsMotherLiterate
  - MediumChanged
  - MarksTrend
Top Green Flags (