In [None]:
from pathlib import Path
import sys

import numpy as np

# Get my_package directory path from Notebook
parent_dir = str(Path().resolve().parents[0])
print(parent_dir)
# Add to sys.path

path_set = set(sys.path)
if parent_dir not in path_set:
    sys.path.insert(0, parent_dir)

print(sys.path)

# Metric RAG

* median, decile, status (Red / Amber / Green), percentage
* All cost catgeories - cost per pupil
* Utilities and Premises and staff - per m^2

In [None]:
import src.pipeline.rag as rag
import pickle
import src.pipeline.comparator_sets as comparators
import glob 
import os
import time

# Create and clean directory
from pathlib import Path
Path("output/metric-rag").mkdir(parents=True, exist_ok=True)

files = glob.glob("output/metric-rag/*")
for f in files:
    os.remove(f)

## Loading saved comparator sets

In [None]:
with open('output/comparator-sets/maintained_schools.pkl', 'rb') as schools_file:
     ms_schools = pickle.load(schools_file)

with open('output/comparator-sets/ms_pupil_comparators.pkl', 'rb') as pupil_file:
     pupil_comparators = pickle.load(pupil_file)
     

Select the school for the RAG calculation

In [None]:
start_time = time.time()

target_school = 103341
comparator_set = comparators.get_comparator_set_by(lambda s: s['URN'] == target_school, ms_schools, pupil_comparators).set_index('URN')
comparator_set

Compute the rag calculation for the cost categories given in the [Benchmarking cost categories doc](https://educationgovuk.sharepoint.com.mcas.ms/:w:/r/sites/DfEFinancialBenchmarking/_layouts/15/Doc.aspx?sourcedoc=%7B622FB0F9-7CB1-445A-8FFA-664F8857F036%7D&file=Benchmarking%20cost%20categories%20and%20sub-categories.docx&action=default&mobileredirect=true)

In [None]:
import pandas as pd
import numpy as np
import json

def get_category_series(category_name, data, basis):
    category_cols = data.columns.isin(rag.base_cols) | data.columns.isin(["is_close"]) | data.columns.str.startswith(category_name)
    df = data[data.columns[category_cols]].copy()
    basis_data = data['Number of pupils' if basis == "pupil" else "Total Internal Floor Area"]
    
    # Create total column and divide be the basis data
    df[category_name+'_Total'] = df[df.columns[pd.Series(df.columns).str.startswith(category_name)]].sum(axis=1) / basis_data
    
    sub_categories = df.columns[df.columns.str.startswith(category_name)].values.tolist()
    
    for sub_category in sub_categories:
        df[sub_category] = df[sub_category] / basis_data
                
    return df, sub_categories
    

def category_stats(category_name, data, ofsted_rating, rag_mapping):
    close_count = data["is_close"][data["is_close"]].count()
    key = "outstanding" if ofsted_rating.lower() == "outstanding" else "other"
    key += "_10" if close_count > 10 else ""
    
    series = data[category_name]
    percentiles = pd.qcut(series, 100, labels=False, duplicates="drop")
    deciles = pd.qcut(series, 10, labels=False, duplicates="drop")
    percentile = int(np.nan_to_num(percentiles.iloc[0]))
    decile = int(np.nan_to_num(deciles.iloc[0]))
    value = float(np.nan_to_num(series.iloc[0]))
    mean = float(np.nan_to_num(series.mean()))
    diff = value - mean
    diff_percent = (diff / value) * 100 if value != 0 else 0
    
    return {
        'value': value,
        'mean': mean,
        'diff_mean': diff,
        'key': key,
        'percentage_diff': diff_percent,
        'percentile': percentile,
        'decile': decile,
        'rag': rag_mapping[key][int(decile)],
        'data': data.reset_index().to_dict(orient='records', index=True)
    }


def compute_category_rag(category_name, settings, comparator_set, stats):
    target = comparator_set.iloc[0]
    ofstead = target["OfstedRating (name)"]
    comparator_set["is_close"] = comparator_set.apply(
        lambda x: rag.is_close_comparator(settings["type"], target, x), axis=1
    )
    
    series, sub_categories = get_category_series(category_name, comparator_set, settings['type'])

    for sub_category in sub_categories:
        stats[sub_category] = category_stats(sub_category, series, ofstead, settings)
        
    return stats

def compute_rag(comparator_set):
    stats = {}
    for cat in rag.category_settings.keys():
        settings = rag.category_settings[cat]
        stats = compute_category_rag(cat, settings, comparator_set, stats)
        
    return stats

def write_rag(urn,year,comparator_set_type,type,data):
    with open(f'output/metric-rag/{urn}/{year}/{comparator_set_type}-{type}.json', 'w') as file:
        json.dump(data, file)
        file.close()



results = compute_rag(comparator_set)
write_rag(103341, 2022, "default", "unmixed", results)
# 
# /{URN}/{type}
#  --- School
#  --- Detail
# 
# /
# 
# //PUPIL
# //School
# {
#     "Teaching and Support staff": {},
#     "Prem"
# }
# 
# //Category detail
# {
#     "Total": {},
#     "Teaching Staff costs": {},
#     "Data": [
#         {}
#     ]   
# }

## Processing Time

In [None]:
print(f'Processing Time: {time.time() - start_time} seconds')