# Metric RAG computation

This computes the Euclidean distance for each establishment from a base establishment of a consistent type (Academies, Maintained schools, SEN). Each establishment is given a weight against every other establishment and the top 60 for each establishment creates the comparator set for that establishment. This is repeated for all establishments in the establishment types set and then finally across all establishments across all establishment types.

In [None]:
import numpy as np
import pandas as pd
import calculations as calcs
import glob 
import os

# Create and clean directory
from pathlib import Path
Path("output/comparator-sets").mkdir(parents=True, exist_ok=True)

files = glob.glob("output/comparator-sets/*")
for f in files:
    os.remove(f)

# Prepare Academy and School Data

Here we prepare the academy and maintained school data by filling in missing values in NumberOfPupils, % Free School Meals and, % Sen with the mean (at this time). 

In [None]:
academy_data = calcs.prepare_data(pd.read_csv("output/pre-processing/academies.csv"))
ms_data = calcs.prepare_data(pd.read_csv("output/pre-processing/maintained_schools.csv", low_memory=False))
all_data = pd.concat([academy_data, ms_data])

# All pupil mix

This creates the comparators sets across both academy and maintained schools

In [None]:
all_mix = calcs.compute_pupils_comparator_matrix(all_data)

for key in all_mix:
     print(f'{key}: {len(all_mix[key])}')

Below is an example of extracting a school by name to show how the data structures work

In [None]:
target_school = 'Glebe Primary School'

school = all_data[all_data['EstablishmentName'] == target_school][['SchoolPhaseType', 'UKPRN', 'PFI School', 'GOR (name)']].reset_index().to_dict(orient='records')[0]

print(school)
colIndex = np.argwhere(all_mix['urns'] == school['URN'])[0][0]
data = all_mix[f'{school['SchoolPhaseType']}'][colIndex]

top_30_index = np.argsort(data)[:30]
distances = data[top_30_index]

urns = all_mix['urns'][top_30_index]
top_30 = all_data[all_data.index.isin(urns)][['EstablishmentName', 'UKPRN', 'PFI School', 'GOR (name)']].sort_index().drop_duplicates()

top_30['Distances'] = distances
top_30

# Example using a custom comparator set

The example below selects a set of URN's based on a defined filter. And the 

In [None]:
target_urn = 145110

school = all_data[all_data.index == target_urn][['SchoolPhaseType', 'UKPRN', 'PFI School', 'GOR (name)']].reset_index().to_dict(orient='records')[0]

custom_comparator_schools = all_data[(all_data['PFI School'] == 'PFI School') | (all_data.index == target_urn)]

print(school)

result = calcs.compute_custom_comparator('PFI Comparator', custom_comparator_schools)
colIndex = np.argwhere(result['urns'] == target_urn)[0][0]
data = result['PFI Comparator'][colIndex]
top_30_index = np.argsort(data)[:60]
distances = data[top_30_index]

urns = result['urns'][top_30_index]
top_30 = custom_comparator_schools[custom_comparator_schools.index.isin(urns)][['EstablishmentName', 'UKPRN', 'PFI School', 'GOR (name)', 'Percentage SEN', 'Percentage Free school meals', 'Number of pupils', 'Age Average Score', 'Total Internal Floor Area']].sort_index().drop_duplicates()

top_30['Distances'] = distances
top_30