# Metric RAG computation

This computes the Euclidean distance for each establishment from a base establishment of a consistent type (Academies, Maintained schools, SEN). Each establishment is given a weight against every other establishment and the top 60 for each establishment creates the comparator set for that establishment. This is repeated for all establishments in the establishment types set and then finally across all establishments across all establishment types.

In [1]:
import numpy as np
import pandas as pd
import calculations as calcs
import glob 
import os

# Create and clean directory
from pathlib import Path
Path("output/comparator-sets").mkdir(parents=True, exist_ok=True)

files = glob.glob("output/comparator-sets/*")
for f in files:
    os.remove(f)

# Prepare Academy and School Data

Here we prepare the academy and maintained school data by filling in missing values in NumberOfPupils, % Free School Meals and, % Sen with the mean (at this time). 

In [2]:
academy_data = calcs.prepare_data(pd.read_csv("output/pre-processing/academies.csv"))
ms_data = calcs.prepare_data(pd.read_csv("output/pre-processing/maintained_schools.csv", low_memory=False))
all_data = pd.concat([academy_data, ms_data])

# All pupil mix

This creates the comparators sets across both academy and maintained schools

In [3]:
all_mix = calcs.compute_pupils_comparator_matrix(all_data)

for key in all_mix:
     print(f'{key}: {len(all_mix[key])}')

urns: 21012
All-through: 148
Alternative Provision: 137
Nursery: 374
Post-16: 63
Primary: 15955
Pupil referral unit: 161
Secondary: 3118
Special: 1007
University technical college: 49


Below is an example of extracting a school by name to show how the data structures work

In [4]:
target_school = 'Glebe Primary School'

school = all_data[all_data['EstablishmentName'] == target_school][['SchoolPhaseType', 'UKPRN', 'PFI School', 'GOR (name)']].reset_index().to_dict(orient='records')[0]

print(school)
colIndex = np.argwhere(all_mix['urns'] == school['URN'])[0][0]
data = all_mix[f'{school['SchoolPhaseType']}'][colIndex]

top_30_index = np.argsort(data)[:30]
distances = data[top_30_index]

urns = all_mix['urns'][top_30_index]
top_30 = all_data[all_data.index.isin(urns)][['EstablishmentName', 'UKPRN', 'PFI School', 'GOR (name)']].sort_index().drop_duplicates()

top_30['Distances'] = distances
top_30

{'URN': 145110, 'SchoolPhaseType': 'Primary', 'UKPRN': 10065458.0, 'PFI School': 'Non-PFI school', 'GOR (name)': 'East of England'}


Unnamed: 0_level_0,EstablishmentName,UKPRN,PFI School,GOR (name),Distances
URN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
103237,Nelson Junior and Infant School,10076565.0,Non-PFI school,West Midlands,0.0
106575,Barugh Green Primary School,10072614.0,Non-PFI school,Yorkshire and the Humber,0.002675
106722,Copley Junior School,10071450.0,Non-PFI school,Yorkshire and the Humber,0.004839
106975,Grace Owen Nursery School,10039192.0,Non-PFI school,Yorkshire and the Humber,0.005009
108348,Greenside Primary School,10072218.0,Non-PFI school,North East,0.005012
134798,London Academy,10016231.0,Non-PFI school,London,0.005136
135367,Bradford Academy,10021055.0,Non-PFI school,Yorkshire and the Humber,0.005659
136779,The Heath School,10033801.0,Non-PFI school,North West,0.00649
137045,King Edward VI Camp Hill School for Boys,10034644.0,Non-PFI school,West Midlands,0.006554
137142,St Breock Primary School,10034620.0,Non-PFI school,South West,0.006825


# Example using a custom comparator set

The example below selects a set of URN's based on a defined filter. And the 

In [13]:
target_urn = 145110

school = all_data[all_data.index == target_urn][['SchoolPhaseType', 'UKPRN', 'PFI School', 'GOR (name)']].reset_index().to_dict(orient='records')[0]

custom_comparator_schools = all_data[(all_data['PFI School'] == 'PFI School') | (all_data.index == target_urn)]

print(school)

result = calcs.compute_custom_comparator('PFI Comparator', custom_comparator_schools)
colIndex = np.argwhere(result['urns'] == target_urn)[0][0]
data = result['PFI Comparator'][colIndex]
top_30_index = np.argsort(data)[:60]
distances = data[top_30_index]

urns = result['urns'][top_30_index]
top_30 = custom_comparator_schools[custom_comparator_schools.index.isin(urns)][['EstablishmentName', 'UKPRN', 'PFI School', 'GOR (name)', 'Percentage SEN', 'Percentage Free school meals', 'Number of pupils', 'Age Average Score', 'Total Internal Floor Area']].sort_index().drop_duplicates()

top_30['Distances'] = distances
top_30

{'URN': 145110, 'SchoolPhaseType': 'Primary', 'UKPRN': 10065458.0, 'PFI School': 'Non-PFI school', 'GOR (name)': 'East of England'}


Unnamed: 0_level_0,EstablishmentName,UKPRN,PFI School,GOR (name),Percentage SEN,Percentage Free school meals,Number of pupils,Age Average Score,Total Internal Floor Area,Distances
URN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
136348,Gosforth Junior High Academy,10032217.0,PFI School,North East,2.526596,22.5,760.0,6.861434,5759.0,0.0
136849,Pewsey Vale School,10034111.0,PFI School,South West,8.241758,22.0,353.0,57.852712,5255.0,0.01242
137012,Newport Community School Primary Academy,10034649.0,PFI School,South West,3.067485,16.0,459.0,26.375723,2595.0,0.019359
138160,Cedar Road Primary School,10037465.0,PFI School,East Midlands,2.163462,16.1,415.0,81.292035,1808.0,0.022535
138190,Harrogate High School,10037426.0,PFI School,Yorkshire and the Humber,3.729282,27.8,693.0,7.0,8385.0,0.02368
138423,Eastfield Academy,10038375.0,PFI School,East Midlands,2.678571,30.4,226.0,54.154433,2098.0,0.025495
138493,Spring Lane Primary School,10038441.0,PFI School,East Midlands,3.171247,23.1,448.0,39.251195,2301.0,0.029998
138497,Harpfield Primary Academy,10038445.0,PFI School,West Midlands,1.271186,27.0,225.0,17.0,1675.0,0.032622
138952,Abington Vale Primary School,10039388.0,PFI School,East Midlands,1.822323,10.0,435.0,56.520507,3316.0,0.033604
138953,Ecton Brook Primary School,10038883.0,PFI School,East Midlands,1.505017,25.8,587.0,40.081828,4705.0,0.034632
