# Metric RAG computation

This computes the Euclidean distance for each establishment from a base establishment of a consistent type (Academies, Maintained schools, SEN). Each establishment is given a weight against every other establishment and the top 60 for each establishment creates the comparator set for that establishment. This is repeated for all establishments in the establishment types set and then finally across all establishments across all establishment types.

In [None]:
import numpy as np
import multiprocessing as mp
import pandas as pd
import calculations as calcs
import glob 
import os

# Create and clean directory
from pathlib import Path
Path("output/comparator-sets").mkdir(parents=True, exist_ok=True)

files = glob.glob("output/comparator-sets/*")
for f in files:
    os.remove(f)

In [None]:
def create_distance_matrix(data):
    classes = data.groupby(['SchoolPhaseType','Boarders (name)']).agg(list)
    
    distance_classes = {}
    
    for idx,row in classes.iterrows():
        pupils = np.array(row['NumberOfPupils'])
        fsm = np.array(row['Percentage Free school meals'])
        sen = np.array(row['Percentage SEN'])
        distance_classes[f'{idx[0]}-{idx[1]}'] = calcs.non_special_distance_calc(pupils, fsm, sen)

# Prepare Academy and School Data

Here we prepare the academy and maintained school data by filling in missing values in NumberOfPupils, % Free School Meals and, % Sen with the mean (at this time). 

In [None]:
academy_data = pd.read_csv("output/pre-processing/academies.csv")
academy_data['Boarders (name)'] = academy_data['Boarders (name)'].map(lambda x : 'Not Boarding' if x == 'Unknown' else x)
academy_data['NumberOfPupils'] = academy_data['NumberOfPupils'].fillna(academy_data['NumberOfPupils'].mean())
academy_data['Percentage Free school meals'] = academy_data['Percentage Free school meals'].fillna(academy_data['Percentage Free school meals'].mean())
academy_data['Percentage SEN'] = academy_data['Percentage SEN'].fillna(academy_data['Percentage SEN'].mean())
academy_data['']
academy_data = academy_data.set_index('URN').sort_index()

In [None]:
ms_data = pd.read_csv("output/pre-processing/maintained_schools.csv", low_memory=False)
ms_data['Boarders (name)'] = ms_data['Boarders (name)'].map(lambda x : 'Not Boarding' if x == 'Unknown' else x)
ms_data['NumberOfPupils'] = ms_data['NumberOfPupils'].fillna(ms_data['NumberOfPupils'].mean())
ms_data['Percentage Free school meals'] = ms_data['Percentage Free school meals'].fillna(ms_data['Percentage Free school meals'].mean())
ms_data['Percentage SEN'] = ms_data['Percentage SEN'].fillna(ms_data['Percentage SEN'].mean())
ms_data = ms_data.set_index('URN').sort_index()

# All pupil mix

This creates the comparators sets across both academy and maintained schools

In [None]:
all_mix = pd.concat([academy_data, ms_data])

all_mix