In [None]:
from pathlib import Path
import sys  

# Get my_package directory path from Notebook
parent_dir = str(Path().resolve().parents[0])
print(parent_dir)
# Add to sys.path

path_set = set(sys.path)
if parent_dir not in path_set:
    sys.path.insert(0, parent_dir)

print(sys.path)

# Comparator computation

This computes the Euclidean distance for each establishment from a base establishment of a consistent type (Academies, Maintained schools, SEN). Each establishment is given a weight against every other establishment and the top 60 for each establishment creates the comparator set for that establishment. This is repeated for all establishments in the establishment types set and then finally across all establishments across all establishment types.

In [None]:
import time 
import pickle
import pandas as pd
import src.pipeline.comparator_sets as comparators
import glob 
import os

start_time = time.time()
# Create and clean directory
from pathlib import Path
Path("output/comparator-sets").mkdir(parents=True, exist_ok=True)

files = glob.glob("output/comparator-sets/*")
for f in files:
    os.remove(f)

# Prepare Academy and School Data

Here we prepare the academy and maintained school data by filling in missing values in NumberOfPupils, % Free School Meals and, % Sen with the mean (at this time). 

In [None]:
academy_data = pd.read_csv("output/pre-processing/academies.csv")
hard_federations = pd.read_csv("output/pre-processing/hard_federations.csv")
soft_federations = pd.read_csv("output/pre-processing/soft_federations.csv")
federations = pd.concat([hard_federations, soft_federations])
ms_data = pd.read_csv("output/pre-processing/maintained_schools.csv", low_memory=False)
ms_data = comparators.prepare_data(pd.concat([ms_data, federations]))
all_schools = comparators.prepare_data(pd.concat([academy_data, ms_data]).set_index('URN'))

# Compute the pupil and building comparators

This creates the comparators sets across both academy and maintained schools

In [None]:
ms_pupil_comparators = comparators.compute_comparator_matrix(ms_data, comparators.compute_pupils_comparator)
ms_building_comparators = comparators.compute_comparator_matrix(ms_data, comparators.compute_buildings_comparator)
academy_pupil_comparators = comparators.compute_comparator_matrix(academy_data, comparators.compute_pupils_comparator)
academy_building_comparators = comparators.compute_comparator_matrix(academy_data, comparators.compute_buildings_comparator)
pupil_comparators = comparators.compute_comparator_matrix(all_schools, comparators.compute_pupils_comparator)
building_comparators = comparators.compute_comparator_matrix(all_schools, comparators.compute_buildings_comparator)

Save to disk

In [None]:
with open('output/comparator-sets/all_pupil_comparators.pkl', 'wb') as pupil_file:
     pickle.dump(pupil_comparators, pupil_file, protocol=pickle.HIGHEST_PROTOCOL)
     pupil_file.close()
     
with open('output/comparator-sets/all_building_comparators.pkl', 'wb') as build_file:
     pickle.dump(building_comparators, build_file, protocol=pickle.HIGHEST_PROTOCOL)
     build_file.close()
     
with open('output/comparator-sets/ms_pupil_comparators.pkl', 'wb') as ms_pupil_file:
     pickle.dump(ms_pupil_comparators, ms_pupil_file, protocol=pickle.HIGHEST_PROTOCOL)
     ms_pupil_file.close()
     
with open('output/comparator-sets/ms_building_comparators.pkl', 'wb') as ms_build_file:
     pickle.dump(ms_building_comparators, ms_build_file, protocol=pickle.HIGHEST_PROTOCOL)
     ms_build_file.close()
     
with open('output/comparator-sets/academy_pupil_comparators.pkl', 'wb') as academy_pupil_file:
     pickle.dump(academy_pupil_comparators, academy_pupil_file, protocol=pickle.HIGHEST_PROTOCOL)
     academy_pupil_file.close()
     
with open('output/comparator-sets/academy_building_comparators.pkl', 'wb') as academy_build_file:
     pickle.dump(academy_building_comparators, academy_build_file, protocol=pickle.HIGHEST_PROTOCOL)
     academy_build_file.close()

with open('output/comparator-sets/maintained_schools.pkl', 'wb') as maintained_schools_file:
     pickle.dump(ms_data, maintained_schools_file, protocol=pickle.HIGHEST_PROTOCOL)
     maintained_schools_file.close()

with open('output/comparator-sets/academies.pkl', 'wb') as academies_file:
     pickle.dump(academy_data, academies_file, protocol=pickle.HIGHEST_PROTOCOL)
     maintained_schools_file.close()

with open('output/comparator-sets/all_schools.pkl', 'wb') as school_file:
     pickle.dump(all_schools, school_file, protocol=pickle.HIGHEST_PROTOCOL)
     school_file.close()

In [None]:
comparators.get_comparator_set_by(lambda s: s['EstablishmentName'] == 'Glebe Primary School', ms_data, ms_pupil_comparators)

Below is an example of extracting a school by name to show how the data structures work

# Example using a custom comparator set

The example below selects a set of URN's based on a defined filter. And the 

In [None]:
# Custom specify some selection criteria for schools.
target_urn = 145110
custom_comparator_schools = all_schools[(all_schools['PFI School'] == 'PFI School') | (all_schools.index == target_urn)]
custom_comparators = comparators.compute_custom_comparator('PFI Comparator', custom_comparator_schools, comparators.compute_pupils_comparator)
cust_set = comparators.get_comparator_set_by(lambda s: s.index == target_urn, all_schools, custom_comparators, is_custom=True, comparator_key='PFI Comparator')

cust_set[['URN', 'GOR (name)', 'Percentage SEN', 'Percentage Free school meals']]

### Timing Keep at the bottom

In [None]:
print(f'Processing Time: {time.time() - start_time} seconds')