In [1]:
from pathlib import Path
import sys  

# Get my_package directory path from Notebook
parent_dir = str(Path().resolve().parents[0])
print(parent_dir)
# Add to sys.path

path_set = set(sys.path)
if parent_dir not in path_set:
    sys.path.insert(0, parent_dir)

print(sys.path)

/Users/colinbull/appdev/dfe/sfb/education-benchmarking-and-insights/data-pipeline
['/Users/colinbull/appdev/dfe/sfb/education-benchmarking-and-insights/data-pipeline', '/opt/homebrew/Cellar/python@3.12/3.12.2_1/Frameworks/Python.framework/Versions/3.12/lib/python312.zip', '/opt/homebrew/Cellar/python@3.12/3.12.2_1/Frameworks/Python.framework/Versions/3.12/lib/python3.12', '/opt/homebrew/Cellar/python@3.12/3.12.2_1/Frameworks/Python.framework/Versions/3.12/lib/python3.12/lib-dynload', '', '/Users/colinbull/Library/Caches/pypoetry/virtualenvs/fbit-data-pipeline-aJYNke-B-py3.12/lib/python3.12/site-packages']


# Comparator computation

This computes the Euclidean distance for each establishment from a base establishment of a consistent type (Academies, Maintained schools, SEN). Each establishment is given a weight against every other establishment and the top 60 for each establishment creates the comparator set for that establishment. This is repeated for all establishments in the establishment types set and then finally across all establishments across all establishment types.

In [2]:
import time 
import numpy as np
import pandas as pd
import src.pipeline.comparator_sets as comparators
import glob 
import os

start_time = time.time()
# Create and clean directory
from pathlib import Path
Path("output/comparator-sets").mkdir(parents=True, exist_ok=True)

# files = glob.glob("output/comparator-sets/*")
# for f in files:
#     os.remove(f)

# Prepare Academy and School Data

Here we prepare the academy and maintained school data by filling in missing values in NumberOfPupils, % Free School Meals and, % Sen with the mean (at this time). 

In [3]:
academy_data = comparators.prepare_data(pd.read_parquet("output/pre-processing/academies.parquet"))
ms_data = comparators.prepare_data(pd.read_parquet("output/pre-processing/maintained_schools.parquet"))
all_schools = comparators.prepare_data(pd.read_parquet("output/pre-processing/all_schools.parquet"))

In [31]:
def select_top_set(all_urns, all_regions, data, base_set_size=60, final_set_size=30):
    top_index = np.argsort(data, kind="stable")[:base_set_size]
    top_urns = all_urns[top_index]
    top_regions = all_regions[top_index]
    same_region = np.argwhere(top_regions == top_regions[0]).flatten()
    same_region_urns = top_urns[same_region]
    urns = np.append(same_region_urns, np.delete(top_urns, same_region)[:final_set_size - len(same_region_urns)])
    return all_urns, top_index, top_urns, same_region, same_region_urns

In [32]:
data = ms_data[["OfstedRating (name)",
                 "Percentage SEN",
                 "Percentage Free school meals",
                 "Number of pupils",
                 "Total Internal Floor Area",
                 "Age Average Score",
                 "GOR (name)",
                 "SchoolPhaseType",
                 "Percentage Primary Need SPLD",
                 "Percentage Primary Need MLD",
                 "Percentage Primary Need PMLD",
                 "Percentage Primary Need SEMH",
                 "Percentage Primary Need SLCN",
                 "Percentage Primary Need HI",
                 "Percentage Primary Need MSI",
                 "Percentage Primary Need PD",
                 "Percentage Primary Need ASD",
                 "Percentage Primary Need OTH"]][ms_data["SchoolPhaseType"] == "Pupil referral unit"].copy()

group_data = data.reset_index().drop_duplicates().copy().groupby(['SchoolPhaseType']).agg(list)
data = comparators.compute_buildings_comparator(("Pupil referral unit", group_data.iloc[0]))[1]
all_urns = np.array(group_data["URN"].values[0])
all_regions = np.array(group_data["GOR (name)"].values[0])
select_top_set(all_urns, all_regions, data)

(array([100006, 100007, 100103, 100391, 100889, 100994, 101255, 101493,
        101706, 101972, 102180, 102562, 102708, 102794, 103146, 103887,
        104038, 104288, 104418, 104757, 104850, 106022, 106023, 106666,
        108565, 108666, 108892, 112095, 112096, 112098, 113992, 115836,
        115837, 115838, 115847, 119103, 119106, 119112, 121270, 123349,
        126171, 128088, 130316, 130344, 130349, 130359, 130856, 130984,
        130987, 130991, 131066, 131100, 131134, 131182, 131201, 131294,
        131367, 131506, 131535, 131584, 131619, 131626, 131629, 131652,
        131753, 131769, 131772, 131827, 132027, 132033, 132077, 132128,
        132130, 132133, 132741, 132816, 132824, 133164, 133398, 133410,
        133583, 133660, 133675, 133689, 133744, 133749, 133754, 133778,
        133945, 134108, 134109, 134127, 134130, 134159, 134257, 134260,
        134321, 134366, 134367, 134373, 134374, 134390, 134523, 134525,
        134597, 134623, 134635, 134694, 134697, 134699, 134758, 

# Compute the pupil and building comparators

This creates the comparators sets across both academy and maintained schools

In [4]:
ms_comparators = comparators.compute_comparator_set(ms_data)
# academy_comparators = comparators.compute_comparator_set(academy_data)
# mixed_comparators = comparators.compute_comparator_set(all_schools)

1 [100006 100007 100103 100391 100889 100994 101255 101493 101706 101972
 102180 102562 102708 102794 103146 103887 104038 104288 104418 104757
 104850 106022 106023 106666 108565 108666 108892 112095 112096 112098
 113992 115836 115837 115838 115847 119103 119106 119112 121270 123349
 126171 128088 130316 130344 130349 130359 130856 130984 130987 130991
 131066 131100 131134 131182 131201 131294 131367 131506 131535 131584
 131619 131626 131629 131652 131753 131769 131772 131827 132027 132033
 132077 132128 132130 132133 132741 132816 132824 133164 133398 133410
 133583 133660 133675 133689 133744 133749 133754 133778 133945 134108
 134109 134127 134130 134159 134257 134260 134321 134366 134367 134373
 134374 134390 134523 134525 134597 134623 134635 134694 134697 134699
 134758 134759 134765 134766 134768 134846 134859 134870 134872 134880
 134919 134970 135010 135253 135254 135260 135330 135432 135462 135465
 135466 135467 135502 135504 135558 135640 135656 135778 135850 135851
 135

In [7]:
ms_comparators.to_parquet("output/comparator-sets/ms_comparators.parquet")
# academy_comparators.to_parquet("output/comparator-sets/academy_comparators.parquet")
# academy_comparators.to_parquet("output/comparator-sets/mixed_comparators.parquet")
# ms_data.to_parquet("output/comparator-sets/maintained_schools.parquet")
# academy_data.to_parquet("output/comparator-sets/academies.parquet")
# all_schools.to_parquet("output/comparator-sets/all_schools.parquet")

In [5]:
ms_comparators.loc[100007]["Building"]

array([100007, 102562, 102794, 108892, 131753, 133749, 134635, 135558,
       135010, 100994, 132816, 100006, 133660, 135260, 137328, 101255,
       103887, 106022, 108666, 128088, 132824, 134108, 134130, 134257,
       134260, 134766, 134859, 134970, 135656, 135851])

### Timing Keep at the bottom

In [None]:
print(f'Processing Time: {time.time() - start_time} seconds')