In [1]:
from pathlib import Path
import sys  

# Get my_package directory path from Notebook
parent_dir = str(Path().resolve().parents[0])
print(parent_dir)
# Add to sys.path

path_set = set(sys.path)
if parent_dir not in path_set:
    sys.path.insert(0, parent_dir)

print(sys.path)

/Users/colinbull/appdev/dfe/sfb/education-benchmarking-and-insights/data-pipeline
['/Users/colinbull/appdev/dfe/sfb/education-benchmarking-and-insights/data-pipeline', '/opt/homebrew/Cellar/python@3.12/3.12.2_1/Frameworks/Python.framework/Versions/3.12/lib/python312.zip', '/opt/homebrew/Cellar/python@3.12/3.12.2_1/Frameworks/Python.framework/Versions/3.12/lib/python3.12', '/opt/homebrew/Cellar/python@3.12/3.12.2_1/Frameworks/Python.framework/Versions/3.12/lib/python3.12/lib-dynload', '', '/Users/colinbull/Library/Caches/pypoetry/virtualenvs/fbit-data-pipeline-aJYNke-B-py3.12/lib/python3.12/site-packages']


# VMFI Data processing pipeline

This workbook aims to emulate the current data processing pipeline that occurs in VMFI pipeline. The logic and processing is largely based on the following document [Insights data portal - Data sources and sql analysis](https://educationgovuk.sharepoint.com.mcas.ms/:w:/r/sites/VMFI/_layouts/15/Doc.aspx?sourcedoc=%7B38C1DC37-7CDB-48B8-9E22-284F4F311C0B%7D&file=1.%20Insights%20portal%20-%20data%20sources%20and%20sql%20analysis%20v010%20-%20Copy.docx&action=default&mobileredirect=true) and will stay true to this document even if the existing stored procedures are doing something different. This will form the basis of a gap analysis going forward. 

All data loaded in the following workbook comes from the set of CSV files in the `data` folder alongside this workbook. These datasets are for the most part from the list at the start of the linked document. However, because there is additional standing data required to fully implement the pipeline then this data has been exported from the development VMFI pipeline database. These files are currently: 

| File name | DB Table |
|:----------|----------|
|standing_data_cdc.csv | standing_data.cdc |

In [2]:
import src.pipeline.pre_processing as pre_processing
import pandas as pd
import numpy as np
import time
import glob
import os

In [3]:
# Create and clean directory
from pathlib import Path
Path("output/pre-processing").mkdir(parents=True, exist_ok=True)

# files = glob.glob("output/pre-processing/*")
# for f in files:
#     os.remove(f)

In [4]:
start_time = time.time()
current_year = 2022

## CDC data load and preparation

School buildings condition dataset. Based on the surveys performed throughout 2018-2019.

The data in the file `data/standing_data_cdc.csv` is just an export of the data in `standing_data.cdc` table. Without the Year and Import ID fields. In future this will likely have to be read directly from the source database as per [this document.](https://educationgovuk.sharepoint.com.mcas.ms/:w:/r/sites/VMFI/_layouts/15/Doc.aspx?sourcedoc=%7B38C1DC37-7CDB-48B8-9E22-284F4F311C0B%7D&file=1.%20Insights%20portal%20-%20data%20sources%20and%20sql%20analysis%20v010%20-%20Copy.docx&action=default&mobileredirect=true) 

In [5]:
cdc = pre_processing.prepare_cdc_data('data/cdc.csv', current_year)

In [6]:
#cdc.to_csv('output/pre-processing/cdc.csv')
cdc

Unnamed: 0_level_0,Total Internal Floor Area,Age Average Score,Building Age
URN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
100150,2803.0,48.358188,1988.333333
100162,2105.0,133.162945,1880.0
100164,2934.0,97.0,1925.0
100166,2040.0,91.705882,1947.5
105304,1602.0,35.752809,1985.0
...,...,...,...
144913,3111.0,16.704275,2008.333333
144917,2620.0,78.412214,1960.714286
105623,3382.0,7.0,2015.0
144918,4733.0,19.009296,1997.5


## School Census data load

*Pupil Census* - DfE data collection providing information about school and pupil characteristics, for example percentage of pupils claiming free school`z meals, or having English as their second language. 

*Workforce census* - Single reference for all school workforce statistics based on staff working in publicly funded schools in England.

The following code loads both the workforce and pupil census data and preforms an `inner` join by URN on the data sets.

In [7]:
census = pre_processing.prepare_census_data('data/census_workforce.xlsx', 'data/census_pupils.csv')


In [8]:
#census.to_csv('output/pre-processing/census.csv')
census

Unnamed: 0_level_0,region_name,district_administrative_name,ward_name,Full time girls Year group 12,Full time girls Year group 13,Full time boys Year group 12,Full time boys Year group 13,Number of early year pupils (years E1 and E2),Number of nursery pupils (years N1 and N2),Number of pupils,...,Total Number of Auxiliary Staff (Headcount),Total School Workforce (Full-Time Equivalent),Total Number of Teachers in the Leadership Group (Full-time Equivalent),Total Number of Teachers (Full-Time Equivalent),Total Number of Teaching Assistants (Full-Time Equivalent),NonClassroomSupportStaffFTE,Total Number of Auxiliary Staff (Full-Time Equivalent),Teachers with Qualified Teacher Status (%) (Headcount),TotalPupilsNursery,TotalPupilsSixthForm
URN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
141334,East Midlands,Nottingham,Bilborough,0.0,0.0,0.0,0.0,0.0,0.0,325.0,...,13.0,34.17,2.64,13.11,10.29,4.82,5.95,100.000000,0.0,0.0
141396,East Midlands,Nottingham,Aspley,0.0,0.0,0.0,0.0,0.0,57.0,642.0,...,34.0,82.47,4.00,34.00,29.55,10.13,8.79,100.000000,57.0,0.0
141397,East Midlands,Nottingham,Bilborough,0.0,0.0,0.0,0.0,0.0,44.0,500.0,...,27.0,72.81,4.24,24.55,31.84,6.55,9.87,100.000000,44.0,0.0
142223,East Midlands,Nottingham,Wollaton West,0.0,0.0,0.0,0.0,0.0,70.0,1117.0,...,47.0,99.66,4.16,47.12,33.07,6.57,12.90,100.000000,70.0,0.0
144396,East Midlands,Nottingham,Bulwell,0.0,0.0,0.0,0.0,0.0,21.0,217.0,...,11.0,25.57,2.00,11.39,7.36,4.00,2.82,100.000000,21.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
104642,North West,Liverpool,Church,0.0,0.0,0.0,0.0,0.0,0.0,421.0,...,18.0,34.47,3.60,15.80,6.27,5.22,7.18,100.000000,0.0,0.0
104643,North West,Liverpool,Cressington,0.0,0.0,0.0,0.0,0.0,28.0,432.0,...,19.0,39.89,3.00,17.40,11.74,3.34,7.41,100.000000,28.0,0.0
104645,North West,Liverpool,Tuebrook and Stoneycroft,0.0,0.0,0.0,0.0,0.0,0.0,237.0,...,10.0,26.47,3.00,12.40,6.78,3.19,4.10,92.307692,0.0,0.0
104646,North West,Liverpool,St Michael's,0.0,0.0,0.0,0.0,0.0,18.0,194.0,...,5.0,22.36,2.00,12.00,6.24,1.44,2.68,100.000000,18.0,0.0


## Special Education Needs (SEN) data load and preparation

Special educational needs dataset. Contains information about the number of pupils, who require various SEN provisions. This loads the `SEN` data, which originates from [here](https://explore-education-statistics.service.gov.uk/find-statistics/special-educational-needs-in-england#dataDownloads-1)

In [9]:
sen = pre_processing.prepare_sen_data('data/sen.csv')

In [10]:
#sen.to_csv("output/pre-processing/sen.csv")
sen

Unnamed: 0_level_0,EHC plan,SEN support,Percentage SEN,Percentage with EHC,Percentage without EHC,Percentage Primary Need SPLD,Percentage Primary Need MLD,Percentage Primary Need SLD,Percentage Primary Need PMLD,Percentage Primary Need SEMH,Percentage Primary Need SLCN,Percentage Primary Need HI,Percentage Primary Need VI,Percentage Primary Need MSI,Percentage Primary Need PD,Percentage Primary Need ASD,Percentage Primary Need OTH
URN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
100000,8.0,59.0,24.723247,2.952030,21.771218,0.738007,1.476015,0.0,0.000000,3.321033,11.439114,0.738007,0.000000,0.738007,0.000000,3.321033,0.369004
100001,0.0,22.0,2.976996,0.000000,2.976996,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
100002,0.0,22.0,8.178439,0.000000,8.178439,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
100003,0.0,145.0,13.875598,0.000000,13.875598,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
100005,2.0,23.0,18.382353,1.470588,16.911765,0.000000,0.000000,0.0,0.735294,0.000000,0.000000,0.000000,0.000000,0.000000,0.735294,16.176471,0.735294
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
149557,3.0,2.0,12.195122,7.317073,4.878049,4.878049,0.000000,0.0,0.000000,0.000000,2.439024,0.000000,0.000000,0.000000,0.000000,4.878049,0.000000
149632,58.0,136.0,15.027111,4.492641,10.534469,2.401239,1.161890,0.0,0.000000,1.549187,1.936483,0.619675,0.309837,0.077459,0.542215,2.013943,1.781565
149633,0.0,7.0,8.139535,0.000000,8.139535,2.325581,1.162791,0.0,0.000000,1.162791,1.162791,0.000000,0.000000,0.000000,0.000000,1.162791,1.162791
149635,10.0,30.0,6.116208,1.529052,4.587156,2.293578,0.152905,0.0,0.000000,1.834862,0.305810,0.305810,0.305810,0.000000,0.000000,0.305810,0.611621


## KS2 and KS4 processing

In [11]:
ks2 = pre_processing.prepare_ks2_data('data/ks2.xlsx')

In [12]:
#ks2.to_csv('output/pre-processing/ks2.csv')
ks2

Unnamed: 0_level_0,Ks2Progress
URN,Unnamed: 1_level_1
100000.0,0.5
136807.0,13.0
139837.0,20.7
140686.0,-1.8
100008.0,5.5
...,...
,-0.4
,-1.7
,-0.1
,-2.0


In [13]:
ks4 = pre_processing.prepare_ks4_data('data/ks4.xlsx')

In [14]:
#ks4.to_csv('output/pre-processing/ks4.csv')
ks4

Unnamed: 0_level_0,AverageAttainment,Progress8Measure,Progress8Banding
URN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
100053.0,50.3,-0.16,Average
100054.0,65.8,0.77,Well above average
100052.0,44.6,-0.03,Average
100092.0,0.5,-2.22,Well below average
100049.0,41.7,-0.28,Below average
...,...,...,...
137269.0,50.2,0.36,Average
112393.0,40.3,-0.11,Average
112385.0,51.2,0.12,Average
141041.0,40.6,-0.84,Well below average


## AR Data load and preparation

This loads the Annual accounts return dataset and the corresponding mapping file. This extract only contains benchmarking section, which consists of submissions of costs, income, and balances of individual academies.

The mapping file, contains the mapping from AR4 cell references to cost categories and descriptions.

In [15]:
academy_ar = pre_processing.prepare_aar_data('data/academy_ar.xlsx')
central_services = pre_processing.prepare_central_services_data('data/academy_ar.xlsx')

In [16]:
#academy_ar.to_csv('output/pre-processing/academy_ar.csv')
academy_ar

Unnamed: 0_level_0,Academy UPIN,Trust UPIN,Date left or closed if in period,Date joined or opened if in period,London Weighting,PFI School,Income_DFE revenue grants,Income_SEN funding,Income_Other DFE grants,Income_Other grants,...,Catering Expenses,Occupation Costs,Total Costs of Supplies and Services,Total Costs of Educational Supplies,Costs of Brought in Professional Services,Total Expenditure,Trust Balance,Financial Position,Trust Financial Position,Is PFI
URN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
138623,111443,137157,,,Neither,Non-PFI school,7967000.0,153000.0,262000.0,403000.0,...,693000.0,1163000.0,1167000.0,750000.0,204000.0,10949000.0,-1899000.0,Deficit,Deficit,False
138630,111451,138199,,,Neither,Non-PFI school,6342000.0,222000.0,7000.0,327000.0,...,95000.0,326000.0,1398000.0,1038000.0,209000.0,7724000.0,2880000.0,Deficit,Surplus,False
143005,111453,135112,,,Neither,Non-PFI school,2798000.0,162000.0,63000.0,36000.0,...,43000.0,243000.0,409000.0,159000.0,122000.0,2949000.0,1985000.0,Surplus,Surplus,False
136296,111710,135428,,,Neither,Non-PFI school,7685000.0,323000.0,215000.0,470000.0,...,422000.0,771000.0,1331000.0,952000.0,71000.0,10613000.0,-1544000.0,Deficit,Deficit,False
137982,113087,136879,,,Neither,Non-PFI school,8021000.0,93000.0,45000.0,95000.0,...,104000.0,398000.0,1341000.0,428000.0,106000.0,10356000.0,-1232000.0,Deficit,Deficit,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
150026,164644,151923,,2023-07-01,Neither,Non-PFI school,227000.0,18000.0,2000.0,0.0,...,18000.0,25000.0,15000.0,10000.0,3000.0,252000.0,341000.0,Deficit,Surplus,False
150133,164745,136351,,2023-07-01,Neither,Non-PFI school,671000.0,183000.0,44000.0,0.0,...,5000.0,26000.0,38000.0,14000.0,7000.0,480000.0,1768000.0,Surplus,Surplus,False
150134,164746,135065,,2023-08-01,Neither,Non-PFI school,83000.0,127000.0,0.0,0.0,...,0.0,7000.0,0.0,0.0,0.0,201000.0,5380000.0,Surplus,Surplus,False
150226,164811,135065,,2023-08-01,Neither,Non-PFI school,54000.0,33000.0,0.0,1000.0,...,0.0,5000.0,2000.0,2000.0,0.0,99000.0,5380000.0,Deficit,Surplus,False


In [17]:
central_services

Unnamed: 0,Trust UPIN,Income_DFE revenue grants,Income_SEN funding,Income_Other DFE grants,Income_Other grants,Income_Government source,Income_Academies,Income_Non government,Income_Facilities and services,Income_Catering services,...,Staff Total,Maintenance & Improvement,Premises,Catering Exp,Occupation,Supplies and Services,Educational Supplies,Brought in Professional Services,Total Expenditure,Financial Position
0,122824,740000.0,5000.0,0.0,0.0,0.0,0.0,0.0,37000.0,3000.0,...,678000.0,15000.0,15000.0,0.0,9000.0,1434000.0,451000.0,791000.0,2131000.0,Deficit
1,122836,0.0,0.0,0.0,444000.0,0.0,0.0,444000.0,0.0,0.0,...,17582000.0,6584000.0,10602000.0,25000.0,826000.0,4053000.0,94000.0,2430000.0,33063000.0,Deficit
2,124263,0.0,0.0,0.0,-239000.0,0.0,0.0,-189000.0,0.0,0.0,...,3880000.0,19000.0,336000.0,-7000.0,2222000.0,526000.0,185000.0,88000.0,6964000.0,Deficit
3,126463,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Surplus
4,133307,0.0,0.0,113000.0,42000.0,0.0,0.0,42000.0,0.0,0.0,...,1718000.0,721000.0,771000.0,0.0,68000.0,284000.0,55000.0,91000.0,2841000.0,Deficit
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2473,163899,0.0,0.0,0.0,0.0,0.0,0.0,0.0,21000.0,0.0,...,96000.0,0.0,0.0,0.0,0.0,21000.0,0.0,20000.0,117000.0,Deficit
2474,164130,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Surplus
2475,164242,175000.0,0.0,11000.0,2000.0,0.0,0.0,2000.0,0.0,0.0,...,78000.0,0.0,0.0,0.0,0.0,211000.0,27000.0,180000.0,289000.0,Deficit
2476,164251,0.0,0.0,0.0,0.0,228000.0,0.0,0.0,815000.0,0.0,...,944000.0,0.0,0.0,79000.0,169000.0,137000.0,17000.0,60000.0,1251000.0,Deficit


Create a summary table for the AR stance of each distinct academy in the table.

Now compute the trust financial position in the same manor as the individual academy position

## Academy and maintained schools data load and preparation

This reads the main GIAS data (edubasealldataYYYYMMDD file) and the associated links file (links_edubasealldataYYYYMMDD file). This is taken from the [GIAS Service](https://get-information-schools.service.gov.uk/help)

Other columns are tidied up by asserting the correct type for that column. This is tidying phase is largly because on load integer columns will be inferred to be a float as opposed to an integer.

In [18]:
schools = pre_processing.prepare_schools_data('data/gias.csv','data/gias_links.csv')


In [19]:
#schools.to_csv('output/pre-processing/schools.csv')
schools.sort_index()

Unnamed: 0_level_0,LA (code),LA (name),EstablishmentNumber,EstablishmentName,TypeOfEstablishment (code),TypeOfEstablishment (name),EstablishmentStatus (code),EstablishmentStatus (name),OpenDate,CloseDate,...,BoardingEstablishment (name),PreviousLA (code),PreviousLA (name),PreviousEstablishmentNumber,OfstedRating (name),MSOA (code),LSOA (code),LA Establishment Number,Has Nursery,Has Sixth Form
URN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100000,201,City of London,3614,The Aldgate School,2,Voluntary aided school,1,Open,NaT,NaT,...,,999,,,Outstanding,E02000001,E01032739,201-3614,True,False
100001,201,City of London,6005,City of London School for Girls,11,Other independent school,1,Open,1920-01-01,NaT,...,Does not have boarders,999,,,,E02000001,E01000002,201-6005,False,True
100002,201,City of London,6006,St Paul's Cathedral School,11,Other independent school,1,Open,1939-01-01,NaT,...,Has boarders,999,,,,E02000001,E01032739,201-6006,False,False
100003,201,City of London,6007,City of London School,11,Other independent school,1,Open,1919-01-01,NaT,...,Does not have boarders,999,,,,E02000001,E01032739,201-6007,False,True
100005,202,Camden,1048,Thomas Coram Centre,15,Local authority nursery school,1,Open,NaT,NaT,...,,999,,,Outstanding,E02007115,E01000937,202-1048,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
402468,679,Monmouthshire,5500,King Henry viii 3-19 School,30,Welsh establishment,1,Open,2023-09-01,NaT,...,,999,,,,999999999,999999999,679-5500,False,False
402469,681,Cardiff,2333,Ysgol Gynradd Groes-Wen Primary,30,Welsh establishment,1,Open,2023-09-01,NaT,...,,999,,,,W02000380,W01001729,681-2333,False,False
402470,668,Pembrokeshire,2398,Ysgol Bro Penfro,30,Welsh establishment,4,Proposed to open,2024-09-01,NaT,...,,999,,,,W02000140,W01000607,668-2398,False,False
402471,679,Monmouthshire,2325,Ysgol Gymraeg Trefynwy,30,Welsh establishment,4,Proposed to open,2024-09-01,NaT,...,,999,,,,W02000339,W01001978,679-2325,False,False


In [20]:
cfo = pre_processing.build_cfo_data('data/cfo.xlsx')

Merge required GIAS, census, sen, cdc, PFI, and arr data with the base academy data

In [21]:
academies = pre_processing.build_academy_data('data/academy_master_list.csv', 'data/gias_all_links.csv',
                                              current_year, schools, census, sen, cdc, 
                                              academy_ar, ks2, ks4, cfo, central_services)

In [22]:
#academies.to_csv('output/pre-processing/academies.csv')
academies.sort_index()

Unnamed: 0_level_0,Company Registration Number,Incorporation Date,Academy Trust UPIN,Trust Name,Academy Name,Academy UPIN_x,Trust Type,Date Opened,Type of Provision - Phase,Regional School Commissioner,...,Other costs_Staff-related insurance_Per Unit_CS,Other costs_Supply teacher insurance_Per Unit_CS,Other costs_Rent and rates_Per Unit_CS,Other costs_Special facilities_Per Unit_CS,Other costs_Other insurance premiums_Per Unit_CS,Other costs_Interest charges for loan and bank_Per Unit_CS,Other costs_Direct revenue financing_Per Unit_CS,Other costs_PFI charges_Per Unit_CS,Other costs_Total_Per Unit_CS,Catering staff and supplies_Net Costs_CS
URN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
105135,05210075,2004-08-19 00:00:00.0000000,135025,St Paul's Academy,St Paul's Academy - Greenwich,119110,Single Academy Trust (SAT),2005-09-01 00:00:00.0000000,Secondary,South London & South East,...,0.0,0.0,0.000000,0.00000,0.000000,0.0,0.000000,0.0,0.000000,0.0
129342,07525820,2011-02-10 00:00:00.0000000,135906,Tove Learning Trust,Grace Academy Solihull,118615,Multi Academy Trust (MAT),2006-09-01 00:00:00.0000000,Secondary,West Midlands,...,0.0,0.0,0.000000,1.78112,0.197902,0.0,0.000000,0.0,17.811201,8000.0
130247,08075785,2012-05-18 00:00:00.0000000,138199,The White Horse Federation,John Madejski Academy,118622,Multi Academy Trust (MAT),2006-09-01 00:00:00.0000000,Secondary,North West London & South Central,...,0.0,0.0,0.000000,0.00000,31.121503,0.0,0.241877,0.0,53.051681,3146000.0
130908,02236171,1988-03-28 00:00:00.0000000,134858,Endeavour Academies Trust,Macmillan Academy,118397,Multi Academy Trust (MAT),2005-09-01 00:00:00.0000000,Secondary,North,...,0.0,0.0,0.000000,0.00000,0.000000,0.0,0.000000,0.0,0.982801,0.0
130909,02303464,1988-10-06 00:00:00.0000000,134876,Dixons Academies Trust,Dixons City Academy,118388,Multi Academy Trust (MAT),2005-09-01 00:00:00.0000000,Secondary,Lancashire & West Yorkshire,...,0.0,0.0,0.000000,0.00000,2.590674,0.0,5.867114,0.0,34.212130,133000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
149205,10265276,2016-07-06 00:00:00.0000000,139732,Coast And Vale Learning Trust,Filey School,163970,Multi Academy Trust (MAT),2015-09-01 00:00:00.0000000,Secondary,North,...,0.0,0.0,0.000000,0.00000,0.000000,0.0,0.000000,0.0,5.603985,0.0
149221,08566185,2013-06-12 00:00:00.0000000,137607,Perry Hall Multi-academy Trust,Sledmere Primary School,163968,Multi Academy Trust (MAT),2017-11-01 00:00:00.0000000,Primary,West Midlands,...,0.0,0.0,4.216867,0.00000,1.506024,0.0,11.445783,0.0,28.614458,0.0
149222,10192252,2016-05-20 00:00:00.0000000,139657,Connect Academy Trust,Cockington Primary School,163969,Multi Academy Trust (MAT),2013-09-01 00:00:00.0000000,Primary,South West,...,0.0,0.0,2.365464,0.00000,3.252513,0.0,0.000000,0.0,12.123004,4000.0
149299,09066969,2014-06-02 00:00:00.0000000,135834,Blessed Christopher Wharton Catholic Academy T...,"The Holy Family Catholic School, a Voluntary A...",162962,Multi Academy Trust (MAT),2022-08-01 00:00:00.0000000,Secondary,Lancashire & West Yorkshire,...,0.0,0.0,1.236603,0.00000,0.000000,0.0,0.000000,0.0,3.709810,0.0


Merge required census and cdc data to the maintained schools data set

In [23]:
# Load raw list from CSV
maintained_schools = pre_processing.build_maintained_school_data('data/maintained_schools_master_list.csv','data/gias_all_links.csv',current_year, schools, census, sen, cdc, ks2, ks4)

In [24]:
maintained_schools.to_csv('output/pre-processing/maintained_schools.csv')
#maintained_schools

In [25]:
all_schools = pd.concat([academies,maintained_schools])

In [None]:
all_schools[all_schools["Is PFI"].isna()]

## Federation Capture




In [None]:
(hard_federations, soft_federations) = pre_processing.build_federations_data('data/gias_all_links.csv', maintained_schools.reset_index())

In [None]:
hard_federations.to_csv('output/pre-processing/hard_federations.csv')
hard_federations

In [None]:
soft_federations.to_csv('output/pre-processing/soft_federations.csv')
soft_federations[['LAEstab']]

# Budget Forcast Returns

In [None]:
year = 2023

bfr_cell_mapping_cols = {'EFALineNo':'Int64','balance_flag':'Int64'}

bfr_sofa_cols = {'TrustUPIN':'Int64','CreatedBy':'string','Category':'string','Title':'string','EFALineNo':'Int64','Y1P1':'float','Y1P2':'float','Y2P1':'float','Y2P2':'float'}
bfr_3y_cols = {'TrustUPIN':'Int64','EFALineNo':'Int64','Y2':'Int64','Y3':'Int64','Y4':'Int64'}


def _calculate_metrics(bfr):
    bfr_metrics = bfr[['TrustUPIN']].copy().set_index('TrustUPIN')
    bfr_metrics['Revenue reserve as percentage of income'] =\
          round(bfr[bfr['Title']=='Revenue reserves'].set_index('TrustUPIN')[['Y1']]
                /bfr[bfr['Title']=='Total income'].set_index('TrustUPIN')[['Y1']]*100,1)
    bfr_metrics['Staff costs as percentage of income'] =\
          round(bfr[bfr['Title']=='Staff costs'].set_index('TrustUPIN')[['Y1']]
                /bfr[bfr['Title']=='Total income'].set_index('TrustUPIN')[['Y1']]*100,1)
    bfr_metrics['Expenditure as percentage of income'] =\
          round(bfr[bfr['Title']=='Total expenditure'].set_index('TrustUPIN')[['Y1']]
                /bfr[bfr['Title']=='Total income'].set_index('TrustUPIN')[['Y1']]*100,1)
    bfr_metrics['percent self-generated income'] =\
          round(bfr[bfr['Title']=='Self-generated income'].set_index('TrustUPIN')[['Y1']]/
                (bfr[bfr['Title']=='Self-generated income'].set_index('TrustUPIN')[['Y1']] +
                  bfr[bfr['Title']=='Grant funding'].set_index('TrustUPIN')[['Y1']])*100,0)
    bfr_metrics['percent grant funding'] = 100 - bfr_metrics['percent self-generated income']
    return bfr_metrics

def _calculate_slopes(matrix):
    x = np.array([1,2,3,4,5,6])
    x_bar = 3.5
    x_x_bar = x - x_bar
    y_bar = np.mean(matrix, axis=1)
    y_y_bar = matrix - np.vstack(y_bar)
    slope_array = np.sum(x_x_bar*y_y_bar,axis=1)/np.sum(x_x_bar**2)
    return slope_array

def _assign_slope_flag(df):
    percentile_10 = np.nanpercentile(df['slope'].values, 10)
    percentile_90 = np.nanpercentile(df['slope'].values, 90)
    df['slope_flag'] = 0
    df.loc[df['slope'] < percentile_10, 'slope_flag'] = -1
    df.loc[df['slope'] > percentile_90, 'slope_flag'] = 1
    return df


def _slope_analysis(bfr_dataframe, academies_y2, academies_y1):

    year_columns = ['Y-2','Y-1','Y1','Y2','Y3','Y4']
    bfr_revenue_reserves = bfr_dataframe[bfr_dataframe['Title']=='Revenue reserves']
    bfr_pupil_numbers = bfr_dataframe[bfr_dataframe['Title']=='Pupil numbers']

    

    # TODO need to add in historic data to this, filling in fake values for now
    bfr_revenue_reserves = pd.merge(
        bfr_revenue_reserves, 
        academies_y2[['Trust UPIN','Trust Balance']].rename(columns={
            'Trust UPIN':'TrustUPIN',
            'Trust Balance':'Y-2'
            }).drop_duplicates(), how='left', on='TrustUPIN')
    
    bfr_revenue_reserves = pd.merge(
        bfr_revenue_reserves, 
        academies_y1[['Trust UPIN','Trust Balance']].rename(columns={
                'Trust UPIN':'TrustUPIN',
                'Trust Balance':'Y-1'
                }).drop_duplicates(), how='left', on='TrustUPIN')
    
    bfr_pupil_numbers = pd.merge(
        bfr_pupil_numbers, 
        academies_y2[['Trust UPIN','Number of pupils']].rename(columns={
            'Trust UPIN':'TrustUPIN',
            'Number of pupils':'Y-2'
            }).groupby('TrustUPIN').agg(sum), how='left', on='TrustUPIN')
    
    bfr_pupil_numbers = pd.merge(
        bfr_pupil_numbers, 
        academies_y2[['Trust UPIN','Number of pupils']].rename(columns={
            'Trust UPIN':'TrustUPIN',
            'Number of pupils':'Y-1'
            }).groupby('TrustUPIN').agg(sum), how='left', on='TrustUPIN')


    # convert to matrix
    matrix_revenue_reserves = bfr_revenue_reserves[year_columns].values.astype(float)
    matrix_pupil_numbers = bfr_pupil_numbers[year_columns].values.astype(float)

    matrix_revenue_reserves_per_pupil = matrix_revenue_reserves/matrix_pupil_numbers

    # determine associated slopes
    bfr_revenue_reserves['slope'] = _calculate_slopes(matrix_revenue_reserves)

    bfr_revenue_reserves_per_pupil = bfr_revenue_reserves[['CreatedBy','Category','Title','EFALineNo']].copy()
    bfr_revenue_reserves_per_pupil['slope'] = _calculate_slopes(matrix_revenue_reserves_per_pupil)
    for i in range(len(year_columns)):
        bfr_revenue_reserves_per_pupil[year_columns[i]] = matrix_revenue_reserves_per_pupil.T[i]


    # flag top 10% and bottom 90% percent of slopes with -1 and 1 respectively
    bfr_revenue_reserves = _assign_slope_flag(bfr_revenue_reserves)
    bfr_revenue_reserves_per_pupil = _assign_slope_flag(bfr_revenue_reserves_per_pupil)

    return bfr_revenue_reserves, bfr_revenue_reserves_per_pupil

def _volatility_analysis(bfr):
    bfr['volatility'] = (bfr['Trust Balance'] - bfr['Y1P2'])/abs(bfr['Trust Balance'])

    volatility_conditions = [(bfr['volatility'] <= -0.05),
                            (bfr['volatility'] <= 0.05),
                            (bfr['volatility'] <= 0.1),
                            (bfr['volatility'] > 0.1)]
    volatility_messages = ["AR below forecast", 
                        "stable forecast", 
                        "AR above forecast", 
                        "AR significantly above forecast"]

    bfr['volatility_status'] = np.select(volatility_conditions, volatility_messages, default='')
    return bfr

def build_bfr_data(bfr_sofa_data_path,bfr_3y_data_path, academies_y2, academies_y1, academies):

    bfr_sofa = pd.read_csv(
        bfr_sofa_data_path,
        encoding='unicode-escape',
        dtype=bfr_sofa_cols,
        usecols=bfr_sofa_cols.keys(),
    )

    bfr_3y = pd.read_csv(
        bfr_3y_data_path,
        encoding='unicode-escape',
        dtype=bfr_3y_cols,
        usecols=bfr_3y_cols.keys(),
    )    


    # remove unused metrics
    bfr_sofa = bfr_sofa[bfr_sofa['EFALineNo'].isin([298,430,335,380,211,220,199,200,205,210,999])]

    self_gen_income = bfr_sofa[
        bfr_sofa['EFALineNo'].isin([211,220])
        ].groupby('TrustUPIN')[['Y1P1','Y1P2','Y2P1','Y2P2']].sum().reset_index()
    self_gen_income['Title'] = 'Self-generated income'

    grant_funding = bfr_sofa[
        bfr_sofa['EFALineNo'].isin([199,200,205,210])
        ].groupby('TrustUPIN')[['Y1P1','Y1P2','Y2P1','Y2P2']].sum().reset_index()
    grant_funding['Title'] = 'Grant funding'

    bfr_sofa = bfr_sofa[~bfr_sofa['EFALineNo'].isin([211,220,199,200,205,210])]
    bfr_sofa = pd.concat([bfr_sofa, self_gen_income, grant_funding])
    bfr_sofa['Title'].replace({
        'Balance c/f to next period ':'Revenue reserves',
        'Pupil numbers (actual and estimated)':'Pupil numbers',
        'Total revenue expenditure':'Total expenditure',
        'Total revenue income':'Total income','Total staff costs':'Staff costs'
        }, inplace=True)
    bfr_sofa['Y1'] = bfr_sofa['Y1P1'] + bfr_sofa['Y1P2']
    bfr_sofa.drop_duplicates(inplace=True)
    
    bfr_3y['EFALineNo'].replace({2980:298,4300:430,3800:380,9000:999}, inplace=True)
    bfr_3y = bfr_3y[bfr_3y['EFALineNo'].isin([298,430,335,380,999])]
    bfr_3y.drop_duplicates(inplace=True)
    

    bfr = pd.merge(bfr_sofa, bfr_3y, how='left', on=('TrustUPIN','EFALineNo'))
    
    # get trust metrics
    bfr_metrics = _calculate_metrics(bfr)
    # Slope analysis
    bfr_revenue_reserves, bfr_revenue_reserves_per_pupil = _slope_analysis(bfr, academies_y2, academies_y1)

    # volatility analysis
    bfr = pd.merge(bfr, academies[['Trust UPIN','Trust Balance']].rename(
        columns={'Trust UPIN': 'TrustUPIN'}), how='left', on='TrustUPIN')
    bfr = _volatility_analysis(bfr)
    
    bfr_metrics.drop_duplicates(inplace=True)
    
    use_columns = ["Y-2","Y-1","Y1","Y2","Y3","slope","slope_flag"]
    
    bfr_revenue_reserves.drop_duplicates(inplace=True)
    bfr_revenue_reserves = bfr_revenue_reserves[use_columns]
    bfr_revenue_reserves.rename(columns={
        "Y-2":"revenue_reserves_year_-2",
        "Y-1":"revenue_reserves_year_-1",
        "Y1":"revenue_reserves_year_0",
        "Y2":"revenue_reserves_year_1",
        "Y3":"revenue_reserves_year_2",
        "slope":"revenue_reserves_slope",
        "slope_flag":"revenue_reserves_slope_flag"}, inplace=True)
    
    
    bfr_revenue_reserves_per_pupil.drop_duplicates(inplace=True)
    bfr_revenue_reserves_per_pupil = bfr_revenue_reserves_per_pupil[use_columns]
    bfr_revenue_reserves_per_pupil.rename(columns={
        "Y-2":"revenue_reserves_year_per_pupil_-2",
        "Y-1":"revenue_reserves_year_per_pupil_-1",
        "Y1":"revenue_reserves_year_per_pupil_0",
        "Y2":"revenue_reserves_year_per_pupil_1",
        "Y3":"revenue_reserves_year_per_pupil_2",
        "slope":"revenue_reserves_year_per_pupil_slope",
        "slope_flag":"revenue_reserves_year_per_pupil_slope_flag"}, inplace=True)
    
    bfr_metrics = pd.merge(bfr_metrics, bfr_revenue_reserves, left_index=True, right_index=True)
    bfr_metrics = pd.merge(bfr_metrics, bfr_revenue_reserves_per_pupil, left_index=True, right_index=True)
    return bfr_metrics, bfr

In [None]:
bfr_metrics, bfr = build_bfr_data('data/BFR_SOFA_raw.csv','data/BFR_3Y_raw.csv', academies.copy().reset_index(), academies.copy().reset_index(), academies.copy().reset_index())

### Timing Keep at the bottom

In [None]:
print(f'Processing Time: {time.time() - start_time} seconds')

There are 327 duplicates in the academies outputs, and 346 in the maintained (excluding federations)


academy_ar has 10444 entries, 148 of the urns in this list are duplicated, though they look to be schools which have changed from SAT to MAT

The academies_list containes duplicated LAEstabs due to schools transitioning between SAT / MAT etc.

maintained_schools has 10650 entries, 347 of which are nulls. These can just be dropped

There are a few duplicates in federation data as well: