In [1]:
from pathlib import Path
import sys

import numpy as np
import pandas as pd

# Get my_package directory path from Notebook
parent_dir = str(Path().resolve().parents[0])
print(parent_dir)
# Add to sys.path

path_set = set(sys.path)
if parent_dir not in path_set:
    sys.path.insert(0, parent_dir)

print(sys.path)

/Users/colinbull/appdev/dfe/sfb/education-benchmarking-and-insights/data-pipeline
['/Users/colinbull/appdev/dfe/sfb/education-benchmarking-and-insights/data-pipeline', '/opt/homebrew/Cellar/python@3.12/3.12.2_1/Frameworks/Python.framework/Versions/3.12/lib/python312.zip', '/opt/homebrew/Cellar/python@3.12/3.12.2_1/Frameworks/Python.framework/Versions/3.12/lib/python3.12', '/opt/homebrew/Cellar/python@3.12/3.12.2_1/Frameworks/Python.framework/Versions/3.12/lib/python3.12/lib-dynload', '', '/Users/colinbull/Library/Caches/pypoetry/virtualenvs/fbit-data-pipeline-aJYNke-B-py3.12/lib/python3.12/site-packages']


# VMFI Data processing pipeline

This workbook aims to emulate the current data processing pipeline that occurs in VMFI pipeline. The logic and processing is largely based on the following document [Insights data portal - Data sources and sql analysis](https://educationgovuk.sharepoint.com.mcas.ms/:w:/r/sites/VMFI/_layouts/15/Doc.aspx?sourcedoc=%7B38C1DC37-7CDB-48B8-9E22-284F4F311C0B%7D&file=1.%20Insights%20portal%20-%20data%20sources%20and%20sql%20analysis%20v010%20-%20Copy.docx&action=default&mobileredirect=true) and will stay true to this document even if the existing stored procedures are doing something different. This will form the basis of a gap analysis going forward. 

All data loaded in the following workbook comes from the set of CSV files in the `data` folder alongside this workbook. These datasets are for the most part from the list at the start of the linked document. However, because there is additional standing data required to fully implement the pipeline then this data has been exported from the development VMFI pipeline database. These files are currently: 

| File name | DB Table |
|:----------|----------|
|standing_data_cdc.csv | standing_data.cdc |

In [2]:
import src.pipeline.pre_processing as pre_processing
import time
import glob
import os

In [3]:
# Create and clean directory
from pathlib import Path
Path("output/pre-processing").mkdir(parents=True, exist_ok=True)

files = glob.glob("output/pre-processing/*")
for f in files:
    os.remove(f)

In [4]:
current_year = 2022

## CDC data load and preparation

School buildings condition dataset. Based on the surveys performed throughout 2018-2019.

The data in the file `data/standing_data_cdc.csv` is just an export of the data in `standing_data.cdc` table. Without the Year and Import ID fields. In future this will likely have to be read directly from the source database as per [this document.](https://educationgovuk.sharepoint.com.mcas.ms/:w:/r/sites/VMFI/_layouts/15/Doc.aspx?sourcedoc=%7B38C1DC37-7CDB-48B8-9E22-284F4F311C0B%7D&file=1.%20Insights%20portal%20-%20data%20sources%20and%20sql%20analysis%20v010%20-%20Copy.docx&action=default&mobileredirect=true) 

In [5]:
cdc = pre_processing.prepare_cdc_data('data/cdc.csv', current_year)

In [6]:
#cdc.to_csv('output/pre-processing/cdc.csv')
cdc

Unnamed: 0_level_0,Total Internal Floor Area,Age Average Score
URN,Unnamed: 1_level_1,Unnamed: 2_level_1
100150,2803.0,48.358188
100162,2105.0,133.162945
100164,2934.0,97.0
100166,2040.0,91.705882
105304,1602.0,35.752809
...,...,...
144913,3111.0,16.704275
144917,2620.0,78.412214
105623,3382.0,7.0
144918,4733.0,19.009296


## School Census data load

*Pupil Census* - DfE data collection providing information about school and pupil characteristics, for example percentage of pupils claiming free school`z meals, or having English as their second language. 

*Workforce census* - Single reference for all school workforce statistics based on staff working in publicly funded schools in England.

The following code loads both the workforce and pupil census data and preforms an `inner` join by URN on the data sets.

In [7]:
census = pre_processing.prepare_census_data('data/census_workforce.xlsx', 'data/census_pupils.csv')

In [8]:
#census.to_csv('output/pre-processing/census.csv')
census

Unnamed: 0_level_0,Percentage claiming Free school meals,Percentage Free school meals,number of pupils whose first language is known or believed to be other than English,Statutory Low Age,Total School Workforce (Headcount),Total Number of Teachers in the Leadership Group (Headcount),Total Number of Teachers (Headcount),Total Number of Teaching Assistants (Headcount),FullTimeOtherHeadCount,Total Number of Auxiliary Staff (Headcount),Total School Workforce (Full-Time Equivalent),Total Number of Teachers in the Leadership Group (Full-time Equivalent),Total Number of Teachers (Full-Time Equivalent),Total Number of Teaching Assistants (Full-Time Equivalent),FullTimeOther,Total Number of Auxiliary Staff (Full-Time Equivalent),Pupil: Teacher Ratio (Full-Time Equivalent of qualified and unqualified teachers),Teachers with Qualified Teacher Status (%) (Headcount),Number of Vacant Teacher Posts
URN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
141334,33.8,52.3,93.0,4,48,3,15,14,6,13,34.17,2.64,13.11,10.29,4.82,5.95,24.8,100.000000,0
141396,23.4,60.3,236.0,3,118,4,39,34,11,34,82.47,4.00,34.00,29.55,10.13,8.79,18.3,100.000000,0
141397,33.2,47.7,127.0,3,105,5,27,42,9,27,72.81,4.24,24.55,31.84,6.55,9.87,19.7,100.000000,0
142223,5.1,8.7,343.0,3,156,5,56,44,9,47,99.66,4.16,47.12,33.07,6.57,12.90,23.0,100.000000,0
144396,56.7,64.8,29.0,3,37,2,13,9,4,11,25.57,2.00,11.39,7.36,4.00,2.82,18.1,100.000000,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
104642,2.4,2.6,14.0,4,52,4,18,10,6,18,34.47,3.60,15.80,6.27,5.22,7.18,26.6,100.000000,0
104643,3.5,8.5,13.0,3,68,3,19,24,6,19,39.89,3.00,17.40,11.74,3.34,7.41,24.7,100.000000,0
104645,32.9,33.8,43.0,7,37,3,13,10,4,10,26.47,3.00,12.40,6.78,3.19,4.10,19.1,92.307692,0
104646,29.9,31.9,20.0,3,29,2,12,10,2,5,22.36,2.00,12.00,6.24,1.44,2.68,15.8,100.000000,0


## Special Education Needs (SEN) data load and preparation

Special educational needs dataset. Contains information about the number of pupils, who require various SEN provisions. This loads the `SEN` data, which originates from [here](https://explore-education-statistics.service.gov.uk/find-statistics/special-educational-needs-in-england#dataDownloads-1)

In [9]:
sen = pre_processing.prepare_sen_data('data/sen.csv')

In [10]:
#sen.to_csv("output/pre-processing/sen.csv")
sen.describe()

Unnamed: 0,Total pupils,EHC plan,Percentage SEN,Primary Need SPLD,Primary Need MLD,Primary Need SLD,Primary Need PMLD,Primary Need SEMH,Primary Need SLCN,Primary Need HI,...,Percentage Primary Need SLD,Percentage Primary Need PMLD,Percentage Primary Need SEMH,Percentage Primary Need SLCN,Percentage Primary Need HI,Percentage Primary Need VI,Percentage Primary Need MSI,Percentage Primary Need PD,Percentage Primary Need ASD,Percentage Primary Need OTH
count,24442.0,24442.0,24442.0,24442.0,24442.0,24442.0,24442.0,24442.0,24442.0,24442.0,...,24442.0,24442.0,24442.0,24442.0,24442.0,24442.0,24442.0,24442.0,24442.0,24442.0
mean,371.239342,15.922224,9.706077,6.957778,9.093896,1.374642,0.447754,11.632477,14.110261,0.948736,...,0.803987,0.29185,4.152309,4.575585,0.276888,0.158307,0.061716,0.477125,3.004468,0.570137
std,361.174987,34.361522,25.066705,14.135909,15.285804,9.949306,3.380279,16.855224,16.033571,2.735648,...,5.929642,2.66345,10.77553,4.778444,2.354148,1.431706,0.328265,2.332442,8.983834,1.67472
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,145.0,2.0,1.143416,0.0,0.0,0.0,0.0,2.0,3.0,0.0,...,0.0,0.0,0.90751,1.342658,0.0,0.0,0.0,0.0,0.277778,0.0
50%,240.0,6.0,2.197802,2.0,3.0,0.0,0.0,6.0,10.0,0.0,...,0.0,0.0,2.268041,3.496503,0.0,0.0,0.0,0.138889,1.234568,0.0
75%,440.0,14.0,3.75,7.0,11.0,0.0,0.0,14.0,20.0,1.0,...,0.0,0.0,4.022989,6.43492,0.316456,0.162075,0.0,0.520833,2.461538,0.581395
max,3440.0,772.0,100.0,233.0,437.0,270.0,98.0,337.0,307.0,192.0,...,100.0,100.0,100.0,99.519231,100.0,100.0,23.913043,100.0,100.0,100.0


## KS2 and KS4 processing

In [11]:
ks2 = pre_processing.prepare_ks2_data('data/ks2.xlsx')

In [12]:
#ks2.to_csv('output/pre-processing/ks2.csv')
ks2

Unnamed: 0_level_0,Ks2Progress
URN,Unnamed: 1_level_1
100000.0,0.5
136807.0,13.0
139837.0,20.7
140686.0,-1.8
100008.0,5.5
...,...
,-0.4
,-1.7
,-0.1
,-2.0


In [13]:
ks4 = pre_processing.prepare_ks4_data('data/ks4.xlsx')

In [14]:
#ks4.to_csv('output/pre-processing/ks4.csv')
ks4

Unnamed: 0_level_0,AverageAttainment,Progress8Measure,Progress8Banding
URN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
100053.0,50.3,-0.16,Average
100054.0,65.8,0.77,Well above average
100052.0,44.6,-0.03,Average
100092.0,0.5,-2.22,Well below average
100049.0,41.7,-0.28,Below average
...,...,...,...
137269.0,50.2,0.36,Average
112393.0,40.3,-0.11,Average
112385.0,51.2,0.12,Average
141041.0,40.6,-0.84,Well below average


## AR Data load and preparation

This loads the Annual accounts return dataset and the corresponding mapping file. This extract only contains benchmarking section, which consists of submissions of costs, income, and balances of individual academies.

The mapping file, contains the mapping from AR4 cell references to cost categories and descriptions.

In [15]:
aar = pre_processing.prepare_aar_data('data/academy_ar.xlsx')

In [16]:
#academy_ar.to_csv('output/pre-processing/academy_ar.csv')
aar

Unnamed: 0_level_0,Trust UPIN,PFI School,DFE/EFA Revenue grants (includes Coronavirus Government Funding,of which: Coronavirus Government Funding,SEN funding,Other DfE/EFA Revenue Grants,Other income - LA & other Government grants,"Government source, non-grant",Academies,Non-Government,...,Trust_All income from facilities and services,Trust_Income from catering,Trust_Receipts from supply teacher insurance claims,Trust_Donations and/or voluntary funds,Trust_Other self-generated income,Trust_Investment income,Central Services Balance,Central Services Financial Position,Academy Financial Position,Trust Financial Position
Academy UPIN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
111443,137157,Non-PFI school,7967000.0,41000.0,153000.0,262000.0,0.0,0.0,0.0,403000.0,...,539000.0,1063000.0,0.0,127000.0,473000.0,0.0,-1830000.0,Deficit,Deficit,Deficit
111451,138199,Non-PFI school,6342000.0,80000.0,222000.0,7000.0,203000.0,0.0,0.0,124000.0,...,442000.0,1000.0,0.0,702000.0,0.0,0.0,-8541000.0,Deficit,Deficit,Surplus
111453,135112,Non-PFI school,2798000.0,25000.0,162000.0,63000.0,0.0,0.0,0.0,36000.0,...,286000.0,82000.0,0.0,426000.0,0.0,0.0,-3191000.0,Deficit,Surplus,Surplus
111710,135428,Non-PFI school,7685000.0,79000.0,323000.0,215000.0,83000.0,0.0,0.0,387000.0,...,96000.0,252000.0,0.0,19000.0,9000.0,15000.0,0.0,Deficit,Deficit,Deficit
113087,136879,Non-PFI school,8021000.0,0.0,93000.0,45000.0,81000.0,0.0,0.0,14000.0,...,77000.0,0.0,0.0,14000.0,666000.0,26000.0,0.0,Deficit,Deficit,Deficit
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
164746,140031,Non-PFI school,1103000.0,76000.0,1213000.0,123000.0,18000.0,0.0,0.0,0.0,...,17000.0,78000.0,0.0,1000.0,361000.0,0.0,-1067000.0,Deficit,Deficit,Surplus
164811,135065,Non-PFI school,54000.0,0.0,33000.0,0.0,1000.0,0.0,0.0,0.0,...,2168000.0,334000.0,0.0,525000.0,0.0,0.0,-8019000.0,Deficit,Deficit,Surplus
164811,140031,Non-PFI school,949000.0,11000.0,1219000.0,0.0,37000.0,0.0,0.0,0.0,...,17000.0,78000.0,0.0,1000.0,361000.0,0.0,-1067000.0,Deficit,Deficit,Surplus
164812,139706,Non-PFI school,137000.0,28000.0,0.0,0.0,0.0,0.0,0.0,0.0,...,221000.0,3000.0,0.0,104000.0,0.0,0.0,-950000.0,Deficit,Surplus,Surplus


## Academy and maintained schools data load and preparation

This reads the main GIAS data (edubasealldataYYYYMMDD file) and the associated links file (links_edubasealldataYYYYMMDD file). This is taken from the [GIAS Service](https://get-information-schools.service.gov.uk/help)

Other columns are tidied up by asserting the correct type for that column. This is tidying phase is largly because on load integer columns will be inferred to be a float as opposed to an integer.

In [17]:
schools = pre_processing.prepare_schools_data('data/gias.csv','data/gias_links.csv')


In [18]:
#schools.to_csv('output/pre-processing/schools.csv')
schools.sort_index()

Unnamed: 0_level_0,LA (code),LA (name),EstablishmentNumber,EstablishmentName,TypeOfEstablishment (code),TypeOfEstablishment (name),EstablishmentStatus (code),EstablishmentStatus (name),OpenDate,CloseDate,...,UrbanRural (name),BoardingEstablishment (name),PreviousLA (code),PreviousLA (name),PreviousEstablishmentNumber,OfstedRating (name),MSOA (code),LSOA (code),LA Establishment Number,HeadName
URN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100000,201,City of London,3614,The Aldgate School,2,Voluntary aided school,1,Open,NaT,NaT,...,(England/Wales) Urban major conurbation,,999,,,Outstanding,E02000001,E01032739,201-3614,Miss Alexandra Allan
100001,201,City of London,6005,City of London School for Girls,11,Other independent school,1,Open,1920-01-01,NaT,...,(England/Wales) Urban major conurbation,Does not have boarders,999,,,,E02000001,E01000002,201-6005,Mrs Jenny Brown
100002,201,City of London,6006,St Paul's Cathedral School,11,Other independent school,1,Open,1939-01-01,NaT,...,(England/Wales) Urban major conurbation,Has boarders,999,,,,E02000001,E01032739,201-6006,
100003,201,City of London,6007,City of London School,11,Other independent school,1,Open,1919-01-01,NaT,...,(England/Wales) Urban major conurbation,Does not have boarders,999,,,,E02000001,E01032739,201-6007,Mr Alan Bird
100005,202,Camden,1048,Thomas Coram Centre,15,Local authority nursery school,1,Open,NaT,NaT,...,(England/Wales) Urban major conurbation,,999,,,Outstanding,E02007115,E01000937,202-1048,Ms Perina Holness
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
402468,679,Monmouthshire,5500,King Henry viii 3-19 School,30,Welsh establishment,1,Open,2023-09-01,NaT,...,,,999,,,,999999999,999999999,679-5500,
402469,681,Cardiff,2333,Ysgol Gynradd Groes-Wen Primary,30,Welsh establishment,1,Open,2023-09-01,NaT,...,(England/Wales) Rural village,,999,,,,W02000380,W01001729,681-2333,
402470,668,Pembrokeshire,2398,Ysgol Bro Penfro,30,Welsh establishment,4,Proposed to open,2024-09-01,NaT,...,(England/Wales) Rural town and fringe,,999,,,,W02000140,W01000607,668-2398,
402471,679,Monmouthshire,2325,Ysgol Gymraeg Trefynwy,30,Welsh establishment,4,Proposed to open,2024-09-01,NaT,...,(England/Wales) Urban city and town,,999,,,,W02000339,W01001978,679-2325,


Merge required GIAS, census, sen, cdc, PFI, and arr data with the base academy data

In [None]:
academies = pre_processing.build_academy_data('data/academy_master_list.csv', 
                                              current_year, schools, census, sen, cdc, 
                                              aar, ks2, ks4)

In [None]:
# academies.to_csv('output/pre-processing/academies.csv', columns=output_schemas.academies_output)
academies.loc[147756]

Merge required census and cdc data to the maintained schools data set

In [None]:
# Load raw list from CSV
maintained_schools = pre_processing.build_maintained_school_data('data/maintained_schools_master_list.csv',current_year, schools, census, sen, cdc, ks2, ks4)

In [None]:
maintained_schools_output = [
    "number of pupils whose first language is known or believed to be other than English",
    "Utilities_Water and sewerage:",
    "Utilities_Energy",
    "UrbanRural (name)",
    "UKPRN",
    "Type",
    "Total pupils",
    "Total School Workforce (Headcount)",
    "Total School Workforce (Full-Time Equivalent)",
    "Total Number of Teaching Assistants (Headcount)",
    "Total Number of Teaching Assistants (Full-Time Equivalent)",
    "Total Number of Teachers in the Leadership Group (Headcount)",
    "Total Number of Teachers in the Leadership Group (Full-time Equivalent)",
    "Total Number of Teachers (Headcount)",
    "Total Number of Teachers (Full-Time Equivalent)",
    "Total Number of Auxiliary Staff (Headcount)",
    "Total Number of Auxiliary Staff (Full-Time Equivalent)",
    "Total Internal Floor Area",
    "Total Income   I01 to I18",
    '"Total Income   I01 to I08, I11 to I15, I18"',
    "Total Expenditure  E01 to E32",
    '"Total Expenditure  E01 to E29 and E31 to E32 minus I9, I10, I16 and I17"',
    "TelephoneNum",
    "Teaching and Teaching support staff_Teaching staff",
    "Teaching and Teaching support staff_Supply teaching staff",
    "Teaching and Teaching support staff_Educational consultancy",
    "Teaching and Teaching support staff_Education support staff",
    "Teaching and Teaching support staff_Agency supply teaching staff",
    "Teachers with Qualified Teacher Status (%) (Headcount)",
    "Statutory Low Age",
    "Status",
    "SchoolWebsite",
    "SchoolPhaseType",
    "SchoolCapacity",
    "School Financial Position",
    "School Balance",
    "Revenue Reserve   B01 plus B02 plus B06",
    "Pupil: Teacher Ratio (Full-Time Equivalent of qualified and unqualified teachers)",
    "Prov_VI",
    "Prov_SPLD",
    "Prov_SLD",
    "Prov_SLCN",
    "Prov_SEMH",
    "Prov_PMLD",
    "Prov_PD",
    "Prov_OTH",
    "Prov_MSI",
    "Prov_MLD",
    "Prov_HI",
    "Prov_ASD",
    "Progress8Measure",
    "Progress8Banding",
    "Primary Need VI",
    "Primary Need SPLD",
    "Primary Need SLD",
    "Primary Need SLCN",
    "Primary Need SEMH",
    "Primary Need PMLD",
    "Primary Need PD",
    "Primary Need OTH",
    "Primary Need MSI",
    "Primary Need MLD",
    "Primary Need HI",
    "Primary Need ASD",
    "Premises staff and services_Premises staff",
    "Premises staff and services_Other occupation costs",
    "Premises staff and services_Maintenance of premises",
    "Premises staff and services_Cleaning and caretaking",
    "Postcode",
    "Percentage SEN",
    "Percentage Free school meals",
    "PFI School",
    "Overall Phase",
    "Other grants and payments",
    "Other costs_Supply teacher insurance",
    "Other costs_Staff-related insurance",
    "Other costs_Staff development and training",
    "Other costs_Special facilities",
    "Other costs_Rent and rates",
    "Other costs_PFI charges",
    "Other costs_Other insurance premiums",
    "Other costs_Interest charges for loan and bank",
    "Other costs_Indirect employee expenses",
    "Other costs_Grounds maintenance",
    "Other costs_Direct revenue financing",
    "OfstedRating (name)",
    "OfstedLastInsp",
    "OfficialSixthForm (code)",
    "NurseryProvision (name)",
    "Number of Vacant Teacher Posts",
    "Non-educational support staff_Professional services (non-curriculum)",
    "Non-educational support staff_Other staff",
    "Non-educational support staff_Administrative and clerical staff",
    "No of pupils in 6th form",
    "No Teachers",
    "MSOA (code)",
    "Lowest age of pupils",
    "London Weighting",
    "Lead school in federation",
    "LastChangedDate",
    "LSOA (code)",
    "LA Establishment Number",
    "LA (name)",
    "LA (code)",
    "Ks2Progress",
    "In-year Balance   Total Income (I01 to I18) minus Total Expenditure (E01 to E32)",
    "IT_ICT learning resources",
    "I18  Additional grant for schools",
    "I17  Community focused school facilities income",
    "I16  Community focussed school funding and   or grants",
    "I15  Pupil focussed extended school funding and   or grants",
    "I13  Donations and or private funds",
    "I12  Income from contributions to visits etc",
    "I11  Receipts from other insurance claims",
    "I10  Receipts from supply teacher insurance claims",
    "I08  Income from facilities and services",
    "I06  Other government grants",
    "I05  Pupil Premium",
    "I04  Funding for minority ethnic pupils",
    "I03  SEN funding",
    "I02  Funding for 6th form students",
    "I01  Funds delegated by the LA",
    "Highest age of pupils",
    "HeadName",
    "Gender (name)",
    "GOR (name)",
    "FullTimeOtherHeadCount",
    "FullTimeOther",
    "FTE of Teaching Assistants",
    "FTE of Support Staff",
    "FTE of Admin Staff",
    "EstablishmentNumber",
    "EstablishmentName",
    "Educational supplies_Learning resources (not ICT equipment)",
    "Educational supplies_Examination fees",
    "EHC plan",
    "E32 Community focused school costs",
    "E31  Community focused school staff",
    "Catering_Income from catering",
    "Catering_Catering supplies",
    "Catering_Catering staff",
    "BoardingEstablishment (name)",
    "Boarders (name)",
    "AverageAttainment",
    "Age Average Score",
    "AdmissionsPolicy (name)",
    "AdmissionsPolicy (code)",
    "Administrative supplies_Administrative supplies (non educational)",
    "% of teachers with QTS",
    "% of pupils with EAL",
    "% of pupils who are Boarders",
    "% of pupils known to be eligible for free school meals (Performa",
    "% of pupils known to be eligible for and claiming free school me",
]

maintained_schools['Premises staff and services_Premises staff']
maintained_schools.to_csv('output/pre-processing/maintained_schools.csv', columns=maintained_schools_output)
maintained_schools

## Federation Capture




In [None]:
(hard_federations, soft_federations) = pre_processing.build_federations_data('data/gias_all_links.csv', maintained_schools)

In [None]:
hard_federations.to_csv('output/pre-processing/hard_federations.csv')
hard_federations

In [None]:
soft_federations.to_csv('output/pre-processing/soft_federations.csv')
soft_federations[['LAEstab']]