In [16]:
from pathlib import Path
import sys  

# Get my_package directory path from Notebook
parent_dir = str(Path().resolve().parents[0])
print(parent_dir)
# Add to sys.path

path_set = set(sys.path)
if parent_dir not in path_set:
    sys.path.insert(0, parent_dir)

print(sys.path)

/Users/colinbull/appdev/dfe/sfb/education-benchmarking-and-insights/data-pipeline
['/Users/colinbull/appdev/dfe/sfb/education-benchmarking-and-insights/data-pipeline', '/Users/colinbull/appdev/dfe/sfb/education-benchmarking-and-insights/data-pipeline/src', '/Users/colinbull/Library/Application Support/JetBrains/Toolbox/apps/PyCharm-P/ch-0/241.14494.241/PyCharm.app/Contents/plugins/python/helpers-pro/jupyter_debug', '/Users/colinbull/Library/Application Support/JetBrains/Toolbox/apps/PyCharm-P/ch-0/241.14494.241/PyCharm.app/Contents/plugins/python/helpers/pydev', '/opt/homebrew/Cellar/python@3.12/3.12.2_1/Frameworks/Python.framework/Versions/3.12/lib/python312.zip', '/opt/homebrew/Cellar/python@3.12/3.12.2_1/Frameworks/Python.framework/Versions/3.12/lib/python3.12', '/opt/homebrew/Cellar/python@3.12/3.12.2_1/Frameworks/Python.framework/Versions/3.12/lib/python3.12/lib-dynload', '', '/Users/colinbull/Library/Caches/pypoetry/virtualenvs/fbit-data-pipeline-aJYNke-B-py3.12/lib/python3.12/si

# VMFI Data processing pipeline

This workbook aims to emulate the current data processing pipeline that occurs in VMFI pipeline. The logic and processing is largely based on the following document [Insights data portal - Data sources and sql analysis](https://educationgovuk.sharepoint.com.mcas.ms/:w:/r/sites/VMFI/_layouts/15/Doc.aspx?sourcedoc=%7B38C1DC37-7CDB-48B8-9E22-284F4F311C0B%7D&file=1.%20Insights%20portal%20-%20data%20sources%20and%20sql%20analysis%20v010%20-%20Copy.docx&action=default&mobileredirect=true) and will stay true to this document even if the existing stored procedures are doing something different. This will form the basis of a gap analysis going forward. 

All data loaded in the following workbook comes from the set of CSV files in the `data` folder alongside this workbook. These datasets are for the most part from the list at the start of the linked document. However, because there is additional standing data required to fully implement the pipeline then this data has been exported from the development VMFI pipeline database. These files are currently: 

| File name | DB Table |
|:----------|----------|
|standing_data_cdc.csv | standing_data.cdc |

In [17]:
import src.pre_processing as pre_processing
import time
import glob
import os

# Create and clean directory
from pathlib import Path
Path("output/pre-processing").mkdir(parents=True, exist_ok=True)

files = glob.glob("output/pre-processing/*")
for f in files:
    os.remove(f)

start_time = time.time()
current_year = 2022



## CDC data load and preparation

School buildings condition dataset. Based on the surveys performed throughout 2018-2019.

The data in the file `data/standing_data_cdc.csv` is just an export of the data in `standing_data.cdc` table. Without the Year and Import ID fields. In future this will likely have to be read directly from the source database as per [this document.](https://educationgovuk.sharepoint.com.mcas.ms/:w:/r/sites/VMFI/_layouts/15/Doc.aspx?sourcedoc=%7B38C1DC37-7CDB-48B8-9E22-284F4F311C0B%7D&file=1.%20Insights%20portal%20-%20data%20sources%20and%20sql%20analysis%20v010%20-%20Copy.docx&action=default&mobileredirect=true) 

In [19]:
cdc = pre_processing.prepare_cdc_data('data/standing_data_cdc.csv', current_year)

In [20]:
cdc.to_csv('output/pre-processing/cdc.csv')
cdc

Unnamed: 0_level_0,GIFA,Block Age,Total Internal Floor Area,Proportion Area,Indicative Age,Age Score,Age Average Score
URN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
100150,89.0,,2803.0,0.031752,,,48.358188
100150,2221.0,1961-1970,2803.0,0.792365,1965,45.164823,48.358188
100150,436.0,2001-2010,2803.0,0.155548,2005,2.64431,48.358188
100150,57.0,1991-2000,2803.0,0.020335,1995,0.549055,48.358188
100162,131.0,,2105.0,0.062233,,,133.162945
...,...,...,...,...,...,...,...
144919,284.0,1981-1990,1762.0,0.161180,1985,5.963678,38.370602
144919,392.0,2001-2010,1762.0,0.222474,2005,3.782066,38.370602
144919,740.0,2011-2020,1762.0,0.419977,2015,2.939841,38.370602
144919,31.0,2001-2010,1762.0,0.017594,2005,0.299092,38.370602


## School Census data load

*Pupil Census* - DfE data collection providing information about school and pupil characteristics, for example percentage of pupils claiming free school`z meals, or having English as their second language. 

*Workforce census* - Single reference for all school workforce statistics based on staff working in publicly funded schools in England.

The following code loads both the workforce and pupil census data and preforms an `inner` join by URN on the data sets.

In [21]:
census = pre_processing.prepare_census_data(
    'data/School_Tables_School_Workforce_Census_2022.xlsx', 
    'data/standing_data_census_pupils.csv')

In [22]:
census.to_csv('output/pre-processing/census.csv')
census

Unnamed: 0_level_0,% of pupils known to be eligible for and claiming free school me,% of pupils known to be eligible for free school meals (Performa,number of pupils whose first language is known or believed to be other than English,Statutory Low Age,Total School Workforce (Headcount),Total Number of Teachers in the Leadership Group (Headcount),Total Number of Teachers (Headcount),Total Number of Teaching Assistants (Headcount),FullTimeOtherHeadCount,Total Number of Auxiliary Staff (Headcount),Total School Workforce (Full-Time Equivalent),Total Number of Teachers in the Leadership Group (Full-time Equivalent),Total Number of Teachers (Full-Time Equivalent),Total Number of Teaching Assistants (Full-Time Equivalent),FullTimeOther,Total Number of Auxiliary Staff (Full-Time Equivalent),Pupil: Teacher Ratio (Full-Time Equivalent of qualified and unqualified teachers),Teachers with Qualified Teacher Status (%) (Headcount),Number of Vacant Teacher Posts
URN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
141334,33.8,52.3,93.0,4,48,3,15,14,6,13,34.17,2.64,13.11,10.29,4.82,5.95,24.8,100.000000,0
141396,23.4,60.3,236.0,3,118,4,39,34,11,34,82.47,4.00,34.00,29.55,10.13,8.79,18.3,100.000000,0
141397,33.2,47.7,127.0,3,105,5,27,42,9,27,72.81,4.24,24.55,31.84,6.55,9.87,19.7,100.000000,0
142223,5.1,8.7,343.0,3,156,5,56,44,9,47,99.66,4.16,47.12,33.07,6.57,12.90,23.0,100.000000,0
144396,56.7,64.8,29.0,3,37,2,13,9,4,11,25.57,2.00,11.39,7.36,4.00,2.82,18.1,100.000000,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
104642,2.4,2.6,14.0,4,52,4,18,10,6,18,34.47,3.60,15.80,6.27,5.22,7.18,26.6,100.000000,0
104643,3.5,8.5,13.0,3,68,3,19,24,6,19,39.89,3.00,17.40,11.74,3.34,7.41,24.7,100.000000,0
104645,32.9,33.8,43.0,7,37,3,13,10,4,10,26.47,3.00,12.40,6.78,3.19,4.10,19.1,92.307692,0
104646,29.9,31.9,20.0,3,29,2,12,10,2,5,22.36,2.00,12.00,6.24,1.44,2.68,15.8,100.000000,0


## Special Education Needs (SEN) data load and preparation

Special educational needs dataset. Contains information about the number of pupils, who require various SEN provisions. This loads the `SEN` data, which originates from [here](https://explore-education-statistics.service.gov.uk/find-statistics/special-educational-needs-in-england#dataDownloads-1)

In [23]:
sen = pre_processing.prepare_sen_data('data/SEN.csv')

In [24]:
sen.to_csv("output/pre-processing/sen.csv")
sen

Unnamed: 0_level_0,Total pupils,EHC plan,Percentage SEN,Primary Need SPLD,Primary Need MLD,Primary Need SLD,Primary Need PMLD,Primary Need SEMH,Primary Need SLCN,Primary Need HI,...,Prov_SLD,Prov_PMLD,Prov_SEMH,Prov_SLCN,Prov_HI,Prov_VI,Prov_MSI,Prov_PD,Prov_ASD,Prov_OTH
URN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100000,271,8,2.95203,2,4,0,0,9,31,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
100001,739,0,0.0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
100002,269,0,0.0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
100003,1045,0,0.0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
100005,136,2,1.470588,0,0,0,1,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
149557,41,3,7.317073,2,0,0,0,0,1,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
149632,1291,58,4.492641,31,15,0,0,20,25,8,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
149633,86,0,0.0,2,1,0,0,1,1,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
149635,654,10,1.529052,15,1,0,0,12,2,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## KS2 and KS4 processing

In [9]:
ks2 = pre_processing.prepare_ks2_data('data/2022-2023_england_ks2revised.xlsx')

FileNotFoundError: [Errno 2] No such file or directory: 'data/2022-2023_england_ks2revised.xlsx'

In [None]:
ks2.to_csv('output/ks2.csv')
ks2

In [None]:
ks4 = pre_processing.prepare_ks4_data('data/2022-2023_england_ks4revised.xlsx')

In [None]:
ks4.to_csv('output/ks4.csv')
ks4

## AR Data load and preparation

This loads the Annual accounts return dataset and the corresponding mapping file. This extract only contains benchmarking section, which consists of submissions of costs, income, and balances of individual academies.

The mapping file, contains the mapping from AR4 cell references to cost categories and descriptions.

In [10]:
(ar, trust_agg, academy_ar) = pre_processing.prepare_aar_data('data/SFB_Academies_2022-23_20240418.xlsx')

FileNotFoundError: [Errno 2] No such file or directory: 'data/SFB_Academies_2022-23_20240418.xlsx'

In [None]:
ar.to_csv('output/pre-processing/ar.csv')
ar

Create a summary table for the AR stance of each distinct academy in the table.

Now compute the trust financial position in the same manor as the individual academy position

## Academy and maintained schools data load and preparation

This reads the main GIAS data (edubasealldataYYYYMMDD file) and the associated links file (links_edubasealldataYYYYMMDD file). This is taken from the [GIAS Service](https://get-information-schools.service.gov.uk/help)

Other columns are tidied up by asserting the correct type for that column. This is tidying phase is largly because on load integer columns will be inferred to be a float as opposed to an integer.

In [None]:
schools = pre_processing.prepare_schools_data('data/edubasealldata20240312.csv','data/links_edubasealldata20240312.csv')


In [None]:
schools.to_csv('output/pre-processing/schools.csv')
schools.sort_index()

Merge required GIAS, census, sen, cdc, PFI, and arr data with the base academy data

In [None]:
academies = pre_processing.build_academy_data('data/master_list_raw.csv', 
                                              current_year, schools, census, sen, cdc, 
                                              academy_ar, trust_agg, ks2, ks4)

In [None]:
academies.to_csv('output/pre-processing/academies.csv')
academies.sort_index()

Merge required census and cdc data to the maintained schools data set

In [None]:
# Load raw list from CSV
maintained_schools = pre_processing.build_maintained_school_data('data/maintained_schools_raw.csv')

In [None]:
maintained_schools.to_csv('output/pre-processing/maintained_schools.csv')
maintained_schools

## Federation Capture




In [None]:
(hard_federations, soft_federations) = pre_processing.build_federations_data('data/alllinksdata20240417.csv')

In [None]:
hard_federations.to_csv('output/pre-processing/hard_federations.csv')
hard_federations

In [None]:
soft_federations.to_csv('output/pre-processing/soft_federations.csv')
soft_federations[['LAEstab']]

### Timing Keep at the bottom

In [None]:
print(f'Processing Time: {time.time() - start_time} seconds')