# Setup


In [23]:
import pandas as pd
import xlrd # pd.read_excel dependency
import openpyxl # pd.read_excel dependency
import jinja2 # dataframe styling dependency # OPTIONAL, you can delete this statement and just not run the cell that needs it (It's just a display cell)
import numpy as np
from tqdm import tqdm
from warnings import simplefilter
import copy
from itertools import product
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)

In [24]:
# Section 1, STEP 4: Update the filename_crosswalk dictionary with the new year as a key and the new filename as a value
filename_crosswalk = {
    2024: "24-RC-Pub-Data-Set.xlsx",
    2023: "23-RC-Pub-Data-Set.xlsx",
    2022: "2022-Report-Card-Public-Data-Set.xlsx",
    2021: "2021-RC-Pub-Data-Set.xlsx",
    2020: "2020-Report-Card-Public-Data-Set.xlsx",
    2019: "2019-Report-Card-Public-Data-Set.xlsx",
    2018: "Report-Card-Public-Data-Set.xlsx",
    2017: "rc17.txt",
    2016: "rc16.txt",
    2015: "rc15.txt",
    2014: "rc14.txt",
    2013: "rc13.txt",
    2012: "rc12.txt",
    2011: "rc11u.txt",
    2010: "rc10.txt",
    2009: "rc09.txt",
    2008: "rc08u.txt"
}

START_YEAR = min(filename_crosswalk.keys())
END_YEAR = max(filename_crosswalk.keys())

assessment_crosswalk = {
    2017: "rc17_assessment.txt",
    2016: "rc16_assessment.txt",
    2015: "rc15-assessment.txt"
}

# Section 1, STEP 5: New demographics
demographic_key = {
    "Female": "Female",
    "FEMALE": "Female",
    "Male": "Male",
    "MALE": "Male",
    "White": "White",
    "WHITE": "White",
    "WHITE %": "White",
    "White %": "White",
    "Asian": "Asian",
    "ASIAN": "Asian",
    "ASIAN %": "Asian",
    "Asian %": "Asian",
    "Asian5": "Asian",
    "Black": "Black",
    "BLACK": "Black",
    "BLACK %": "Black",
    "Black or African American": "Black",
    "Black or African American %": "Black",
    "Black or African American3": "Black",
    "Latinx": "Latinx",
    "HISPANIC": "Latinx",
    "HISPANIC %": "Latinx",
    "Hispanic or Latino": "Latinx",
    "Hispanic or Latino %": "Latinx",
    "Hispanic": "Latinx",
    "Hispanic or Latino4": "Latinx",
    "American Indian or Alaska Native": "American Indian or Alaska Native",
    "NATIVE AMER": "American Indian or Alaska Native",
    "Native Amer": "American Indian or Alaska Native",
    "NATIVE AMERICAN": "American Indian or Alaska Native",
    "NATIVE AMERICAN %": "American Indian or Alaska Native",
    "American Indian or Alaska Native %": "American Indian or Alaska Native",
    "Am Ind/Alaska Nat": "American Indian or Alaska Native",
    "Am Ind/Alaska Nat2": "American Indian or Alaska Native",
    "Am Ind/Alaska Nat7": "American Indian or Alaska Native",
    "Native Hawaiian or Other Pacific Islander": "Native Hawaiian or Other Pacific Islander",
    "NATIVE HAWAIIAN AND OTHERS": "Native Hawaiian or Other Pacific Islander",
    "NATIVE HAWAIIAN AND OTHERS %": "Native Hawaiian or Other Pacific Islander",
    "Native Hawaiian or Other Pacific Islander %": "Native Hawaiian or Other Pacific Islander",
    "NATIVE HAWAIIAN OR OTHER PACIFIC ISLANDER %": "Native Hawaiian or Other Pacific Islander",
    "NATIVE HAWAIIAN OR OTHER PACIFIC ISLANDER": "Native Hawaiian or Other Pacific Islander",
    "HawaiiPacIslander": "Native Hawaiian or Other Pacific Islander",
    "Hawaiian/Pac Islander": "Native Hawaiian or Other Pacific Islander",
    "Nat Haw/Other Pac Isndr": "Native Hawaiian or Other Pacific Islander",
    "Nat Haw/Other Pac Isndr6": "Native Hawaiian or Other Pacific Islander",
    "Two or More Races": "Two or More Races",
    "MULTIRACIAL": "Two or More Races",
    "MULTIRACIAL %": "Two or More Races",
    "MULTIRACIAL/ETHNIC": "Two or More Races",
    "MULTIRACIAL/ETHNIC %": "Two or More Races",
    "MultiRace": "Two or More Races",
    "MultiRace3": "Two or More Races",
    "TWO OR MORE RACES": "Two or More Races",
    "TWO OR MORE RACES %": "Two or More Races",
    "TOW OR MORE RACES": "Two or More Races",
    "TOW OR MORE RACES %": "Two or More Races",
    "Two or More Races %": "Two or More Races",
    "Two or More Race": "Two or More Races",
    "Two or More Races8": "Two or More Races",
    "EL": "EL",
    "EL %": "EL",
    "LEP": "EL",
    "L.E.P.": "EL",
    "LEP %": "EL",
    "Low Income": "Low Income",
    "LOW INCOME": "Low Income",
    "Low Income %": "Low Income",
    "LOW INCOME %": "Low Income",
    "Low-Income": "Low Income",
    "LOW-INCOME": "Low Income",
    "Low-Income %": "Low Income",
    "LOW-INCOME %": "Low Income",
    "LowIncome": "Low Income",
    "Migrant": "Migrant",
    "MIGRANT": "Migrant",
    "MIGRANT %": "Migrant",
    "Homeless": "Homeless",
    "Homeless %": "Homeless",
    "HOMELESS": "Homeless",
    "HOMELESS %": "Homeless",
    "IEP": "IEP",
    "I.E.P.": "IEP",
    "IEP %": "IEP",
    "Children with Disabilities": "Children with Disabilities",
    "CWD": "Children with Disabilities",
    "CWD %": "Children with Disabilities",
    "UNKNOWN": "Unknown",
    "Unknown": "Unknown",
    "UNKNOWN RACE": "Unknown",
    "Unknown Race": "Unknown",
    'Non Binary': 'Non Binary',
    'Youth in Care': 'Youth in Care',
    'YOUTH IN CARE' : 'Youth in Care',
    'Youth In Care' : 'Youth in Care',
    'YIC' : 'Youth in Care',
    'yic' : 'Youth in Care',
    'Yic' : 'Youth in Care',
}

PARCC_lgd = list(product(range(1, 6), range(
    3, 9), ['Homeless', 'Children with Disabilities', 'Unknown', 'Non Binary']))
PARCC_ld = list(product(range(
    1, 6), ['Homeless', 'Children with Disabilities', 'Unknown', 'Non Binary']))

# These metric-demo combos are not present in any report cards
absent_metric_demo_combos = [
    'Student Enrollment - Female', 'Student Enrollment - Male', 'Student Enrollment - Migrant', 'Student Enrollment - Unknown', 'Student Enrollment - Non Binary',
    'Student Attendance Rate - Homeless', 'Student Attendance Rate - Children with Disabilities', 'Student Attendance Rate - Unknown', 'Student Attendance Rate - Non Binary',
    'Chronic Absenteeism - Migrant', 'Chronic Absenteeism - Homeless', 'Chronic Absenteeism - Unknown', 'Total Teacher FTE - EL', 'Total Teacher FTE - Low Income', 'Chronic Absenteeism - Non Binary',
    'Total Teacher FTE - Migrant', 'Total Teacher FTE - Homeless', 'Total Teacher FTE - IEP', 'Total Teacher FTE - Children with Disabilities', 'Total Teacher FTE - Non Binary',
    '% 9th Grade on Track - Female', '% 9th Grade on Track - Male', '% 9th Grade on Track - Migrant', '% 9th Grade on Track - Homeless', '% 9th Grade on Track - Unknown', '% 9th Grade on Track - Non Binary',
    '# CTE Participants - Unknown', '# CTE Participants - Non Binary',
    '4-Year Graduation Rate (Perkins) - Unknown', '4-Year Graduation Rate (Perkins) - Non Binary',
    'Postsecondary Placement Rate (Perkins) - Unknown', 'Postsecondary Placement Rate (Perkins) - Non Binary',
    'Nontraditional Program Enrollment Rate (Perkins) - Unknown', 'Nontraditional Program Enrollment Rate (Perkins) - Non Binary',
    '# Students enrolled in Dual Credit Coursework - Male', '# Students enrolled in Dual Credit Coursework - Migrant', '# Students enrolled in Dual Credit Coursework - Unknown', '# Students enrolled in Dual Credit Coursework - Non Binary',
    '% Students enrolled in Dual Credit Coursework - Male', '% Students enrolled in Dual Credit Coursework - Migrant', '% Students enrolled in Dual Credit Coursework - Unknown', '% Students enrolled in Dual Credit Coursework - Non Binary',
    '# Students who took AP classes Grade 9 - Children with Disabilities', '# Students who took AP classes Grade 10 - Children with Disabilities', '# Students who took AP classes Grade 11 - Children with Disabilities', '# Students who took AP classes Grade 12 - Children with Disabilities'] \
    + list(map(lambda x: f'# Students who took {x[0]} classes Grade {x[1]} - {x[2]}', list(product(['Dual Credit', 'AP', 'IB'], range(9, 13), ['Female', 'Male', 'Migrant', 'Homeless', 'Unknown', 'Non Binary'])))) \
    + ['High School 4-Year Graduation Rate - Unknown', 'High School 4-Year Graduation Rate - Non Binary',
       'High School 6-Year Graduation Rate - Unknown', 'High School 6-Year Graduation Rate - Non Binary'] \
    + list(map(lambda x: f'% All students IAR ELA Level {x[0]} - Grade {x[1]} - {x[2]}', list(product(range(1, 6), range(3, 9), ['Unknown', 'Non Binary'])))) + list(
        map(lambda x: f'% All students IAR Mathematics Level {x[0]} - Grade {x[1]} - {x[2]}', list(product(range(1, 6), range(3, 9), ['Unknown', 'Non Binary'])))) \
    + ['# Students IAR ELA Participation - Homeless', '# Students IAR ELA Participation - Migrant', '# Students IAR ELA Participation - Unknown', '# Students IAR Math Participation - Non Binary',
       '% Students IAR ELA Participation - Homeless', '% Students IAR ELA Participation - Migrant', '% Students IAR ELA Participation - Unknown', '% Students IAR Math Participation - Non Binary',
       '# Students IAR Math Participation - Homeless', '# Students IAR Math Participation - Migrant', '# Students IAR Math Participation - Unknown', '# Students IAR ELA Participation - Non Binary',
       '% Students IAR Math Participation - Homeless', '% Students IAR Math Participation - Migrant', '% Students IAR Math Participation - Unknown', '% Students IAR ELA Participation - Non Binary',
       'IAR ELA No Participation Rate - Migrant', 'IAR ELA No Participation Rate - Homeless', 'IAR ELA No Participation Rate - Unknown', 'IAR ELA No Participation Rate - Non Binary',
       'IAR Math No Participation Rate - Migrant', 'IAR Math No Participation Rate - Homeless', 'IAR Math No Participation Rate - Unknown', 'IAR Math No Participation Rate - Non Binary',
       '# Discipline Incidents - Migrant', '# Discipline Incidents - Homeless', '# Discipline Incidents - Children with Disabilities', '# Discipline Incidents - Unknown', '# Discipline Incidents - Non Binary',
       '# Discipline Incidents - Expulsion: Received Educational Srvcs - Migrant', '# Discipline Incidents - Expulsion: Received Educational Srvcs - Homeless', '# Discipline Incidents - Expulsion: Received Educational Srvcs - Children with Disabilities', '# Discipline Incidents - Expulsion: Received Educational Srvcs - Unknown',
       '# Discipline Incidents - Expulsion: Did not Receive Educational Srvcs - Migrant', '# Discipline Incidents - Expulsion: Did not Receive Educational Srvcs - Homeless', '# Discipline Incidents - Expulsion: Did not Receive Educational Srvcs - Children with Disabilities', '# Discipline Incidents - Expulsion: Did not Receive Educational Srvcs - Unknown',
       '# Discipline Incidents - In-school Suspension - Migrant', '# Discipline Incidents - In-school Suspension - Homeless', '# Discipline Incidents - In-school Suspension - Children with Disabilities', '# Discipline Incidents - In-school Suspension - Unknown',
       '# Discipline Incidents - Out-of-School Suspension - Migrant', '# Discipline Incidents - Out-of-School Suspension - Homeless', '# Discipline Incidents - Out-of-School Suspension - Children with Disabilities', '# Discipline Incidents - Out-of-School Suspension - Unknown',
       '# Discipline Incidents - Removal - Migrant', '# Discipline Incidents - Removal - Homeless', '# Discipline Incidents - Removal - Children with Disabilities', '# Discipline Incidents - Removal - Unknown',
       'Student Mobility Rate - Migrant', 'Student Mobility Rate - Homeless', 'Student Mobility Rate - Unknown', 'Student Mobility Rate - Non Binary',
       'SAT Reading Students Level 1 % - Unknown', 'SAT Reading Students Level 1 % - Non Binary',
       'SAT Reading Students Level 2 % - Unknown', 'SAT Reading Students Level 2 % - Non Binary',
       'SAT Reading Students Level 3 % - Unknown', 'SAT Reading Students Level 3 % - Non Binary',
       'SAT Reading Students Level 4 % - Unknown', 'SAT Reading Students Level 4 % - Non Binary',
       'SAT Math Students Level 1 % - Unknown', 'SAT Math Students Level 1 % - Non Binary',
       'SAT Math Students Level 2 % - Unknown', 'SAT Math Students Level 2 % - Non Binary',
       'SAT Math Students Level 3 % - Unknown', 'SAT Math Students Level 3 % - Non Binary',
       'SAT Math Students Level 4 % - Unknown', 'SAT Math Students Level 4 % - Non Binary',
       '# Students SAT Math Participation - Migrant', '# Students SAT Math Participation - Homeless', '# Students SAT Math Participation - Unknown', '# Students SAT Math Participation - Non Binary',
       '% Students SAT Math Participation - Migrant', '% Students SAT Math Participation - Homeless', '% Students SAT Math Participation - Unknown', '% Students SAT Math Participation - Non Binary',
       '# Students SAT ELA Participation - Migrant', '# Students SAT ELA Participation - Homeless', '# Students SAT ELA Participation - Unknown', '# Students SAT ELA Participation - Non Binary',
       '% SAT ELA Participation - Migrant', '% SAT ELA Participation - Homeless', '% SAT ELA Participation - Unknown', '% SAT ELA Participation - Non Binary',
       'SAT ELA No Participation Rate - Migrant', 'SAT ELA No Participation Rate - Homeless', 'SAT ELA No Participation Rate - Unknown', 'SAT ELA No Participation Rate - Non Binary',
       'SAT Math No Participation Rate - Migrant', 'SAT Math No Participation Rate - Homeless', 'SAT Math No Participation Rate - Unknown', 'SAT Math No Participation Rate - Non Binary',
       '# ISA Proficiency Student - Unknown', '# ISA Proficiency Student - Non Binary',
       '% ISA Participation - Migrant', '% ISA Participation - Homeless', '% ISA Participation - Unknown', '% ISA Participation - Non Binary',
       '# ISA Participation - Migrant', '# ISA Participation - Homeless', '# ISA Participation - Unknown', '# ISA Participation - Non Binary',
       'ISA No Participation Rate - Migrant', 'ISA No Participation Rate - Homeless', 'ISA No Participation Rate - Unknown', 'ISA No Participation Rate - Non Binary',
       ] + list(map(lambda x: f'All students PARCC ELA Level {x[0]} - Grade {x[1]} - {x[2]}', PARCC_lgd)) + list(
    map(lambda x: f'All students PARCC Mathematics Level {x[0]} - Grade {x[1]} - {x[2]}', PARCC_lgd)) \
    + list(map(lambda x: f'All students PARCC ELA Level {x[0]} - High School - {x[1]}', PARCC_ld)) + list(
        map(lambda x: f'All students PARCC Math Level {x[0]} - High School - {x[1]}', PARCC_ld)) \
    + list(map(lambda x: f'All students PARCC ELA I Level {x[0]} - {x[1]}', PARCC_ld)) + list(
        map(lambda x: f'All students PARCC ELA II Level {x[0]} - {x[1]}', PARCC_ld)) + list(
        map(lambda x: f'All students PARCC ELA III Level {x[0]} - {x[1]}', PARCC_ld)) \
    + list(map(lambda x: f'All students PARCC ALG I Level {x[0]} - {x[1]}', PARCC_ld)) + list(
        map(lambda x: f'All students PARCC ALG II Level {x[0]} - {x[1]}', PARCC_ld)) \
    + list(map(lambda x: f'All students PARCC GEO Level {x[0]} - {x[1]}', PARCC_ld)) \
    + list(map(lambda x: f'All students PARCC MATH I Level {x[0]} - {x[1]}', PARCC_ld)) + list(
        map(lambda x: f'All students PARCC MATH II Level {x[0]} - {x[1]}', PARCC_ld)) + list(
        map(lambda x: f'All students PARCC MATH III Level {x[0]} - {x[1]}', PARCC_ld)) \
    + list(map(lambda x: f'Total Students PARCC Math Participation - {x}', ['Homeless', 'Children with Disabilities', 'Unknown', 'Non Binary', 'Migrant'])) \
    + list(map(lambda x: f'Total Students PARCC Math Participation % - {x}', ['Homeless', 'Children with Disabilities', 'Unknown', 'Non Binary', 'Migrant'])) \
    + list(map(lambda x: f'Total Students PARCC ELA Participation - {x}', ['Homeless', 'Children with Disabilities', 'Unknown', 'Non Binary', 'Migrant'])) \
    + list(map(lambda x: f'Total Students PARCC ELA Participation % - {x}',
           ['Homeless', 'Children with Disabilities', 'Unknown', 'Non Binary', 'Migrant'])) \
    + list(map(lambda x: f'ISAT {x[0]} {x[1]} - Grade {x[2]} - {x[3]}', list(product(['Reading', 'Mathematics'], [
           'Academic Warning', 'Below', 'Meets', 'Exceeds'], range(3, 9), ['Homeless', 'Children with Disabilities', 'Unknown', 'Non Binary'])))) \
    + ['High School 4-Year Cohort Graduates - Unknown', 'High School 4-Year Cohort Graduates - Non Binary'] \
    + ['Student Attendance Rate - Youth in Care', 'Chronic Absenteeism - Youth in Care', 'Total Teacher FTE - Youth in Care', '% 9th Grade on Track - Youth in Care', '# CTE Participants - Youth in Care', '4-Year Graduation Rate (Perkins) - Youth in Care', 'Postsecondary Placement Rate (Perkins) - Youth in Care', 'Nontraditional Program Enrollment Rate (Perkins) - Youth in Care', '# Students who took Dual Credit classes Grade 9 - Youth in Care', '# Students who took Dual Credit classes Grade 10 - Youth in Care', '# Students who took Dual Credit classes Grade 11 - Youth in Care', '# Students who took Dual Credit classes Grade 12 - Youth in Care', '# Students enrolled in Dual Credit Coursework - Youth in Care', '% Students enrolled in Dual Credit Coursework - Youth in Care', '# Students who took IB classes Grade 9 - Youth in Care', '# Students who took IB classes Grade 10 - Youth in Care', '# Students who took IB classes Grade 11 - Youth in Care', '# Students who took IB classes Grade 12 - Youth in Care', '# Students who took AP classes Grade 9 - Youth in Care', '# Students who took AP classes Grade 10 - Youth in Care', '# Students who took AP classes Grade 11 - Youth in Care', '# Students who took AP classes Grade 12 - Youth in Care', 'High School 4-Year Graduation Rate - Youth in Care', 'High School 6-Year Graduation Rate - Youth in Care', '# Students IAR Math Participation - Youth in Care', '% Students IAR Math Participation - Youth in Care', '# Students IAR ELA Participation - Youth in Care', '% Students IAR ELA Participation - Youth in Care', '# Students SAT Math Participation - Youth in Care', '% Students SAT Math Participation - Youth in Care', '# Students SAT ELA Participation - Youth in Care', '% SAT ELA Participation - Youth in Care', '# ISA Proficiency Student - Youth in Care', '% ISA Participation - Youth in Care', '# ISA Participation - Youth in Care', 'All students PARCC ELA Level 1 - Grade 3 - Youth in Care', 'All students PARCC ELA Level 2 - Grade 3 - Youth in Care', 'All students PARCC ELA Level 3 - Grade 3 - Youth in Care', 'All students PARCC ELA Level 4 - Grade 3 - Youth in Care', 'All students PARCC ELA Level 5 - Grade 3 - Youth in Care', 'All students PARCC Mathematics Level 1 - Grade 3 - Youth in Care', 'All students PARCC Mathematics Level 2 - Grade 3 - Youth in Care', 'All students PARCC Mathematics Level 3 - Grade 3 - Youth in Care', 'All students PARCC Mathematics Level 4 - Grade 3 - Youth in Care', 'All students PARCC Mathematics Level 5 - Grade 3 - Youth in Care', 'All students PARCC ELA Level 1 - Grade 4 - Youth in Care', 'All students PARCC ELA Level 2 - Grade 4 - Youth in Care', 'All students PARCC ELA Level 3 - Grade 4 - Youth in Care', 'All students PARCC ELA Level 4 - Grade 4 - Youth in Care', 'All students PARCC ELA Level 5 - Grade 4 - Youth in Care', 'All students PARCC Mathematics Level 1 - Grade 4 - Youth in Care', 'All students PARCC Mathematics Level 2 - Grade 4 - Youth in Care', 'All students PARCC Mathematics Level 3 - Grade 4 - Youth in Care', 'All students PARCC Mathematics Level 4 - Grade 4 - Youth in Care', 'All students PARCC Mathematics Level 5 - Grade 4 - Youth in Care', 'All students PARCC ELA Level 1 - Grade 5 - Youth in Care', 'All students PARCC ELA Level 2 - Grade 5 - Youth in Care', 'All students PARCC ELA Level 3 - Grade 5 - Youth in Care', 'All students PARCC ELA Level 4 - Grade 5 - Youth in Care', 'All students PARCC ELA Level 5 - Grade 5 - Youth in Care', 'All students PARCC Mathematics Level 1 - Grade 5 - Youth in Care', 'All students PARCC Mathematics Level 2 - Grade 5 - Youth in Care', 'All students PARCC Mathematics Level 3 - Grade 5 - Youth in Care', 'All students PARCC Mathematics Level 4 - Grade 5 - Youth in Care', 'All students PARCC Mathematics Level 5 - Grade 5 - Youth in Care', 'All students PARCC ELA Level 1 - Grade 6 - Youth in Care', 'All students PARCC ELA Level 2 - Grade 6 - Youth in Care', 'All students PARCC ELA Level 3 - Grade 6 - Youth in Care', 'All students PARCC ELA Level 4 - Grade 6 - Youth in Care', 'All students PARCC ELA Level 5 - Grade 6 - Youth in Care', 'All students PARCC Mathematics Level 1 - Grade 6 - Youth in Care', 'All students PARCC Mathematics Level 2 - Grade 6 - Youth in Care', 'All students PARCC Mathematics Level 3 - Grade 6 - Youth in Care', 'All students PARCC Mathematics Level 4 - Grade 6 - Youth in Care', 'All students PARCC Mathematics Level 5 - Grade 6 - Youth in Care', 'All students PARCC ELA Level 1 - Grade 7 - Youth in Care', 'All students PARCC ELA Level 2 - Grade 7 - Youth in Care', 'All students PARCC ELA Level 3 - Grade 7 - Youth in Care', 'All students PARCC ELA Level 4 - Grade 7 - Youth in Care', 'All students PARCC ELA Level 5 - Grade 7 - Youth in Care', 'All students PARCC Mathematics Level 1 - Grade 7 - Youth in Care', 'All students PARCC Mathematics Level 2 - Grade 7 - Youth in Care', 'All students PARCC Mathematics Level 3 - Grade 7 - Youth in Care', 'All students PARCC Mathematics Level 4 - Grade 7 - Youth in Care', 'All students PARCC Mathematics Level 5 - Grade 7 - Youth in Care', 'All students PARCC ELA Level 1 - Grade 8 - Youth in Care', 'All students PARCC ELA Level 2 - Grade 8 - Youth in Care', 'All students PARCC ELA Level 3 - Grade 8 - Youth in Care', 'All students PARCC ELA Level 4 - Grade 8 - Youth in Care', 'All students PARCC ELA Level 5 - Grade 8 - Youth in Care', 'All students PARCC Mathematics Level 1 - Grade 8 - Youth in Care', 'All students PARCC Mathematics Level 2 - Grade 8 - Youth in Care', 'All students PARCC Mathematics Level 3 - Grade 8 - Youth in Care', 'All students PARCC Mathematics Level 4 - Grade 8 - Youth in Care', 'All students PARCC Mathematics Level 5 - Grade 8 - Youth in Care', 'All students PARCC ELA Level 1 - High School - Youth in Care', 'All students PARCC ELA Level 2 - High School - Youth in Care', 'All students PARCC ELA Level 3 - High School - Youth in Care', 'All students PARCC ELA Level 4 - High School - Youth in Care', 'All students PARCC ELA Level 5 - High School - Youth in Care', 'All students PARCC Math Level 1 - High School - Youth in Care', 'All students PARCC Math Level 2 - High School - Youth in Care', 'All students PARCC Math Level 3 - High School - Youth in Care', 'All students PARCC Math Level 4 - High School - Youth in Care', 'All students PARCC Math Level 5 - High School - Youth in Care', 'All students PARCC ELA I Level 1 - Youth in Care', 'All students PARCC ELA I Level 2 - Youth in Care', 'All students PARCC ELA I Level 3 - Youth in Care', 'All students PARCC ELA I Level 4 - Youth in Care', 'All students PARCC ELA I Level 5 - Youth in Care', 'All students PARCC ELA II Level 1 - Youth in Care', 'All students PARCC ELA II Level 2 - Youth in Care', 'All students PARCC ELA II Level 3 - Youth in Care', 'All students PARCC ELA II Level 4 - Youth in Care', 'All students PARCC ELA II Level 5 - Youth in Care', 'All students PARCC ELA III Level 1 - Youth in Care', 'All students PARCC ELA III Level 2 - Youth in Care', 'All students PARCC ELA III Level 3 - Youth in Care', 'All students PARCC ELA III Level 4 - Youth in Care', 'All students PARCC ELA III Level 5 - Youth in Care', 'All students PARCC ALG I Level 1 - Youth in Care', 'All students PARCC ALG I Level 2 - Youth in Care', 'All students PARCC ALG I Level 3 - Youth in Care', 'All students PARCC ALG I Level 4 - Youth in Care', 'All students PARCC ALG I Level 5 - Youth in Care', 'All students PARCC ALG II Level 1 - Youth in Care', 'All students PARCC ALG II Level 2 - Youth in Care', 'All students PARCC ALG II Level 3 - Youth in Care', 'All students PARCC ALG II Level 4 - Youth in Care', 'All students PARCC ALG II Level 5 - Youth in Care', 'All students PARCC GEO Level 1 - Youth in Care', 'All students PARCC GEO Level 2 - Youth in Care', 'All students PARCC GEO Level 3 - Youth in Care', 'All students PARCC GEO Level 4 - Youth in Care', 'All students PARCC GEO Level 5 - Youth in Care', 'All students PARCC MATH I Level 1 - Youth in Care', 'All students PARCC MATH I Level 2 - Youth in Care', 'All students PARCC MATH I Level 3 - Youth in Care', 'All students PARCC MATH I Level 4 - Youth in Care', 'All students PARCC MATH I Level 5 - Youth in Care', 'All students PARCC MATH II Level 1 - Youth in Care', 'All students PARCC MATH II Level 2 - Youth in Care', 'All students PARCC MATH II Level 3 - Youth in Care', 'All students PARCC MATH II Level 4 - Youth in Care', 'All students PARCC MATH II Level 5 - Youth in Care', 'All students PARCC MATH III Level 1 - Youth in Care', 'All students PARCC MATH III Level 2 - Youth in Care', 'All students PARCC MATH III Level 3 - Youth in Care', 'All students PARCC MATH III Level 4 - Youth in Care', 'All students PARCC MATH III Level 5 - Youth in Care', 'Total Students PARCC Math Participation - Youth in Care', 'Total Students PARCC Math Participation % - Youth in Care', 'Total Students PARCC ELA Participation - Youth in Care', 'Total Students PARCC ELA Participation % - Youth in Care', 'ISAT Reading Academic Warning - Grade 3 - Youth in Care', 'ISAT Reading Below - Grade 3 - Youth in Care', 'ISAT Reading Meets - Grade 3 - Youth in Care', 'ISAT Reading Exceeds - Grade 3 - Youth in Care', 'ISAT Mathematics Academic Warning - Grade 3 - Youth in Care', 'ISAT Mathematics Below - Grade 3 - Youth in Care', 'ISAT Mathematics Meets - Grade 3 - Youth in Care', 'ISAT Mathematics Exceeds - Grade 3 - Youth in Care', 'ISAT Reading Academic Warning - Grade 4 - Youth in Care', 'ISAT Reading Below - Grade 4 - Youth in Care', 'ISAT Reading Meets - Grade 4 - Youth in Care', 'ISAT Reading Exceeds - Grade 4 - Youth in Care', 'ISAT Mathematics Academic Warning - Grade 4 - Youth in Care', 'ISAT Mathematics Below - Grade 4 - Youth in Care', 'ISAT Mathematics Meets - Grade 4 - Youth in Care', 'ISAT Mathematics Exceeds - Grade 4 - Youth in Care', 'ISAT Reading Academic Warning - Grade 5 - Youth in Care', 'ISAT Reading Below - Grade 5 - Youth in Care', 'ISAT Reading Meets - Grade 5 - Youth in Care', 'ISAT Reading Exceeds - Grade 5 - Youth in Care', 'ISAT Mathematics Academic Warning - Grade 5 - Youth in Care', 'ISAT Mathematics Below - Grade 5 - Youth in Care', 'ISAT Mathematics Meets - Grade 5 - Youth in Care', 'ISAT Mathematics Exceeds - Grade 5 - Youth in Care', 'ISAT Reading Academic Warning - Grade 6 - Youth in Care', 'ISAT Reading Below - Grade 6 - Youth in Care', 'ISAT Reading Meets - Grade 6 - Youth in Care', 'ISAT Reading Exceeds - Grade 6 - Youth in Care', 'ISAT Mathematics Academic Warning - Grade 6 - Youth in Care', 'ISAT Mathematics Below - Grade 6 - Youth in Care', 'ISAT Mathematics Meets - Grade 6 - Youth in Care', 'ISAT Mathematics Exceeds - Grade 6 - Youth in Care', 'ISAT Reading Academic Warning - Grade 7 - Youth in Care', 'ISAT Reading Below - Grade 7 - Youth in Care', 'ISAT Reading Meets - Grade 7 - Youth in Care', 'ISAT Reading Exceeds - Grade 7 - Youth in Care', 'ISAT Mathematics Academic Warning - Grade 7 - Youth in Care', 'ISAT Mathematics Below - Grade 7 - Youth in Care', 'ISAT Mathematics Meets - Grade 7 - Youth in Care', 'ISAT Mathematics Exceeds - Grade 7 - Youth in Care', 'ISAT Reading Academic Warning - Grade 8 - Youth in Care', 'ISAT Reading Below - Grade 8 - Youth in Care', 'ISAT Reading Meets - Grade 8 - Youth in Care', 'ISAT Reading Exceeds - Grade 8 - Youth in Care', 'ISAT Mathematics Academic Warning - Grade 8 - Youth in Care', 'ISAT Mathematics Below - Grade 8 - Youth in Care', 'ISAT Mathematics Meets - Grade 8 - Youth in Care', 'ISAT Mathematics Exceeds - Grade 8 - Youth in Care', 'High School 4-Year Cohort Graduates - Youth in Care']


# Reading Files and Cleaning


In [25]:
def get_layout_file(short_year, sheet=0):
    if int(short_year) > 12:
        x = "x"
    else:
        x = ""
    if short_year == "12" or short_year == "16" or short_year == "15":
        return pd.read_excel("./data/RC" + short_year + "-layout.xls" + x, header=None, sheet_name=sheet)
    else:
        return pd.read_excel("./data/RC" + short_year + "_layout.xls" + x, header=None, sheet_name=sheet)

In [26]:
def label_proficiency(layout_sheet, y):
    if y == 2015:
        layout_sheet.iloc[11046:11102, 1] = 'SCHOOL'
        layout_sheet.iloc[11131:11187, 1] = 'DISTRICT'
        layout_sheet.iloc[11216:11272, 1] = 'STATE'
        return layout_sheet
    elif y == 2016:
        layout_sheet.iloc[11054:11110, 1] = 'SCHOOL'
        layout_sheet.iloc[11139:11195, 1] = 'DISTRICT'
        layout_sheet.iloc[11224:11280, 1] = 'STATE'
        return layout_sheet
    elif y == 2017:
        layout_sheet.iloc[8113:8169, 1] = 'SCHOOL'
        layout_sheet.iloc[8198:8254, 1] = 'DISTRICT'
        layout_sheet.iloc[8283:8339, 1] = 'STATE'
        return layout_sheet
    else:
        return None

In [27]:
def clean_layout_file(layout_file, demographic_key):
    layout_file = layout_file.rename(
        columns={0: 'Column #', 1: 'Second Qualifier', 2: "Demographic", 5: "Metric"})
    layout_file["Demographic"] = layout_file['Demographic'].str.strip().replace(
        demographic_key)
    sq_mask = ~(layout_file['Second Qualifier'].isnull()) & (
        layout_file['Second Qualifier'].str.strip() != '')
    layout_file.loc[sq_mask, 'Demographic'] = layout_file.loc[sq_mask, 'Demographic'] + \
        ' (' + layout_file.loc[sq_mask, 'Second Qualifier'] + ')'
    layout_file = layout_file.iloc[:, [0, 2, 5]]

    # Drop rows that don't have a column number (header rows for categories)
    layout_file['Column #'] = pd.to_numeric(
        layout_file['Column #'], errors='coerce')
    layout_file = layout_file[layout_file['Column #'].notnull()]
    layout_file['Column #'] = layout_file['Column #'].astype(int)

    # Reset index to column number
    layout_file.index = layout_file['Column #'] - 1
    layout_file.index.name = None

    # Drop Column Number column
    layout_file = layout_file.drop(columns='Column #')

    # Replace demographic keys with Advance Illinois standard
    # also clean up mistakes in demographics
    # This makes it so that the demographic terms used in each year do not need to be tracked
    layout_file['Metric'] = layout_file['Metric'].str.strip()
    layout_file['Demographic'] = layout_file['Demographic'].str.strip()

    # Create mask for all rows with demographics
    mask = ~(layout_file["Demographic"].isnull()) & (
        layout_file["Demographic"] != "ALL") & (layout_file["Demographic"] != "ALL STUDENTS")
    # Combine Metric and Demographic columns
    layout_file.loc[mask, "Metric"] = layout_file.loc[mask, "Metric"].astype(
        str) + " - " + layout_file.loc[mask, "Demographic"].astype(str)

    layout_file = layout_file.replace({'MEETSS': 'MEETS'})

    return layout_file

In [28]:
layout = {}
layout_assessment = {}
# NOTE: the demographics column may have other notes besides just demographic info

for year in range(2008, 2018):
    s = "{:02d}".format(year - 2000)

    # Grab Column Number, Demographic, and Metric columns
    # Combine two demographic columns if there are two
    layout[year] = get_layout_file(s)
    layout[year] = clean_layout_file(layout[year], demographic_key)
    if year > 2014:
        layout_assessment[year] = get_layout_file(s, 1)
        layout_assessment[year] = label_proficiency(
            layout_assessment[year], year)
        layout_assessment[year] = clean_layout_file(
            layout_assessment[year], demographic_key)
        layout_assessment[year] = layout_assessment[year].iloc[6:]
        layout_assessment[year].index = layout_assessment[year].index + \
            layout[year].index[-1] + 1

        layout[year] = pd.concat((layout[year], layout_assessment[year]))

In [29]:
# Replace demographics in teacher data with Advance Illinois standard
for year in layout.keys():
    teacher_demos = layout[year]['Metric'].str.extract(r'% (.*) TEACH')[0]
    teacher_demos = teacher_demos.dropna()
    teacher_demos = teacher_demos[(
        teacher_demos != 'CLASSES NOT TAUGHT BY HIGHLY QUALIFIED') & (teacher_demos != 'of')]
    layout[year].loc[teacher_demos.index,
                     'Demographic'] = teacher_demos.replace(demographic_key)

    layout[year]['Metric'] = layout[year]['Metric'].str.replace(
        'TEACH ER', 'TEACHER')
    layout[year]['Metric'] = layout[year]['Metric'].str.replace(
        'TEACHER- ', 'TEACHER - ')

    layout[year].loc[teacher_demos.index, 'Metric'] = layout[year].loc[teacher_demos.index, 'Metric'].str.replace(
        r'(% )(.*)( TEACH)', lambda m: m[1] + demographic_key[m[2]] + m[3], regex=True)

In [30]:
# Replace demographics in enrollment data with Advance Illinois standard
for year in layout.keys():
    enroll_demos = layout[year]['Metric'].str.extract(r'^\w+ - (.*) %$')[0]
    special_enroll = layout[year]['Metric'].str.extract(
        r'(.*) (?:SCHOOL|DISTRICT|STATE) %$')[0]

    enroll_demos = enroll_demos.dropna()
    special_enroll = special_enroll.dropna()
    special_enroll = special_enroll[special_enroll.apply(
        lambda x: x in demographic_key.keys())]

    layout[year].loc[enroll_demos.index,
                     'Demographic'] = enroll_demos.replace(demographic_key)
    layout[year].loc[special_enroll.index,
                     'Demographic'] = special_enroll.replace(demographic_key)

    layout[year].loc[enroll_demos.index, 'Metric'] = layout[year].loc[enroll_demos.index, 'Metric'].str.replace(
        r'(^\w+ - )(.*)( %)$', lambda m: m[1] + demographic_key[m[2]] + m[3], regex=True)
    layout[year].loc[special_enroll.index, 'Metric'] = layout[year].loc[special_enroll.index, 'Metric'].str.replace(
        r'(.*)( (?:SCHOOL|DISTRICT|STATE) %$)', lambda m: demographic_key[m[1]] + m[2], regex=True)

In [31]:
report_card = {}

if 'REPORT_CARD' in globals():
    report_card = copy.deepcopy(REPORT_CARD)
else:
    for year in tqdm(filename_crosswalk.keys()):
        if year > 2017:
            wkbk = pd.read_excel(
                "./data/" + filename_crosswalk[year], sheet_name=None, dtype='object')
            wkbk.pop('Revision History', None)
            wkbk.pop('Important Notes', None)

            if year == 2021:
                for k in wkbk.keys():
                    wkbk[k].loc[(wkbk[k]['RCDTS'] == '310458000802001') & (
                        wkbk[k]['Type'] == 'District'), 'RCDTS'] = '310458000800000'

            report_card[year] = wkbk['General'].copy()

            for k in filter(lambda x: x not in ['General', 'Finance'], wkbk.keys()):
                report_card[year] = pd.merge(
                    report_card[year], wkbk[k], on='RCDTS', how='outer', validate="1:1", suffixes=('', f"_{k}"))

        elif year > 2014:
            report_card[year] = pd.read_csv("./data/" + filename_crosswalk[year], sep=";",
                                            header=None, dtype='object')

            report_card_w_assessment = pd.read_csv("./data/" + assessment_crosswalk[year], sep=";",
                                                   header=None, dtype='object').iloc[:, 6:]

            report_card_w_assessment.columns = report_card_w_assessment.columns + \
                layout_assessment[year].index[0] - 6

            report_card[year] = pd.concat(
                (report_card[year], report_card_w_assessment), axis=1).rename(columns=layout[year]['Metric'])
        else:
            report_card[year] = pd.read_csv("./data/" + filename_crosswalk[year], sep=";",
                                            header=None, dtype='object').rename(columns=layout[year]['Metric'])
    REPORT_CARD = copy.deepcopy(report_card)

In [32]:
crosswalk = pd.read_excel(
    'Local Historic Crosswalk.xlsx', sheet_name='Name Crosswalk')
crosswalk.index = crosswalk['Year']
crosswalk = crosswalk.drop(columns='Year')

demo_info = pd.read_excel(
    'Local Historic Crosswalk.xlsx', sheet_name='Details')
disagg_info = demo_info.groupby('Metric')['Disaggregated'].max()
# True if index is ever disaggregated, false otherwise

# Exploration


In [33]:
# EXPLORATORY CELL
search_string = "counselor".lower()

results = []

for year in range(START_YEAR, 2018):
    results.append(pd.Series(layout[year].loc[layout[year]['Metric'].str.lower(
    ).str.contains(search_string, regex=False), 'Metric'], name=year).reset_index(drop=True))

for year in range(2018, END_YEAR + 1):
    results.append(pd.Series(report_card[year].columns[report_card[year].columns.str.lower(
    ).str.contains(search_string, regex=False)], name=year).reset_index(drop=True))
    # print(year, list(pd.Series(map(lambda x: x.split(
    #     ' - ')[0], report_card[year].columns[report_card[year].columns.str.lower().str.contains(search_string)])).drop_duplicates()))


pd.DataFrame(results)#.loc[2024].to_clipboard()

Unnamed: 0,0,1
2008,,
2009,,
2010,,
2011,,
2012,,
2013,,
2014,,
2015,,
2016,,
2017,,


# Preprocessing


In [34]:
# Adds all possible demographic categories to each column that has disaggregated data
def add_demo_columns(columns, disagg_data):
    out_columns = []
    for col in columns:
        out_columns.append(col)
        if disagg_data[col]:
            out_columns += list(map(lambda x: col + " - " + x,
                                pd.Series(demographic_key.values()).unique()))
    return out_columns

In [35]:
# Section 2, Step 5: Typos and formatting issues
# Replace demographic designations in report cards with standardized ones
for year in range(2018, END_YEAR + 1):
    report_card[year].columns = report_card[year].columns.str.strip()
    report_card[year].columns = report_card[year].columns.str.replace(
        'Black or African American', 'Black')
    report_card[year].columns = report_card[year].columns.str.replace(
        'Hispanic or Latino', 'Latinx')
    report_card[year].columns = report_card[year].columns.str.replace(
        'Hispanic', 'Latinx')
    report_card[year].columns = report_card[year].columns.str.replace(
        'CWD', 'Children with Disabilities')
    report_card[year].columns = report_card[year].columns.str.replace(
        'Hawaiian/Pac Islander', 'Native Hawaiian or Other Pacific Islander')
    report_card[year].columns = report_card[year].columns.str.replace(
        'Am Ind/Alaska Nat', 'American Indian or Alaska Native')
    report_card[year].columns = report_card[year].columns.str.replace(
        'MultiRace', 'Two or More Races')
    report_card[year].columns = report_card[year].columns.str.replace(
        'LowIncome', 'Low Income')
    report_card[year].columns = report_card[year].columns.str.replace(
        r'\bTwo or More Race\b', 'Two or More Races', regex=True)
    report_card[year].columns = report_card[year].columns.str.replace(
        '% Homeless students IAR Mathematics Level 1 - Grade 32', '% Homeless students IAR Mathematics Level 1 - Grade 5')
    report_card[year].columns = report_card[year].columns.str.replace(
        'Homeless students IAR Mathematics Level 1 - Grade 3.1', 'Homeless students IAR Mathematics Level 1 - Grade 5')

for year in range(2019, END_YEAR + 1):
    report_card[year] = report_card[year].rename(
        columns={'# ISA Participation - White Count': '# ISA Participation - White'})


report_card[2018] = report_card[2018].rename(
    columns={'Math Participation IEP %.1': 'Math Participation EL %', 'Math Participation Total IEP Count.1': 'Math Participation Total EL Count'})

report_card[2019] = report_card[2019].rename(
    columns={'% Math Participation - IEP.1': '% Math Participation - EL'})

In [36]:
# Add all possible demographic categories to each column with disaggregated data
columns = add_demo_columns(crosswalk.columns, disagg_info)
columns = list(filter(lambda x: x not in absent_metric_demo_combos, columns))

# Create new object filtering out old columns
new_columns = list(filter(lambda x: x not in crosswalk.columns, columns))

# Create new crosswalk with demographic info
demo_crosswalk = crosswalk.copy()
demo_crosswalk[new_columns] = np.nan

for col in new_columns:
    split = col.split(' - ')
    if len(split) > 2:
        metric = ' - '.join(split[:-1])
        demo = split[-1]
    else:
        metric, demo = split
    demo_formats = demo_info.copy().loc[demo_info['Metric'] == metric, [
        'Year', 'Disaggregation Format', 'Special Format']]

    if (demo in ['IEP', 'EL', 'Low Income', 'Homeless']):
        demo_formats.loc[demo_formats['Special Format'].notnull(
        ), 'Disaggregation Format'] = demo_formats.loc[demo_formats['Special Format'].notnull(), 'Special Format']

    demo_formats = demo_formats.set_index('Year')['Disaggregation Format']

    demo_formats = demo_formats.str.replace('demo', demo)
    demo_formats = demo_formats.str.replace('DEMO', demo)
    demo_crosswalk[col] = demo_formats

master_data = pd.DataFrame(columns=['Year'] + columns)

datasets = {}

In [37]:
def scope_data(rename_vals, year, scope, demo_crosswalk, report_card, dropped_district_columns):
    scoped_rename_vals = rename_vals.drop(
        demo_crosswalk.loc[year, ['School Name', 'School Type']])

    if scope == 'DISTRICT':
        scoped_rename_vals.index = scoped_rename_vals.index.str.replace(
            'SCHOOL', scope)
        scoped_rename_vals.index = [
            demo_crosswalk.loc[year, 'RCDTS']] + list(scoped_rename_vals.index[1:])
        found_columns = [item.replace('SCHOOL', scope) for item in demo_crosswalk.loc[year].dropna(
        ) if item.replace('SCHOOL', scope) in report_card[year].columns]
        found_columns.remove("DISTRICT TYPE NAME")
        found_columns.remove("DISTRICT NAME")
    else:
        scoped_rename_vals.index = scoped_rename_vals.index.str.replace(
            'DISTRICT', scope).str.replace('SCHOOL', scope)
        scoped_rename_vals.index = [
            demo_crosswalk.loc[year, 'RCDTS']] + list(scoped_rename_vals.index[1:])

        found_columns = [item.replace('DISTRICT', scope).replace('SCHOOL', scope) for item in demo_crosswalk.loc[year].dropna(
        ) if item.replace('DISTRICT', scope).replace('SCHOOL', scope) in report_card[year].columns]

    found_columns = [demo_crosswalk.loc[year, 'RCDTS'],
                     'SCHOOL TYPE NAME'] + found_columns

    scoped_data = report_card[year].loc[:, found_columns]
    dropped_district_columns[year] = list(
        set(scoped_rename_vals.index) - set(found_columns))
    dropped_district_columns[year].sort()
    scoped_data = scoped_data.rename(columns=scoped_rename_vals)

    return scoped_data, dropped_district_columns

In [38]:
def adjust_typing(data, verbose=0):

    for col in data.columns[8:]:
        if verbose > 10:
            print(col)
        try:
            data[col] = pd.to_numeric(data[col])
        except:
            data[col] = data[col].astype(str)
            data[col] = data[col].str.replace(",", "")
            data[col] = data[col].str.replace("*", "")
            data[col] = data[col].str.strip()
            data[col] = data[col].str.replace("Not Provided", "")
            data[col] = data[col].str.replace("nan", '')
            data[col] = pd.to_numeric(data[col])
    return data

In [39]:
dropped_columns = {}
dropped_district_columns = {}
dropped_state_columns = {}

for year in range(START_YEAR, END_YEAR + 1):
    # the dropna here drops the columns that are not included in the crosswalk
    # and thus not included in the report card for this year.
    years_columns = demo_crosswalk.loc[year].dropna()
    # swap index and vals for renaming

    rename_vals = pd.Series(years_columns.index.values, index=years_columns)
    # this list comprehension drops any columns that are not found in the report card this year
    # this should drop demographic columns that are not found in this year, such as homeless enrollment
    # counts for 2008 it should not however, drop columns that should be found in the report card.
    # Because it is capable of dropping columns that should be there, the dropped columns are recorded in
    # dropped_columns to be checked later
    found_columns = [
        item for item in years_columns if item in report_card[year].columns]

    data = report_card[year].loc[:, found_columns]
    dropped_columns[year] = list(set(years_columns) - set(found_columns))
    dropped_columns[year].sort()
    data_copy = data.copy()
    data = data.rename(columns=rename_vals)

    # District and state level aggregation for 2008-2017
    if year < 2018:
        data['Type'] = 'School'

        district_data, dropped_district_columns = scope_data(
            rename_vals, year, 'DISTRICT', demo_crosswalk, report_card, dropped_district_columns)
        district_data['RCD'] = district_data['RCDTS'].str[:9]
        district_data = district_data[district_data['SCHOOL TYPE NAME'] != 'CHARTER SCH'].drop(
            columns='SCHOOL TYPE NAME')
        district_data['RCDTS'] = district_data['RCD'] + '000000'
        district_data = district_data.groupby(
            'RCD').max().reset_index(drop=True)
        district_data['Type'] = 'District'

        state_data, dropped_state_columns = scope_data(
            rename_vals, year, 'STATE', demo_crosswalk, report_card, dropped_state_columns)
        state_data = pd.DataFrame(state_data.max()).T.drop(
            columns=['RCDTS', 'SCHOOL TYPE NAME', 'City', 'County'])
        state_data['Type'] = 'Statewide'

        data = pd.concat((data, district_data, state_data),
                         axis=0, ignore_index=True)
        data = data[['RCDTS', 'Type'] +
                    [item for item in data.columns if item not in ['RCDTS', 'Type']]]

    data = adjust_typing(data)
    data['Year'] = year
    datasets[year] = data

In [40]:
display(dropped_district_columns)
dropped_columns

{2008: ['% Native Hawaiian or Other Pacific Islander TEACH - DISTRICT',
  '% Two or More Races TEACH - DISTRICT',
  '% Unknown TEACH - DISTRICT',
  'ATTENDANCE RATE DISTRICT % - Native Hawaiian or Other Pacific Islander',
  'DISTRICT - Children with Disabilities %',
  'DISTRICT - Native Hawaiian or Other Pacific Islander %',
  'DISTRICT - Youth in Care %',
  'GR3 MATH DISTRICT BELOW - Native Hawaiian or Other Pacific Islander (ISAT)',
  'GR3 MATH DISTRICT EXCEEDS - Native Hawaiian or Other Pacific Islander (ISAT)',
  'GR3 MATH DISTRICT MEETS - Native Hawaiian or Other Pacific Islander (ISAT)',
  'GR3 READ DISTRICT BELOW - Native Hawaiian or Other Pacific Islander (ISAT)',
  'GR3 READ DISTRICT EXCEEDS - Native Hawaiian or Other Pacific Islander (ISAT)',
  'GR3 READ DISTRICT MEETS - Native Hawaiian or Other Pacific Islander (ISAT)',
  'GR4 MATH DISTRICT BELOW - Native Hawaiian or Other Pacific Islander (ISAT)',
  'GR4 MATH DISTRICT EXCEEDS - Native Hawaiian or Other Pacific Islander (ISA

{2008: ['% Native Hawaiian or Other Pacific Islander TEACH - DISTRICT',
  '% Two or More Races TEACH - DISTRICT',
  '% Unknown TEACH - DISTRICT',
  'ATTENDANCE RATE SCHOOL % - Native Hawaiian or Other Pacific Islander',
  'GR3 MATH SCHOOL BELOW - Native Hawaiian or Other Pacific Islander (ISAT)',
  'GR3 MATH SCHOOL EXCEEDS - Native Hawaiian or Other Pacific Islander (ISAT)',
  'GR3 MATH SCHOOL MEETS - ALL (ISAT)',
  'GR3 MATH SCHOOL MEETS - Male (ISAT)',
  'GR3 MATH SCHOOL MEETS - Native Hawaiian or Other Pacific Islander (ISAT)',
  'GR3 READ SCHOOL BELOW - Native Hawaiian or Other Pacific Islander (ISAT)',
  'GR3 READ SCHOOL EXCEEDS - Native Hawaiian or Other Pacific Islander (ISAT)',
  'GR3 READ SCHOOL MEETS - ALL (ISAT)',
  'GR3 READ SCHOOL MEETS - Female (ISAT)',
  'GR3 READ SCHOOL MEETS - Male (ISAT)',
  'GR3 READ SCHOOL MEETS - Native Hawaiian or Other Pacific Islander (ISAT)',
  'GR4 MATH SCHOOL BELOW - Native Hawaiian or Other Pacific Islander (ISAT)',
  'GR4 MATH SCHOOL EXCEED

In [41]:
dot_ones = {}
for i in range(START_YEAR, END_YEAR + 1):
    c = report_card[i].columns.astype(str)
    dot_ones[i] = (list(c[c.str.contains('.1', regex=False)]))
dot_ones

{2008: [],
 2009: [],
 2010: [],
 2011: [],
 2012: [],
 2013: [],
 2014: [],
 2015: [],
 2016: [],
 2017: [],
 2018: ['Five Essential Survey Ambitious Instruction.1',
  'General Admin 2016-17 - Dollars.1',
  'All students DLM Mathematics Emerging Grade 3.1'],
 2019: ['Five Essential Survey Ambitious Instruction.1',
  '% All Students (Peer Districts) - Black.1',
  '% All Students (Peer Districts) - Latinx.1',
  '% All Students (Peer Districts) - Asian.1',
  '% All Students (Peer Districts) - Nat Haw/Other Pac Isndr.1',
  '% All Students (Peer Districts) - American Indian or Alaska Native.1',
  '% All Students (Peer Districts) - Two or More Races.1'],
 2020: [],
 2021: [],
 2022: [],
 2023: [],
 2024: []}

# Final Dataset Creation and Processing


In [42]:
master_data = pd.concat(datasets.values(), ignore_index=True)
master_data = master_data.loc[:, ['Year'] + columns]
master_data = master_data.apply(
    lambda x: x.str.strip() if x.dtype == 'object' else x)

In [43]:
master_data.loc[master_data['Type'] ==
                'Statewide', 'RCDTS'] = '650000000800000'
# Pre-2018 data fills district data to the school level, but this erases that in keeping with the newer protocol
master_data.loc[master_data['Type'] == 'School', list(master_data.columns[master_data.columns.str.contains(
    "Teacher FTE")]) + ['Pupil Teacher Ratio - Elementary', 'Pupil Teacher Ratio - High School']] = np.nan
master_data.columns = master_data.columns.str.replace(
    'Student Enrollment - ', '% Student Enrollment - ')
master_data.columns = master_data.columns.str.replace(
    'Total Teacher FTE - ', '% Teachers - ')

In [44]:
with pd.option_context('display.max_rows', 10000, 'display.max_columns', 100):
    for t in ['Statewide']:
        print(t)
        df = master_data[master_data['Type'] == t].groupby(['Year'])[list(filter(
            lambda x: x not in master_data.columns[:9], master_data.columns))].mean().replace({np.NaN: '.'}).T
        # count().astype(bool).replace({False:'.',True:'X'}).T)
        display(df.tail(20).style.format(
            precision=1, thousands=",", decimal="."))

Statewide


Year,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023,2024
High School 4-Year Cohort Graduates - Male,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,65055.0
High School 4-Year Cohort Graduates - White,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,61931.0
High School 4-Year Cohort Graduates - Asian,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,7597.0
High School 4-Year Cohort Graduates - Black,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,19318.0
High School 4-Year Cohort Graduates - Latinx,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,36335.0
High School 4-Year Cohort Graduates - American Indian or Alaska Native,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,263.0
High School 4-Year Cohort Graduates - Native Hawaiian or Other Pacific Islander,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,110.0
High School 4-Year Cohort Graduates - Two or More Races,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,4755.0
High School 4-Year Cohort Graduates - EL,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,11384.0
High School 4-Year Cohort Graduates - Low Income,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,56546.0


In [45]:
query_string = "Type == 'Statewide' and Year >= 2011 and Year <= 2024"
master_data.query(query_string)[[
    "Year",
    "Student Enrollment",
    "% Student Enrollment - White",
    "% Student Enrollment - Asian",
    "% Student Enrollment - Black",
    "% Student Enrollment - Latinx",
    "% Student Enrollment - American Indian or Alaska Native",
    "% Student Enrollment - Native Hawaiian or Other Pacific Islander",
    "% Student Enrollment - Two or More Races"
]]

Unnamed: 0,Year,Student Enrollment,% Student Enrollment - White,% Student Enrollment - Asian,% Student Enrollment - Black,% Student Enrollment - Latinx,% Student Enrollment - American Indian or Alaska Native,% Student Enrollment - Native Hawaiian or Other Pacific Islander,% Student Enrollment - Two or More Races
19092,2011,2074806.0,51.4,4.1,18.3,23.0,0.3,0.1,2.8
23830,2012,2066692.0,51.0,4.2,18.0,23.6,0.3,0.1,2.8
28554,2013,2054155.0,50.6,4.3,17.6,24.1,0.3,0.1,3.0
33208,2014,2046857.0,49.9,4.5,17.5,24.6,0.3,0.1,3.1
37828,2015,2054556.0,49.3,4.6,17.5,25.1,0.3,0.1,3.1
42414,2016,2041779.0,48.8,4.7,17.3,25.5,0.3,0.1,3.2
47062,2017,2028162.0,48.5,4.9,17.0,25.7,0.4,0.1,3.4
51816,2018,2001529.0,48.0,5.1,16.8,26.2,0.3,0.1,3.5
56554,2019,1984519.0,47.6,5.1,16.7,26.4,0.3,0.1,3.8
61274,2020,1957018.0,47.5,5.2,16.6,26.6,0.3,0.1,3.8


In [46]:
df = master_data.query('Type == "Statewide"')[
    ['Year', 'All students PARCC MATH Level 4', 'All students PARCC MATH Level 5', 'All students PARCC Proficiency in Math %']].reset_index(drop=True)
df['Proficient'] = df.iloc[:, 1:3].sum(axis=1)
df

Unnamed: 0,Year,All students PARCC MATH Level 4,All students PARCC MATH Level 5,All students PARCC Proficiency in Math %,Proficient
0,2008,,,,0.0
1,2009,,,,0.0
2,2010,,,,0.0
3,2011,,,,0.0
4,2012,,,,0.0
5,2013,,,,0.0
6,2014,,,,0.0
7,2015,25.0,3.2,28.2,28.2
8,2016,26.6,3.8,30.5,30.4
9,2017,26.7,4.5,31.2,31.2


# Data Categorization and Writing to File


In [47]:
def find_columns(starters):
    cols = []
    for col in starters:
        cols += list(master_data.columns[master_data.columns.str.startswith(col)])
    return list(pd.Series(cols).drop_duplicates())

In [48]:
cat_walk = demo_info.drop_duplicates(subset=['Metric', 'Category']).groupby(
    ['Category']).agg({'Metric': list})
# Section 4, Step 1: Add new category to list
cat_walk = cat_walk.loc[['Identifier', 'Enrollment-Attendance', 'Student Performance',
                         'CTE', 'Teachers-Admin', 'ACT', 'IAR', 'ISA', 'ISAT', 'PARCC', 'SAT'], :]
cat_walk['Demo Metrics'] = cat_walk['Metric'].apply(find_columns)

In [54]:
def write_to_sheets(master_data, cat_walk, district=False):
    if district:
        data = master_data[master_data['Type'] == 'District']
        path = 'Historic RC District Data.xlsx'
    else:
        data = master_data
        path = 'Historic Data.xlsx'

    excel_writer = pd.ExcelWriter(path)

    toc = demo_info.groupby(['Metric', 'Category'])[
        'Disaggregated'].max().reset_index()
    toc = toc.set_index('Category', drop=True)
    toc = toc.loc[list(cat_walk.index), ['Metric', 'Disaggregated']]
    toc = pd.merge(toc, (data.groupby(['Year']).count() > 1).replace(
        {False: '', True: 'X'}).T, how='left', left_on='Metric', right_index=True)
    toc.loc[toc['Metric'] == 'Year', range(START_YEAR, END_YEAR + 1)] = 'X'

    toc.to_excel(excel_writer, sheet_name='Table of Contents')

    for cat in tqdm(cat_walk.index[1:]):
        sheet_data = pd.merge(data[cat_walk.loc['Identifier', 'Metric']],
                              data[cat_walk.loc[cat, 'Demo Metrics']], left_index=True, right_index=True)
        years = sheet_data[['Year'] + cat_walk.loc[cat,
                                                   'Demo Metrics']].groupby(['Year']).count().sum(axis=1).astype(bool)

        year_high = years[years].index.max()
        year_low = years[years].index.min()
        sheet_data.query("Year >= @year_low and Year <= @year_high").to_excel(
            excel_writer, sheet_name=cat, index=False)
    excel_writer.close()

In [None]:
write_to_file = True

if write_to_file:
    if 'REPORT_CARD' in globals() or 'report_card' in globals():
        del REPORT_CARD
        del report_card

    write_to_sheets(master_data, cat_walk, district=True)

100%|██████████| 10/10 [01:13<00:00,  7.37s/it]


In [None]:
if write_to_file:
    write_to_sheets(master_data, cat_walk)