# Setup


In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from warnings import simplefilter
import copy
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)

In [2]:
filename_crosswalk = {
    2023: "23-RC-Pub-Data-Set.xlsx",
    2022: "2022-Report-Card-Public-Data-Set.xlsx",
    2021: "2021-RC-Pub-Data-Set.xlsx",
    2020: "2020-Report-Card-Public-Data-Set.xlsx",
    2019: "2019-Report-Card-Public-Data-Set.xlsx",
    2018: "Report-Card-Public-Data-Set.xlsx",
    2017: "rc17.txt",
    2016: "rc16.txt",
    2015: "rc15.txt",
    2014: "rc14.txt",
    2013: "rc13.txt",
    2012: "rc12.txt",
    2011: "rc11u.txt",
    2010: "rc10.txt",
    2009: "rc09.txt",
    2008: "rc08u.txt"
}

demographic_key = {
    "Female": "Female",
    "FEMALE": "Female",
    "Male": "Male",
    "MALE": "Male",
    "White": "White",
    "WHITE": "White",
    "WHITE %": "White",
    "White %": "White",
    "Asian": "Asian",
    "ASIAN": "Asian",
    "ASIAN %": "Asian",
    "Asian %": "Asian",
    "Asian5": "Asian",
    "Black": "Black",
    "BLACK": "Black",
    "BLACK %": "Black",
    "Black or African American": "Black",
    "Black or African American %": "Black",
    "Black or African American3": "Black",
    "Latinx": "Latinx",
    "HISPANIC": "Latinx",
    "HISPANIC %": "Latinx",
    "Hispanic or Latino": "Latinx",
    "Hispanic or Latino %": "Latinx",
    "Hispanic": "Latinx",
    "Hispanic or Latino4": "Latinx",
    "American Indian or Alaska Native": "American Indian or Alaska Native",
    "NATIVE AMER": "American Indian or Alaska Native",
    "Native Amer": "American Indian or Alaska Native",
    "NATIVE AMERICAN": "American Indian or Alaska Native",
    "NATIVE AMERICAN %": "American Indian or Alaska Native",
    "American Indian or Alaska Native %": "American Indian or Alaska Native",
    "Am Ind/Alaska Nat": "American Indian or Alaska Native",
    "Am Ind/Alaska Nat2": "American Indian or Alaska Native",
    "Am Ind/Alaska Nat7": "American Indian or Alaska Native",
    "Native Hawaiian or Other Pacific Islander": "Native Hawaiian or Other Pacific Islander",
    "NATIVE HAWAIIAN AND OTHERS": "Native Hawaiian or Other Pacific Islander",
    "NATIVE HAWAIIAN AND OTHERS %": "Native Hawaiian or Other Pacific Islander",
    "Native Hawaiian or Other Pacific Islander %": "Native Hawaiian or Other Pacific Islander",
    "NATIVE HAWAIIAN OR OTHER PACIFIC ISLANDER %": "Native Hawaiian or Other Pacific Islander",
    "NATIVE HAWAIIAN OR OTHER PACIFIC ISLANDER": "Native Hawaiian or Other Pacific Islander",
    "HawaiiPacIslander": "Native Hawaiian or Other Pacific Islander",
    "Hawaiian/Pac Islander": "Native Hawaiian or Other Pacific Islander",
    "Nat Haw/Other Pac Isndr": "Native Hawaiian or Other Pacific Islander",
    "Nat Haw/Other Pac Isndr6": "Native Hawaiian or Other Pacific Islander",
    "Two or More Races": "Two or More Races",
    "MULTIRACIAL": "Two or More Races",
    "MULTIRACIAL %": "Two or More Races",
    "MULTIRACIAL/ETHNIC": "Two or More Races",
    "MULTIRACIAL/ETHNIC %": "Two or More Races",
    "MultiRace": "Two or More Races",
    "MultiRace3": "Two or More Races",
    "TWO OR MORE RACES": "Two or More Races",
    "TWO OR MORE RACES %": "Two or More Races",
    "TOW OR MORE RACES": "Two or More Races",
    "TOW OR MORE RACES %": "Two or More Races",
    "Two or More Races %": "Two or More Races",
    "Two or More Race": "Two or More Races",
    "Two or More Races8": "Two or More Races",
    "EL": "EL",
    "EL %": "EL",
    "LEP": "EL",
    "L.E.P.": "EL",
    "LEP %": "EL",
    "Low Income": "Low Income",
    "LOW INCOME": "Low Income",
    "Low Income %": "Low Income",
    "LOW INCOME %": "Low Income",
    "Low-Income": "Low Income",
    "LOW-INCOME": "Low Income",
    "Low-Income %": "Low Income",
    "LOW-INCOME %": "Low Income",
    "LowIncome": "Low Income",
    "Migrant": "Migrant",
    "MIGRANT": "Migrant",
    "MIGRANT %": "Migrant",
    "Homeless": "Homeless",
    "Homeless %": "Homeless",
    "HOMELESS": "Homeless",
    "HOMELESS %": "Homeless",
    "IEP": "IEP",
    "I.E.P.": "IEP",
    "IEP %": "IEP",
    "Children with Disabilities": "Children with Disabilities",
    "CWD": "Children with Disabilities",
    "CWD %": "Children with Disabilities",
    "UNKNOWN": "Unknown",
    "Unknown": "Unknown",
    "UNKNOWN RACE": "Unknown",
    "Unknown Race": "Unknown",
}

# These metric-demo combos are not present in any report cards
absent_metric_demo_combos = ['Student Enrollment - Female', 'Student Enrollment - Male', 'Student Enrollment - Migrant', 'Student Enrollment - Unknown',
                             'Student Attendance Rate - Homeless', 'Student Attendance Rate - Children with Disabilities', 'Student Attendance Rate - Unknown',
                             'Chronic Absenteeism - Migrant', 'Chronic Absenteeism - Homeless', 'Chronic Absenteeism - Unknown', 'Total Teacher FTE - EL', 'Total Teacher FTE - Low Income',
                             'Total Teacher FTE - Migrant', 'Total Teacher FTE - Homeless', 'Total Teacher FTE - IEP', 'Total Teacher FTE - Children with Disabilities',
                             '% 9th Grade on Track - Female', '% 9th Grade on Track - Male', '% 9th Grade on Track - Migrant', '% 9th Grade on Track - Homeless', '% 9th Grade on Track - Unknown',
                             '# CTE Participants - Unknown',
                             '4-Year Graduation Rate (Perkins) - Unknown',
                             'Postsecondary Placement Rate (Perkins) - Unknown',
                             'Nontraditional Program Enrollment Rate (Perkins) - Unknown',
                             '# Students who took Dual Credit classes Grade 9 - Female', '# Students who took Dual Credit classes Grade 9 - Male', '# Students who took Dual Credit classes Grade 9 - Migrant', '# Students who took Dual Credit classes Grade 9 - Homeless', '# Students who took Dual Credit classes Grade 9 - Unknown',
                             '# Students who took Dual Credit classes Grade 10 - Female', '# Students who took Dual Credit classes Grade 10 - Male', '# Students who took Dual Credit classes Grade 10 - Migrant', '# Students who took Dual Credit classes Grade 10 - Homeless', '# Students who took Dual Credit classes Grade 10 - Unknown',
                             '# Students who took Dual Credit classes Grade 11 - Female', '# Students who took Dual Credit classes Grade 11 - Male', '# Students who took Dual Credit classes Grade 11 - Migrant', '# Students who took Dual Credit classes Grade 11 - Homeless', '# Students who took Dual Credit classes Grade 11 - Unknown',
                             '# Students who took Dual Credit classes Grade 12 - Female', '# Students who took Dual Credit classes Grade 12 - Male', '# Students who took Dual Credit classes Grade 12 - Migrant', '# Students who took Dual Credit classes Grade 12 - Homeless', '# Students who took Dual Credit classes Grade 12 - Unknown',
                             '# Students enrolled in Dual Credit Coursework - Male', '# Students enrolled in Dual Credit Coursework - Migrant', '# Students enrolled in Dual Credit Coursework - Unknown',
                             '% Students enrolled in Dual Credit Coursework - Male', '% Students enrolled in Dual Credit Coursework - Migrant', '% Students enrolled in Dual Credit Coursework - Unknown',
                             '# Students who took AP classes Grade 9 - Female', '# Students who took AP classes Grade 9 - Male', '# Students who took AP classes Grade 9 - Migrant', '# Students who took AP classes Grade 9 - Homeless', '# Students who took AP classes Grade 9 - Children with Disabilities', '# Students who took AP classes Grade 9 - Unknown',
                             '# Students who took AP classes Grade 10 - Female', '# Students who took AP classes Grade 10 - Male', '# Students who took AP classes Grade 10 - Migrant', '# Students who took AP classes Grade 10 - Homeless', '# Students who took AP classes Grade 10 - Children with Disabilities', '# Students who took AP classes Grade 10 - Unknown',
                             '# Students who took AP classes Grade 11 - Female', '# Students who took AP classes Grade 11 - Male', '# Students who took AP classes Grade 11 - Migrant', '# Students who took AP classes Grade 11 - Homeless', '# Students who took AP classes Grade 11 - Children with Disabilities', '# Students who took AP classes Grade 11 - Unknown',
                             '# Students who took AP classes Grade 12 - Female', '# Students who took AP classes Grade 12 - Male', '# Students who took AP classes Grade 12 - Migrant', '# Students who took AP classes Grade 12 - Homeless', '# Students who took AP classes Grade 12 - Children with Disabilities', '# Students who took AP classes Grade 12 - Unknown',
                             '# Students who took IB classes Grade 9 - Female', '# Students who took IB classes Grade 9 - Male', '# Students who took IB classes Grade 9 - Migrant', '# Students who took IB classes Grade 9 - Homeless', '# Students who took IB classes Grade 9 - Unknown',
                             '# Students who took IB classes Grade 10 - Female', '# Students who took IB classes Grade 10 - Male', '# Students who took IB classes Grade 10 - Migrant', '# Students who took IB classes Grade 10 - Homeless', '# Students who took IB classes Grade 10 - Unknown',
                             '# Students who took IB classes Grade 11 - Female', '# Students who took IB classes Grade 11 - Male', '# Students who took IB classes Grade 11 - Migrant', '# Students who took IB classes Grade 11 - Homeless', '# Students who took IB classes Grade 11 - Unknown',
                             '# Students who took IB classes Grade 12 - Female', '# Students who took IB classes Grade 12 - Male', '# Students who took IB classes Grade 12 - Migrant', '# Students who took IB classes Grade 12 - Homeless', '# Students who took IB classes Grade 12 - Unknown',
                             'High School 4-Year Graduation Rate - Unknown', 'High School 6-Year Graduation Rate - Unknown',
                             '% All students IAR ELA Level 1 - Grade 3 - Unknown', '% All students IAR ELA Level 1 - Grade 4 - Unknown', '% All students IAR ELA Level 1 - Grade 5 - Unknown', '% All students IAR ELA Level 1 - Grade 6 - Unknown', '% All students IAR ELA Level 1 - Grade 7 - Unknown', '% All students IAR ELA Level 1 - Grade 8 - Unknown',
                             '% All students IAR ELA Level 2 - Grade 3 - Unknown', '% All students IAR ELA Level 2 - Grade 4 - Unknown', '% All students IAR ELA Level 2 - Grade 5 - Unknown', '% All students IAR ELA Level 2 - Grade 6 - Unknown', '% All students IAR ELA Level 2 - Grade 7 - Unknown', '% All students IAR ELA Level 2 - Grade 8 - Unknown',
                             '% All students IAR ELA Level 3 - Grade 3 - Unknown', '% All students IAR ELA Level 3 - Grade 4 - Unknown', '% All students IAR ELA Level 3 - Grade 5 - Unknown', '% All students IAR ELA Level 3 - Grade 6 - Unknown', '% All students IAR ELA Level 3 - Grade 7 - Unknown', '% All students IAR ELA Level 3 - Grade 8 - Unknown',
                             '% All students IAR ELA Level 4 - Grade 3 - Unknown', '% All students IAR ELA Level 4 - Grade 4 - Unknown', '% All students IAR ELA Level 4 - Grade 5 - Unknown', '% All students IAR ELA Level 4 - Grade 6 - Unknown', '% All students IAR ELA Level 4 - Grade 7 - Unknown', '% All students IAR ELA Level 4 - Grade 8 - Unknown',
                             '% All students IAR ELA Level 5 - Grade 3 - Unknown', '% All students IAR ELA Level 5 - Grade 4 - Unknown', '% All students IAR ELA Level 5 - Grade 5 - Unknown', '% All students IAR ELA Level 5 - Grade 6 - Unknown', '% All students IAR ELA Level 5 - Grade 7 - Unknown', '% All students IAR ELA Level 5 - Grade 8 - Unknown',
                             '% All students IAR Mathematics Level 1 - Grade 3 - Unknown', '% All students IAR Mathematics Level 1 - Grade 4 - Unknown', '% All students IAR Mathematics Level 1 - Grade 5 - Unknown', '% All students IAR Mathematics Level 1 - Grade 6 - Unknown', '% All students IAR Mathematics Level 1 - Grade 7 - Unknown', '% All students IAR Mathematics Level 1 - Grade 8 - Unknown',
                             '% All students IAR Mathematics Level 2 - Grade 3 - Unknown', '% All students IAR Mathematics Level 2 - Grade 4 - Unknown', '% All students IAR Mathematics Level 2 - Grade 5 - Unknown', '% All students IAR Mathematics Level 2 - Grade 6 - Unknown', '% All students IAR Mathematics Level 2 - Grade 7 - Unknown', '% All students IAR Mathematics Level 2 - Grade 8 - Unknown',
                             '% All students IAR Mathematics Level 3 - Grade 3 - Unknown', '% All students IAR Mathematics Level 3 - Grade 4 - Unknown', '% All students IAR Mathematics Level 3 - Grade 5 - Unknown', '% All students IAR Mathematics Level 3 - Grade 6 - Unknown', '% All students IAR Mathematics Level 3 - Grade 7 - Unknown', '% All students IAR Mathematics Level 3 - Grade 8 - Unknown',
                             '% All students IAR Mathematics Level 4 - Grade 3 - Unknown', '% All students IAR Mathematics Level 4 - Grade 4 - Unknown', '% All students IAR Mathematics Level 4 - Grade 5 - Unknown', '% All students IAR Mathematics Level 4 - Grade 6 - Unknown', '% All students IAR Mathematics Level 4 - Grade 7 - Unknown', '% All students IAR Mathematics Level 4 - Grade 8 - Unknown',
                             '% All students IAR Mathematics Level 5 - Grade 3 - Unknown', '% All students IAR Mathematics Level 5 - Grade 4 - Unknown', '% All students IAR Mathematics Level 5 - Grade 5 - Unknown', '% All students IAR Mathematics Level 5 - Grade 6 - Unknown', '% All students IAR Mathematics Level 5 - Grade 7 - Unknown', '% All students IAR Mathematics Level 5 - Grade 8 - Unknown',
                             '# Students IAR ELA Participation - Homeless', '# Students IAR ELA Participation - Migrant', '# Students IAR ELA Participation - Unknown',
                             '% Students IAR ELA Participation - Homeless', '% Students IAR ELA Participation - Migrant', '% Students IAR ELA Participation - Unknown',
                             '# Students IAR Math Participation - Homeless', '# Students IAR Math Participation - Migrant', '# Students IAR Math Participation - Unknown',
                             '% Students IAR Math Participation - Homeless', '% Students IAR Math Participation - Migrant', '% Students IAR Math Participation - Unknown',
                             'IAR ELA No Participation Rate - Migrant', 'IAR ELA No Participation Rate - Homeless', 'IAR ELA No Participation Rate - Unknown',
                             'IAR Math No Participation Rate - Migrant', 'IAR Math No Participation Rate - Homeless', 'IAR Math No Participation Rate - Unknown',
                             ]

# Reading Files and Cleaning


In [3]:
def get_layout_file(short_year):
    if int(short_year) > 12:
        x = "x"
    else:
        x = ""
    if short_year == "12" or short_year == "16" or short_year == "15":
        return pd.read_excel("./data/RC" + short_year + "-layout.xls" + x, header=None)
    else:
        return pd.read_excel("./data/RC" + short_year + "_layout.xls" + x, header=None)

In [4]:
layout = {}
# NOTE: the demographics column may have other notes besides just demographic info

for year in range(2008, 2018):
    s = "{:02d}".format(year - 2000)

    # Grab Column Number, Demographic, and Metric columns
    layout[year] = get_layout_file(s).iloc[:, [0, 2, 5]]

    # Drop rows that don't have a column number (header rows for categories)
    layout[year] = layout[year][pd.to_numeric(
        layout[year][0], errors='coerce').notnull()].reset_index(drop=True)

    # Drop Column Number column and rename others
    layout[year] = layout[year].drop(columns=0)
    layout[year] = layout[year].rename(columns={2: "Demographic", 5: "Metric"})

    # Replace demographic keys with Advance Illinois standard
    # also clean up mistakes in demographics
    # This makes it so that the demographic terms used in each year do not need to be tracked
    layout[year]["Demographic"] = layout[year]['Demographic'].replace(
        demographic_key)
    layout[year]['Metric'] = layout[year]['Metric'].str.strip()
    layout[year]['Demographic'] = layout[year]['Demographic'].str.strip()

    # Create mask for all rows with demographics
    mask = ~(layout[year]["Demographic"].isnull()) & (
        layout[year]["Demographic"] != "ALL") & (layout[year]["Demographic"] != "ALL STUDENTS")
    # Combine Metric and Demographic columns
    layout[year].loc[mask, "Metric"] = layout[year].loc[mask, "Metric"].astype(
        str) + " - " + layout[year].loc[mask, "Demographic"].astype(str)

In [5]:
# Replace demographics in teacher data with Advance Illinois standard
for year in layout.keys():
    teacher_demos = layout[year]['Metric'].str.extract(r'% (.*) TEACH')[0]
    teacher_demos = teacher_demos.dropna()
    teacher_demos = teacher_demos[(
        teacher_demos != 'CLASSES NOT TAUGHT BY HIGHLY QUALIFIED') & (teacher_demos != 'of')]
    layout[year].loc[teacher_demos.index,
                     'Demographic'] = teacher_demos.replace(demographic_key)

    layout[year]['Metric'] = layout[year]['Metric'].str.replace(
        'TEACH ER', 'TEACHER')
    layout[year]['Metric'] = layout[year]['Metric'].str.replace(
        'TEACHER- ', 'TEACHER - ')

    layout[year].loc[teacher_demos.index, 'Metric'] = layout[year].loc[teacher_demos.index, 'Metric'].str.replace(
        r'(% )(.*)( TEACH)', lambda m: m[1] + demographic_key[m[2]] + m[3], regex=True)

In [6]:
# Replace demographics in enrollment data with Advance Illinois standard
for year in layout.keys():
    enroll_demos = layout[year]['Metric'].str.extract(r'^\w+ - (.*) %$')[0]
    special_enroll = layout[year]['Metric'].str.extract(
        r'(.*) (?:SCHOOL|DISTRICT|STATE) %$')[0]

    enroll_demos = enroll_demos.dropna()
    special_enroll = special_enroll.dropna()
    special_enroll = special_enroll[special_enroll.apply(
        lambda x: x in demographic_key.keys())]

    layout[year].loc[enroll_demos.index,
                     'Demographic'] = enroll_demos.replace(demographic_key)
    layout[year].loc[special_enroll.index,
                     'Demographic'] = special_enroll.replace(demographic_key)

    layout[year].loc[enroll_demos.index, 'Metric'] = layout[year].loc[enroll_demos.index, 'Metric'].str.replace(
        r'(^\w+ - )(.*)( %)$', lambda m: m[1] + demographic_key[m[2]] + m[3], regex=True)
    layout[year].loc[special_enroll.index, 'Metric'] = layout[year].loc[special_enroll.index, 'Metric'].str.replace(
        r'(.*)( (?:SCHOOL|DISTRICT|STATE) %$)', lambda m: demographic_key[m[1]] + m[2], regex=True)

In [208]:
report_card = {}

if 'REPORT_CARD' in globals():
    report_card = copy.deepcopy(REPORT_CARD)
else:
    for key in tqdm(filename_crosswalk.keys()):
        if key > 2017:
            wkbk = pd.read_excel(
                "./data/" + filename_crosswalk[key], sheet_name=None, dtype='object')
            wkbk.pop('Revision History', None)
            wkbk.pop('Important Notes', None)

            if key == 2021:
                for k in wkbk.keys():
                    wkbk[k].loc[(wkbk[k]['RCDTS'] == '310458000802001') & (
                        wkbk[k]['Type'] == 'District'), 'RCDTS'] = '310458000800000'

            report_card[key] = wkbk['General'].copy()

            for k in filter(lambda x: x not in ['General', 'Finance'], wkbk.keys()):
                report_card[key] = pd.merge(
                    report_card[key], wkbk[k], on='RCDTS', how='outer', validate="1:1", suffixes=('', f"_{k}"))

        else:
            report_card[key] = pd.read_csv("./data/" + filename_crosswalk[key], sep=";",
                                           header=None, dtype='object').rename(columns=layout[key]['Metric'])
    REPORT_CARD = copy.deepcopy(report_card)

  0%|          | 0/16 [00:00<?, ?it/s]

In [209]:
crosswalk = pd.read_excel(
    'Local Historic Crosswalk.xlsx', sheet_name='Name Crosswalk')
crosswalk.index = crosswalk['Year']
crosswalk = crosswalk.drop(columns='Year')

demo_info = pd.read_excel(
    'Local Historic Crosswalk.xlsx', sheet_name='Details')
disagg_info = demo_info.groupby('Metric')['Disaggregated'].max()
# True if index is ever disaggregated, false otherwise

# Exploration


In [210]:
# EXPLORATORY CELL
search_string = "counsel".lower()

results = []

for year in range(2008, 2018):
    results.append(pd.Series(layout[year].loc[layout[year]['Metric'].str.lower(
    ).str.contains(search_string), 'Metric'], name=year).reset_index(drop=True))

for year in range(2018, 2024):
    results.append(pd.Series(report_card[year].columns[report_card[year].columns.str.lower(
    ).str.contains(search_string)], name=year).reset_index(drop=True))
    # print(year, list(pd.Series(map(lambda x: x.split(
    #     ' - ')[0], report_card[year].columns[report_card[year].columns.str.lower().str.contains(search_string)])).drop_duplicates()))


pd.DataFrame(results)  # .to_clipboard()

2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018


In [211]:
pd.to_numeric(report_card[2018]['9th Grade on Track']).describe()

count    1167.000000
mean       86.490146
std        13.288191
min         0.000000
25%        83.000000
50%        90.000000
75%        94.000000
max       100.000000
Name: 9th Grade on Track, dtype: float64

In [212]:
# EXPLORATORY CELL
col_idx = 6
year_flag = 1

for year in range(2023, 2017, -1):
    print(report_card[year].columns[col_idx])

for year in range(2017, 2011, -1):
    print(layout[year].loc[col_idx + 1, 'Metric'])

for year in range(2011, 2007, -1):
    print(layout[year].loc[col_idx, 'Metric'])

District Type
District Type
District Type
District Type
District Type
District Type
DISTRICT TYPE CODE (0,1,2,9)
DISTRICT TYPE CODE (0,1,2,9)
DISTRICT TYPE CODE (0,1,2,9)
DISTRICT TYPE CODE (0,1,2,9)
DISTRICT TYPE CODE (0,1,2,9)
DISTRICT TYPE CODE (0,1,2,9)
DISTRICT TYPE CODE (0,1,2,9)
DISTRICT TYPE CODE (0,1,2,9)
DISTRICT TYPE CODE (0,1,2,9)
DISTRICT TYPE CODE (0,1,2,9)


# Preprocessing


In [213]:
# Adds all possible demographic categories to each column that has disaggregated data
def add_demo_columns(columns, disagg_data):
    out_columns = []
    for col in columns:
        out_columns.append(col)
        if disagg_data[col]:
            out_columns += list(map(lambda x: col + " - " + x,
                                pd.Series(demographic_key.values()).unique()))
    return out_columns

In [214]:
for i in pd.Series(report_card[2023].columns)[report_card[2023].columns.str.startswith('% Homeless students IAR Mathematics Level 1')]:
    display(i)

'% Homeless students IAR Mathematics Level 1 - Grade 3'

'% Homeless students IAR Mathematics Level 1 - Grade 4'

'% Homeless students IAR Mathematics Level 1 - Grade 32'

'% Homeless students IAR Mathematics Level 1 - Grade 6'

'% Homeless students IAR Mathematics Level 1 - Grade 7'

'% Homeless students IAR Mathematics Level 1 - Grade 8'

In [215]:
# Replace demographic designations in report cards with standardized ones
for year in range(2018, 2024):
    report_card[year].columns = report_card[year].columns.str.replace(
        'Black or African American', 'Black')
    report_card[year].columns = report_card[year].columns.str.replace(
        'Hispanic or Latino', 'Latinx')
    report_card[year].columns = report_card[year].columns.str.replace(
        'Hispanic', 'Latinx')
    report_card[year].columns = report_card[year].columns.str.replace(
        'CWD', 'Children with Disabilities')
    report_card[year].columns = report_card[year].columns.str.replace(
        'Hawaiian/Pac Islander', 'Native Hawaiian or Other Pacific Islander')
    report_card[year].columns = report_card[year].columns.str.replace(
        'Am Ind/Alaska Nat', 'American Indian or Alaska Native')
    report_card[year].columns = report_card[year].columns.str.replace(
        'MultiRace', 'Two or More Races')
    report_card[year].columns = report_card[year].columns.str.replace(
        'LowIncome', 'Low Income')
    report_card[year].columns = report_card[year].columns.str.replace(
        'Two or More Race ', 'Two or More Races ')
    report_card[year].columns = report_card[year].columns.str.replace(
        '% Homeless students IAR Mathematics Level 1 - Grade 32', '% Homeless students IAR Mathematics Level 1 - Grade 5')

In [216]:
for i in pd.Series(report_card[2023].columns)[report_card[2023].columns.str.startswith('% Homeless students IAR Mathematics Level 1')]:
    display(i)

'% Homeless students IAR Mathematics Level 1 - Grade 3'

'% Homeless students IAR Mathematics Level 1 - Grade 4'

'% Homeless students IAR Mathematics Level 1 - Grade 5'

'% Homeless students IAR Mathematics Level 1 - Grade 6'

'% Homeless students IAR Mathematics Level 1 - Grade 7'

'% Homeless students IAR Mathematics Level 1 - Grade 8'

In [217]:
# Add all possible demographic categories to each column with disaggregated data
columns = add_demo_columns(crosswalk.columns, disagg_info)
columns = list(filter(lambda x: x not in absent_metric_demo_combos, columns))

# Create new object filtering out old columns
new_columns = list(filter(lambda x: x not in crosswalk.columns, columns))

# Create new crosswalk with demographic info
demo_crosswalk = crosswalk.copy()
demo_crosswalk[new_columns] = np.nan

for col in new_columns:
    split = col.split(' - ')
    if len(split) > 2:
        metric = ' - '.join(split[:-1])
        demo = split[-1]
    else:
        metric, demo = split
    demo_formats = demo_info.copy().loc[demo_info['Metric'] == metric, [
        'Year', 'Disaggregation Format', 'Special Format']]

    if (demo in ['IEP', 'EL', 'Low Income', 'Homeless']):
        demo_formats.loc[demo_formats['Special Format'].notnull(
        ), 'Disaggregation Format'] = demo_formats.loc[demo_formats['Special Format'].notnull(), 'Special Format']

    demo_formats = demo_formats.set_index('Year')['Disaggregation Format']

    demo_formats = demo_formats.str.replace('demo', demo)
    demo_formats = demo_formats.str.replace('DEMO', demo)
    demo_crosswalk[col] = demo_formats

master_data = pd.DataFrame(columns=['Year'] + columns)

datasets = {}

In [218]:
def scope_data(rename_vals, year, scope, demo_crosswalk, report_card, dropped_district_columns):
    scoped_rename_vals = rename_vals.drop(
        demo_crosswalk.loc[year, ['School Name', 'School Type']])

    if scope == 'DISTRICT':
        scoped_rename_vals.index = scoped_rename_vals.index.str.replace(
            'SCHOOL', scope)
        scoped_rename_vals.index = [
            demo_crosswalk.loc[year, 'RCDTS']] + list(scoped_rename_vals.index[1:])
        found_columns = [item.replace('SCHOOL', scope) for item in demo_crosswalk.loc[year].dropna(
        ) if item.replace('SCHOOL', scope) in report_card[year].columns]
        found_columns.remove("DISTRICT TYPE NAME")
        found_columns.remove("DISTRICT NAME")
    else:
        scoped_rename_vals.index = scoped_rename_vals.index.str.replace(
            'DISTRICT', scope).str.replace('SCHOOL', scope)
        scoped_rename_vals.index = [
            demo_crosswalk.loc[year, 'RCDTS']] + list(scoped_rename_vals.index[1:])

        found_columns = [item.replace('DISTRICT', scope).replace('SCHOOL', scope) for item in demo_crosswalk.loc[year].dropna(
        ) if item.replace('DISTRICT', scope).replace('SCHOOL', scope) in report_card[year].columns]

    found_columns = [demo_crosswalk.loc[year, 'RCDTS'],
                     'SCHOOL TYPE NAME'] + found_columns

    scoped_data = report_card[year].loc[:, found_columns]
    dropped_district_columns[year] = list(
        set(scoped_rename_vals.index) - set(found_columns))
    dropped_district_columns[year].sort()
    scoped_data = scoped_data.rename(columns=scoped_rename_vals)

    return scoped_data, dropped_district_columns

In [219]:
def adjust_typing(data):

    for col in data.columns[8:]:
        try:
            data[col] = pd.to_numeric(data[col])
        except:
            data[col] = data[col].str.replace(",", "")
            data[col] = data[col].str.strip()
            data[col] = pd.to_numeric(data[col])
    return data

In [220]:
dropped_columns = {}
dropped_district_columns = {}
dropped_state_columns = {}

for year in range(2008, 2024):
    # the dropna here drops the columns that are not included in the crosswalk
    # and thus not included in the report card for this year.
    years_columns = demo_crosswalk.loc[year].dropna()
    # swap index and vals for renaming

    rename_vals = pd.Series(years_columns.index.values, index=years_columns)
    # this list comprehension drops any columns that are not found in the report card this year
    # this should drop demographic columns that are not found in this year, such as homeless enrollment
    # counts for 2008 it should not however, drop columns that should be found in the report card.
    # Because it is capable of dropping columns that should be there, the dropped columns are recorded in
    # dropped_columns to be checked later
    found_columns = [
        item for item in years_columns if item in report_card[year].columns]

    data = report_card[year].loc[:, found_columns]
    dropped_columns[year] = list(set(years_columns) - set(found_columns))
    dropped_columns[year].sort()
    data_copy = data.copy()
    data = data.rename(columns=rename_vals)

    # District and state level aggregation for 2008-2017
    if year < 2018:
        data['Type'] = 'School'

        district_data, dropped_district_columns = scope_data(
            rename_vals, year, 'DISTRICT', demo_crosswalk, report_card, dropped_district_columns)
        district_data['RCD'] = district_data['RCDTS'].str[:9]
        district_data = district_data[district_data['SCHOOL TYPE NAME'] != 'CHARTER SCH'].drop(
            columns='SCHOOL TYPE NAME')
        district_data['RCDTS'] = district_data['RCD'] + '000000'
        district_data = district_data.groupby(
            'RCD').max().reset_index(drop=True)
        district_data['Type'] = 'District'

        state_data, dropped_state_columns = scope_data(
            rename_vals, year, 'STATE', demo_crosswalk, report_card, dropped_state_columns)
        state_data = pd.DataFrame(state_data.max()).T.drop(
            columns=['RCDTS', 'SCHOOL TYPE NAME', 'City', 'County'])
        state_data['Type'] = 'Statewide'

        data = pd.concat((data, district_data, state_data),
                         axis=0, ignore_index=True)
        data = data[['RCDTS', 'Type'] +
                    [item for item in data.columns if item not in ['RCDTS', 'Type']]]

    data = adjust_typing(data)
    data['Year'] = year
    datasets[year] = data

In [221]:
display(dropped_district_columns)
dropped_columns

{2008: ['% Native Hawaiian or Other Pacific Islander TEACH - DISTRICT',
  '% Two or More Races TEACH - DISTRICT',
  '% Unknown TEACH - DISTRICT',
  'ATTENDANCE RATE DISTRICT % - Native Hawaiian or Other Pacific Islander',
  'DISTRICT - Children with Disabilities %',
  'DISTRICT - Native Hawaiian or Other Pacific Islander %',
  'HS GRAD RATE DISTRICT % - Children with Disabilities',
  'HS GRAD RATE DISTRICT % - Homeless',
  'HS GRAD RATE DISTRICT % - Native Hawaiian or Other Pacific Islander',
  'Homeless DISTRICT %',
  'IEP DISTRICT %'],
 2009: ['% Native Hawaiian or Other Pacific Islander TEACH - DISTRICT',
  '% Two or More Races TEACH - DISTRICT',
  '% Unknown TEACH - DISTRICT',
  'ATTENDANCE RATE DISTRICT % - Native Hawaiian or Other Pacific Islander',
  'DISTRICT - Children with Disabilities %',
  'DISTRICT - Native Hawaiian or Other Pacific Islander %',
  'HS GRAD RATE DISTRICT % - Children with Disabilities',
  'HS GRAD RATE DISTRICT % - Homeless',
  'HS GRAD RATE DISTRICT % - Na

{2008: ['% Native Hawaiian or Other Pacific Islander TEACH - DISTRICT',
  '% Two or More Races TEACH - DISTRICT',
  '% Unknown TEACH - DISTRICT',
  'ATTENDANCE RATE SCHOOL % - Native Hawaiian or Other Pacific Islander',
  'HS GRAD RATE SCHOOL % - Children with Disabilities',
  'HS GRAD RATE SCHOOL % - Homeless',
  'HS GRAD RATE SCHOOL % - Native Hawaiian or Other Pacific Islander',
  'Homeless SCHOOL %',
  'IEP SCHOOL %',
  'SCHOOL - Children with Disabilities %',
  'SCHOOL - Native Hawaiian or Other Pacific Islander %'],
 2009: ['% Native Hawaiian or Other Pacific Islander TEACH - DISTRICT',
  '% Two or More Races TEACH - DISTRICT',
  '% Unknown TEACH - DISTRICT',
  'ATTENDANCE RATE SCHOOL % - Native Hawaiian or Other Pacific Islander',
  'HS GRAD RATE SCHOOL % - Children with Disabilities',
  'HS GRAD RATE SCHOOL % - Homeless',
  'HS GRAD RATE SCHOOL % - Native Hawaiian or Other Pacific Islander',
  'Homeless SCHOOL %',
  'IEP SCHOOL %',
  'SCHOOL - Children with Disabilities %',
  '

# Final Dataset Creation and Processing


In [222]:
master_data = pd.concat(datasets.values(), ignore_index=True)
master_data = master_data.loc[:, ['Year'] + columns]
master_data = master_data.apply(
    lambda x: x.str.strip() if x.dtype == 'object' else x)

In [223]:
master_data.loc[master_data['Type'] ==
                'Statewide', 'RCDTS'] = '650000000800000'
# Pre-2018 data fills district data to the school level, but this erases that in keeping with the newer protocol
master_data.loc[master_data['Type'] == 'School', list(master_data.columns[master_data.columns.str.contains(
    "Teacher FTE")]) + ['Pupil Teacher Ratio - Elementary', 'Pupil Teacher Ratio - High School']] = np.nan
master_data.columns = master_data.columns.str.replace(
    'Student Enrollment - ', '% Student Enrollment - ')
master_data.columns = master_data.columns.str.replace(
    'Total Teacher FTE - ', '% Teachers - ')

In [226]:
with pd.option_context('display.max_rows', 100, 'display.max_columns', 100):
    master_data.groupby(['Type', 'Year'])[list(filter(lambda x: x not in master_data.columns[:9],
                                                      master_data.columns))].count().astype(bool).replace({False: '.', True: 'X'}).T[['School', 'District', 'Statewide']].to_csv('data_with_holes.csv')
    for t in ['District']:
        print(t)
        df = master_data[master_data['Type'] == t].groupby(['Year'])[list(filter(
            lambda x: x not in master_data.columns[:9], master_data.columns))].mean().replace({np.NaN: '.'}).T
        # count().astype(bool).replace({False:'.',True:'X'}).T)
        display(df.tail(30).style.format(
            precision=1, thousands=",", decimal="."))

District


Year,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023
IAR Math No Participation Rate,.,.,.,.,.,.,.,.,.,.,.,.,.,.,1.4,1.1
IAR Math No Participation Rate - Female,.,.,.,.,.,.,.,.,.,.,.,.,.,.,1.3,1.0
IAR Math No Participation Rate - Male,.,.,.,.,.,.,.,.,.,.,.,.,.,.,1.4,1.1
IAR Math No Participation Rate - White,.,.,.,.,.,.,.,.,.,.,.,.,.,.,1.4,1.1
IAR Math No Participation Rate - Asian,.,.,.,.,.,.,.,.,.,.,.,.,.,.,1.0,1.1
IAR Math No Participation Rate - Black,.,.,.,.,.,.,.,.,.,.,.,.,.,.,2.0,1.5
IAR Math No Participation Rate - Latinx,.,.,.,.,.,.,.,.,.,.,.,.,.,.,1.4,1.1
IAR Math No Participation Rate - American Indian or Alaska Native,.,.,.,.,.,.,.,.,.,.,.,.,.,.,2.0,1.1
IAR Math No Participation Rate - Native Hawaiian or Other Pacific Islander,.,.,.,.,.,.,.,.,.,.,.,.,.,.,3.5,1.3
IAR Math No Participation Rate - Two or More Races,.,.,.,.,.,.,.,.,.,.,.,.,.,.,1.9,1.4


In [225]:
write_to_file = False

if write_to_file:
    master_data.to_excel('Historic Data.xlsx', index=False)
    master_data.query("Type == 'District'").to_excel(
        'Historic RC District Data.xlsx', index=False)