In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
filename_crosswalk = {
    2023 : "23-RC-Pub-Data-Set.xlsx",
    2022 : "2022-Report-Card-Public-Data-Set.xlsx",
    2021 : "2021-RC-Pub-Data-Set.xlsx",
    2020 : "2020-Report-Card-Public-Data-Set.xlsx",
    2019 : "2019-Report-Card-Public-Data-Set.xlsx",
    2018 : "Report-Card-Public-Data-Set.xlsx",
    2017 : "rc17.txt",
    2016 : "rc16.txt",
    2015 : "rc15.txt",
    2014 : "rc14.txt",
    2013 : "rc13.txt",
    2012 : "rc12.txt",
    2011 : "rc11u.txt",
    2010 : "rc10.txt",
    2009 : "rc09.txt",
    2008 : "rc08u.txt"
}

demographic_key = {
    "Female" : "Female",
    "FEMALE" : "Female",
    "Male" : "Male",
    "MALE" : "Male",
    "White" : "White",
    "WHITE" : "White",
    "WHITE %" : "White",
    "White %" : "White",
    "Asian" : "Asian",
    "ASIAN" : "Asian",
    "ASIAN %" : "Asian",
    "Asian %" : "Asian",
    "Asian5" : "Asian",
    "Black" : "Black",
    "BLACK" : "Black",
    "BLACK %" : "Black",
    "Black or African American":"Black",
    "Black or African American %" : "Black",
    "Black or African American3" : "Black",
    "Latinx" : "Latinx",
    "HISPANIC" : "Latinx",
    "HISPANIC %" : "Latinx",
    "Hispanic or Latino":"Latinx",
    "Hispanic or Latino %" : "Latinx",
    "Hispanic" : "Latinx",
    "Hispanic or Latino4" : "Latinx",
    "American Indian or Alaska Native" : "American Indian or Alaska Native",
    "NATIVE AMERICAN" : "American Indian or Alaska Native",
    "NATIVE AMERICAN %" : "American Indian or Alaska Native",
    "American Indian or Alaska Native %" : "American Indian or Alaska Native",
    "Am Ind/Alaska Nat" : "American Indian or Alaska Native",
    "Am Ind/Alaska Nat2" : "American Indian or Alaska Native",
    "Am Ind/Alaska Nat7" : "American Indian or Alaska Native",
    "Native Hawaiian or Other Pacific Islander" : "Native Hawaiian or Other Pacific Islander",
    "NATIVE HAWAIIAN AND OTHERS" : "Native Hawaiian or Other Pacific Islander",
    "NATIVE HAWAIIAN AND OTHERS %" : "Native Hawaiian or Other Pacific Islander",
    "Native Hawaiian or Other Pacific Islander %" : "Native Hawaiian or Other Pacific Islander",
    "NATIVE HAWAIIAN OR OTHER PACIFIC ISLANDER %" : "Native Hawaiian or Other Pacific Islander",
    "NATIVE HAWAIIAN OR OTHER PACIFIC ISLANDER" : "Native Hawaiian or Other Pacific Islander",
    "HawaiiPacIslander" : "Native Hawaiian or Other Pacific Islander",
    "Hawaiian/Pac Islander" : "Native Hawaiian or Other Pacific Islander",
    "Nat Haw/Other Pac Isndr" : "Native Hawaiian or Other Pacific Islander",
    "Nat Haw/Other Pac Isndr6" : "Native Hawaiian or Other Pacific Islander",
    "Two or More Races" : "Two or More Races",
    "MULTIRACIAL" : "Two or More Races",
    "MULTIRACIAL %" : "Two or More Races",
    "MULTIRACIAL/ETHNIC %" : "Two or More Races",
    "MULTIRACIAL/ETHNIC %" : "Two or More Races",
    "MultiRace" : "Two or More Races",
    "MultiRace3" : "Two or More Races",
    "TWO OR MORE RACES" : "Two or More Races",
    "TWO OR MORE RACES %" : "Two or More Races",
    "TOW OR MORE RACES" : "Two or More Races",
    "TOW OR MORE RACES %" : "Two or More Races",
    "Two or More Races %" : "Two or More Races",
    "Two or More Race" : "Two or More Races",
    "Two or More Races8" : "Two or More Races",
    "EL" : "EL",
    "EL %" : "EL",
    "LEP" : "EL",
    "L.E.P." : "EL",
    "LEP %" : "EL",
    "Low Income" : "Low Income",
    "LOW INCOME" : "Low Income",
    "Low Income %" : "Low Income",
    "LOW INCOME %" : "Low Income",
    "Low-Income" : "Low Income",
    "LOW-INCOME" : "Low Income",
    "Low-Income %" : "Low Income",
    "LOW-INCOME %" : "Low Income",
    "LowIncome" : "Low Income",
    "Migrant" : "Migrant",
    "MIGRANT" : "Migrant",
    "MIGRANT %" : "Migrant",
    "Homeless" : "Homeless",
    "Homeless %" : "Homeless",
    "HOMELESS" : "Homeless",
    "HOMELESS %" : "Homeless",
    "IEP" : "IEP",
    "I.E.P." : "IEP",
    "IEP %" : "IEP",
    "Children with Disabilities" : "Children with Disabilities",
    "CWD" : "Children with Disabilities",
    "CWD %" : "Children with Disabilities",
}


In [3]:
def get_layout_file(short_year):
    if int(short_year) > 12:
        x = "x"
    else:
        x = ""
    if short_year == "12" or short_year == "16" or short_year == "15":
        return pd.read_excel("./data/RC" + short_year + "-layout.xls" + x, header=None)
    else:
        return pd.read_excel("./data/RC" + short_year + "_layout.xls" + x, header=None)
    
def fix_enrollment(layout_df):
    for enrollment_key in ['SCHOOL', 'SUBREGION', 'DISTRICT', 'STATE']:
        enrollment_rate_mask = layout_df['Metric'].str.startswith(enrollment_key + ' - ')
        layout_df.loc[enrollment_rate_mask,'Demographic'] = layout_df.loc[enrollment_rate_mask, 'Metric'].str.split(" - ").apply(lambda x: x[-1])
        layout_df.loc[enrollment_rate_mask, 'Metric'] = enrollment_key + ' ENROLLMENT'

        special_demos = ['L.E.P.', 'I.E.P.', 'LOW-INCOME', 'HOMELESS']
        for demo in special_demos:
            demo_mask = layout_df['Metric'].str.match(f"{re.escape(demo)} {re.escape(enrollment_key)} %")
            if demo_mask.sum() > 0:
                layout_df.loc[demo_mask, 'Demographic'] = demo
                layout_df.loc[demo_mask, 'Metric'] = enrollment_key + ' ENROLLMENT'

        layout_df.loc[layout_df['Metric'] == enrollment_key + ' TOTAL ENROLLMENT', 'Metric'] = enrollment_key + ' ENROLLMENT'
    return layout_df

In [4]:
layout = {}

for year in range(2008,2018):
    s = "{:02d}".format(year - 2000)
    layout[year] = get_layout_file(s).iloc[:,[0,2,5]]
    layout[year] = layout[year][pd.to_numeric(layout[year][0], errors='coerce').notnull()].loc[:,[2,5]].reset_index(drop=True)
    layout[year] = layout[year].rename(columns={2:"Demographic",5:"Metric"})
    layout[year] = fix_enrollment(layout[year])
    layout[year]["Demographic"] = layout[year]['Demographic'].replace(demographic_key)
    mask = ~(layout[year]["Demographic"].isnull()) & (layout[year]["Demographic"] != "ALL")
    layout[year].loc[mask,"Metric"] = layout[year].loc[mask,"Metric"].astype(str) + " - " + layout[year].loc[mask,"Demographic"].astype(str)
    layout[year]

In [5]:
x = 25
layout[2008].loc[range(12 + x,35 + x)]

Unnamed: 0,Demographic,Metric
37,American Indian or Alaska Native,STATE ENROLLMENT - American Indian or Alaska N...
38,Two or More Races,STATE ENROLLMENT - Two or More Races
39,,STATE ENROLLMENT
40,EL,SCHOOL ENROLLMENT - EL
41,EL,SUBREGION ENROLLMENT - EL
42,EL,DISTRICT ENROLLMENT - EL
43,EL,STATE ENROLLMENT - EL
44,Low Income,SCHOOL ENROLLMENT - Low Income
45,Low Income,SUBREGION ENROLLMENT - Low Income
46,Low Income,DISTRICT ENROLLMENT - Low Income


In [6]:
report_card = {}

for key in filename_crosswalk.keys():
    if key > 2017:
        report_card[key] = pd.read_excel("./data/" + filename_crosswalk[key], sheet_name="General", dtype='object')
    else:
        report_card[key] = pd.read_csv("./data/" + filename_crosswalk[key], sep=";", header=None, dtype='object').rename(columns=layout[key]['Metric'])

In [7]:
report_card[2018] = report_card[2018].rename(columns={'Student Enrollment - Total' : 'Student Enrollment'})

In [8]:
crosswalk = pd.read_excel('Local Historic Crosswalk.xlsx', sheet_name='Name Crosswalk')
crosswalk.index = crosswalk['Year']
crosswalk = crosswalk.drop(columns='Year')

demo_info = pd.read_excel('Local Historic Crosswalk.xlsx', sheet_name='Details')
disagg_info = pd.Series(crosswalk.columns, index=crosswalk.columns)
disagg_info = pd.Series(disagg_info.apply(lambda col: pd.merge(crosswalk[col].reset_index(), demo_info, left_on=['Year',col], right_on=['Year','Metric'], how='left')['Disaggregated'].any()), index=crosswalk.columns, name='Disaggregated')
# True if index is ever disaggregated, false otherwise

In [9]:
for year in range(2008,2018):
    print(year,list(layout[year].loc[layout[year]['Metric'].str.lower().str.startswith('school total enrollment'),'Metric']))
    
for year in range(2018,2024):
    print(year,list(report_card[year].columns[report_card[year].columns.str.lower().str.contains('enrollment')]))

2008 []
2009 []
2010 []
2011 []
2012 []
2013 []
2014 []
2015 []
2016 []
2017 []
2018 ['Student Enrollment', 'Student Enrollment - White %', 'Student Enrollment - Black or African American %', 'Student Enrollment - Hispanic or Latino %', 'Student Enrollment - Asian %', 'Student Enrollment - Native Hawaiian or Other Pacific Islander %', 'Student Enrollment - American Indian or Alaska Native %', 'Student Enrollment - Two or More Races %', 'Student Enrollment - EL %', 'Student Enrollment - IEP %', 'Student Enrollment - Low Income %', 'Student Enrollment - Homeless %', '# of CTE Enrollment']
2019 ['# Student Enrollment', '% Student Enrollment - White', '% Student Enrollment - Black or African American', '% Student Enrollment - Hispanic or Latino', '% Student Enrollment - Asian', '% Student Enrollment - Native Hawaiian or Other Pacific Islander', '% Student Enrollment - American Indian or Alaska Native', '% Student Enrollment - Two or More Races', '% Student Enrollment - Children with Disabi

In [10]:
col_idx = 6
year_flag = 1

for year in range(2023,2017,-1):
    print(report_card[year].columns[col_idx])

for year in range(2017,2011,-1):
    print(layout[year].loc[col_idx + 1,'Metric'])
    
for year in range(2011,2007,-1):
    print(layout[year].loc[col_idx,'Metric'])

District Type
District Type
District Type
District Type
District Type
District Type
DISTRICT TYPE CODE (0,1,2,9)
DISTRICT TYPE CODE (0,1,2,9)
DISTRICT TYPE CODE (0,1,2,9)
DISTRICT TYPE CODE (0,1,2,9)
DISTRICT TYPE CODE (0,1,2,9)
DISTRICT TYPE CODE (0,1,2,9)
DISTRICT TYPE CODE (0,1,2,9)
DISTRICT TYPE CODE (0,1,2,9)
DISTRICT TYPE CODE (0,1,2,9)
DISTRICT TYPE CODE (0,1,2,9)


In [11]:
def add_demo_columns(columns, disagg_data):
    out_columns = []
    for col in columns:
        out_columns.append(col)
        if disagg_data[col]:
            out_columns += list(map(lambda x: col + " - " + x, pd.Series(demographic_key.values()).unique()))
    return out_columns

In [12]:
crosswalk.columns

Index(['RCDTS', 'Type', 'School Name', 'District Name', 'City', 'County',
       'District Type', 'Student Enrollment', 'Student Attendance Rates',
       'Chronic Truancy Rate', 'Chronic Absenteeism'],
      dtype='object')

In [13]:
# Replace demographic designations in report cards with standardized ones
# This is only done to relevant columns
for year in range(2018,2024):
    for col in crosswalk.columns:
        metric = crosswalk.loc[year,col]
        if metric == '# Student Enrollment':
            metric = '% Student Enrollment'

        metric_demos = pd.Series(report_card[year].columns)[list(map(lambda x: x.startswith(metric), report_card[year].columns))]
        metric_demos.index = metric_demos
        if len(metric_demos) > 1:
            metric_demos = metric_demos.apply(lambda x: x.split(" - "))
            metric_demos = metric_demos[metric_demos.apply(len) > 1]
            metric_demos = metric_demos.apply(lambda x: x[0] + " - " + demographic_key[x[1]])
            
            report_card[year] = report_card[year].rename(columns=metric_demos)

In [14]:
absent_demo_combos = ['Student Enrollment - Female', 'Student Enrollment - Male', 'Student Enrollment - Migrant', 'Student Attendance Rates - Homeless', 'Student Attendance Rates - Children with Disabilities', 'Chronic Absenteeism - Migrant', 'Chronic Absenteeism - Homeless']
columns = add_demo_columns(crosswalk.columns, disagg_info)
columns = [item for item in columns if item not in absent_demo_combos]
new_columns = [item for item in columns if item not in crosswalk.columns]
demo_crosswalk = crosswalk.copy()
demo_crosswalk[new_columns] = np.nan

for col in new_columns:
    metric, demo = col.split(' - ')
    demo_crosswalk[col] = demo_crosswalk[metric] + " - " + demo
    if metric == 'Student Enrollment':
        demo_crosswalk.loc[range(2019,2024),col] = '% Student Enrollment - ' + demo

display(demo_crosswalk.columns)

master_data = pd.DataFrame(columns=['Year'] + columns)

datasets = {}

Index(['RCDTS', 'Type', 'School Name', 'District Name', 'City', 'County',
       'District Type', 'Student Enrollment', 'Student Attendance Rates',
       'Chronic Truancy Rate', 'Chronic Absenteeism',
       'Student Enrollment - White', 'Student Enrollment - Asian',
       'Student Enrollment - Black', 'Student Enrollment - Latinx',
       'Student Enrollment - American Indian or Alaska Native',
       'Student Enrollment - Native Hawaiian or Other Pacific Islander',
       'Student Enrollment - Two or More Races', 'Student Enrollment - EL',
       'Student Enrollment - Low Income', 'Student Enrollment - Homeless',
       'Student Enrollment - IEP',
       'Student Enrollment - Children with Disabilities',
       'Student Attendance Rates - Female', 'Student Attendance Rates - Male',
       'Student Attendance Rates - White', 'Student Attendance Rates - Asian',
       'Student Attendance Rates - Black', 'Student Attendance Rates - Latinx',
       'Student Attendance Rates - American 

In [15]:
dropped_columns = {}

for year in range(2008,2024):
    # the dropna here drops the colunns that are not included in the crosswalk
    # and thus not included in the report card for this year.
    years_columns = demo_crosswalk.loc[year].dropna() 
    rename_vals = pd.Series(years_columns.index.values, index=years_columns) # swap index and vals for renaming

    # this list comprehension drops any columns that are not found in the report card this year
    # this should drop demographic columns that are not found in this year, such as homeless counts for 2008
    # it should not however, drop columns that should be found in the report card.
    # Because it is capable of dropping columns that should be there, the dropped columns are recorded in 
    # dropped_columns to be checked later
    found_columns = [item for item in demo_crosswalk.loc[year].dropna() if item in report_card[year].columns]
    data = report_card[year].loc[:,found_columns]
    dropped_columns[year] = list(set(years_columns) - set(found_columns))
    dropped_columns[year].sort()
    data = data.rename(columns=rename_vals)
    data['Year'] = year
    datasets[year] = data

dropped_columns

{2008: ['ATTENDANCE RATE SCHOOL % - Native Hawaiian or Other Pacific Islander',
  'SCHOOL ENROLLMENT - Children with Disabilities',
  'SCHOOL ENROLLMENT - Homeless',
  'SCHOOL ENROLLMENT - IEP',
  'SCHOOL ENROLLMENT - Native Hawaiian or Other Pacific Islander'],
 2009: ['ATTENDANCE RATE SCHOOL % - Native Hawaiian or Other Pacific Islander',
  'SCHOOL ENROLLMENT - Children with Disabilities',
  'SCHOOL ENROLLMENT - Homeless',
  'SCHOOL ENROLLMENT - IEP',
  'SCHOOL ENROLLMENT - Native Hawaiian or Other Pacific Islander'],
 2010: ['ATTENDANCE RATE SCHOOL % - Native Hawaiian or Other Pacific Islander',
  'SCHOOL ENROLLMENT - Children with Disabilities',
  'SCHOOL ENROLLMENT - Homeless',
  'SCHOOL ENROLLMENT - Native Hawaiian or Other Pacific Islander'],
 2011: ['SCHOOL ENROLLMENT - Children with Disabilities',
  'SCHOOL ENROLLMENT - Homeless'],
 2012: ['SCHOOL ENROLLMENT - Children with Disabilities',
  'SCHOOL ENROLLMENT - Homeless'],
 2013: ['SCHOOL ENROLLMENT - Children with Disabilitie

In [16]:
master_data = pd.concat(datasets.values(), ignore_index=True)
master_data = master_data.loc[:, ['Year'] + columns]
master_data['RCDTS'] = master_data['RCDTS'].astype(str)
master_data
master_data.to_excel('Historic Data.xlsx',index=False)