In [2]:
import pandas as pd
import numpy as np

### Split the large file into smaller ones
- The number of rows is too large to be loaded into csv files
- There will be encoding issues if directly read from txt files

In [1]:
with open("enrollment_2021.txt",'r') as file:
    lines = file.readlines()

with open("enrollment_2021_A.txt",'w') as file:
    for line in lines[:int(len(lines)/2)]:
        file.write(line)

with open("enrollment_2021_B.txt",'w') as file:
    for line in lines[int(len(lines)/2):]:
        file.write(line)

### Manually load txt data into excel files and save as csv files

### Load the data
- The staff data at different granularity levels: school, school district, county, state

In [4]:
df_enrollment_a = pd.read_csv('enrollment_2021_A.csv', low_memory=False)
df_enrollment_a.head()

Unnamed: 0,AcademicYear,AggregateLevel,CountyCode,DistrictCode,SchoolCode,CountyName,DistrictName,SchoolName,Charter,ReportingCategory,CumulativeEnrollment
0,2020-21,C,1,,,Alameda,,,All,GF,110211
1,2020-21,C,1,,,Alameda,,,All,GM,117612
2,2020-21,C,1,,,Alameda,,,All,GX,82
3,2020-21,C,1,,,Alameda,,,All,RA,62638
4,2020-21,C,1,,,Alameda,,,All,RB,20475


In [6]:
df_enrollment_b = pd.read_csv('enrollment_2021_B.csv', low_memory=False)
df_enrollment_b.head()

Unnamed: 0,AcademicYear,AggregateLevel,CountyCode,DistrictCode,SchoolCode,CountyName,DistrictName,SchoolName,Charter,ReportingCategory,CumulativeEnrollment
0,2020-21,S,19,73452.0,1936228.0,Los Angeles,Rowland Unified,Nogales High,No,RF,118
1,2020-21,S,19,73452.0,1936228.0,Los Angeles,Rowland Unified,Nogales High,No,RH,1629
2,2020-21,S,19,73452.0,1936228.0,Los Angeles,Rowland Unified,Nogales High,No,RI,*
3,2020-21,S,19,73452.0,1936228.0,Los Angeles,Rowland Unified,Nogales High,No,RP,*
4,2020-21,S,19,73452.0,1936228.0,Los Angeles,Rowland Unified,Nogales High,No,RT,*


### Withdraw the enrollment with filters
- Aggregate Level = S. School level data

In [7]:
df_enrollment_a_filtered = df_enrollment_a[(df_enrollment_a['AggregateLevel'] == 'S')]
df_enrollment_b_filtered = df_enrollment_b[(df_enrollment_b['AggregateLevel'] == 'S')]

In [8]:
df_enrollment_a_filtered

Unnamed: 0,AcademicYear,AggregateLevel,CountyCode,DistrictCode,SchoolCode,CountyName,DistrictName,SchoolName,Charter,ReportingCategory,CumulativeEnrollment
44415,2020-21,S,1,10017.0,130419.0,Alameda,Alameda County Office of Education,Alameda County Community,No,GF,87
44416,2020-21,S,1,10017.0,130419.0,Alameda,Alameda County Office of Education,Alameda County Community,No,GM,50
44417,2020-21,S,1,10017.0,130419.0,Alameda,Alameda County Office of Education,Alameda County Community,No,RA,*
44418,2020-21,S,1,10017.0,130419.0,Alameda,Alameda County Office of Education,Alameda County Community,No,RB,43
44419,2020-21,S,1,10017.0,130419.0,Alameda,Alameda County Office of Education,Alameda County Community,No,RD,*
...,...,...,...,...,...,...,...,...,...,...,...
111398,2020-21,S,19,73452.0,1936228.0,Los Angeles,Rowland Unified,Nogales High,No,GF,904
111399,2020-21,S,19,73452.0,1936228.0,Los Angeles,Rowland Unified,Nogales High,No,GM,964
111400,2020-21,S,19,73452.0,1936228.0,Los Angeles,Rowland Unified,Nogales High,No,RA,55
111401,2020-21,S,19,73452.0,1936228.0,Los Angeles,Rowland Unified,Nogales High,No,RB,16


In [10]:
df_enrollment_b_filtered.head()

Unnamed: 0,AcademicYear,AggregateLevel,CountyCode,DistrictCode,SchoolCode,CountyName,DistrictName,SchoolName,Charter,ReportingCategory,CumulativeEnrollment
0,2020-21,S,19,73452.0,1936228.0,Los Angeles,Rowland Unified,Nogales High,No,RF,118
1,2020-21,S,19,73452.0,1936228.0,Los Angeles,Rowland Unified,Nogales High,No,RH,1629
2,2020-21,S,19,73452.0,1936228.0,Los Angeles,Rowland Unified,Nogales High,No,RI,*
3,2020-21,S,19,73452.0,1936228.0,Los Angeles,Rowland Unified,Nogales High,No,RP,*
4,2020-21,S,19,73452.0,1936228.0,Los Angeles,Rowland Unified,Nogales High,No,RT,*


In [11]:
# Combine the two sets of data
combined = pd.concat([df_enrollment_a_filtered, df_enrollment_b_filtered])

### Resolve format issues
- Make sure all the codes are stored as strings 

In [12]:
combined['CountyCode'] = combined['CountyCode'].astype(int).astype(str)
combined['DistrictCode'] = combined['DistrictCode'].astype(int).astype(str)
combined['SchoolCode'] = combined['SchoolCode'].astype(int).astype(str)

In [13]:
combined.head()

Unnamed: 0,AcademicYear,AggregateLevel,CountyCode,DistrictCode,SchoolCode,CountyName,DistrictName,SchoolName,Charter,ReportingCategory,CumulativeEnrollment
44415,2020-21,S,1,10017,130419,Alameda,Alameda County Office of Education,Alameda County Community,No,GF,87
44416,2020-21,S,1,10017,130419,Alameda,Alameda County Office of Education,Alameda County Community,No,GM,50
44417,2020-21,S,1,10017,130419,Alameda,Alameda County Office of Education,Alameda County Community,No,RA,*
44418,2020-21,S,1,10017,130419,Alameda,Alameda County Office of Education,Alameda County Community,No,RB,43
44419,2020-21,S,1,10017,130419,Alameda,Alameda County Office of Education,Alameda County Community,No,RD,*


In [14]:
combined.to_csv('enrollment2021_school_level.csv', encoding='utf-8', index=False)

- The report categories adopted
RB = African American
RI = American Indian or Alaska Native
RA = Asian
RF = Filipino
RH = Hispanic or Latino
RD = Not Reported
RP = Pacific Islander
RT = Two or More Races
RW = White
GM = Male
GF = Female
GX = Non-Binary Gender (Beginning 2019–20)
GZ = Missing Gender
SE = English Learners
SD = Students with Disabilities
SS = Socioeconomically Disadvantaged
SM = Migrant
SF = Foster
SH = Homeless
TA = Total