In [8]:
import pandas as pd
import numpy as np
import plotly as plt
import plotly.express as px

## Reading in assessment data for every TN Public school for 5 years of historical data

In [9]:
# Contains missing data related to district and 
data_2017 = pd.read_csv('../data/school_based/2017.csv')
data_2018 =pd.read_csv('../data/school_based/2018.csv')
data_2019 = pd.read_csv('../data/school_based/2019.csv')
data_2021 = pd.read_csv('../data/school_based/2021.csv')
# Uses different naming conventions
data_2022 = pd.read_csv('../data/school_based/2022.csv')

In [10]:
data_2017.head()

Unnamed: 0,year,system,school,subject,grade,subgroup,valid_tests,n_below,n_approaching,n_on_track,n_mastered,pct_below,pct_approaching,pct_on_track,pct_mastered,pct_on_mastered
0,2017,10,2,Algebra I,10,All Students,50.0,**,**,**,**,**,**,**,**,**
1,2017,10,2,Algebra I,10,Economically Disadvantaged,25.0,**,**,**,**,**,**,**,**,**
2,2017,10,2,Algebra I,10,Non-Black/Hispanic/Native American,50.0,**,**,**,**,**,**,**,**,**
3,2017,10,2,Algebra I,10,Non-Economically Disadvantaged,25.0,**,**,**,**,**,**,**,**,**
4,2017,10,2,Algebra I,10,Non-English Learners,50.0,**,**,**,**,**,**,**,**,**


## Issues
- 2017 does not have the `school_name` and `system_name` columns.  
  - I will create a mapping from the 2018 dataframe to backfill the missing strings.
- Figure out to deal with suppressed data and salvage as much student demographic data as you can      
  - `*` = The number of **valid** test scores is less than 10.
  - `**` = Any individual proficiency level is <u>less than</u> **1%** or <u>greater than</u> **99%** of the district average.


In [11]:
# Create a unique mapping of 2018 dataframe
mapping_2018 = data_2018.drop_duplicates(subset=['system', 'school'])

# Merge the dataframes
data_2017 = pd.merge(data_2017, mapping_2018[['system', 'system_name', 'school', 'school_name']], on=['system', 'school'], how='left')

# Specify column order to match subsequent dataframes
new_order = ['year', 'system', 'system_name', 'school', 'school_name', 'subject', 'grade', 'subgroup', 'valid_tests',
             'n_below', 'n_approaching', 'n_on_track', 'n_mastered', 'pct_below', 
             'pct_approaching', 'pct_on_track', 'pct_mastered', 'pct_on_mastered']

# Reorder the columns
data_2017 = data_2017[new_order]

In [12]:
mapping_2018

Unnamed: 0,year,system,system_name,school,school_name,test,subject,grade,subgroup,valid_tests,n_below,n_approaching,n_on_track,n_mastered,pct_below,pct_approaching,pct_on_track,pct_mastered,pct_on_mastered
0,2018,10,Anderson County,2,Anderson County High School,EOC,Algebra I,10,All Students,43.0,**,**,**,**,**,**,**,**,**
471,2018,10,Anderson County,5,Andersonville Elementary,MSAA/Alt-Science/Social Studies,ELA,3,All Students,1.0,*,*,*,*,*,*,*,*,*
707,2018,10,Anderson County,10,Briceville Elementary,TNReady,ELA,3,All Students,12.0,**,**,**,**,**,**,**,**,25
901,2018,10,Anderson County,15,Claxton Elementary,TNReady,ELA,3,All Students,56.0,11,27,15,3,19.6,48.2,26.8,5.4,32.1
1188,2018,10,Anderson County,20,Clinton Middle School,EOC,Algebra I,8,All Students,58.0,3,13,22,20,5.2,22.4,37.9,34.5,72.4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
593903,2018,985,Achievement School District,8125,Memphis Scholars Caldwell-Guthrie,MSAA/Alt-Science/Social Studies,ELA,3,All Students,4.0,*,*,*,*,*,*,*,*,*
594135,2018,985,Achievement School District,8130,Memphis Scholars Raleigh-Egypt,MSAA/Alt-Science/Social Studies,ELA,6,All Students,2.0,*,*,*,*,*,*,*,*,*
594401,2018,985,Achievement School District,8135,Kirby Middle School,TNReady,ELA,6,All Students,103.0,**,**,**,**,**,**,**,**,17.5
594725,2018,985,Achievement School District,8140,Hillcrest High School,EOC,Algebra I,9,All Students,89.0,**,**,**,**,**,**,**,**,**


Looks like the mapping was successful, but did not compmpleteley solve the issue.  The pre-backfilled 2017 dataset did not contain nanvalues.  After backfill, 2429 etries in the newly constructed "system_name" and "school_name" columns wet unmatched.  I will need to explore this further.


In [50]:
data_2017.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 440633 entries, 0 to 440632
Data columns (total 18 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   year             440633 non-null  int64  
 1   system           440633 non-null  int64  
 2   system_name      438204 non-null  object 
 3   school           440633 non-null  int64  
 4   school_name      438204 non-null  object 
 5   subject          440633 non-null  object 
 6   grade            440633 non-null  object 
 7   subgroup         440633 non-null  object 
 8   valid_tests      440633 non-null  float64
 9   n_below          440633 non-null  object 
 10  n_approaching    440633 non-null  object 
 11  n_on_track       440633 non-null  object 
 12  n_mastered       440633 non-null  object 
 13  pct_below        440633 non-null  object 
 14  pct_approaching  440633 non-null  object 
 15  pct_on_track     440633 non-null  object 
 16  pct_mastered     440633 non-null  obje

In [65]:
missing_2017 = data_2017[data_2017['system_name'].isna()]
missing_2017 = missing_2017[['system', 'school']]
missing_2017 = missing_2017.groupby('system')['school'].unique()
missing_2017

system
10            [105]
11              [0]
150             [0]
190      [425, 520]
231            [25]
300             [0]
470            [83]
580         [0, 75]
650             [0]
792    [2075, 2760]
794           [170]
800            [35]
820           [200]
830             [0]
860             [0]
985    [8035, 8080]
Name: school, dtype: object

In [54]:
data_2018.head(n=1)

Unnamed: 0,year,system,system_name,school,school_name,test,subject,grade,subgroup,valid_tests,n_below,n_approaching,n_on_track,n_mastered,pct_below,pct_approaching,pct_on_track,pct_mastered,pct_on_mastered
0,2018,10,Anderson County,2,Anderson County High School,EOC,Algebra I,10,All Students,43.0,**,**,**,**,**,**,**,**,**


In [None]:
data_2021.head(n=1)

In [None]:
# 