In [1]:
import pandas as pd
import numpy as np
import plotly as plt
import plotly.express as px
import io

### Reading in assessment data for every TN Public school for 5 years of historical data
    > Technically 6 but the test was canceled for 2020

In [2]:
# Missing data related to district, school, and enrollment 
data_2017 = pd.read_csv('../data/school_based/2017.csv')
# Missing enrollment data
data_2018 =pd.read_csv('../data/school_based/2018.csv')
data_2019 = pd.read_csv('../data/school_based/2019.csv')
data_2021 = pd.read_csv('../data/school_based/2021.csv')
# Uses different naming conventions than previous years.
# Metric naming will be standardized to align with 2022 conventions
data_2022 = pd.read_csv('../data/school_based/2022.csv')

In [3]:
# List of annual assessment daframes
dfs = [data_2017, data_2018, data_2019, data_2021, data_2022]
df_names = ['year_2017', 'year_2018', 'year_2019', 'year_2021', 'year_2022']

# Initialize a StringIO object to get info in one go
output = io.StringIO()

# Loop through the dfs and their names
for df, name in zip(dfs, df_names):

    # Write the name of the DataFrame to the output
    output.write(f"{name} info:\n")
    
    # Capture the output of the .info() method
    df.info(buf=output)
    
    # Separater
    output.write("\n---\n")

# output 
info_str = output.getvalue()

# Close the StringIO object
output.close()

print(info_str)

year_2017 info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 440633 entries, 0 to 440632
Data columns (total 16 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   year             440633 non-null  int64  
 1   system           440633 non-null  int64  
 2   school           440633 non-null  int64  
 3   subject          440633 non-null  object 
 4   grade            440633 non-null  object 
 5   subgroup         440633 non-null  object 
 6   valid_tests      440633 non-null  float64
 7   n_below          440633 non-null  object 
 8   n_approaching    440633 non-null  object 
 9   n_on_track       440633 non-null  object 
 10  n_mastered       440633 non-null  object 
 11  pct_below        440633 non-null  object 
 12  pct_approaching  440633 non-null  object 
 13  pct_on_track     440633 non-null  object 
 14  pct_mastered     440633 non-null  object 
 15  pct_on_mastered  440633 non-null  object 
dtypes: float64(1), int64(3

## Issues
- Figure out to deal with suppressed data and salvage as much student demographic data as you can      
  - `*` = The number of **valid** test scores is less than 10.
  - `**` = Any individual proficiency level is <u>less than</u> **1%** or <u>greater than</u> **99%** of the district average.


### Combine Datasets with more or less the same naming conventions

In [4]:
middle_years = pd.concat([data_2018, data_2019, data_2021])

# Renaming columns in middle years to match 2022 naming conventions.  They mean the exact same thing.
middle_years = middle_years.rename(columns={
    'n_on_track': 'n_met_expectations',
    'pct_on_track': 'pct_met_expectations',
    'n_mastered': 'n_exceeded_expectations',
    'pct_mastered': 'pct_exceeded_expectations',
    'subgroup': 'student_group',
    'pct_on_mastered': 'pct_met_exceeded'
})
middle_years.head()

Unnamed: 0,year,system,system_name,school,school_name,test,subject,grade,student_group,valid_tests,...,n_met_expectations,n_exceeded_expectations,pct_below,pct_approaching,pct_met_expectations,pct_exceeded_expectations,pct_met_exceeded,enrolled,tested,participation_rate
0,2018,10,Anderson County,2,Anderson County High School,EOC,Algebra I,10,All Students,43.0,...,**,**,**,**,**,**,**,,,
1,2018,10,Anderson County,2,Anderson County High School,EOC,Algebra I,10,Economically Disadvantaged,14.0,...,**,**,**,**,**,**,7.1,,,
2,2018,10,Anderson County,2,Anderson County High School,EOC,Algebra I,10,Non-Black/Hispanic/Native American,43.0,...,**,**,**,**,**,**,**,,,
3,2018,10,Anderson County,2,Anderson County High School,EOC,Algebra I,10,Non-Economically Disadvantaged,29.0,...,**,**,**,**,**,**,**,,,
4,2018,10,Anderson County,2,Anderson County High School,EOC,Algebra I,10,Non-English Learners/Transitional 1-4,43.0,...,**,**,**,**,**,**,**,,,


### Combining `middle_years` dataset with 2022 to unify all data except 2017 

In [5]:
# Combine middle years with data_2022 
assessments = pd.concat([middle_years, data_2022])

# Dropping Enrolled, tested, participation_rate, and n_columns
assessments = assessments.drop(['enrolled', 
                                  'tested', 
                                  'participation_rate',
                                  'n_below',
                                  'n_approaching',
                                  'n_met_expectations',
                                  'n_exceeded_expectations'], axis=1)

assessments.head(n=2)

Unnamed: 0,year,system,system_name,school,school_name,test,subject,grade,student_group,valid_tests,pct_below,pct_approaching,pct_met_expectations,pct_exceeded_expectations,pct_met_exceeded
0,2018,10,Anderson County,2,Anderson County High School,EOC,Algebra I,10,All Students,43.0,**,**,**,**,**
1,2018,10,Anderson County,2,Anderson County High School,EOC,Algebra I,10,Economically Disadvantaged,14.0,**,**,**,**,7.1


In [6]:
# Calculate the number of unique systems and schools in each year
unique_counts = assessments.groupby('year').agg({'system': pd.Series.nunique, 'school': pd.Series.nunique}).reset_index()

# Melt the DataFrame to get it in the right format for plotting
melted_counts = unique_counts.melt(id_vars='year', var_name='type', value_name='count')

# Create a bar chart with Plotly
fig = px.bar(melted_counts, x='year', y='count', color='type', barmode='group', title='Unique Counts of Systems and Schools per Year')

# Show the chart
fig.show()

### Fix Missing info in 2017 - I am going to drop this for now and focus on the later years.  There is some mislabeled data and a bit of ambiguity about which test belongs with which subject.  I emailed TDOE, but I'm gonna leave it alone until I hear back.

In [7]:
# Create a unique mapping of system & school for the `assessments` dataframe.
# This is due to there not being a name associated with the numbers
mapping_assessments = assessments.drop_duplicates(subset=['system', 'school'])

# Merge the 2017 with the mapping
data_2017 = pd.merge(data_2017, mapping_assessments[['system', 'system_name', 'school', 'school_name']], on=['system', 'school'], how='left')\

# Renaming 2017 with standard naming conventions and dropping unneccissary columns
data_2017 = data_2017.rename(
    columns={
    'pct_on_track': 'pct_met_expectations',
    'pct_mastered': 'pct_exceeded_expectations',
    'subgroup': 'student_group',
    'pct_on_mastered': 'pct_met_exceeded'
})

# Dropping the columns not needed for analysis
data_2017 = data_2017.drop([ 
    'n_below',
    'n_approaching',
    'n_on_track',
    'n_mastered'], axis=1)

data_2017.head(n=1)

Unnamed: 0,year,system,school,subject,grade,student_group,valid_tests,pct_below,pct_approaching,pct_met_expectations,pct_exceeded_expectations,pct_met_exceeded,system_name,school_name
0,2017,10,2,Algebra I,10,All Students,50.0,**,**,**,**,**,Anderson County,Anderson County High School


In [8]:
data_2017['subject'].unique()


array(['Algebra I', 'Algebra II', 'Biology I', 'Chemistry', 'English II',
       'Geometry', 'English III', 'US History', 'English I', 'ELA',
       'Math', 'Science', 'Integrated Math I', 'Integrated Math II',
       'Integrated Math III'], dtype=object)

looks like school `0` occurs as a missing value frequently.  Ill look at that.
- The schools referenced here do not occur in the known districts past 2017.  I'll drop the nan values and hope it doesn't come back to bite me.

In [9]:
# Create new DF of missing 'system name' values
missing_2017 = data_2017[data_2017['school_name'].isna()]

# Drop all but system and school columns
dropped_missing_2017 = missing_2017[['system', 'school']]

# group by system and unique school
dropped_missing_2017 = missing_2017.groupby('system')['school'].unique()
dropped_missing_2017

system
10            [105]
11              [0]
150             [0]
190      [425, 520]
231            [25]
300             [0]
470            [83]
580         [0, 75]
650             [0]
792    [2075, 2760]
794           [170]
800            [35]
820           [200]
830             [0]
860             [0]
985    [8035, 8080]
Name: school, dtype: object

### Subject/Test Cleaning

Looks like I need to break this table down a bit further to aggregate by test and then subject. The data are noisy and I will filter by the aggregated scores reported in each school. The general hueristic is that the majority of students take these tests in a specific grade.  However, some students take the tests earlier or later than their peers.  In these cases, there can be much grade-level suppression.  Therefore, aggregatons are much less suppressed than grade-level analysis.

In [10]:
# Dropping MSAA & Alt
assessments = assessments[assessments["test"].str.contains("MSAA|MSAA/Alt-Science/Social Studies") == False]

# Filtering for "All Grades" aggregate of each subject
assessments= assessments[assessments["grade"].str.contains("All Grades")]

# Removing student_groups that do not have enough valid tests.
assessments = assessments[assessments['pct_below'] != '*']

# Dropping proficeincy categories to explore solely "met" or "did not meet" expectations
# Dropped 'grade' as it's no longer descriptive
#assessments = assessments.drop(['grade', 'pct_below', 'pct_approaching', 'pct_met_expectations', 'pct_exceeded_expectations'], axis=1)

# Converting pct_met_exceeded to 

In [11]:
assessments.head()

Unnamed: 0,year,system,system_name,school,school_name,test,subject,grade,student_group,valid_tests,pct_below,pct_approaching,pct_met_expectations,pct_exceeded_expectations,pct_met_exceeded
26,2018,10,Anderson County,2,Anderson County High School,EOC,Algebra I,All Grades,All Students,209.0,**,**,**,**,23.4
31,2018,10,Anderson County,2,Anderson County High School,EOC,Algebra I,All Grades,Economically Disadvantaged,55.0,49.1,32.7,12.7,5.5,18.2
36,2018,10,Anderson County,2,Anderson County High School,EOC,Algebra I,All Grades,Non-Black/Hispanic/Native American,203.0,**,**,**,**,22.7
37,2018,10,Anderson County,2,Anderson County High School,EOC,Algebra I,All Grades,Non-Economically Disadvantaged,154.0,**,**,**,**,25.3
38,2018,10,Anderson County,2,Anderson County High School,EOC,Algebra I,All Grades,Non-English Learners/Transitional 1-4,208.0,**,**,**,**,23.6


In [12]:
bob = assessments.loc[(assessments['pct_met_exceeded'] == '**') & (assessments['student_group'] == 'All Students')]
bob
# bob['pct_met_exceeded'] = bob['pct_met_exceeded'].apply(pd.to_numeric)

Unnamed: 0,year,system,system_name,school,school_name,test,subject,grade,student_group,valid_tests,pct_below,pct_approaching,pct_met_expectations,pct_exceeded_expectations,pct_met_exceeded
5710,2018,12,Oak Ridge,25,Jefferson Middle School,EOC,Geometry,All Grades,All Students,13.0,**,**,**,**,**
9152,2018,20,Bedford County,20,Shelbyville Central High School,EOC,Algebra I,All Grades,All Students,352.0,**,**,**,**,**
10895,2018,20,Bedford County,35,Community High School,EOC,US History,All Grades,All Students,102.0,**,**,**,**,**
14141,2018,30,Benton County,27,Camden Jr High School,EOC,Integrated Math I,All Grades,All Students,22.0,**,**,**,**,**
16360,2018,50,Blount County,23,Carpenters Middle School,EOC,Algebra I,All Grades,All Students,28.0,**,**,**,**,**
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
674149,2022,985,Achievement School District,8140,Hillcrest High School,EOC,Geometry,All Grades,All Students,89.0,**,**,**,**,**
674337,2022,987,Tennessee Public Charter School Commission,8005,Bluff City High School,EOC,Algebra I,All Grades,All Students,109.0,**,**,**,**,**
674397,2022,987,Tennessee Public Charter School Commission,8005,Bluff City High School,EOC,Algebra II,All Grades,All Students,98.0,**,**,**,**,**
674469,2022,987,Tennessee Public Charter School Commission,8005,Bluff City High School,EOC,Biology I,All Grades,All Students,134.0,**,**,**,**,**


In [13]:
# Calculating total percentage of not met for student_groups per assessment
#bob = assessments["pct_not_met"] = assessments['pct_met_exceeded'] - 1

assessments.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 381481 entries, 26 to 675533
Data columns (total 15 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   year                       381481 non-null  int64  
 1   system                     381481 non-null  int64  
 2   system_name                381481 non-null  object 
 3   school                     381481 non-null  int64  
 4   school_name                381481 non-null  object 
 5   test                       381481 non-null  object 
 6   subject                    381481 non-null  object 
 7   grade                      381481 non-null  object 
 8   student_group              381481 non-null  object 
 9   valid_tests                381481 non-null  float64
 10  pct_below                  381481 non-null  object 
 11  pct_approaching            381481 non-null  object 
 12  pct_met_expectations       381481 non-null  object 
 13  pct_exceeded_expectations  3

In [14]:
# Look at the subjects for each test.    
test_subjects = assessments[['test', 'subject']]
pivot_table = assessments.pivot_table(index='test', columns='subject', aggfunc='size', fill_value=0)
pivot_table

subject,Algebra I,Algebra II,Biology I,Chemistry,ELA,English I,English II,English III,Geometry,Integrated Math I,Integrated Math II,Integrated Math III,Math,Science,Social Studies,US History
test,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
EOC,19623,14094,13090,3378,0,17980,17407,3541,15197,4395,3259,2840,0,0,0,13789
TNReady,0,0,0,0,77569,0,0,0,0,0,0,0,77439,53839,44041,0
