# Exploration for Reported Assessment Results

## Imports and Such

In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
import geopandas as gpd
from shapely.geometry import Point
import plotly.graph_objects as go
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 100)

In [2]:
# Import assessment data minus fully suppressed scores
assessments = pd.read_pickle('../data/school_based/assessments_clean.pkl')

# Import Suppressed outlier data
suppressed = pd.read_pickle ('../data/school_based/full_suppression.pkl')

# Import Tennessee School District Geometry
tn_leas = gpd.read_file('../data/tn_leas.geojson', index_col='system_name')

## Listy McListface - A Place to look at the lists in my dataframe.

In [3]:
# Assessments Info
assessments.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Index: 335452 entries, 0 to 381480
Data columns (total 27 columns):
 #   Column              Non-Null Count   Dtype   
---  ------              --------------   -----   
 0   year                335452 non-null  int64   
 1   system_name         335452 non-null  object  
 2   school_name         335452 non-null  object  
 3   test                335452 non-null  object  
 4   subject             335452 non-null  object  
 5   subject_area        335452 non-null  object  
 6   student_group       335452 non-null  object  
 7   valid_tests         335452 non-null  float64 
 8   pct_met_exceeded    335452 non-null  float64 
 9   system              335452 non-null  object  
 10  school              335452 non-null  object  
 11  school_lvl          335452 non-null  object  
 12  tot_enrolled        335452 non-null  object  
 13  fte_teachers        334603 non-null  float64 
 14  stu_tchr_ratio      334603 non-null  float64 
 15  school_type   

In [4]:
# Student Groups
student_group_list = np.unique(assessments['student_group'].values).tolist()
student_group_list

['All Students',
 'American Indian or Alaska Native',
 'Asian',
 'Black or African American',
 'Black/Hispanic/Native American',
 'Economically Disadvantaged',
 'English Learner Transitional 1-4',
 'English Learners',
 'English Learners with Transitional 1-4',
 'Female',
 'Gifted',
 'Hispanic',
 'Male',
 'Native Hawaiian or Other Pacific Islander',
 'Non-Black/Hispanic/Native American',
 'Non-Economically Disadvantaged',
 'Non-English Learners/Transitional 1-4',
 'Non-Students with Disabilities',
 'Students with Disabilities',
 'Super Subgroup',
 'White']

In [5]:
# School Types
school_type_list = np.unique(assessments['school_type'].values).tolist()
school_type_list

['1-Regular school',
 '2-Special education school',
 '4-Alternative Education School']

In [6]:
# Subject Areas
subject_area_list = np.unique(assessments['subject_area'].values).tolist()
subject_area_list

['ELA', 'Math', 'Science', 'Social Studies']

## Unsuppressed: Broad Overview of Results

### Overall Unweighted Proficiencies by School-Level, Subject Area, and Year.

In [7]:
# Subject Area Pivot Table
subject_area_pivot = pd.pivot_table(assessments,
                       values = 'pct_met_exceeded',
                       index = ['school_lvl'],
                       columns = ['subject_area','year'],
                       aggfunc = np.mean)

# Get the current list of years
years = list(subject_area_pivot.columns)

subject_area_pivot
#.plot()

subject_area,ELA,ELA,ELA,ELA,Math,Math,Math,Math,Science,Science,Science,Social Studies,Social Studies,Social Studies,Social Studies
year,2018,2019,2021,2022,2018,2019,2021,2022,2018,2021,2022,2018,2019,2021,2022
school_lvl,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2
Elementary,0.313354,0.317439,0.288133,0.332412,0.351674,0.40286,0.308109,0.338107,0.519245,0.353617,0.367404,0.338689,0.345175,0.328732,0.364729
High,0.269997,0.331378,0.294335,0.348687,0.231409,0.26061,0.201373,0.229202,0.400077,0.349336,0.355829,0.270426,0.286531,0.305459,0.343403
Middle,0.272938,0.289341,0.244374,0.294892,0.362565,0.390276,0.341143,0.357363,0.51125,0.322881,0.326773,0.337043,0.359389,0.334036,0.383663
Other,0.307405,0.327303,0.316126,0.344306,0.285647,0.308662,0.244099,0.240873,0.41355,0.365386,0.347952,0.304003,0.296822,0.349833,0.327282
Secondary,0.293484,0.276567,0.359758,0.370127,0.308447,0.384527,0.292991,0.387432,0.443152,0.505515,0.530455,0.277826,0.428159,0.291515,0.525471


#### Visual of Log Proficiencies For All Subjects Combined by School Level (Unweighted)

In [8]:
# Apply logarithmic scaling to the values
log_values = np.log(subject_area_pivot.values)

# Create a diverging colorscale for heatmap
colorscale = 'Viridis'

# Create the heatmap figure
fig = go.Figure(data=go.Heatmap(
    z=log_values,
    x=subject_area_pivot.columns.get_level_values('year'),
    y=subject_area_pivot.index,
    colorscale=colorscale,
    zmid=np.median(log_values)  # Set the midpoint of the colorscale
))

# Update the layout
fig.update_layout(
    title='Subject Area Heatmap (Log Scale)',
    xaxis_title='Year',
    yaxis_title='School Level'
)

# Show the figure
fig.show()

### 🗺️ Spatial Join of School Geometry (lat/long point) and District (polygons)

There's some funk going on here.  District names don't match across datasets.  I'm going to do a spatial merge to see which dististricts are associated based on thier physical location.

In [9]:
# Set the CRS for the assessments dataframe
assessments.crs = "EPSG:4269"

# Reproject assessments dataframe to match the CRS of tn_leas dataframe
reproject = assessments.to_crs(tn_leas.crs)

# Perform spatial join
assessments = gpd.sjoin(reproject, tn_leas, how='inner', predicate='intersects')

There is indeed a mismatch in naming conventions between datasets

In [10]:
# Looking at differences in naming conventions
pd.merge(
    left = assessments.groupby(['system_name_left','school_name'])['system_name_right'].nunique().loc[lambda x: x>1].reset_index().drop(columns = 'system_name_right'),
    right = assessments)[['system_name_left', 'school_name', 'system_name_right']].drop_duplicates()#.to_csv('../data/fixerupper.csv', index = False)

subset_assessments = assessments[['system_name_left', 'system_name_right']]
subset_assessments.tail()

Unnamed: 0,system_name_left,system_name_right
381475,Wilson County,Wilson County School District
381476,Wilson County,Wilson County School District
381477,Wilson County,Wilson County School District
381479,Wilson County,Wilson County School District
381480,Wilson County,Wilson County School District


In [11]:
# Load cleaned dictionary mapping
clean_dictionary =pd.read_csv('../data/clean_dictionary.csv')
clean_dictionary = pd.concat([pd.merge(
    left = assessments.groupby(['system_name_left','school_name'])['system_name_right'].nunique().loc[lambda x: x == 1].reset_index().drop(columns = 'system_name_right'),
    right = assessments)[['system_name_left', 'school_name', 'system_name_right']].drop_duplicates(), clean_dictionary])
clean_dictionary.tail(n=7)

Unnamed: 0,system_name_left,school_name,system_name_right
85,Williamson County,Centennial High School,Williamson County School District in Franklin
86,Williamson County,Discovery Virtual K-8 School,Williamson County School District in Franklin
87,Williamson County,Franklin High School,Williamson County School District in Franklin
88,Williamson County,Renaissance High School,Williamson County School District in Franklin
89,Williamson County,Vanguard Virtual High School,Williamson County School District in Franklin
90,Wilson County,Lebanon High School,Wilson County School District in Lebanon
91,Wilson County,Tennessee Virtual On-Line School,Wilson County School District


Proccessing steps: Keep system_name_right, rename as system_name, drop system_name_right, set_system_name at col index 2

In [12]:
# Merge assessments with the clean dictionary
assessments = (pd.merge(left = assessments, right = clean_dictionary))

# Rename the 'system_name_right' column to 'system_name'
assessments.rename(columns={'system_name_right': 'system_name'}, inplace=True)

# Move the 'system_name' column to the third position
columns = list(assessments.columns)
columns.insert(2, columns.pop(columns.index('system_name')))
assessments = assessments[columns]

# Drop the 'system_name_right' column
assessments.drop('system_name_left', axis=1, inplace=True)

In [13]:
assessments.head()

Unnamed: 0,year,system_name,school_name,test,subject,subject_area,student_group,valid_tests,pct_met_exceeded,system,school,school_lvl,tot_enrolled,fte_teachers,stu_tchr_ratio,school_type,magnet,charter,virtual,title_1,lat,long,locale,geometry,pct_met_exceeded_w,stu_tchr_ratio_w,fte_teachers_w,index_right
0,2018,Shelby County School District,Aspire Coleman,TNReady,ELA,ELA,All Students,321.0,0.112,TN-00985,TN-00985-8050,Elementary,570,27.5,20.73,1-Regular school,0,1,NOTVIRTUAL,1,35.2141,-89.923641,11-City: Large,POINT (-89.92364 35.21410),35.952,6654.33,8827.5,42
1,2018,Shelby County School District,Aspire Coleman,TNReady,ELA,ELA,Black or African American,299.0,0.114,TN-00985,TN-00985-8050,Elementary,570,27.5,20.73,1-Regular school,0,1,NOTVIRTUAL,1,35.2141,-89.923641,11-City: Large,POINT (-89.92364 35.21410),34.086,6198.27,8222.5,42
2,2018,Shelby County School District,Aspire Coleman,TNReady,ELA,ELA,Black/Hispanic/Native American,312.0,0.115,TN-00985,TN-00985-8050,Elementary,570,27.5,20.73,1-Regular school,0,1,NOTVIRTUAL,1,35.2141,-89.923641,11-City: Large,POINT (-89.92364 35.21410),35.88,6467.76,8580.0,42
3,2018,Shelby County School District,Aspire Coleman,TNReady,ELA,ELA,Economically Disadvantaged,222.0,0.095,TN-00985,TN-00985-8050,Elementary,570,27.5,20.73,1-Regular school,0,1,NOTVIRTUAL,1,35.2141,-89.923641,11-City: Large,POINT (-89.92364 35.21410),21.09,4602.06,6105.0,42
4,2018,Shelby County School District,Aspire Coleman,TNReady,ELA,ELA,English Learners with Transitional 1-4,11.0,0.182,TN-00985,TN-00985-8050,Elementary,570,27.5,20.73,1-Regular school,0,1,NOTVIRTUAL,1,35.2141,-89.923641,11-City: Large,POINT (-89.92364 35.21410),2.002,228.03,302.5,42


#### 🏫 List of columns in assessments (school level) for use in district analysis.

In [14]:
# Let's figure how how I can pivot this    
districts = assessments[['locale',
                         'year',
                         'system_name', 
                         'school_lvl', 
                         'subject_area', 
                         'student_group', 
                         'pct_met_exceeded_w', 
                         'school_type',
                         'magnet',
                         'charter',
                         'title_1',
                         'fte_teachers_w',
                         'stu_tchr_ratio_w',
                         'valid_tests']]

In [15]:
unique_system_names = districts['system_name'].unique()
unique_system_names

array(['Shelby County School District',
       'Metropolitan Nashville Public School District',
       'Alamo City School District',
       'Crockett County School District in Alamo',
       'Alcoa City School District', 'Fentress County School District',
       'Anderson County School District', 'Clinton City School District',
       'Anderson County School District in Clinton',
       'Arlington Community School District',
       'McMinn County School District in Athens', 'Athens City Schools',
       'Bartlett City School District', 'Bedford County School District',
       'Bells City School District', 'Benton County School District',
       'Bledsoe County School District', 'Blount County School District',
       'Bradford Special School District',
       'Bradley County School District', 'Cleveland City School District',
       'Bristol City School District', 'Campbell County School District',
       'Cannon County School District', 'Carter County School District',
       'Elizabe

### 🏋️ Weighting Metrics Based on Valid Tests

#### 📇 Indices for Weight Pivots

In [16]:
# Indices for pivots
indices = ['system_name',  # School District 
           'school_lvl', # Level of school (Elem, Middle, High)
           'school_type', # Regular, alternative, special education
           'magnet', # Is magnet?
           'charter', # Is charter?
           'title_1', # Is title 1?
           'locale', # Location category of school (rural, large city, etc)
           'subject_area', # Overall content area of 
           'student_group'] # Aggregate student groups (all students, students with disabilities, etc)

#### 🏋️➕ Sum of Valid-Test-Weighted Scores for pct_met_exceeded, fte_teachers, and student_tchr_ratio 

In [17]:
# Sum of valid test scores (The 🏋️)
weight = pd.pivot_table(
    districts,
    values='valid_tests',  # Column to calculate the sum of valid test scores
    index=indices,
    columns='year',
    aggfunc=np.sum
)

# Sum of weighted scores pivot for 'pct_met_exceeded'
sum_weighted_proficiency = pd.pivot_table(
    districts,
    values='pct_met_exceeded_w', # Weighted sum of students who met or exceeded expectations
    index=indices,
    columns='year',
    aggfunc=np.sum
)

# Sum of weighted scores pivot for 'fte_teachers'
sum_weighted_fte = pd.pivot_table(
    districts,
    values='fte_teachers_w', # Weighted sum of full-time equivalent teachers
    index=indices,
    columns='year',
    aggfunc=np.sum
)

# Sum of weighted scores pivot for 'stu_tchr_ratio'
sum_weighted_str = pd.pivot_table(
    districts,
    values='stu_tchr_ratio_w', # Weighted sum of student/teacher ratios
    index=indices,
    columns='year',
    aggfunc=np.sum
)

# Create a multi-level column index
column_index = pd.MultiIndex.from_product([['pct_met_exceeded', 'fte_teachers', 'stu_tchr_ratio'], sum_weighted_proficiency.columns])

# Concatenate the pivot tables horizontally
weighted_sums_pivot = pd.concat([sum_weighted_proficiency, sum_weighted_fte, sum_weighted_str], axis=1)
weighted_sums_pivot.columns = column_index

#### 🏋️⚖️ Weighted Averages for pct_met_exceeded_w, fte_teachers_w, stu_tchr_ratio_w

In [18]:
# Divide sum_weighted_proficiency by weight
weighted_avg_proficiency = sum_weighted_proficiency / weight

# Divide sum_weighted_fte by weight
weighted_avg_fte = sum_weighted_fte / weight

# Divide sum_weighted_str by weight
weighted_avg_str = sum_weighted_str / weight

# Create a multi-level column index for the weighted average pivots
column_index = pd.MultiIndex.from_product([['pct_met_exceeded', 'fte_teachers', 'stu_tchr_ratio'], weighted_avg_proficiency.columns])

# Concatenate the weighted average pivots horizontally
weighted_avg_pivot = pd.concat([weighted_avg_proficiency, weighted_avg_fte, weighted_avg_str], axis=1)
weighted_avg_pivot.columns = column_index

weighted_avg_pivot

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,Unnamed: 8_level_0,pct_met_exceeded,pct_met_exceeded,pct_met_exceeded,pct_met_exceeded,fte_teachers,fte_teachers,fte_teachers,fte_teachers,stu_tchr_ratio,stu_tchr_ratio,stu_tchr_ratio,stu_tchr_ratio
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,year,2018,2019,2021,2022,2018,2019,2021,2022,2018,2019,2021,2022
system_name,school_lvl,school_type,magnet,charter,title_1,locale,subject_area,student_group,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2
Alamo City School District,Elementary,1-Regular school,0,0,1,32-Town: Distant,ELA,All Students,0.455,0.465,0.458,0.570,40.0,41.0,41.00,37.00,15.28,15.66,14.49,16.38
Alamo City School District,Elementary,1-Regular school,0,0,1,32-Town: Distant,ELA,Black or African American,0.314,0.283,0.200,0.400,40.0,41.0,41.00,37.00,15.28,15.66,14.49,16.38
Alamo City School District,Elementary,1-Regular school,0,0,1,32-Town: Distant,ELA,Black/Hispanic/Native American,0.312,0.274,0.215,0.426,40.0,41.0,41.00,37.00,15.28,15.66,14.49,16.38
Alamo City School District,Elementary,1-Regular school,0,0,1,32-Town: Distant,ELA,Economically Disadvantaged,0.352,0.310,0.307,0.392,40.0,41.0,41.00,37.00,15.28,15.66,14.49,16.38
Alamo City School District,Elementary,1-Regular school,0,0,1,32-Town: Distant,ELA,English Learner Transitional 1-4,0.355,0.333,0.111,0.357,40.0,41.0,41.00,37.00,15.28,15.66,14.49,16.38
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Wilson County School District in Lebanon,High,1-Regular school,0,0,1,31-Town: Fringe,Science,Non-English Learners/Transitional 1-4,,,0.431,0.471,,,101.73,94.78,,,15.54,18.66
Wilson County School District in Lebanon,High,1-Regular school,0,0,1,31-Town: Fringe,Science,Non-Students with Disabilities,,,0.428,0.469,,,101.73,94.78,,,15.54,18.66
Wilson County School District in Lebanon,High,1-Regular school,0,0,1,31-Town: Fringe,Science,Students with Disabilities,,,0.156,0.235,,,101.73,94.78,,,15.54,18.66
Wilson County School District in Lebanon,High,1-Regular school,0,0,1,31-Town: Fringe,Science,Super Subgroup,,,0.289,0.317,,,101.73,94.78,,,15.54,18.66


### 🤺 Slicing and Lagging
> "... the heavy-sword splendid.  The hard-edgèd weapon;  with Hrunting to aid me, I shall gain me glory .. "
    >> *-Beowulf*

#### 🪟 Setting Variables for lag window slicing
> 🗒️ **lvpp lags** → The lags for **Science** related metrics require some special treatment. The last valid prepandemic measurement for **ELA**, **Math**, and **Social Studies** was in **2019**.   **Science**, however, was not assessed in **2019**, but was in **2018**.  I cannnot directly compare lag windows for all subjects that start in **2019**.  In order to include metrics related to **Science** for fair comparisons, its window must start at **2018**.  Therefore, **lvpp** variables will be used as treatment for window the **last valid prepandemic measure** for each content area (e.g. 2018 or 2019 respectively).

> 🗂️ lvpp → last valid pre-pandemic assessment to 2021 (first full school year after 2020 school closure)

> 🗂️ intra → difference between first and second years school were reopend post-pandemic (2021 - 2022)

> 🗂️ pre_post → difference between last valid pre-pandemic scores and last year in the dataset (lvpp to 2022)

In [19]:
# Set the stop year for all metrics
lvpp_stop = 2021

# Set the start year for Math, ELA, and Social Studies related metrics
lvpp_start_mess = 2019

# Set the start year for Science-related metrics
lvpp_start_science = 2018

# Set the start and stop year for the "intra" lag
intra_start = 2021
intra_stop = 2022

# Science Slicer
science_slice = (slice(None), slice(None), slice(None), slice(None), slice(None), slice(None), slice(None), 'Science')

# Non-Science slicer
subjects_slice = (slice(None), slice(None), slice(None), slice(None), slice(None), slice(None), slice(None))

#### 📝 Assessment Proficiency Lags

> 🗂️ pct_met_exceeded → Changes in weighted average full-time students who displayed **at-least minimum expected proficiency** over time.

In [20]:
# Calculate separate lag scores for Science-related metrics.

# Last Valid Pre-Pandemic met_exceded measurement (lvpp)
weighted_avg_pivot.loc[science_slice, ('pct_met_exceeded', 'lvpp')] = (
    weighted_avg_pivot.loc[science_slice, ('pct_met_exceeded', lvpp_stop)] -
    weighted_avg_pivot.loc[science_slice, ('pct_met_exceeded', lvpp_start_science)]
)

# Calculate the lvpp scores for Math, ELA, and Social Studies related metrics
weighted_avg_pivot.loc[
    subjects_slice + (['Math', 'ELA', 'Social Studies'],),
    ('pct_met_exceeded', 'lvpp')
] = (
    weighted_avg_pivot.loc[
        subjects_slice + (['Math', 'ELA', 'Social Studies'],),
        ('pct_met_exceeded', lvpp_stop)
    ] - weighted_avg_pivot.loc[
        subjects_slice + (['Math', 'ELA', 'Social Studies'],),
        ('pct_met_exceeded', lvpp_start_mess)
    ]
)

# Calculate the intra lag score for 'pct_met_exceeded'
weighted_avg_pivot.loc[subjects_slice, ('pct_met_exceeded', 'intra')] = (
    weighted_avg_pivot.loc[subjects_slice, ('pct_met_exceeded', intra_stop)] -
    weighted_avg_pivot.loc[subjects_slice, ('pct_met_exceeded', intra_start)]
)

# Calculate the 'pre-post' assessment lag scores
weighted_avg_pivot[('pct_met_exceeded', 'pre-post')] = (
    weighted_avg_pivot[('pct_met_exceeded', 2022)] -
    weighted_avg_pivot[('pct_met_exceeded', 'lvpp')]
)

#### 🧑‍🏫 Full Time Equivalent Teachers Lag

> 🗂️ fte_teachers → Changes in weighted-average of **full-time-equivalent teachers** over time.

In [21]:
# Last Valid Pre-Pandemic science-related full-time equivalent teachers measurement (lvpp)
weighted_avg_pivot.loc[science_slice, ('fte_teachers', 'lvpp')] = (
    weighted_avg_pivot.loc[science_slice, ('fte_teachers', lvpp_stop)] -
    weighted_avg_pivot.loc[science_slice, ('fte_teachers', lvpp_start_science)]
)

# Last Valid Pre-Pandemic full-time equivalent teacher measurement (lvpp)
weighted_avg_pivot.loc[
    subjects_slice + (['Math', 'ELA', 'Social Studies'],),
    ('fte_teachers', 'lvpp')
] = (
    weighted_avg_pivot.loc[
        subjects_slice + (['Math', 'ELA', 'Social Studies'],),
        ('fte_teachers', lvpp_stop)
    ] - weighted_avg_pivot.loc[
        subjects_slice + (['Math', 'ELA', 'Social Studies'],),
        ('fte_teachers', lvpp_start_mess)
    ]
)

# Calculate the intra lag score for 'fte_teachers'
weighted_avg_pivot.loc[subjects_slice, ('fte_teachers', 'intra')] = (
    weighted_avg_pivot.loc[subjects_slice, ('fte_teachers', intra_stop)] -
    weighted_avg_pivot.loc[subjects_slice, ('fte_teachers', intra_start)]
)

# Calculate the 'pre-post' assessment lag scores for full-time equivalent teachers
weighted_avg_pivot[('fte_teachers', 'pre-post')] = (
    weighted_avg_pivot[('fte_teachers', 2022)] -
    weighted_avg_pivot[('fte_teachers', 'lvpp')]
)

#### 🧑‍🎓/🧑‍🏫 Student Teacher Ratio Lag

> 🗂️ stu_tchr_ratio → Changes in weighted-average **student-to-teacher ratio** over time.

In [22]:
# Last Valid Pre-Pandemic science-related student/teacher ratio measurement (lvpp)
weighted_avg_pivot.loc[science_slice, ('stu_tchr_ratio', 'lvpp')] = (
    weighted_avg_pivot.loc[science_slice, ('stu_tchr_ratio', lvpp_stop)] -
    weighted_avg_pivot.loc[science_slice, ('stu_tchr_ratio', lvpp_start_science)]
)

# Last Valid Pre-Pandemic non science-related student/teacher ratio measurement (lvpp)
weighted_avg_pivot.loc[
    subjects_slice + (['Math', 'ELA', 'Social Studies'],),
    ('stu_tchr_ratio', 'lvpp')
] = (
    weighted_avg_pivot.loc[
        subjects_slice + (['Math', 'ELA', 'Social Studies'],),
        ('stu_tchr_ratio', lvpp_stop)
    ] - weighted_avg_pivot.loc[
        subjects_slice + (['Math', 'ELA', 'Social Studies'],),
        ('stu_tchr_ratio', lvpp_start_mess)
    ]
)

# Calculate the intra lag score for 'stu_tchr_ratio'
weighted_avg_pivot.loc[subjects_slice, ('stu_tchr_ratio', 'intra')] = (
    weighted_avg_pivot.loc[subjects_slice, ('stu_tchr_ratio', intra_stop)] -
    weighted_avg_pivot.loc[subjects_slice, ('stu_tchr_ratio', intra_start)]
)

# Calculate the 'pre-post' assessment lag scores for student/teacher ratios
weighted_avg_pivot[('stu_tchr_ratio', 'pre-post')] = (
    weighted_avg_pivot[('stu_tchr_ratio', 2022)] -
    weighted_avg_pivot[('stu_tchr_ratio', 'lvpp')]
)


### 🔨 "Un-Pivoting"  Weighted Average Pivot to Prepare for Geometry 
🗂️ weighted_avg_pivot → weighted_average_metrics

In [23]:
# Resetting index of weighted average pivot to create weighted_average_metrics DataFrame
weighted_average_metrics = weighted_avg_pivot.reset_index()

# Compressing hierarchy in columns and joining levels with "_"
weighted_average_metrics.columns = ['_'.join(str(col) for col in column) for column in weighted_average_metrics.columns.values]

# Removing trailing "_" introduced when compressing
for col in weighted_average_metrics.columns:
        # Check if the column name ends with '_'
        if col.endswith('_'):
            # If it does, remove the trailing underscore
            weighted_average_metrics = weighted_average_metrics.rename(columns={col: col[:-1]})

weighted_average_metrics.head()

Unnamed: 0,system_name,school_lvl,school_type,magnet,charter,title_1,locale,subject_area,student_group,pct_met_exceeded_2018,pct_met_exceeded_2019,pct_met_exceeded_2021,pct_met_exceeded_2022,fte_teachers_2018,fte_teachers_2019,fte_teachers_2021,fte_teachers_2022,stu_tchr_ratio_2018,stu_tchr_ratio_2019,stu_tchr_ratio_2021,stu_tchr_ratio_2022,pct_met_exceeded_lvpp,pct_met_exceeded_intra,pct_met_exceeded_pre-post,fte_teachers_lvpp,fte_teachers_intra,fte_teachers_pre-post,stu_tchr_ratio_lvpp,stu_tchr_ratio_intra,stu_tchr_ratio_pre-post
0,Alamo City School District,Elementary,1-Regular school,0,0,1,32-Town: Distant,ELA,All Students,0.455,0.465,0.458,0.57,40.0,41.0,41.0,37.0,15.28,15.66,14.49,16.38,-0.007,0.112,0.577,0.0,-4.0,37.0,-1.17,1.89,17.55
1,Alamo City School District,Elementary,1-Regular school,0,0,1,32-Town: Distant,ELA,Black or African American,0.314,0.283,0.2,0.4,40.0,41.0,41.0,37.0,15.28,15.66,14.49,16.38,-0.083,0.2,0.483,0.0,-4.0,37.0,-1.17,1.89,17.55
2,Alamo City School District,Elementary,1-Regular school,0,0,1,32-Town: Distant,ELA,Black/Hispanic/Native American,0.312,0.274,0.215,0.426,40.0,41.0,41.0,37.0,15.28,15.66,14.49,16.38,-0.059,0.211,0.485,0.0,-4.0,37.0,-1.17,1.89,17.55
3,Alamo City School District,Elementary,1-Regular school,0,0,1,32-Town: Distant,ELA,Economically Disadvantaged,0.352,0.31,0.307,0.392,40.0,41.0,41.0,37.0,15.28,15.66,14.49,16.38,-0.003,0.085,0.395,0.0,-4.0,37.0,-1.17,1.89,17.55
4,Alamo City School District,Elementary,1-Regular school,0,0,1,32-Town: Distant,ELA,English Learner Transitional 1-4,0.355,0.333,0.111,0.357,40.0,41.0,41.0,37.0,15.28,15.66,14.49,16.38,-0.222,0.246,0.579,0.0,-4.0,37.0,-1.17,1.89,17.55


In [24]:
# List of unique system names in weighted assessments
weighted_average_metrics_system_names = weighted_average_metrics['system_name'].tolist()

# List of unique system names in tn_leas
tn_leas_system_names = tn_leas['system_name'].tolist()

# Checking if system names exist in weighted assessments and do not in the district boundaries data set
unique_in_weighted_average_metrics = list(set(weighted_average_metrics_system_names) - set(tn_leas_system_names))

# Check if system name exists in district boundaries, but does not in weighted assessments
unique_in_tn_leas = list(set(tn_leas_system_names) - set(weighted_average_metrics_system_names))

Checking if system names exist in weighted assessments that do not in the district boundaries data set.  I've never been so happy to see an empty list!

In [25]:
unique_in_weighted_average_metrics

[]

###  📊 🤝 🗺️ Merging Weighted Average Metrics with District Geometry Files and converting to GeoDataFrame

In [26]:
weighted_average_metrics = pd.merge(left=weighted_average_metrics, right=tn_leas, on="system_name", how='left')
weighted_average_metrics = gpd.GeoDataFrame(weighted_average_metrics, geometry='geometry')

#### Renaming columns to conform with ERSI standards

In [27]:

rename_dict = {
    'pct_met_exceeded_2018': 'pctm_18',
    'pct_met_exceeded_2019': 'pctm_19',
    'pct_met_exceeded_2021': 'pctm_21',
    'pct_met_exceeded_2022': 'pctm_22',
    'fte_teachers_2018': 'fte_18',
    'fte_teachers_2019': 'fte_19',
    'fte_teachers_2021': 'fte_21',
    'fte_teachers_2022': 'fte_22',
    'stu_tchr_ratio_2018': 'str_18',
    'stu_tchr_ratio_2019': 'str_19',
    'stu_tchr_ratio_2021': 'str_21',
    'stu_tchr_ratio_2022': 'str_22',
    'pct_met_exceeded_lvpp': 'pctm_lvpp',
    'pct_met_exceeded_intra': 'pctm_intra',
    'pct_met_exceeded_pre-post': 'pctm_pp',
    'fte_teachers_lvpp': 'fte_lvpp',
    'fte_teachers_intra': 'fte_intra',
    'fte_teachers_pre-post': 'fte_pp',
    'stu_tchr_ratio_lvpp': 'str_lvpp',
    'stu_tchr_ratio_intra': 'str_intra',
    'stu_tchr_ratio_pre-post': 'str_pp'
}
weighted_average_metrics.rename(columns=rename_dict, inplace=True)

In [28]:
weighted_average_metrics.head(n=1)

Unnamed: 0,system_name,school_lvl,school_type,magnet,charter,title_1,locale,subject_area,student_group,pctm_18,pctm_19,pctm_21,pctm_22,fte_18,fte_19,fte_21,fte_22,str_18,str_19,str_21,str_22,pctm_lvpp,pctm_intra,pctm_pp,fte_lvpp,fte_intra,fte_pp,str_lvpp,str_intra,str_pp,geometry
0,Alamo City School District,Elementary,1-Regular school,0,0,1,32-Town: Distant,ELA,All Students,0.455,0.465,0.458,0.57,40.0,41.0,41.0,37.0,15.28,15.66,14.49,16.38,-0.007,0.112,0.577,0.0,-4.0,37.0,-1.17,1.89,17.55,"POLYGON ((-89.00656 35.83268, -89.00660 35.832..."


There's some funk going on here.  District names don't match across datasets.  I'm going to do a spatial merge to see which dististricts are associated based on thier physical location.

###  🗺️ Exporting Weighed Average Metrics GeoDataFrame as Shapefile

In [29]:
# CAUTION!: Don't turn on the GeoJSON generator unless lost.
weighted_average_metrics.to_file('../data/weighted_average_metrics.geojson', driver='GeoJSON')

### 📊 Visual EDA

In [30]:
# Get unique school levels
school_levels = weighted_assessments['school_lvl'].unique()

# Create the heatmap traces for each school level
heatmaps = []
for school_level in school_levels:
    data = weighted_assessments[weighted_assessments['school_lvl'] == school_level]
    heatmap = go.Heatmap(
        x=data['subject_area'],
        y=data['year'],
        z=data['pct_met_exceeded'],
        colorscale='viridis',
        colorbar=dict(title='pct_met_exceeded'),
        name=school_level
    )
    heatmaps.append(heatmap)

# Create the figure and add the heatmaps
fig = go.Figure(data=heatmaps)

# Customize the layout
fig.update_layout(
    title='Changes in pct_met_exceeded by School Level, Subject Area, and Year',
    xaxis=dict(title='Subject Area'),
    yaxis=dict(title='Year'),
    height=600,
    width=800
)

# Show the facetted heatmap
fig.show()


NameError: name 'weighted_assessments' is not defined

In [None]:
weighted_assessments.head()

Unnamed: 0,system_name,school_lvl,school_type,magnet,charter,title_1,locale,subject_area,student_group,year,fte_teachers,pct_met_exceeded,stu_tchr_ratio
0,Achievement School District,Elementary,1-Regular school,0,0,1,11-City: Large,ELA,All Students,2018,16.068687,0.09099,19.127374
1,Achievement School District,Elementary,1-Regular school,0,0,1,11-City: Large,ELA,All Students,2019,13.614091,0.075048,19.75875
2,Achievement School District,Elementary,1-Regular school,0,0,1,11-City: Large,ELA,All Students,2021,13.43705,0.057583,15.263022
3,Achievement School District,Elementary,1-Regular school,0,0,1,11-City: Large,ELA,All Students,2022,10.265169,0.089914,18.669213
4,Achievement School District,Elementary,1-Regular school,0,0,1,11-City: Large,ELA,All Students,lvpp,-0.177041,-0.017465,-4.495728


In [None]:
subject_area = [weighted_assessments['subject_area'].unique()]
subject_area

[array(['ELA', 'Math', 'Science', 'Social Studies'], dtype=object)]

In [None]:
# Calculate average proficiency per school level, content area, and year
average_proficiency = weighted_assessments.groupby(['school_lvl', 'subject_area', 'year'])['pct_met_exceeded'].mean().reset_index()

# Define subject_area
subject_area = weighted_assessments['subject_area'].unique().tolist()

# Reorder scool levels
school_lvl_order = ['Elementary', 'Middle', 'High', 'Secondary', 'Other']

# Create bar plot
fig = px.bar(average_proficiency, x='year', y='pct_met_exceeded', color='subject_area',
             facet_row='subject_area', facet_col='school_lvl',
             category_orders={'subject_area': subject_area, 'school_lvl': school_lvl_order},
             labels={'pct_met_exceeded': 'Average Proficiency',
                     'school_lvl': 'School Level',
                     'subject_area': 'Subject Area'})

# Update layout
fig.update_layout(
    title='Average Weighted Proficiency by School Level, Content Area, and Year',
   autosize=True,
    width=1200,
    height=900,
)

# Update the graph size to 50% Larger
fig.update_layout(
    autosize=True,
    width=1200,
    height=900,
)

# Show the figure
fig.show()


### σ Stats Models

In [None]:
import statsmodels.api as sm

# Prepare the data for the linear model
X = weighted_avg_pivot[('pct_met_exceeded', 'lvpp')]  # Independent variable
y = weighted_avg_pivot[('fte_teachers', 'intra')]  # Dependent variable

In [None]:
# Check for missing or NaN values in X
print(np.isnan(X).sum())
print(X.isnull().sum())

# Check for missing or NaN values in y
print(np.isnan(y).sum())
print(y.isnull().sum())

30867
30867
21882
21882


In [None]:

# Add a constant term to the independent variable
X = sm.add_constant(X)

# Fit the linear model
model = sm.OLS(y, X)
results = model.fit()

# Print the model summary
print(results.summary())

MissingDataError: exog contains inf or nans