# Stats Models

In [1]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf
import plotly.graph_objects as go

In [2]:
unweighted = pd.read_pickle('../data/school_based/assessments_clean.pkl')

In [3]:
unweighted

Unnamed: 0,year,system_name,school_name,test,subject,subject_area,student_group,valid_tests,pct_met_exceeded,system,...,charter,virtual,title_1,lat,long,locale,geometry,pct_met_exceeded_w,stu_tchr_ratio_w,fte_teachers_w
0,2018,Achievement School District,Aspire Coleman,TNReady,ELA,ELA,All Students,321.0,0.112,TN-00985,...,1,NOTVIRTUAL,1,35.21410,-89.923641,11-City: Large,POINT (-89.92364 35.21410),35.952,6654.33,8827.50
1,2018,Achievement School District,Aspire Coleman,TNReady,ELA,ELA,Black or African American,299.0,0.114,TN-00985,...,1,NOTVIRTUAL,1,35.21410,-89.923641,11-City: Large,POINT (-89.92364 35.21410),34.086,6198.27,8222.50
2,2018,Achievement School District,Aspire Coleman,TNReady,ELA,ELA,Black/Hispanic/Native American,312.0,0.115,TN-00985,...,1,NOTVIRTUAL,1,35.21410,-89.923641,11-City: Large,POINT (-89.92364 35.21410),35.880,6467.76,8580.00
3,2018,Achievement School District,Aspire Coleman,TNReady,ELA,ELA,Economically Disadvantaged,222.0,0.095,TN-00985,...,1,NOTVIRTUAL,1,35.21410,-89.923641,11-City: Large,POINT (-89.92364 35.21410),21.090,4602.06,6105.00
4,2018,Achievement School District,Aspire Coleman,TNReady,ELA,ELA,English Learners with Transitional 1-4,11.0,0.182,TN-00985,...,1,NOTVIRTUAL,1,35.21410,-89.923641,11-City: Large,POINT (-89.92364 35.21410),2.002,228.03,302.50
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
381475,2022,Wilson County,Wilson Central High School,EOC,Geometry,Math,Non-Economically Disadvantaged,324.0,0.432,TN-00950,...,0,SUPPVIRTUAL,0,36.13409,-86.395110,41-Rural: Fringe,POINT (-86.39511 36.13409),139.968,5514.48,29403.00
381476,2022,Wilson County,Wilson Central High School,EOC,Geometry,Math,Non-English Learners/Transitional 1-4,331.0,0.432,TN-00950,...,0,SUPPVIRTUAL,0,36.13409,-86.395110,41-Rural: Fringe,POINT (-86.39511 36.13409),142.992,5633.62,30038.25
381477,2022,Wilson County,Wilson Central High School,EOC,Geometry,Math,Non-Students with Disabilities,335.0,0.445,TN-00950,...,0,SUPPVIRTUAL,0,36.13409,-86.395110,41-Rural: Fringe,POINT (-86.39511 36.13409),149.075,5701.70,30401.25
381479,2022,Wilson County,Wilson Central High School,EOC,Geometry,Math,Super Subgroup,129.0,0.202,TN-00950,...,0,SUPPVIRTUAL,0,36.13409,-86.395110,41-Rural: Fringe,POINT (-86.39511 36.13409),26.058,2195.58,11706.75


In [4]:
unweighted = unweighted.loc[(unweighted['student_group'] == "All Students") | (unweighted['student_group'] == "Super Subgroup")]

In [5]:
unweighted.columns

Index(['year', 'system_name', 'school_name', 'test', 'subject', 'subject_area',
       'student_group', 'valid_tests', 'pct_met_exceeded', 'system', 'school',
       'school_lvl', 'tot_enrolled', 'fte_teachers', 'stu_tchr_ratio',
       'school_type', 'magnet', 'charter', 'virtual', 'title_1', 'lat', 'long',
       'locale', 'geometry', 'pct_met_exceeded_w', 'stu_tchr_ratio_w',
       'fte_teachers_w'],
      dtype='object')

The category for students not in the super subgroup does not exist.  Therefore, We'll need some algebra to construct it.  I will perform the calculations with a pivot table.

In [6]:
group_cols = ['year', 'system_name', 'school_name', 'test', 'subject', 'subject_area','system', 'school',
       'school_lvl', 'tot_enrolled', 'fte_teachers', 'stu_tchr_ratio',
       'school_type', 'magnet', 'charter', 'virtual', 'title_1', 'lat', 'long',
       'locale']

In [7]:
pivot_first = unweighted.pivot_table(columns = 'student_group',
                       values = 'valid_tests',
                      index = group_cols)



The "remaining" calculated variable represents the "non-ssg" group

In [8]:
pivot_first = unweighted.pivot_table(columns = 'student_group',
                       values = ('valid_tests', 'pct_met_exceeded'),
                      index = group_cols)
pivot_first[('valid_tests', 'remaining')] = pivot_first[('valid_tests','All Students')] - pivot_first[('valid_tests', 'Super Subgroup')]

In [9]:
pivot_first[('pct_met_exceeded', 'remaining')] = (pivot_first[('valid_tests','All Students')]*pivot_first[('pct_met_exceeded','All Students')] - pivot_first[('valid_tests','Super Subgroup')]*pivot_first[('pct_met_exceeded','Super Subgroup')])/pivot_first[('valid_tests','remaining')]


In [10]:
pivot_first

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,Unnamed: 8_level_0,Unnamed: 9_level_0,Unnamed: 10_level_0,Unnamed: 11_level_0,Unnamed: 12_level_0,Unnamed: 13_level_0,Unnamed: 14_level_0,Unnamed: 15_level_0,Unnamed: 16_level_0,Unnamed: 17_level_0,Unnamed: 18_level_0,Unnamed: 19_level_0,pct_met_exceeded,pct_met_exceeded,valid_tests,valid_tests,valid_tests,pct_met_exceeded
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,student_group,All Students,Super Subgroup,All Students,Super Subgroup,remaining,remaining
year,system_name,school_name,test,subject,subject_area,system,school,school_lvl,tot_enrolled,fte_teachers,stu_tchr_ratio,school_type,magnet,charter,virtual,title_1,lat,long,locale,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2,Unnamed: 24_level_2,Unnamed: 25_level_2
2018,Achievement School District,Aspire Coleman,TNReady,ELA,ELA,TN-00985,TN-00985-8050,Elementary,570,27.50,20.73,1-Regular school,0,1,NOTVIRTUAL,1,35.214100,-89.923641,11-City: Large,0.112,0.113,321.0,318.0,3.0,0.006000
2018,Achievement School District,Aspire Coleman,TNReady,Math,Math,TN-00985,TN-00985-8050,Elementary,570,27.50,20.73,1-Regular school,0,1,NOTVIRTUAL,1,35.214100,-89.923641,11-City: Large,0.096,0.094,322.0,319.0,3.0,0.308667
2018,Achievement School District,Aspire Coleman,TNReady,Science,Science,TN-00985,TN-00985-8050,Elementary,570,27.50,20.73,1-Regular school,0,1,NOTVIRTUAL,1,35.214100,-89.923641,11-City: Large,0.211,0.211,166.0,166.0,0.0,
2018,Achievement School District,Aspire Hanley Elementary,TNReady,Math,Math,TN-00985,TN-00985-8025,Elementary,513,19.50,26.31,1-Regular school,0,1,NOTVIRTUAL,1,35.113582,-89.976593,11-City: Large,0.109,0.107,238.0,233.0,5.0,0.202200
2018,Achievement School District,Aspire Hanley Elementary,TNReady,Science,Science,TN-00985,TN-00985-8025,Elementary,513,19.50,26.31,1-Regular school,0,1,NOTVIRTUAL,1,35.113582,-89.976593,11-City: Large,0.149,0.139,74.0,72.0,2.0,0.509000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022,Wilson County,Barry Tatum Virtual Learning Academy,EOC,Geometry,Math,TN-00950,TN-00950-0087,Other,222,16.93,13.11,1-Regular school,0,0,FACEVIRTUAL,0,36.167459,-86.305948,31-Town: Fringe,0.263,0.222,38.0,18.0,20.0,0.299900
2022,Wilson County,Barry Tatum Virtual Learning Academy,TNReady,ELA,ELA,TN-00950,TN-00950-0087,Other,222,16.93,13.11,1-Regular school,0,0,FACEVIRTUAL,0,36.167459,-86.305948,31-Town: Fringe,0.452,0.393,62.0,28.0,34.0,0.500588
2022,Wilson County,Barry Tatum Virtual Learning Academy,TNReady,Math,Math,TN-00950,TN-00950-0087,Other,222,16.93,13.11,1-Regular school,0,0,FACEVIRTUAL,0,36.167459,-86.305948,31-Town: Fringe,0.541,0.500,61.0,28.0,33.0,0.575788
2022,Wilson County,Barry Tatum Virtual Learning Academy,TNReady,Science,Science,TN-00950,TN-00950-0087,Other,222,16.93,13.11,1-Regular school,0,0,FACEVIRTUAL,0,36.167459,-86.305948,31-Town: Fringe,0.613,0.500,62.0,28.0,34.0,0.706059


In [11]:
algebra = pivot_first['pct_met_exceeded'].melt(ignore_index = False).rename(columns = {'value':'pct_met_exceeded'}).reset_index()

In [12]:
# na = all students (Valid Tests)
# pa = pct_met(all students)
# ns = supersubgroup(Valdi Tests)
# ps = pct_met (supersubgroup)
# nr = remainder

# pr ? 

> 🗂️ lvpp → last valid pre-pandemic assessment to 2021

In [13]:
group_cols = ['student_group', 'system_name', 'school_name', 'test', 'subject', 'subject_area','system', 'school',
       'school_lvl','school_type', 'magnet', 'charter', 'title_1','locale']

In [14]:
pivot_second = algebra.pivot_table(columns = 'year',
                       values = 'pct_met_exceeded',
                      index = group_cols)

Calculate lvpp epoch

In [15]:
# Last Valid Pre-Pandemic met_exceded measurement (lvpp)
pivot_second['lvpp']= pivot_second[2021] - pivot_second[2019]

In [16]:
lvpp = pivot_second.reset_index()

In [17]:
lvpp

year,student_group,system_name,school_name,test,subject,subject_area,system,school,school_lvl,school_type,magnet,charter,title_1,locale,2018,2019,2021,2022,lvpp
0,All Students,Achievement School District,Aspire Coleman,TNReady,ELA,ELA,TN-00985,TN-00985-8050,Elementary,1-Regular school,0,1,1,11-City: Large,0.112000,0.061000,,,
1,All Students,Achievement School District,Aspire Coleman,TNReady,Math,Math,TN-00985,TN-00985-8050,Elementary,1-Regular school,0,1,1,11-City: Large,0.096000,0.114000,,,
2,All Students,Achievement School District,Aspire Coleman,TNReady,Science,Science,TN-00985,TN-00985-8050,Elementary,1-Regular school,0,1,1,11-City: Large,0.211000,,,,
3,All Students,Achievement School District,Aspire Hanley Elementary,TNReady,ELA,ELA,TN-00985,TN-00985-8025,Elementary,1-Regular school,0,1,1,11-City: Large,,0.057000,,,
4,All Students,Achievement School District,Aspire Hanley Elementary,TNReady,Math,Math,TN-00985,TN-00985-8025,Elementary,1-Regular school,0,1,1,11-City: Large,0.109000,0.147000,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32633,remaining,Wilson County,Wilson Central High School,EOC,English I,ELA,TN-00950,TN-00950-0073,High,1-Regular school,0,0,0,41-Rural: Fringe,0.439000,0.473529,0.502820,0.579268,0.029291
32634,remaining,Wilson County,Wilson Central High School,EOC,English II,ELA,TN-00950,TN-00950-0073,High,1-Regular school,0,0,0,41-Rural: Fringe,0.533250,0.725217,0.611364,0.749716,-0.113854
32635,remaining,Wilson County,Wilson Central High School,EOC,English III,ELA,TN-00950,TN-00950-0073,High,1-Regular school,0,0,0,41-Rural: Fringe,0.412840,,,,
32636,remaining,Wilson County,Wilson Central High School,EOC,Geometry,Math,TN-00950,TN-00950-0073,High,1-Regular school,0,0,0,41-Rural: Fringe,0.373010,0.429425,0.437654,0.529321,0.008228


## Although there does seem to be a fair bit of uncertainty in the model, OLS indicates statistically signifigant difference in impact concerning ELA in different school levles for the ssg vs non-ssg student groups.

In [18]:
# Elem ela vs. ela in other school levels
# Explanatory
model_data = lvpp.dropna(subset = ['student_group', 'lvpp', 'school_lvl']).loc[lvpp['subject_area'] == 'ELA']

model_data = model_data.loc[model_data['student_group'] != 'All Students']

formula = 'lvpp ~ C(student_group) * C(school_lvl)'

model = smf.ols(formula = formula, data = model_data).fit()

model.summary()

0,1,2,3
Dep. Variable:,lvpp,R-squared:,0.013
Model:,OLS,Adj. R-squared:,0.01
Method:,Least Squares,F-statistic:,4.174
Date:,"Tue, 13 Jun 2023",Prob (F-statistic):,2.27e-05
Time:,19:09:30,Log-Likelihood:,1858.9
No. Observations:,2800,AIC:,-3698.0
Df Residuals:,2790,BIC:,-3638.0
Df Model:,9,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-0.0375,0.004,-8.620,0.000,-0.046,-0.029
C(student_group)[T.remaining],-0.0056,0.006,-0.897,0.370,-0.018,0.007
C(school_lvl)[T.High],0.0048,0.008,0.615,0.539,-0.010,0.020
C(school_lvl)[T.Middle],-0.0266,0.010,-2.737,0.006,-0.046,-0.008
C(school_lvl)[T.Other],-0.0028,0.024,-0.114,0.909,-0.051,0.045
C(school_lvl)[T.Secondary],0.0488,0.063,0.780,0.435,-0.074,0.171
C(student_group)[T.remaining]:C(school_lvl)[T.High],-0.0223,0.011,-1.994,0.046,-0.044,-0.000
C(student_group)[T.remaining]:C(school_lvl)[T.Middle],-0.0143,0.014,-1.029,0.304,-0.041,0.013
C(student_group)[T.remaining]:C(school_lvl)[T.Other],0.0168,0.035,0.487,0.627,-0.051,0.085

0,1,2,3
Omnibus:,848.331,Durbin-Watson:,1.829
Prob(Omnibus):,0.0,Jarque-Bera (JB):,57743.085
Skew:,0.54,Prob(JB):,0.0
Kurtosis:,25.221,Cond. No.,51.2


## It appears as if there was a signifigant impact for elementary student ELA scores in charter schools vs non charter schools.

In [20]:
# Explanatory
model_data = lvpp.dropna(subset = ['student_group', 'lvpp', 'school_lvl']).loc[lvpp['subject_area'] == 'ELA']

model_data = model_data.loc[model_data['student_group'] == 'All Students']

formula = 'lvpp ~ C(charter)'

model = smf.ols(formula = formula, data = model_data).fit()

model.summary()

0,1,2,3
Dep. Variable:,lvpp,R-squared:,0.007
Model:,OLS,Adj. R-squared:,0.006
Method:,Least Squares,F-statistic:,10.47
Date:,"Tue, 13 Jun 2023",Prob (F-statistic):,0.00124
Time:,19:09:30,Log-Likelihood:,1874.7
No. Observations:,1494,AIC:,-3745.0
Df Residuals:,1492,BIC:,-3735.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-0.0445,0.002,-24.297,0.000,-0.048,-0.041
C(charter)[T.1],-0.0272,0.008,-3.236,0.001,-0.044,-0.011

0,1,2,3
Omnibus:,84.695,Durbin-Watson:,1.765
Prob(Omnibus):,0.0,Jarque-Bera (JB):,242.957
Skew:,0.255,Prob(JB):,1.75e-53
Kurtosis:,4.909,Cond. No.,4.71
