# OLS Modeling: ELA - Super Subgroup vs. All Students Based on Locale

In [1]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf

In [2]:
metrics = pd.read_pickle('../data/Weighted_average_metrics.pkl').drop('geometry', axis = 1)

In [3]:
metrics.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55834 entries, 0 to 55833
Data columns (total 30 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   system_name    55834 non-null  object 
 1   school_lvl     55834 non-null  object 
 2   school_type    55834 non-null  object 
 3   magnet         55834 non-null  int64  
 4   charter        55834 non-null  int64  
 5   title_1        55834 non-null  object 
 6   locale         55834 non-null  object 
 7   subject_area   55834 non-null  object 
 8   student_group  55834 non-null  object 
 9   pctm_18        34510 non-null  float64
 10  pctm_19        28112 non-null  float64
 11  pctm_21        38095 non-null  float64
 12  pctm_22        39521 non-null  float64
 13  fte_18         34510 non-null  float64
 14  fte_19         28112 non-null  float64
 15  fte_21         38095 non-null  float64
 16  fte_22         39521 non-null  float64
 17  str_18         34510 non-null  float64
 18  str_19

## Schools in Tennessee exist in 12 locales.

In [4]:
metrics['locale'].unique()

array(['32-Town: Distant', '21-Suburb: Large', '41-Rural: Fringe',
       '31-Town: Fringe', '42-Rural: Distant', '33-Town: Remote',
       '43-Rural: Remote', '23-Suburb: Small', '13-City: Small',
       '22-Suburb: Mid-size', '12-City: Mid-size', '11-City: Large'],
      dtype=object)

## The differences in locale are not statistically signifigant for ELA - All Students (Prob(f)>0.05)

In [5]:

# Explanatory - "All Students" Student Group, Locale, and LVPP 
model_data = metrics.dropna(subset = ['student_group', 'locale', 'pctm_lvpp']).loc[metrics['subject_area'] == 'ELA']
# Response - All Atudents
model_data = model_data.loc[model_data['student_group'] == 'All Students']

formula = 'pctm_lvpp ~ C(student_group) + C(locale)'

model = smf.ols(formula = formula, data = model_data).fit()

model.summary()

0,1,2,3
Dep. Variable:,pctm_lvpp,R-squared:,0.023
Model:,OLS,Adj. R-squared:,0.005
Method:,Least Squares,F-statistic:,1.259
Date:,"Wed, 14 Jun 2023",Prob (F-statistic):,0.245
Time:,21:54:05,Log-Likelihood:,797.04
No. Observations:,589,AIC:,-1570.0
Df Residuals:,577,BIC:,-1518.0
Df Model:,11,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-0.0438,0.012,-3.532,0.000,-0.068,-0.019
C(locale)[T.12-City: Mid-size],0.0247,0.016,1.543,0.123,-0.007,0.056
C(locale)[T.13-City: Small],0.0073,0.017,0.424,0.671,-0.026,0.041
C(locale)[T.21-Suburb: Large],0.0097,0.015,0.665,0.506,-0.019,0.038
C(locale)[T.22-Suburb: Mid-size],-0.0258,0.020,-1.309,0.191,-0.064,0.013
C(locale)[T.23-Suburb: Small],-0.0276,0.031,-0.896,0.371,-0.088,0.033
C(locale)[T.31-Town: Fringe],0.0055,0.017,0.324,0.746,-0.028,0.039
C(locale)[T.32-Town: Distant],0.0053,0.014,0.372,0.710,-0.023,0.033
C(locale)[T.33-Town: Remote],0.0206,0.019,1.111,0.267,-0.016,0.057

0,1,2,3
Omnibus:,86.312,Durbin-Watson:,1.888
Prob(Omnibus):,0.0,Jarque-Bera (JB):,271.06
Skew:,0.685,Prob(JB):,1.38e-59
Kurtosis:,6.028,Cond. No.,18.5


## The differences in locale are not statistically signifigant for ELA - Super Subgroup (Prob(f)>0.05)

In [6]:
# Explanatory
model_data = metrics.dropna(subset = ['student_group', 'locale', 'pctm_lvpp']).loc[metrics['subject_area'] == 'ELA']

model_data = model_data.loc[model_data['student_group'] == 'Super Subgroup']

formula = 'pctm_lvpp ~ C(student_group) + C(locale)'

model = smf.ols(formula = formula, data = model_data).fit()

model.summary()

0,1,2,3
Dep. Variable:,pctm_lvpp,R-squared:,0.022
Model:,OLS,Adj. R-squared:,0.003
Method:,Least Squares,F-statistic:,1.177
Date:,"Wed, 14 Jun 2023",Prob (F-statistic):,0.3
Time:,21:54:05,Log-Likelihood:,834.13
No. Observations:,577,AIC:,-1644.0
Df Residuals:,565,BIC:,-1592.0
Df Model:,11,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-0.0434,0.011,-3.845,0.000,-0.066,-0.021
C(locale)[T.12-City: Mid-size],0.0229,0.015,1.544,0.123,-0.006,0.052
C(locale)[T.13-City: Small],-0.0086,0.016,-0.543,0.587,-0.040,0.022
C(locale)[T.21-Suburb: Large],0.0100,0.013,0.750,0.454,-0.016,0.036
C(locale)[T.22-Suburb: Mid-size],-0.0246,0.018,-1.369,0.171,-0.060,0.011
C(locale)[T.23-Suburb: Small],0.0108,0.031,0.349,0.727,-0.050,0.072
C(locale)[T.31-Town: Fringe],0.0107,0.016,0.685,0.494,-0.020,0.041
C(locale)[T.32-Town: Distant],0.0091,0.013,0.701,0.483,-0.016,0.034
C(locale)[T.33-Town: Remote],0.0185,0.017,1.062,0.289,-0.016,0.053

0,1,2,3
Omnibus:,62.091,Durbin-Watson:,1.798
Prob(Omnibus):,0.0,Jarque-Bera (JB):,282.192
Skew:,0.347,Prob(JB):,5.28e-62
Kurtosis:,6.355,Cond. No.,18.5
