### Importing necessary Libraries

In [35]:
import os
import pandas as pd
import numpy as np
import statsmodels.api as sm
from statsmodels.stats.anova import anova_lm
from statsmodels.formula.api import ols
from loguru import logger

### Load the Data

In [36]:
path = "star.xlsx"
df = pd.read_excel(path)  
df.columns

Index(['id', 'schid', 'tchid', 'tchexper', 'absent', 'readscore', 'mathscore',
       'totalscore', 'boy', 'white_asian', 'black', 'tchwhite', 'tchmasters',
       'freelunch', 'schurban', 'schrural', 'small', 'regular', 'aide'],
      dtype='object')

In [37]:
df["tchexper"] = df["tchexper"].replace("", np.nan)
df = df.dropna(subset=["tchexper"])
df["tchexper"] = pd.to_numeric(df["tchexper"], errors="coerce")

### a) Sample means by classroom type

In [38]:
means = {
    "Regular (no aide)": df.loc[(df["regular"] == 1) & (df["aide"] == 0), "totalscore"].mean(),
    "Regular (with aide)": df.loc[df["aide"] == 1, "totalscore"].mean(),
    "Small": df.loc[df["small"] == 1, "totalscore"].mean()
}
means

{'Regular (no aide)': np.float64(918.0428927680798),
 'Regular (with aide)': np.float64(918.5313890261987),
 'Small': np.float64(931.9418872266973)}

### b) Regression Model

In [39]:
X = df[["small", "aide"]]
X = sm.add_constant(X)
y = df["totalscore"]

model_b = sm.OLS(y, X).fit()
logger.info(model_b.summary())

[32m2025-09-27 11:46:43.792[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m6[0m - [1m                            OLS Regression Results                            
Dep. Variable:             totalscore   R-squared:                       0.007
Model:                            OLS   Adj. R-squared:                  0.007
Method:                 Least Squares   F-statistic:                     20.93
Date:                Sat, 27 Sep 2025   Prob (F-statistic):           8.74e-10
Time:                        11:46:43   Log-Likelihood:                -32963.
No. Observations:                5766   AIC:                         6.593e+04
Df Residuals:                    5763   BIC:                         6.595e+04
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------

### c) Adding teacher experience (tchexper):

In [40]:
X = df[["small", "aide", "tchexper"]]
X = sm.add_constant(X)
model_c = sm.OLS(y, X).fit()
logger.info(model_c.summary())

[32m2025-09-27 11:46:43.815[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m4[0m - [1m                            OLS Regression Results                            
Dep. Variable:             totalscore   R-squared:                       0.020
Model:                            OLS   Adj. R-squared:                  0.020
Method:                 Least Squares   F-statistic:                     39.86
Date:                Sat, 27 Sep 2025   Prob (F-statistic):           1.73e-25
Time:                        11:46:43   Log-Likelihood:                -32925.
No. Observations:                5766   AIC:                         6.586e+04
Df Residuals:                    5762   BIC:                         6.588e+04
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------

### d) Add BOY, FREELUNCH, WHITE_ASIAN:

In [41]:
X = df[["small", "aide", "tchexper", "boy", "freelunch", "white_asian"]]
X = sm.add_constant(X)
model_d = sm.OLS(y, X).fit()
logger.info(model_d.summary())

[32m2025-09-27 11:46:43.841[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m4[0m - [1m                            OLS Regression Results                            
Dep. Variable:             totalscore   R-squared:                       0.102
Model:                            OLS   Adj. R-squared:                  0.101
Method:                 Least Squares   F-statistic:                     109.3
Date:                Sat, 27 Sep 2025   Prob (F-statistic):          5.60e-131
Time:                        11:46:43   Log-Likelihood:                -32673.
No. Observations:                5766   AIC:                         6.536e+04
Df Residuals:                    5759   BIC:                         6.541e+04
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                  coef    std err          t      P>|t|      [0.025      0.975]
-----------------------

### e) Add TCHWHITE, TCHMASTERS, SCHURBAN, SCHRURAL:

In [43]:
X = df[["small", "aide", "tchexper", "boy", "freelunch", "white_asian",
        "tchwhite", "tchmasters", "schurban", "schrural"]]
X = sm.add_constant(X)
model_e = sm.OLS(y, X).fit()
logger.info(model_e.summary())


[32m2025-09-27 11:47:53.703[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m5[0m - [1m                            OLS Regression Results                            
Dep. Variable:             totalscore   R-squared:                       0.106
Model:                            OLS   Adj. R-squared:                  0.104
Method:                 Least Squares   F-statistic:                     68.14
Date:                Sat, 27 Sep 2025   Prob (F-statistic):          5.21e-132
Time:                        11:47:53   Log-Likelihood:                -32662.
No. Observations:                5766   AIC:                         6.535e+04
Df Residuals:                    5755   BIC:                         6.542e+04
Df Model:                          10                                         
Covariance Type:            nonrobust                                         
                  coef    std err          t      P>|t|      [0.025      0.975]
-----------------------

### f) Discussion of the importance of previous points:

- Adding controls like **tchexper**, **demographics**, and **school** characteristics helps isolate the **“treatment”** effect of small classes vs. aide from confounding factors.

- If the coefficients on **small** and **aide** remain stable, it suggests ***random assignment worked well (balance across groups).***

- If they shift, it means *selection* or *imbalance* affected raw estimates.

### g) School fixed effects:

In [51]:
X = df[["small", "aide", "tchexper", "boy", "freelunch", "white_asian",
        "tchwhite", "tchmasters", "schurban", "schrural"]]
X = pd.get_dummies(df["schid"], drop_first=True).join(X)
X = sm.add_constant(X)
model_g = sm.OLS(y, X).fit()

anova_lm(model_e, model_g)   # compare with model (e)


ValueError: Pandas data cast to numpy dtype of object. Check input data with np.asarray(data).