In [11]:
import numpy as np
import pandas as pd
import statsmodels.formula.api as smf
import statsmodels.api as sm
from tqdm import tqdm


## Question 1

Use the data in homework_10.1.csv and find the fixed effect (the constant term in the regression) for each time (0 through 11). 

In [6]:
# Load data
df = pd.read_csv('homework_10.1.csv')

In [7]:
# Convert time to categorical for fixed effects
df['time'] = df['time'].astype('category')

# Fit model with time fixed effects and X
model = smf.ols('y ~ X + C(time) - 1', data=df).fit()

# Extract fixed effects
fixed_effects = model.params.filter(like='C(time)')
fixed_effects = fixed_effects.rename(lambda x: int(x.split('[')[-1].rstrip(']'))).sort_index()

# Show fixed effects for each time
print(fixed_effects)

0     2.879246
1     4.676858
2     6.148259
3     7.162997
4     8.117890
5     9.016350
6     8.626309
7     8.515667
8     8.407278
9     7.467505
10    5.991741
11    4.623149
dtype: float64


## Question 2

What about the fixed effect for each city (0 through 9). 

In [None]:
df = pd.read_csv('homework_10.1.csv')

In [8]:
# Convert city to categorical
df['city'] = df['city'].astype('category')

# Fit model with city fixed effects and X
model = smf.ols('y ~ X + C(city) - 1', data=df).fit()

# Extract fixed effects
city_effects = model.params.filter(like='C(city)')
city_effects = city_effects.rename(lambda x: int(x.split('[')[-1].rstrip(']'))).sort_index()

# Show fixed effects
print(city_effects)

0    11.029943
1     5.227050
2     7.921836
3    12.926031
4    11.296032
5    -0.057783
6     7.704997
7     3.257647
8     3.384002
9     5.450428
dtype: float64


## Question 3

For the following data, model np.exp(Y) as a function of X and Z. 

num = 10000 

X = np.clip(np.random.normal(3, 1, (num,)), 0.01, 100)

Z = np.clip(np.random.normal(3, 1, (num,)), 0.01, 100)

Y = np.log(X + Z) + np.random.normal(0, 1, (num,)) 

In [7]:
num=100000

X = np.clip(np.random.normal(3, 1, (num,)), 0.01, 100)
Z = np.clip(np.random.normal(3, 1, (num,)), 0.01, 100)
Y = np.log(X + Z) + np.random.normal(0, 1, (num,))

In [8]:
# Model exp(Y)
data = pd.DataFrame({'X': X, 'Z': Z, 'expY': np.exp(Y)})

# Fit model
X_design = sm.add_constant(data[['X', 'Z']])
model = sm.OLS(data['expY'], X_design).fit()

print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                   expY   R-squared:                       0.030
Model:                            OLS   Adj. R-squared:                  0.030
Method:                 Least Squares   F-statistic:                     1550.
Date:                Sun, 27 Jul 2025   Prob (F-statistic):               0.00
Time:                        12:23:56   Log-Likelihood:            -3.9968e+05
No. Observations:              100000   AIC:                         7.994e+05
Df Residuals:                   99997   BIC:                         7.994e+05
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         -0.0214      0.182     -0.117      0.9

## Question 4

Suppose we were to use the data below to find the standard error of X's coefficient (the coefficient that should be 1.5) in two ways: 

    i) By asking Python to find the standard error. 
    ii) num = 10000 

 

Z = np.random.normal(0, 1, (num,)) 

X = Z + np.random.normal(0, 1, (num,)) 

Y = 1.5 * X + 2.3 * Z + np.random.normal(0, X**2, (num,)) 



By simulating what happens if we generate the data 100 times, estimating X's coefficient and finding the standard deviation of the 100 estimates.

In [9]:
num = 10000

Z = np.random.normal(0, 1, (num,))
X = Z + np.random.normal(0, 1, (num,))
Y = 1.5 * X + 2.3 * Z + np.random.normal(0, X**2, (num,))

In [10]:
# One run — use Python's standard error estimate

# Fit model
X_design = sm.add_constant(np.column_stack((X, Z)))
model = sm.OLS(Y, X_design).fit()

# Extract standard error for X's coefficient
se_python = model.bse[1]  # index 1 = X (0 is intercept)

print("Python standard error of X's coefficient:", se_python)

Python standard error of X's coefficient: 0.03475002660146179


In [12]:
# Simulation — estimate std. dev. across 100 trials

coefs = []

for _ in tqdm(range(100)):
    Z = np.random.normal(0, 1, num)
    X = Z + np.random.normal(0, 1, num)
    Y = 1.5 * X + 2.3 * Z + np.random.normal(0, X**2, num)
    X_design = sm.add_constant(np.column_stack((X, Z)))
    model = sm.OLS(Y, X_design).fit()
    coefs.append(model.params[1])  # store coefficient of X

# Standard deviation of X's coefficient estimates
se_simulated = np.std(coefs, ddof=1)

print("Simulated std dev of X's coefficient over 100 runs:", se_simulated)


100%|██████████| 100/100 [00:00<00:00, 492.52it/s]

Simulated std dev of X's coefficient over 100 runs: 0.055960869080901844





## Homework Reflection 

No Homework Relection this week.