# Exercise 5: Solutions

In [1]:
import wooldridge as woo
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf


df = woo.dataWoo('wage1')

x = df['educ']
y = df['wage']

# ingredients to the OLS formulas:
cov_xy = np.cov(x, y)[1, 0]  # access 2. row and 1. column of covariance matrix
var_x = np.var(x, ddof=1)
x_bar = np.mean(x)
y_bar = np.mean(y)

# manual calculation of OLS coefficients:
b1 = cov_xy / var_x
b0 = y_bar - b1 * x_bar
print(f'b1: {b1}\n')
print(f'b0: {b0}\n')

# OLS regression:
reg = smf.ols(formula='wage ~ educ', data=df)
results = reg.fit()

# obtain predicted values and residuals by hand:
b = results.params
wage_hat = b[0] + b[1] * df['educ']

# You can also write it like that:
#wage_hat = b['Intercept'] + b['educ'] * df['educ']

u_hat = df['wage'] - wage_hat


# Output
table = pd.DataFrame({'educ': df['educ'],
                      'wage': df['wage'],
                      'wage_hat': wage_hat,
                      'u_hat': u_hat})
print(f'table.head(15): \n{table.head(15)}\n')


# estimate models:
reg = smf.ols(formula='wage ~ educ', data=df)
results = reg.fit()

print(f'results.summary(): \n{results.summary()}\n')

b1: 0.5413592546651753

b0: -0.9048516119572207

table.head(15): 
    educ       wage  wage_hat      u_hat
0     11   3.100000  5.050100  -1.950100
1     12   3.240000  5.591459  -2.351459
2     11   3.000000  5.050100  -2.050100
3      8   6.000000  3.426022   2.573978
4     12   5.300000  5.591459  -0.291459
5     16   8.750000  7.756896   0.993104
6     18  11.250000  8.839615   2.410385
7     12   5.000000  5.591459  -0.591459
8     12   3.600000  5.591459  -1.991460
9     17  18.180000  8.298256   9.881745
10    16   6.250000  7.756896  -1.506896
11    13   8.130000  6.132819   1.997181
12    12   8.770000  5.591459   3.178541
13    12   5.500000  5.591459  -0.091459
14    12  22.200001  5.591459  16.608541

results.summary(): 
                            OLS Regression Results                            
Dep. Variable:                   wage   R-squared:                       0.165
Model:                            OLS   Adj. R-squared:                  0.163
Method:             

Confirm the three OLS properties

- The sum, and therefore the sample average of the OLS residuals, is zero.

\begin{equation}
\sum_{i=1}^n\hat{u}_i=0 \rightarrow \bar{u}_i=0
\end{equation}

- The Covariance of $x$ and $u$ is zero

\begin{equation}
\sum_{i=1}^nx_i\hat{u}_i=0 \rightarrow Cov(x_i,\hat{u}_i)=0
\end{equation}

- The point $(\bar{x},\bar{y})$ is always on the OLS regression line.

\begin{equation}
\bar{y}=\hat{\beta}_0+\hat{\beta}_1\cdot \bar{x}
\end{equation}

In [2]:
import wooldridge as woo
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf

df = woo.dataWoo('wage1')

x = df['educ']
y = df['wage']

# OLS regression:
reg = smf.ols(formula='wage ~ educ', data=df)
results = reg.fit()

# obtain coefficients, predicted values and residuals:
b = results.params
wage_hat = results.fittedvalues
u_hat = results.resid

# confirm property (1):
u_hat_mean = np.mean(u_hat)
print(f'u_hat_mean: {u_hat_mean}\n')

# confirm property (2):
educ_u_cov = np.cov(df['educ'], u_hat) [1, 0] #access 2. row and 1. column of covariance matrix
print(f'educ_u_cov: {educ_u_cov}\n')

# confirm property (3):
educ_mean = np.mean(df['educ'])
wage_pred = b[0] + b[1] * educ_mean
print(f'wage_pred: {wage_pred}\n')

wage_mean = np.mean(df['wage'])
print(f'wage_mean: {wage_mean}\n')

u_hat_mean: -8.047639445419766e-15

educ_u_cov: 1.0691976404771032e-15

wage_pred: 5.896102674787044

wage_mean: 5.896102674787035



In [3]:
import wooldridge as woo
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf

df = woo.dataWoo('wage1')

x = df['educ']
y = df['wage']

# OLS regression:
reg = smf.ols(formula='wage ~ educ', data=df)
results = reg.fit()

# obtain coefficients, predicted values and residuals:
b = results.params
wage_hat = results.fittedvalues
u_hat = results.resid

# calculate R^2 in two different ways:
wage = df['wage']
R2_a = np.var(wage_hat, ddof=1) / np.var(wage, ddof=1)
R2_b = 1 - np.var(u_hat, ddof=1) / np.var(wage, ddof=1)

print(f'R2_a: {R2_a}\n')
print(f'R2_b: {R2_b}\n')

R2_a: 0.16475751099205083

R2_b: 0.16475751099205194



The $R^2$ is 0.16 which means, that education explains 16% of the sample variation in wages.

# Exercise 6: Solutions

In [4]:
import wooldridge as woo
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf

df = woo.dataWoo('wage1')

# OLS regression:
reg = smf.ols(formula='wage ~ female', data=df)
results = reg.fit()

print(f'results.summary(): \n{results.summary()}\n')

results.summary(): 
                            OLS Regression Results                            
Dep. Variable:                   wage   R-squared:                       0.116
Model:                            OLS   Adj. R-squared:                  0.114
Method:                 Least Squares   F-statistic:                     68.54
Date:                Sun, 20 Sep 2020   Prob (F-statistic):           1.04e-15
Time:                        23:09:11   Log-Likelihood:                -1400.7
No. Observations:                 526   AIC:                             2805.
Df Residuals:                     524   BIC:                             2814.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      7.0995      0.210

Women get on average 2.5 thousand Euro less than men. However, we do not hold other relevant factors equal, so we should be careful when interpreting this result.

In [5]:
reg = smf.ols(formula='np.log(wage) ~ female', data=df)
results = reg.fit()

print(f'results.summary(): \n{results.summary()}\n')

results.summary(): 
                            OLS Regression Results                            
Dep. Variable:           np.log(wage)   R-squared:                       0.140
Model:                            OLS   Adj. R-squared:                  0.138
Method:                 Least Squares   F-statistic:                     85.04
Date:                Sun, 20 Sep 2020   Prob (F-statistic):           7.10e-19
Time:                        23:09:11   Log-Likelihood:                -373.88
No. Observations:                 526   AIC:                             751.8
Df Residuals:                     524   BIC:                             760.3
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      1.8136      0.030

Women get on average 40% Euro less than men. However, we do not hold other relevant factors equal, so we should be careful when interpreting this result.

There are many other factors that affect wages and may differ between men and women. For instance, women have on avergae less work experience because they are more like to spend some time on maternity leave. Women are also more likely to work part-time and are in other jobs then men. As long as we don't control for such factors, there is an omitted variable bias and we can't interpret the results causally.

The average wage of mean is represented by the intercept. Including men would results in the so called dummy variable trap as including both would result in perfect collinearity.

Excluding experience will probably result in a negative bias.


|Sign of omitted parameter $\beta_2$| Corr($x_1,x_2)>0$ | Corr($x_1,x_2)<0$ |
|---| :--- | :--- |
|$\beta_2>0$|  Positive bias | Negative bias |
|$\beta_2<0$| Negative bias | Positive bias |

In [6]:
reg = smf.ols(formula='np.log(wage) ~ female + educ + exper', data=df)
results = reg.fit()

print(f'results.summary(): \n{results.summary()}\n')

results.summary(): 
                            OLS Regression Results                            
Dep. Variable:           np.log(wage)   R-squared:                       0.353
Model:                            OLS   Adj. R-squared:                  0.349
Method:                 Least Squares   F-statistic:                     94.75
Date:                Sun, 20 Sep 2020   Prob (F-statistic):           5.77e-49
Time:                        23:09:11   Log-Likelihood:                -299.11
No. Observations:                 526   AIC:                             606.2
Df Residuals:                     522   BIC:                             623.3
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      0.4808      0.105

In [7]:
reg = smf.ols(formula='np.log(wage) ~ female + educ + exper + tenure + married + ndurman + services + trade + profocc', data=df)
results = reg.fit()

print(f'results.summary(): \n{results.summary()}\n')

results.summary(): 
                            OLS Regression Results                            
Dep. Variable:           np.log(wage)   R-squared:                       0.487
Model:                            OLS   Adj. R-squared:                  0.478
Method:                 Least Squares   F-statistic:                     54.35
Date:                Sun, 20 Sep 2020   Prob (F-statistic):           3.83e-69
Time:                        23:09:12   Log-Likelihood:                -238.06
No. Observations:                 526   AIC:                             496.1
Df Residuals:                     516   BIC:                             538.8
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      0.8969      0.105

Adding more control variables we hold more things equal. Though the wage discrimination is still there it is much lower now.

# Exercise 5: Solution

The interpretation of the coefficients is as described in the table below:

| Model | Dependent Variable | Independent Variable | Interpretation of $\beta_1$  &nbsp; &nbsp; &nbsp; &nbsp; |
| - | - | - | - | 
| Level-level | y | x | $\Delta y = \beta_1\Delta x$ |
| Level-log | y | log(x) | $\Delta y =(\beta_1/100)\%\Delta x$    |
| Log-level |log(y) | x | $\%\Delta y = (100\beta_1)\Delta x$    |
| Log-log | log(y) | log(x) | $\%\Delta y = \beta_1\%\Delta x$    |



# Exercise 6: Solution

In [8]:
import wooldridge as woo
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf

df = woo.dataWoo('crime1')

reg = smf.ols(formula='narr86 ~ pcnv+avgsen+ptime86+qemp86', data=df)
results = reg.fit()

print(f'results.summary(): \n{results.summary()}\n')


results.summary(): 
                            OLS Regression Results                            
Dep. Variable:                 narr86   R-squared:                       0.042
Model:                            OLS   Adj. R-squared:                  0.041
Method:                 Least Squares   F-statistic:                     29.96
Date:                Sun, 20 Sep 2020   Prob (F-statistic):           2.01e-24
Time:                        23:09:12   Log-Likelihood:                -3393.5
No. Observations:                2725   AIC:                             6797.
Df Residuals:                    2720   BIC:                             6826.
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      0.7068      0.033