In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import statsmodels.formula.api as smf
import statsmodels.api as sm
import statsmodels.stats.api as sms
from statsmodels.stats.anova import anova_lm
from stargazer.stargazer import Stargazer
from scipy import stats
from IPython.display import display, HTML
display(HTML("<style>.container {width:85%;}</style>"))

## Data

In [2]:
selct_columns = ['lwage', 'exper', 'expersq', 'educ',  'age', 'kidslt6', 'kidsge6']
df = (
    pd.read_stata('http://fmwww.bc.edu/ec-p/data/wooldridge/mroz.dta')
    .filter(selct_columns)
    .dropna()
    .assign(educgr = lambda X: pd.cut(X["educ"], bins = [5,11,13, 18], labels=('Diploma','Degree','Masters'), ordered=True))
)

## Testing if the explanatory variables are independently significant using the t-test

| Hypothesis                           | Testing criteria                             | Interpretation                                  |
|--------------------------------------|----------------------------------------------|--------------------------------------------------|
| Population parameter = 0 (H0: beta=0) | p-value                                      | Reject H0 if the p-value is less than the level of significance  |
| Population parameter = 0 (H0: beta=0) | Confidence intervals for the coefficients  | Reject H0 if the confidence interval contains 0   |


In [3]:
olsModel_01 = smf.ols('lwage ~ exper  + age ', data = df).fit()

print(olsModel_01.summary())

                            OLS Regression Results                            
Dep. Variable:                  lwage   R-squared:                       0.030
Model:                            OLS   Adj. R-squared:                  0.025
Method:                 Least Squares   F-statistic:                     6.487
Date:                Sat, 29 Jul 2023   Prob (F-statistic):            0.00168
Time:                        11:09:44   Log-Likelihood:                -461.67
No. Observations:                 428   AIC:                             929.3
Df Residuals:                     425   BIC:                             941.5
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      1.1105      0.195      5.691      0.0

| Variable | Interpretation                                                                |
|----------|-------------------------------------------------------------------------------|
| exper    | Significant at 5% significance level                                          |
| age      | Not significant at 5% significance level (confidence interval contains 0 too) |

### Using ANOVA

In [4]:
anova_table = sm.stats.anova_lm(olsModel_01)
anova_table

Unnamed: 0,df,sum_sq,mean_sq,F,PR(>F)
exper,1.0,6.403509,6.403509,12.558115,0.000438
age,1.0,0.212184,0.212184,0.416121,0.519227
Residual,425.0,216.711758,0.50991,,


## Testing if the explanatory variables are jointly significant using the F-test

In [5]:
olsModel_02 = smf.ols(formula = 'lwage ~ exper + expersq + educ + age + kidslt6 + kidsge6', data=df).fit()
print(olsModel_02.summary())

                            OLS Regression Results                            
Dep. Variable:                  lwage   R-squared:                       0.158
Model:                            OLS   Adj. R-squared:                  0.146
Method:                 Least Squares   F-statistic:                     13.19
Date:                Sat, 29 Jul 2023   Prob (F-statistic):           1.06e-13
Time:                        11:09:48   Log-Likelihood:                -431.24
No. Observations:                 428   AIC:                             876.5
Df Residuals:                     421   BIC:                             904.9
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     -0.4209      0.317     -1.328      0.1

| Prob (F-statistic) | Interpretation                                        |
|--------------------|------------------------------------------------------|
|                    | Reject H0 if Prob (F-statistic) isless than significance level and conclude that the variables are jointly significant at the significance level                 |


### Using ANOVA
We fit two models; one restricted (with only intercept) and another with the variables and define the hypothesis as below
<blockquote>
    <p>H0: Restricted model is fine</p>
    <p>H1: We reject restricted model</p>
</blockquote>
Reject H0 if the p-value is less that the significance level

In [69]:
model_r = smf.ols(formula = 'lwage ~ 1', data=df).fit()
model_ur = smf.ols(formula = 'lwage ~ exper + expersq+ educ + age + kidslt6 + kidsge6', data=df).fit()
anova_lm(model_r, model_ur)

Unnamed: 0,df_resid,ssr,df_diff,ss_diff,F,Pr(>F)
0,427.0,223.327451,0.0,,,
1,421.0,187.987636,6.0,35.339815,13.190639,1.056864e-13
