In [36]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import statsmodels.formula.api as smf
import statsmodels.api as sm
import statsmodels.stats.api as sms
from statsmodels.stats.anova import anova_lm
from stargazer.stargazer import Stargazer
from scipy import stats
from IPython.display import display, HTML
display(HTML("<style>.container {width:85%;}</style>"))

## Data

In [37]:
selct_columns = ['lwage', 'exper', 'expersq', 'educ',  'age', 'kidslt6', 'kidsge6']
df = (
    pd.read_stata('http://fmwww.bc.edu/ec-p/data/wooldridge/mroz.dta')
    .filter(selct_columns)
    .dropna()
    .assign(educgr = lambda X: pd.cut(X["educ"], bins = [5,11,13, 18], labels=('Diploma','Degree','Masters'), ordered=True))
)

## Testing if the explanatory variables are independently significant using the t-test

| Hypothesis                           | Testing criteria                             | Interpretation                                  |
|--------------------------------------|----------------------------------------------|--------------------------------------------------|
| Population parameter = 0 (H0: beta=0) | p-value                                      | Reject H0 if the p-value is less than the level of significance  |
| Population parameter = 0 (H0: beta=0) | Confidence intervals for the coefficients  | Reject H0 if the confidence interval contains 0   |


In [38]:
olsModel_01 = smf.ols('lwage ~ exper  + age ', data = df).fit()

print(olsModel_01.summary())

                            OLS Regression Results                            
Dep. Variable:                  lwage   R-squared:                       0.030
Model:                            OLS   Adj. R-squared:                  0.025
Method:                 Least Squares   F-statistic:                     6.487
Date:                Sat, 29 Jul 2023   Prob (F-statistic):            0.00168
Time:                        19:43:13   Log-Likelihood:                -461.67
No. Observations:                 428   AIC:                             929.3
Df Residuals:                     425   BIC:                             941.5
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      1.1105      0.195      5.691      0.0

| Variable | Interpretation                                                                |
|----------|-------------------------------------------------------------------------------|
| exper    | Significant at 5% significance level                                          |
| age      | Not significant at 5% significance level (confidence interval contains 0 too) |

### Wald test

| Test               | Description                                                                                                      |
|--------------------|------------------------------------------------------------------------------------------------------------------|
| <b>Wald test</b>   | The Wald test is a statistical hypothesis test used to assess the significance of individual parameters (coefficients) in a statistical model. |
| <b>Hypothesis</b>  | Null Hypothesis (H0): The coefficient for the variable of interest is equal to zero (no effect).               |
|                    | Alternative Hypothesis (Ha): The coefficient for the variable of interest is not equal to zero (there is an effect). |
| <b>Decision</b>    | Reject H0 if the p-value is less than the level of significance.                                                 |


In [57]:
testResults = olsModel_01.wald_test("(age=0)")
testResults.pvalue

pd.DataFrame([testResults.fvalue[0][0], testResults.pvalue], index=["f-statistic", "P-value"]).T

Unnamed: 0,f-statistic,P-value
0,0.416121,0.5192268108254405


In [49]:
testResults.fvalue[0][0]

6.487118098469767

In [50]:
testResults.statistic

array([[6.4871181]])

In [53]:
import io
mtcars_data="""model,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
Mazda RX4,21,6,160,110,3.9,2.62,16.46,0,1,4,4
Mazda RX4 Wag,21,6,160,110,3.9,2.875,17.02,0,1,4,4
Datsun 710,22.8,4,108,93,3.85,2.32,18.61,1,1,4,1
Hornet 4 Drive,21.4,6,258,110,3.08,3.215,19.44,1,0,3,1
Hornet Sportabout,18.7,8,360,175,3.15,3.44,17.02,0,0,3,2
Valiant,18.1,6,225,105,2.76,3.46,20.22,1,0,3,1
Duster 360,14.3,8,360,245,3.21,3.57,15.84,0,0,3,4
Merc 240D,24.4,4,146.7,62,3.69,3.19,20,1,0,4,2
Merc 230,22.8,4,140.8,95,3.92,3.15,22.9,1,0,4,2
Merc 280,19.2,6,167.6,123,3.92,3.44,18.3,1,0,4,4
Merc 280C,17.8,6,167.6,123,3.92,3.44,18.9,1,0,4,4
Merc 450SE,16.4,8,275.8,180,3.07,4.07,17.4,0,0,3,3
Merc 450SL,17.3,8,275.8,180,3.07,3.73,17.6,0,0,3,3
Merc 450SLC,15.2,8,275.8,180,3.07,3.78,18,0,0,3,3
Cadillac Fleetwood,10.4,8,472,205,2.93,5.25,17.98,0,0,3,4
Lincoln Continental,10.4,8,460,215,3,5.424,17.82,0,0,3,4
Chrysler Imperial,14.7,8,440,230,3.23,5.345,17.42,0,0,3,4
Fiat 128,32.4,4,78.7,66,4.08,2.2,19.47,1,1,4,1
Honda Civic,30.4,4,75.7,52,4.93,1.615,18.52,1,1,4,2
Toyota Corolla,33.9,4,71.1,65,4.22,1.835,19.9,1,1,4,1
Toyota Corona,21.5,4,120.1,97,3.7,2.465,20.01,1,0,3,1
Dodge Challenger,15.5,8,318,150,2.76,3.52,16.87,0,0,3,2
AMC Javelin,15.2,8,304,150,3.15,3.435,17.3,0,0,3,2
Camaro Z28,13.3,8,350,245,3.73,3.84,15.41,0,0,3,4
Pontiac Firebird,19.2,8,400,175,3.08,3.845,17.05,0,0,3,2
Fiat X1-9,27.3,4,79,66,4.08,1.935,18.9,1,1,4,1
Porsche 914-2,26,4,120.3,91,4.43,2.14,16.7,0,1,5,2
Lotus Europa,30.4,4,95.1,113,3.77,1.513,16.9,1,1,5,2
Ford Pantera L,15.8,8,351,264,4.22,3.17,14.5,0,1,5,4
Ferrari Dino,19.7,6,145,175,3.62,2.77,15.5,0,1,5,6
Maserati Bora,15,8,301,335,3.54,3.57,14.6,0,1,5,8
Volvo 142E,21.4,4,121,109,4.11,2.78,18.6,1,1,4,2"""

#convert string to DataFrame
df = pd.read_csv(io.StringIO(mtcars_data), sep=",")

#fit multiple linear regression model
results = smf.ols('mpg ~ disp + carb + hp + cyl', df).fit()

#view regression model summary
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:                    mpg   R-squared:                       0.788
Model:                            OLS   Adj. R-squared:                  0.757
Method:                 Least Squares   F-statistic:                     25.09
Date:                Sat, 29 Jul 2023   Prob (F-statistic):           9.35e-09
Time:                        19:47:43   Log-Likelihood:                -77.558
No. Observations:                  32   AIC:                             165.1
Df Residuals:                      27   BIC:                             172.4
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     34.0216      2.523     13.482      0.0

In [55]:
print(results.wald_test('(disp = 0)'))

<F test: F=array([[5.66005126]]), p=0.02468609623353298, df_denom=27, df_num=1>


### Using ANOVA

In [4]:
anova_table = sm.stats.anova_lm(olsModel_01)
anova_table

Unnamed: 0,df,sum_sq,mean_sq,F,PR(>F)
exper,1.0,6.403509,6.403509,12.558115,0.000438
age,1.0,0.212184,0.212184,0.416121,0.519227
Residual,425.0,216.711758,0.50991,,


## Testing if the explanatory variables are jointly significant using the F-test

In [5]:
olsModel_02 = smf.ols(formula = 'lwage ~ exper + expersq + educ + age + kidslt6 + kidsge6', data=df).fit()
print(olsModel_02.summary())

                            OLS Regression Results                            
Dep. Variable:                  lwage   R-squared:                       0.158
Model:                            OLS   Adj. R-squared:                  0.146
Method:                 Least Squares   F-statistic:                     13.19
Date:                Sat, 29 Jul 2023   Prob (F-statistic):           1.06e-13
Time:                        11:09:48   Log-Likelihood:                -431.24
No. Observations:                 428   AIC:                             876.5
Df Residuals:                     421   BIC:                             904.9
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     -0.4209      0.317     -1.328      0.1

| Prob (F-statistic) | Interpretation                                        |
|--------------------|------------------------------------------------------|
|                    | Reject H0 if Prob (F-statistic) isless than significance level and conclude that the variables are jointly significant at the significance level                 |


### Using ANOVA
We fit two models; one restricted (with only intercept) and another with the variables and define the hypothesis as below
<blockquote>
    <p>H0: Restricted model is fine</p>
    <p>H1: We reject restricted model</p>
</blockquote>
Reject H0 if the p-value is less that the significance level

In [69]:
model_r = smf.ols(formula = 'lwage ~ 1', data=df).fit()
model_ur = smf.ols(formula = 'lwage ~ exper + expersq+ educ + age + kidslt6 + kidsge6', data=df).fit()
anova_lm(model_r, model_ur)

Unnamed: 0,df_resid,ssr,df_diff,ss_diff,F,Pr(>F)
0,427.0,223.327451,0.0,,,
1,421.0,187.987636,6.0,35.339815,13.190639,1.056864e-13
