In [None]:
!pip install linearmodels
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
from linearmodels.panel import PanelOLS
from linearmodels.panel import compare

Collecting linearmodels
  Downloading linearmodels-6.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.9 kB)
Collecting mypy-extensions>=0.4 (from linearmodels)
  Downloading mypy_extensions-1.0.0-py3-none-any.whl.metadata (1.1 kB)
Collecting pyhdfe>=0.1 (from linearmodels)
  Downloading pyhdfe-0.2.0-py3-none-any.whl.metadata (4.0 kB)
Collecting formulaic>=1.0.0 (from linearmodels)
  Downloading formulaic-1.0.2-py3-none-any.whl.metadata (6.8 kB)
Collecting setuptools-scm<9.0.0,>=8.0.0 (from setuptools-scm[toml]<9.0.0,>=8.0.0->linearmodels)
  Downloading setuptools_scm-8.1.0-py3-none-any.whl.metadata (6.6 kB)
Collecting interface-meta>=1.2.0 (from formulaic>=1.0.0->linearmodels)
  Downloading interface_meta-1.3.0-py3-none-any.whl.metadata (6.7 kB)
Downloading linearmodels-6.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m17.1 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
url = 'https://www.qogdata.pol.gu.se/data/qog_bas_ts_jan24.xlsx'
data = pd.read_excel(url)
data.head()

Unnamed: 0,ccode,cname,year,ccode_qog,cname_qog,ccodealp,ccodecow,version,cname_year,ccodealp_year,...,wdi_trade,wdi_unempfilo,wdi_unempilo,wdi_unempmilo,wdi_unempyfilo,wdi_unempyilo,wdi_unempymilo,wdi_wip,who_sanittot,whr_hap
0,4,Afghanistan,1946,4,Afghanistan,AFG,700.0,QoGBasTSjan24,Afghanistan 1946,AFG46,...,,,,,,,,,,
1,4,Afghanistan,1947,4,Afghanistan,AFG,700.0,QoGBasTSjan24,Afghanistan 1947,AFG47,...,,,,,,,,,,
2,4,Afghanistan,1948,4,Afghanistan,AFG,700.0,QoGBasTSjan24,Afghanistan 1948,AFG48,...,,,,,,,,,,
3,4,Afghanistan,1949,4,Afghanistan,AFG,700.0,QoGBasTSjan24,Afghanistan 1949,AFG49,...,,,,,,,,,,
4,4,Afghanistan,1950,4,Afghanistan,AFG,700.0,QoGBasTSjan24,Afghanistan 1950,AFG50,...,,,,,,,,,,


In [None]:
#Prepare the data by dropping rows with missing values in relevant columns
regression_data = data[['wdi_birth', 'wdi_unempyfilo', 'year']].dropna()

1. Due to the complexity of variables such as birth rate and female unemployment, there is a high chance that we will end up not finding anything significant or that there are significant confounding factors that we cannot see due to the simplicity of the OLS model.

In [None]:
#Run a naive OLS regression on your time series data. Tell me how you expect your Xs to affect your Y and why. Interpret your results.
#naive OLS regression with year fixed effects
ols_model = smf.ols(formula='wdi_unempyfilo ~ wdi_birth + C(year)', data=regression_data).fit()
ols_model.summary()

0,1,2,3
Dep. Variable:,wdi_unempyfilo,R-squared:,0.05
Model:,OLS,Adj. R-squared:,0.044
Method:,Least Squares,F-statistic:,9.095
Date:,"Mon, 18 Nov 2024",Prob (F-statistic):,5.08e-41
Time:,22:30:17,Log-Likelihood:,-21920.0
No. Observations:,5430,AIC:,43900.0
Df Residuals:,5398,BIC:,44110.0
Df Model:,31,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,23.7271,1.217,19.495,0.000,21.341,26.113
C(year)[T.1992],-0.6397,1.532,-0.418,0.676,-3.643,2.364
C(year)[T.1993],0.2892,1.524,0.190,0.849,-2.698,3.277
C(year)[T.1994],0.6213,1.524,0.408,0.684,-2.367,3.609
C(year)[T.1995],0.8799,1.525,0.577,0.564,-2.109,3.869
C(year)[T.1996],1.2269,1.525,0.805,0.421,-1.762,4.216
C(year)[T.1997],1.0214,1.525,0.670,0.503,-1.969,4.011
C(year)[T.1998],0.7866,1.526,0.516,0.606,-2.204,3.777
C(year)[T.1999],0.9716,1.526,0.637,0.524,-2.020,3.963

0,1,2,3
Omnibus:,1021.562,Durbin-Watson:,0.093
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1742.05
Skew:,1.237,Prob(JB):,0.0
Kurtosis:,4.257,Cond. No.,899.0


As expected, none of the years show a significant relationship between birth rate and female unemployment at the 5% level. The R-squared is also quite small which further highlights the model's inadequacies as well as the lack of a clear relationship.

In [None]:
# Create first differences for the dependent and independent variables
regression_data['diff_wdi_unempyfilo'] = regression_data['wdi_unempyfilo'].diff()
regression_data['diff_wdi_birth'] = regression_data['wdi_birth'].diff()

# Drop the first observation of each group (since diff creates NaN for the first row)
regression_data_diff = regression_data.dropna()

# Run the first difference model (no need for year dummies; they are differenced out)
fd_model = smf.ols(formula='diff_wdi_unempyfilo ~ diff_wdi_birth', data=regression_data_diff).fit()

# Display the summary
fd_model.summary()


0,1,2,3
Dep. Variable:,diff_wdi_unempyfilo,R-squared:,0.078
Model:,OLS,Adj. R-squared:,0.078
Method:,Least Squares,F-statistic:,457.4
Date:,"Mon, 18 Nov 2024",Prob (F-statistic):,1.68e-97
Time:,22:42:54,Log-Likelihood:,-15554.0
No. Observations:,5429,AIC:,31110.0
Df Residuals:,5427,BIC:,31130.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-0.0022,0.058,-0.039,0.969,-0.115,0.111
diff_wdi_birth,-0.3494,0.016,-21.387,0.000,-0.381,-0.317

0,1,2,3
Omnibus:,1907.823,Durbin-Watson:,2.019
Prob(Omnibus):,0.0,Jarque-Bera (JB):,510075.723
Skew:,-0.368,Prob(JB):,0.0
Kurtosis:,50.48,Cond. No.,3.53


After using first-differencing for the model, we see that the relationship between the two is now significant in contrast to the previous naive OLS model. This is to a degree an expected outcome because first-differencing may account for serial autocorrelation or seasonality that may present itself in the data. The coefficient has also increased by a small amount, indicating that as birth rate increases, female unemployment rate decreases by around -0.35. This isn't a substantial amount but it is interesting because I would have expected an increase in birth rate to be associated with less women entering the workforce or at least being delayed in their entry.