# Regressions

In [1]:
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer, mean_squared_error

## Making the dataframe

### Data gender inequality

In [2]:
df = pd.read_excel('./data/Gender_StatsEXCEL.xlsx',sheet_name='Data')

In [3]:
df = df.iloc[54000:,]
df.drop(columns=df.iloc[:,3:54].columns.tolist(), inplace=True)
df.rename({'Indicator Name':'Indicator'},inplace=True,axis=1)
df.rename({'Country Name':'Country'},inplace=True,axis=1)
df = df.loc[df.Indicator.str.contains('1=yes; 0=no')]
df = df.drop(columns='Country Code')

In [4]:
df = (df.melt(['Country', 'Indicator'], var_name='Year')
   .pivot_table(index=['Country', 'Year'],
                columns='Indicator', values='value')
)

In [5]:
importances = np.array([0.01831044, 0.03237455, 0.0031967 , 0.012108  , 0.01552896,
       0.00758614, 0.00181886, 0.0546642 , 0.01587967, 0.01247531,
       0.02007783, 0.0364012 , 0.03424009, 0.03465867, 0.02593966,
       0.01570586, 0.05781482, 0.01149065, 0.00752398, 0.05290953,
       0.07132007, 0.01305223, 0.03959353, 0.04099451, 0.02862905,
       0.00649011, 0.00942123, 0.00436768, 0.03333931, 0.02942839,
       0.02490152, 0.02989014, 0.0245622 , 0.05561928, 0.11768564])

In [6]:
df.iloc[:,] *= importances
df = df.assign(Gender_Var=df.sum(axis=1, numeric_only=True))

In [7]:
df.drop(columns=df.iloc[:,0:35].columns.tolist(), inplace=True)

In [8]:
df.reset_index(inplace=True)

### Data subregions

In [9]:
df_Continent = pd.read_excel('./data/Gender_StatsEXCEL.xlsx',sheet_name='Country')

In [10]:
df_Continent = df_Continent[['Table Name', 'Region']]
df_Continent.rename({'Table Name':'Country'},inplace=True,axis=1)
df_Continent.dropna(axis=0, inplace = True)

In [11]:
df = df.merge(df_Continent,how='left',left_on='Country', right_on='Country')

### Data further loading from the dataset

In [12]:
df1 = pd.read_excel('./data/Gender_StatsEXCEL.xlsx',sheet_name='Data')

In [13]:
df1 = df1.iloc[54000:,]
df1.drop(columns=df1.iloc[:,3:54].columns.tolist(), inplace=True)
df1.rename({'Indicator Name':'Indicator'},inplace=True,axis=1)
df1.rename({'Country Name':'Country'},inplace=True,axis=1)
df1 = df1.drop(columns='Country Code')
df1 = df1[df1['Indicator'].isin(['Gini index', 'GDP growth (annual %)', 'Government expenditure on education, total (% of GDP)', 'Employment in agriculture (% of total employment) (modeled ILO estimate)', 
                                        'Employment in industry (% of total employment) (modeled ILO estimate)', 'Employment in services (% of total employment) (modeled ILO estimate)', 'Human Capital Index (HCI) (scale 0-1)',
                                        'School enrollment, secondary (% net)', 'Lower secondary completion rate, male (% of relevant age group)' , 'Lower secondary completion rate, female (% of relevant age group)', 
                                        'GDP per capita (constant 2010 US$)', 'Expected years of schooling', 'Lower secondary completion rate, total (% of relevant age group)', 
                                 'School enrollment, primary and secondary (gross), gender parity index (GPI)', 'Expected years of schooling, male'])]

In [14]:
df1 = (df1.melt(['Country', 'Indicator'], var_name='Year')
   .pivot_table(index=['Country', 'Year'],
                columns='Indicator', values='value'))

In [15]:
df1.reset_index(inplace=True)

In [16]:
df = df.merge(df1,how='left',left_on=['Country', 'Year'], right_on=['Country', 'Year'])

### Data Trade

In [17]:
df1 = pd.read_excel('./data/Trade.xlsx',sheet_name='Data')

In [18]:
df1 = df1.drop(columns = ['Country Code', 'Indicator Code'])
df1.drop(columns=df1.iloc[:,2:52].columns.tolist(), inplace=True)
df1.rename({'Indicator Name':'Indicator'},inplace=True,axis=1)
df1.rename({'Country Name':'Country'},inplace=True,axis=1)

In [19]:
df1 = (df1.melt(['Country', 'Indicator'], var_name='Year')
   .pivot_table(index=['Country', 'Year'],
                columns='Indicator', values='value'))

In [20]:
df1.reset_index(inplace=True)

In [21]:
df = df.merge(df1,how='left',left_on=['Country', 'Year'], right_on=['Country', 'Year'])

### Data Investments

In [22]:
df1 = pd.read_excel('./data/Investment.xlsx',sheet_name='Data')

In [23]:
df1 = df1.drop(columns = ['Country Code', 'Indicator Code'])
df1.drop(columns=df1.iloc[:,2:52].columns.tolist(), inplace=True)
df1.rename({'Indicator Name':'Indicator'},inplace=True,axis=1)
df1.rename({'Country Name':'Country'},inplace=True,axis=1)

In [24]:
df1 = (df1.melt(['Country', 'Indicator'], var_name='Year')
   .pivot_table(index=['Country', 'Year'],
                columns='Indicator', values='value'))

In [25]:
df1.reset_index(inplace=True)

In [26]:
df = df.merge(df1,how='left',left_on=['Country', 'Year'], right_on=['Country', 'Year'])

### Data population growth

In [27]:
df1 = pd.read_excel('./data/Pop_growth.xlsx',sheet_name='Data')

In [28]:
df1 = df1.drop(columns = ['Country Code', 'Indicator Code'])
df1.drop(columns=df1.iloc[:,2:52].columns.tolist(), inplace=True)
df1.rename({'Indicator Name':'Indicator'},inplace=True,axis=1)
df1.rename({'Country Name':'Country'},inplace=True,axis=1)

In [29]:
df1 = (df1.melt(['Country', 'Indicator'], var_name='Year')
   .pivot_table(index=['Country', 'Year'],
                columns='Indicator', values='value'))

In [30]:
df1.reset_index(inplace=True)

In [31]:
df = df.merge(df1,how='left',left_on=['Country', 'Year'], right_on=['Country', 'Year'])

### Data Working population growth

In [32]:
df1 = pd.read_excel('./data/WorkingPop.xlsx',sheet_name='Data')

In [33]:
df1 = df1.drop(columns = ['Country Code', 'Indicator Code'])
df1.drop(columns=df1.iloc[:,2:52].columns.tolist(), inplace=True)
df1.rename({'Indicator Name':'Indicator'},inplace=True,axis=1)
df1.rename({'Country Name':'Country'},inplace=True,axis=1)

In [34]:
df1 = (df1.melt(['Country', 'Indicator'], var_name='Year')
   .pivot_table(index=['Country', 'Year'],
                columns='Indicator', values='value'))

In [35]:
df1.reset_index(inplace=True)

In [36]:
df = df.merge(df1,how='left',left_on=['Country', 'Year'], right_on=['Country', 'Year'])

In [37]:
df['%Gender'] = df['Gender_Var']*100

In [38]:
df.rename({'Employment in agriculture (% of total employment) (modeled ILO estimate)':'%Agriculture'},inplace=True,axis=1)
df.rename({'Employment in industry (% of total employment) (modeled ILO estimate)':'%Industry'},inplace=True,axis=1)
df.rename({'Employment in services (% of total employment) (modeled ILO estimate)':'%Service'},inplace=True,axis=1)
df.rename({'GDP growth (annual %)':'%Growth'},inplace=True,axis=1)
df.rename({'GDP per capita (constant 2010 US$)':'GDP_C'},inplace=True,axis=1)
df.rename({'Government expenditure on education, total (% of GDP)':'%Expenditures'},inplace=True,axis=1)
df.rename({'Human Capital Index (HCI) (scale 0-1)':'HCI'},inplace=True,axis=1)
df.rename({'Lower secondary completion rate, female (% of relevant age group)':'%Completion_F'},inplace=True,axis=1)
df.rename({'Lower secondary completion rate, male (% of relevant age group)':'%Completion_M'},inplace=True,axis=1)
df.rename({'Lower secondary completion rate, total (% of relevant age group)':'%Completion_T'},inplace=True,axis=1)
df.rename({'School enrollment, secondary (% net)':'%Enrollment'},inplace=True,axis=1)
df.rename({'Trade (% of GDP)':'%Trade'},inplace=True,axis=1)
df.rename({'Foreign direct investment, net inflows (% of GDP)':'%Investments'},inplace=True,axis=1)
df.rename({'Population growth (annual %)':'%Pop_Growth'},inplace=True,axis=1)
df.rename({'Population ages 15-64 (% of total population)':'%WorkingPop'},inplace=True,axis=1)
df.rename({'Trade_2':'%Trade_2'},inplace=True,axis=1)
df.rename({'School enrollment, primary and secondary (gross), gender parity index (GPI)':'GPI'},inplace=True,axis=1)
df = df.drop(columns='Gender_Var')

In [39]:
df['%Trade_2'] = df['%Trade']**2

In [40]:
df = df[df.Year != '2022']

## Europe and Central Asia

In [41]:
df_EU_CA = df.loc[df['Region'] == 'Europe & Central Asia']

### Regression 1 Klasen and Lamanna(2009)

In [42]:
X = df_EU_CA.drop('%Growth',axis=1)
y = df_EU_CA['%Growth']
X = X.fillna(method = 'ffill', axis = 'rows')
y = y.fillna(method = 'ffill', axis = 'rows')
X = X.fillna(X.mean(numeric_only=True), axis = 'rows')
y = y.fillna(y.mean(), axis = 'rows')
X = sm.add_constant(X)

In [43]:
X = X.loc[:,['const','%Gender', '%Pop_Growth', '%WorkingPop', '%Investments', '%Trade', 'GPI', 'Expected years of schooling, male']]

In [44]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=42)

In [45]:
regression_model = LinearRegression()
mse_scorer = make_scorer(mean_squared_error)
mse_scores = cross_val_score(regression_model, X_train, y_train, scoring=mse_scorer, cv=5)
mean_mse = np.mean(mse_scores)
mean_mse

15.432424264161728

### Regression 2 Klasen and Lamanna(2009) reduced form

In [46]:
X = df_EU_CA .drop('%Growth',axis=1)
y = df_EU_CA ['%Growth']
X = X.fillna(method = 'ffill', axis = 'rows')
y = y.fillna(method = 'ffill', axis = 'rows')
X = X.fillna(X.mean(numeric_only=True), axis = 'rows')
y = y.fillna(y.mean(), axis = 'rows')
X = sm.add_constant(X)

In [47]:
X = X.loc[:,['const','%Gender', '%Trade', 'GPI', 'Expected years of schooling, male']]

In [48]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=42)

In [49]:
regression_model = LinearRegression()
mse_scorer = make_scorer(mean_squared_error)
mse_scores = cross_val_score(regression_model, X_train, y_train, scoring=mse_scorer, cv=5)
mean_mse = np.mean(mse_scores)
mean_mse

14.78732112467587

### Regression 3 Senguino (2000)

In [50]:
X = df_EU_CA .drop('%Growth',axis=1)
y = df_EU_CA ['%Growth']
X = X.fillna(method = 'ffill', axis = 'rows')
y = y.fillna(method = 'ffill', axis = 'rows')
X = X.fillna(X.mean(numeric_only=True), axis = 'rows')
y = y.fillna(y.mean(), axis = 'rows')
X = sm.add_constant(X)

In [51]:
X = X.loc[:,['const','%Gender', '%Completion_F', '%Completion_M', 'GDP_C']]

In [52]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=42)

In [53]:
regression_model = LinearRegression()
mse_scorer = make_scorer(mean_squared_error)
mse_scores = cross_val_score(regression_model, X_train, y_train, scoring=mse_scorer, cv=5)
mean_mse = np.mean(mse_scores)
mean_mse

14.845484670257184

### Regression 4 Ghosh(2021)

In [54]:
X = df_EU_CA .drop('%Growth',axis=1)
y = df_EU_CA ['%Growth']
X = X.fillna(method = 'ffill', axis = 'rows')
y = y.fillna(method = 'ffill', axis = 'rows')
X = X.fillna(X.mean(numeric_only=True), axis = 'rows')
y = y.fillna(y.mean(), axis = 'rows')
X = sm.add_constant(X)

In [55]:
X = X.loc[:,['const','%Gender', '%Trade', '%Trade_2']]

In [56]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=42)

In [57]:
regression_model = LinearRegression()
mse_scorer = make_scorer(mean_squared_error)
mse_scores = cross_val_score(regression_model, X_train, y_train, scoring=mse_scorer, cv=5)
mean_mse = np.mean(mse_scores)
mean_mse

14.701355285122137

In [58]:
model4 = sm.OLS(y_train,X_train)
results4 = model4.fit()
results4.summary()

0,1,2,3
Dep. Variable:,%Growth,R-squared:,0.015
Model:,OLS,Adj. R-squared:,0.008
Method:,Least Squares,F-statistic:,2.051
Date:,"Fri, 23 Jun 2023",Prob (F-statistic):,0.106
Time:,09:06:22,Log-Likelihood:,-1109.1
No. Observations:,403,AIC:,2226.0
Df Residuals:,399,BIC:,2242.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,1.9212,1.435,1.339,0.181,-0.900,4.742
%Gender,-0.0194,0.015,-1.316,0.189,-0.048,0.010
%Trade,0.0243,0.011,2.115,0.035,0.002,0.047
%Trade_2,-5.662e-05,3.09e-05,-1.830,0.068,-0.000,4.22e-06

0,1,2,3
Omnibus:,57.621,Durbin-Watson:,2.052
Prob(Omnibus):,0.0,Jarque-Bera (JB):,367.778
Skew:,-0.363,Prob(JB):,1.37e-80
Kurtosis:,7.623,Cond. No.,205000.0


In [59]:
y_train_pred = results4.predict(X_train)
y_test_pred = results4.predict(X_test)

In [60]:
print('MSE on train data:',mean_squared_error(y_train, y_train_pred))

MSE on train data: 14.385741352708965


In [61]:
print('MSE on test data:',mean_squared_error(y_test, y_test_pred))

MSE on test data: 12.781924350259656


### Regression 5 Abida and Sghaier(2012)

In [62]:
X = df_EU_CA .drop('%Growth',axis=1)
y = df_EU_CA ['%Growth']
X = X.fillna(method = 'ffill', axis = 'rows')
y = y.fillna(method = 'ffill', axis = 'rows')
X = X.fillna(X.mean(numeric_only=True), axis = 'rows')
y = y.fillna(y.mean(), axis = 'rows')
X = sm.add_constant(X)

In [63]:
X = X.loc[:,['const', 'Gini index', '%Investments', '%Enrollment', '%Trade', 'GDP_C']]

In [64]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=42)

In [65]:
regression_model = LinearRegression()
mse_scorer = make_scorer(mean_squared_error)
mse_scores = cross_val_score(regression_model, X_train, y_train, scoring=mse_scorer, cv=5)
mean_mse = np.mean(mse_scores)
mean_mse

14.958747026987599

### Regression 6 Vo et al (2019)

In [66]:
X = df_EU_CA .drop('%Growth',axis=1)
y = df_EU_CA ['%Growth']
X = X.fillna(method = 'ffill', axis = 'rows')
y = y.fillna(method = 'ffill', axis = 'rows')
X = X.fillna(X.mean(numeric_only=True), axis = 'rows')
y = y.fillna(y.mean(), axis = 'rows')
X = sm.add_constant(X)

In [67]:
X = X.loc[:,['const', 'Gini index', '%Agriculture', '%Industry', '%Service', '%Investments']]

In [68]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=42)

In [69]:
regression_model = LinearRegression()
mse_scorer = make_scorer(mean_squared_error)
mse_scores = cross_val_score(regression_model, X_train, y_train, scoring=mse_scorer, cv=5)
mean_mse = np.mean(mse_scores)
mean_mse

14.383858816921693

In [70]:
model6 = sm.OLS(y_train,X_train)
results6 = model6.fit()
results6.summary()

0,1,2,3
Dep. Variable:,%Growth,R-squared:,0.06
Model:,OLS,Adj. R-squared:,0.048
Method:,Least Squares,F-statistic:,5.077
Date:,"Fri, 23 Jun 2023",Prob (F-statistic):,0.000159
Time:,09:06:23,Log-Likelihood:,-1099.7
No. Observations:,403,AIC:,2211.0
Df Residuals:,397,BIC:,2235.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-5063.8597,4257.021,-1.190,0.235,-1.34e+04,3305.262
Gini index,-0.0912,0.047,-1.944,0.053,-0.183,0.001
%Agriculture,50.7544,42.568,1.192,0.234,-32.932,134.440
%Industry,50.6631,42.571,1.190,0.235,-33.030,134.356
%Service,50.6853,42.568,1.191,0.234,-33.002,134.372
%Investments,0.0010,0.009,0.106,0.916,-0.017,0.019

0,1,2,3
Omnibus:,63.567,Durbin-Watson:,2.003
Prob(Omnibus):,0.0,Jarque-Bera (JB):,547.009
Skew:,-0.307,Prob(JB):,1.65e-119
Kurtosis:,8.674,Cond. No.,1780000.0


In [71]:
y_train_pred = results6.predict(X_train.loc[:,['const', 'Gini index', '%Agriculture', '%Industry', '%Service', '%Investments']])
y_test_pred = results6.predict(X_test.loc[:,['const', 'Gini index', '%Agriculture', '%Industry', '%Service', '%Investments']])

In [72]:
print('MSE on train data:',mean_squared_error(y_train, y_train_pred))

MSE on train data: 13.729773360777045


In [73]:
print('MSE on test data:',mean_squared_error(y_test, y_test_pred))

MSE on test data: 13.351039662962458


### Regression 7 Knowles(2001)

In [74]:
X = df_EU_CA .drop('%Growth',axis=1)
y = df_EU_CA ['%Growth']
X = X.fillna(method = 'ffill', axis = 'rows')
y = y.fillna(method = 'ffill', axis = 'rows')
X = X.fillna(X.mean(numeric_only=True), axis = 'rows')
y = y.fillna(y.mean(), axis = 'rows')
X = sm.add_constant(X)

In [75]:
X = X.loc[:,['const', 'Gini index', 'GDP_C', '%Completion_F', '%Completion_M', '%Investments']]

In [76]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=42)

In [77]:
regression_model = LinearRegression()
mse_scorer = make_scorer(mean_squared_error)
mse_scores = cross_val_score(regression_model, X_train, y_train, scoring=mse_scorer, cv=5)
mean_mse = np.mean(mse_scores)
mean_mse

14.899345239997729

### Regression 8 Lee and Son(2016)

In [78]:
X = df_EU_CA .drop('%Growth',axis=1)
y = df_EU_CA ['%Growth']
X = X.fillna(method = 'ffill', axis = 'rows')
y = y.fillna(method = 'ffill', axis = 'rows')
X = X.fillna(X.mean(numeric_only=True), axis = 'rows')
y = y.fillna(y.mean(), axis = 'rows')
X = sm.add_constant(X)

In [79]:
X = X.loc[:,['const', 'Gini index', 'Expected years of schooling', '%Investments', '%Trade', '%Expenditures', 'GDP_C']]

In [80]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=42)

In [81]:
regression_model = LinearRegression()
mse_scorer = make_scorer(mean_squared_error)
mse_scores = cross_val_score(regression_model, X_train, y_train, scoring=mse_scorer, cv=5)
mean_mse = np.mean(mse_scores)
mean_mse

14.840276747332377

## North America, Latin America and the Caribbean

In [82]:
df_NA_LA = df.loc[df['Region'].isin(['North America','Latin America & Caribbean'])]

### Regression 1 Klasen and Lamanna(2009)

In [83]:
X = df_NA_LA .drop('%Growth',axis=1)
y = df_NA_LA ['%Growth']
X = X.fillna(method = 'ffill', axis = 'rows')
y = y.fillna(method = 'ffill', axis = 'rows')
X = X.fillna(X.mean(numeric_only=True), axis = 'rows')
y = y.fillna(y.mean(), axis = 'rows')
X = sm.add_constant(X)

In [84]:
X = X.loc[:,['const','%Gender', '%Pop_Growth', '%WorkingPop', '%Investments', '%Trade', 'GPI', 'Expected years of schooling, male']]

In [85]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=42)

In [86]:
regression_model = LinearRegression()
mse_scorer = make_scorer(mean_squared_error)
mse_scores = cross_val_score(regression_model, X_train, y_train, scoring=mse_scorer, cv=5)
mean_mse = np.mean(mse_scores)
mean_mse

27.59964843682597

In [87]:
model1 = sm.OLS(y_train,X_train)
results1 = model1.fit()
results1.summary()

0,1,2,3
Dep. Variable:,%Growth,R-squared:,0.155
Model:,OLS,Adj. R-squared:,0.134
Method:,Least Squares,F-statistic:,7.488
Date:,"Fri, 23 Jun 2023",Prob (F-statistic):,2.79e-08
Time:,09:06:23,Log-Likelihood:,-889.18
No. Observations:,294,AIC:,1794.0
Df Residuals:,286,BIC:,1824.0
Df Model:,7,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,4.4820,11.378,0.394,0.694,-17.913,26.877
%Gender,0.0857,0.028,3.066,0.002,0.031,0.141
%Pop_Growth,0.5778,0.385,1.503,0.134,-0.179,1.335
%WorkingPop,-0.2347,0.111,-2.119,0.035,-0.453,-0.017
%Investments,0.3861,0.066,5.871,0.000,0.257,0.516
%Trade,-0.0074,0.013,-0.562,0.574,-0.033,0.018
GPI,9.6104,9.103,1.056,0.292,-8.307,27.528
"Expected years of schooling, male",-0.2685,0.167,-1.605,0.110,-0.598,0.061

0,1,2,3
Omnibus:,61.386,Durbin-Watson:,1.952
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1068.334
Skew:,0.045,Prob(JB):,1.03e-232
Kurtosis:,12.338,Cond. No.,5570.0


In [88]:
y_train_pred = results1.predict(X_train)
y_test_pred = results1.predict(X_test)

In [89]:
print('MSE on train data:',mean_squared_error(y_train, y_train_pred))

MSE on train data: 24.803157057684185


In [90]:
print('MSE on test data:',mean_squared_error(y_test, y_test_pred))

MSE on test data: 25.309770259440477


### Regression 2 Klasen and Lamanna(2009) reduced form

In [91]:
X = df_NA_LA .drop('%Growth',axis=1)
y = df_NA_LA ['%Growth']
X = X.fillna(method = 'ffill', axis = 'rows')
y = y.fillna(method = 'ffill', axis = 'rows')
X = X.fillna(X.mean(numeric_only=True), axis = 'rows')
y = y.fillna(y.mean(), axis = 'rows')
X = sm.add_constant(X)

In [92]:
X = X.loc[:,['const','%Gender', '%Trade', 'GPI', 'Expected years of schooling, male']]

In [93]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=42)

In [94]:
regression_model = LinearRegression()
mse_scorer = make_scorer(mean_squared_error)
mse_scores = cross_val_score(regression_model, X_train, y_train, scoring=mse_scorer, cv=5)
mean_mse = np.mean(mse_scores)
mean_mse

29.448668963144417

### Regression 3 Senguino (2000)

In [95]:
X = df_NA_LA .drop('%Growth',axis=1)
y = df_NA_LA ['%Growth']
X = X.fillna(method = 'ffill', axis = 'rows')
y = y.fillna(method = 'ffill', axis = 'rows')
X = X.fillna(X.mean(numeric_only=True), axis = 'rows')
y = y.fillna(y.mean(), axis = 'rows')
X = sm.add_constant(X)

In [96]:
X = X.loc[:,['const','%Gender', '%Completion_F', '%Completion_M', 'GDP_C']]

In [97]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=42)

In [98]:
regression_model = LinearRegression()
mse_scorer = make_scorer(mean_squared_error)
mse_scores = cross_val_score(regression_model, X_train, y_train, scoring=mse_scorer, cv=5)
mean_mse = np.mean(mse_scores)
mean_mse

28.451081847333445

### Regression 4 Ghosh(2021)

In [99]:
X = df_NA_LA .drop('%Growth',axis=1)
y = df_NA_LA ['%Growth']
X = X.fillna(method = 'ffill', axis = 'rows')
y = y.fillna(method = 'ffill', axis = 'rows')
X = X.fillna(X.mean(numeric_only=True), axis = 'rows')
y = y.fillna(y.mean(), axis = 'rows')
X = sm.add_constant(X)

In [100]:
X = X.loc[:,['const','%Gender', '%Trade', '%Trade_2']]

In [101]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=42)

In [102]:
regression_model = LinearRegression()
mse_scorer = make_scorer(mean_squared_error)
mse_scores = cross_val_score(regression_model, X_train, y_train, scoring=mse_scorer, cv=5)
mean_mse = np.mean(mse_scores)
mean_mse

28.981117289039652

### Regression 5 Abida and Sghaier(2012)

In [103]:
X = df_NA_LA .drop('%Growth',axis=1)
y = df_NA_LA ['%Growth']
X = X.fillna(method = 'ffill', axis = 'rows')
y = y.fillna(method = 'ffill', axis = 'rows')
X = X.fillna(X.mean(numeric_only=True), axis = 'rows')
y = y.fillna(y.mean(), axis = 'rows')
X = sm.add_constant(X)

In [104]:
X = X.loc[:,['const', 'Gini index', '%Investments', '%Enrollment', '%Trade', 'GDP_C']]

In [105]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=42)

In [106]:
regression_model = LinearRegression()
mse_scorer = make_scorer(mean_squared_error)
mse_scores = cross_val_score(regression_model, X_train, y_train, scoring=mse_scorer, cv=5)
mean_mse = np.mean(mse_scores)
mean_mse

28.75744625318992

### Regression 6 Vo et al (2019)

In [107]:
X = df_NA_LA .drop('%Growth',axis=1)
y = df_NA_LA ['%Growth']
X = X.fillna(method = 'ffill', axis = 'rows')
y = y.fillna(method = 'ffill', axis = 'rows')
X = X.fillna(X.mean(numeric_only=True), axis = 'rows')
y = y.fillna(y.mean(), axis = 'rows')
X = sm.add_constant(X)

In [108]:
X = X.loc[:,['const', 'Gini index', '%Agriculture', '%Industry', '%Service', '%Investments']]

In [109]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=42)

In [110]:
regression_model = LinearRegression()
mse_scorer = make_scorer(mean_squared_error)
mse_scores = cross_val_score(regression_model, X_train, y_train, scoring=mse_scorer, cv=5)
mean_mse = np.mean(mse_scores)
mean_mse

27.715203881676263

In [111]:
model6 = sm.OLS(y_train,X_train)
results6 = model6.fit()
results6.summary()

0,1,2,3
Dep. Variable:,%Growth,R-squared:,0.12
Model:,OLS,Adj. R-squared:,0.104
Method:,Least Squares,F-statistic:,7.829
Date:,"Fri, 23 Jun 2023",Prob (F-statistic):,6.29e-07
Time:,09:06:24,Log-Likelihood:,-895.18
No. Observations:,294,AIC:,1802.0
Df Residuals:,288,BIC:,1824.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-1.048e+04,5454.410,-1.921,0.056,-2.12e+04,255.097
Gini index,-0.0217,0.071,-0.305,0.760,-0.162,0.118
%Agriculture,104.8623,54.543,1.923,0.056,-2.491,212.216
%Industry,104.9960,54.537,1.925,0.055,-2.346,212.338
%Service,104.7629,54.543,1.921,0.056,-2.591,212.117
%Investments,0.2742,0.062,4.444,0.000,0.153,0.396

0,1,2,3
Omnibus:,63.289,Durbin-Watson:,1.929
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1193.295
Skew:,0.011,Prob(JB):,7.57e-260
Kurtosis:,12.87,Cond. No.,1530000.0


In [112]:
y_train_pred = results6.predict(X_train)
y_test_pred = results6.predict(X_test)

In [113]:
print('MSE on train data:',mean_squared_error(y_train, y_train_pred))

MSE on train data: 25.837011403242627


In [114]:
print('MSE on test data:',mean_squared_error(y_test, y_test_pred))

MSE on test data: 26.03066544436563


### Regression 7 Knowles(2001)

In [115]:
X = df_NA_LA .drop('%Growth',axis=1)
y = df_NA_LA ['%Growth']
X = X.fillna(method = 'ffill', axis = 'rows')
y = y.fillna(method = 'ffill', axis = 'rows')
X = X.fillna(X.mean(numeric_only=True), axis = 'rows')
y = y.fillna(y.mean(), axis = 'rows')
X = sm.add_constant(X)

In [116]:
X = X.loc[:,['const', 'Gini index', 'GDP_C', '%Completion_F', '%Completion_M', '%Investments']]

In [117]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=42)

In [118]:
regression_model = LinearRegression()
mse_scorer = make_scorer(mean_squared_error)
mse_scores = cross_val_score(regression_model, X_train, y_train, scoring=mse_scorer, cv=5)
mean_mse = np.mean(mse_scores)
mean_mse

28.159255696595608

### Regression 8 Lee and Son(2016)

In [119]:
X = df_NA_LA .drop('%Growth',axis=1)
y = df_NA_LA ['%Growth']
X = X.fillna(method = 'ffill', axis = 'rows')
y = y.fillna(method = 'ffill', axis = 'rows')
X = X.fillna(X.mean(numeric_only=True), axis = 'rows')
y = y.fillna(y.mean(), axis = 'rows')
X = sm.add_constant(X)

In [120]:
X = X.loc[:,['const', 'Gini index', 'Expected years of schooling', '%Investments', '%Trade', '%Expenditures', 'GDP_C']]

In [121]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=42)

In [122]:
regression_model = LinearRegression()
mse_scorer = make_scorer(mean_squared_error)
mse_scores = cross_val_score(regression_model, X_train, y_train, scoring=mse_scorer, cv=5)
mean_mse = np.mean(mse_scores)
mean_mse

28.87826423214734

## Sub-Saharan Africa

In [123]:
df_SSA = df.loc[df['Region'] == 'Sub-Saharan Africa']

### Regression 1 Klasen and Lamanna(2009)

In [124]:
X = df_SSA .drop('%Growth',axis=1)
y = df_SSA ['%Growth']
X = X.fillna(method = 'ffill', axis = 'rows')
y = y.fillna(method = 'ffill', axis = 'rows')
X = X.fillna(X.mean(numeric_only=True), axis = 'rows')
y = y.fillna(y.mean(), axis = 'rows')
X = sm.add_constant(X)

In [125]:
X = X.loc[:,['const','%Gender', '%Pop_Growth', '%WorkingPop', '%Investments', '%Trade', 'GPI', 'Expected years of schooling, male']]

In [126]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=42)

In [127]:
regression_model = LinearRegression()
mse_scorer = make_scorer(mean_squared_error)
mse_scores = cross_val_score(regression_model, X_train, y_train, scoring=mse_scorer, cv=5)
mean_mse = np.mean(mse_scores)
mean_mse

33.52747320747436

### Regression 2 Klasen and Lamanna(2009) reduced form

In [128]:
X = df_SSA .drop('%Growth',axis=1)
y = df_SSA ['%Growth']
X = X.fillna(method = 'ffill', axis = 'rows')
y = y.fillna(method = 'ffill', axis = 'rows')
X = X.fillna(X.mean(numeric_only=True), axis = 'rows')
y = y.fillna(y.mean(), axis = 'rows')
X = sm.add_constant(X)

In [129]:
X = X.loc[:,['const','%Gender', '%Trade', 'GPI', 'Expected years of schooling, male']]

In [130]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=42)

In [131]:
regression_model = LinearRegression()
mse_scorer = make_scorer(mean_squared_error)
mse_scores = cross_val_score(regression_model, X_train, y_train, scoring=mse_scorer, cv=5)
mean_mse = np.mean(mse_scores)
mean_mse

31.72446610967317

In [132]:
model2 = sm.OLS(y_train,X_train)
results2 = model2.fit()
results2.summary()

0,1,2,3
Dep. Variable:,%Growth,R-squared:,0.03
Model:,OLS,Adj. R-squared:,0.02
Method:,Least Squares,F-statistic:,2.975
Date:,"Fri, 23 Jun 2023",Prob (F-statistic):,0.0193
Time:,09:06:25,Log-Likelihood:,-1211.0
No. Observations:,386,AIC:,2432.0
Df Residuals:,381,BIC:,2452.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-2.5892,2.785,-0.930,0.353,-8.066,2.887
%Gender,0.0134,0.022,0.596,0.551,-0.031,0.057
%Trade,0.0019,0.008,0.228,0.819,-0.015,0.019
GPI,8.2485,2.713,3.041,0.003,2.915,13.582
"Expected years of schooling, male",-0.2385,0.158,-1.508,0.132,-0.550,0.072

0,1,2,3
Omnibus:,280.673,Durbin-Watson:,2.125
Prob(Omnibus):,0.0,Jarque-Bera (JB):,6024.782
Skew:,-2.782,Prob(JB):,0.0
Kurtosis:,21.537,Cond. No.,1220.0


In [133]:
y_train_pred = results2.predict(X_train)
y_test_pred = results2.predict(X_test)

In [134]:
print('MSE on train data:',mean_squared_error(y_train, y_train_pred))

MSE on train data: 31.089047558966218


In [135]:
print('MSE on test data:',mean_squared_error(y_test, y_test_pred))

MSE on test data: 21.87153664548915


### Regression 3 Senguino (2000)

In [136]:
X = df_SSA .drop('%Growth',axis=1)
y = df_SSA ['%Growth']
X = X.fillna(method = 'ffill', axis = 'rows')
y = y.fillna(method = 'ffill', axis = 'rows')
X = X.fillna(X.mean(numeric_only=True), axis = 'rows')
y = y.fillna(y.mean(), axis = 'rows')
X = sm.add_constant(X)

In [137]:
X = X.loc[:,['const','%Gender', '%Completion_F', '%Completion_M', 'GDP_C']]

In [138]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=42)

In [139]:
regression_model = LinearRegression()
mse_scorer = make_scorer(mean_squared_error)
mse_scores = cross_val_score(regression_model, X_train, y_train, scoring=mse_scorer, cv=5)
mean_mse = np.mean(mse_scores)
mean_mse

32.02933479689084

### Regression 4 Ghosh(2021)

In [140]:
X = df_SSA .drop('%Growth',axis=1)
y = df_SSA ['%Growth']
X = X.fillna(method = 'ffill', axis = 'rows')
y = y.fillna(method = 'ffill', axis = 'rows')
X = X.fillna(X.mean(numeric_only=True), axis = 'rows')
y = y.fillna(y.mean(), axis = 'rows')
X = sm.add_constant(X)

In [141]:
X = X.loc[:,['const','%Gender', '%Trade', '%Trade_2']]

In [142]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=42)

In [143]:
regression_model = LinearRegression()
mse_scorer = make_scorer(mean_squared_error)
mse_scores = cross_val_score(regression_model, X_train, y_train, scoring=mse_scorer, cv=5)
mean_mse = np.mean(mse_scores)
mean_mse

32.28258902914633

### Regression 5 Abida and Sghaier(2012)

In [144]:
X = df_SSA .drop('%Growth',axis=1)
y = df_SSA ['%Growth']
X = X.fillna(method = 'ffill', axis = 'rows')
y = y.fillna(method = 'ffill', axis = 'rows')
X = X.fillna(X.mean(numeric_only=True), axis = 'rows')
y = y.fillna(y.mean(), axis = 'rows')
X = sm.add_constant(X)

In [145]:
X = X.loc[:,['const', 'Gini index', '%Investments', '%Enrollment', '%Trade', 'GDP_C']]

In [146]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=42)

In [147]:
regression_model = LinearRegression()
mse_scorer = make_scorer(mean_squared_error)
mse_scores = cross_val_score(regression_model, X_train, y_train, scoring=mse_scorer, cv=5)
mean_mse = np.mean(mse_scores)
mean_mse

30.31203183603669

In [148]:
model5 = sm.OLS(y_train,X_train)
results5 = model5.fit()
results5.summary()

0,1,2,3
Dep. Variable:,%Growth,R-squared:,0.075
Model:,OLS,Adj. R-squared:,0.063
Method:,Least Squares,F-statistic:,6.196
Date:,"Fri, 23 Jun 2023",Prob (F-statistic):,1.54e-05
Time:,09:06:26,Log-Likelihood:,-1201.8
No. Observations:,386,AIC:,2416.0
Df Residuals:,380,BIC:,2439.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,6.3896,1.812,3.525,0.000,2.826,9.953
Gini index,-0.1274,0.038,-3.312,0.001,-0.203,-0.052
%Investments,0.0395,0.031,1.256,0.210,-0.022,0.101
%Enrollment,0.0710,0.020,3.606,0.000,0.032,0.110
%Trade,0.0066,0.010,0.641,0.522,-0.014,0.027
GDP_C,-0.0004,0.000,-3.190,0.002,-0.001,-0.000

0,1,2,3
Omnibus:,282.512,Durbin-Watson:,2.189
Prob(Omnibus):,0.0,Jarque-Bera (JB):,5980.223
Skew:,-2.816,Prob(JB):,0.0
Kurtosis:,21.442,Cond. No.,25800.0


In [149]:
y_train_pred = results5.predict(X_train)
y_test_pred = results5.predict(X_test)

In [150]:
print('MSE on train data:',mean_squared_error(y_train, y_train_pred))

MSE on train data: 29.64327690458431


In [151]:
print('MSE on test data:',mean_squared_error(y_test, y_test_pred))

MSE on test data: 21.715432393685887


### Regression 6 Vo et al (2019)

In [152]:
X = df_SSA .drop('%Growth',axis=1)
y = df_SSA ['%Growth']
X = X.fillna(method = 'ffill', axis = 'rows')
y = y.fillna(method = 'ffill', axis = 'rows')
X = X.fillna(X.mean(numeric_only=True), axis = 'rows')
y = y.fillna(y.mean(), axis = 'rows')
X = sm.add_constant(X)

In [153]:
X = X.loc[:,['const', 'Gini index', '%Agriculture', '%Industry', '%Service', '%Investments']]

In [154]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=42)

In [155]:
regression_model = LinearRegression()
mse_scorer = make_scorer(mean_squared_error)
mse_scores = cross_val_score(regression_model, X_train, y_train, scoring=mse_scorer, cv=5)
mean_mse = np.mean(mse_scores)
mean_mse

31.324818395902184

### Regression 7 Knowles(2001)

In [156]:
X = df_SSA .drop('%Growth',axis=1)
y = df_SSA ['%Growth']
X = X.fillna(method = 'ffill', axis = 'rows')
y = y.fillna(method = 'ffill', axis = 'rows')
X = X.fillna(X.mean(numeric_only=True), axis = 'rows')
y = y.fillna(y.mean(), axis = 'rows')
X = sm.add_constant(X)

In [157]:
X = X.loc[:,['const', 'Gini index', 'GDP_C', '%Completion_F', '%Completion_M', '%Investments']]

In [158]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=42)

In [159]:
regression_model = LinearRegression()
mse_scorer = make_scorer(mean_squared_error)
mse_scores = cross_val_score(regression_model, X_train, y_train, scoring=mse_scorer, cv=5)
mean_mse = np.mean(mse_scores)
mean_mse

31.145818008839864

### Regression 8 Lee and Son(2016)

In [160]:
X = df_SSA .drop('%Growth',axis=1)
y = df_SSA ['%Growth']
X = X.fillna(method = 'ffill', axis = 'rows')
y = y.fillna(method = 'ffill', axis = 'rows')
X = X.fillna(X.mean(numeric_only=True), axis = 'rows')
y = y.fillna(y.mean(), axis = 'rows')
X = sm.add_constant(X)

In [161]:
X = X.loc[:,['const', 'Gini index', 'Expected years of schooling', '%Investments', '%Trade', '%Expenditures', 'GDP_C']]

In [162]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=42)

In [163]:
regression_model = LinearRegression()
mse_scorer = make_scorer(mean_squared_error)
mse_scores = cross_val_score(regression_model, X_train, y_train, scoring=mse_scorer, cv=5)
mean_mse = np.mean(mse_scores)
mean_mse

30.937889419385066

## The Middle East and North Africa

In [164]:
df_ME = df.loc[df['Region'] == 'Middle East & North Africa']

### Regression 1 Klasen and Lamanna(2009)

In [165]:
X = df_ME .drop('%Growth',axis=1)
y = df_ME ['%Growth']
X = X.fillna(method = 'ffill', axis = 'rows')
y = y.fillna(method = 'ffill', axis = 'rows')
X = X.fillna(X.mean(numeric_only=True), axis = 'rows')
y = y.fillna(y.mean(), axis = 'rows')
X = sm.add_constant(X)

In [166]:
X = X.loc[:,['const','%Gender', '%Pop_Growth', '%WorkingPop', '%Investments', '%Trade', 'GPI', 'Expected years of schooling, male']]

In [167]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=42)

In [168]:
regression_model = LinearRegression()
mse_scorer = make_scorer(mean_squared_error)
mse_scores = cross_val_score(regression_model, X_train, y_train, scoring=mse_scorer, cv=5)
mean_mse = np.mean(mse_scores)
mean_mse

121.4411585154974

### Regression 2 Klasen and Lamanna(2009) reduced form

In [169]:
X = df_ME .drop('%Growth',axis=1)
y = df_ME ['%Growth']
X = X.fillna(method = 'ffill', axis = 'rows')
y = y.fillna(method = 'ffill', axis = 'rows')
X = X.fillna(X.mean(numeric_only=True), axis = 'rows')
y = y.fillna(y.mean(), axis = 'rows')
X = sm.add_constant(X)

In [170]:
X = X.loc[:,['const','%Gender', '%Trade', 'GPI', 'Expected years of schooling, male']]

In [171]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=42)

In [172]:
regression_model = LinearRegression()
mse_scorer = make_scorer(mean_squared_error)
mse_scores = cross_val_score(regression_model, X_train, y_train, scoring=mse_scorer, cv=5)
mean_mse = np.mean(mse_scores)
mean_mse

112.1879623984868

In [173]:
model2 = sm.OLS(y_train,X_train.loc[:,['const','%Gender', '%Trade', 'GPI', 'Expected years of schooling, male']])
results2 = model2.fit()
results2.summary()

0,1,2,3
Dep. Variable:,%Growth,R-squared:,0.032
Model:,OLS,Adj. R-squared:,0.009
Method:,Least Squares,F-statistic:,1.407
Date:,"Fri, 23 Jun 2023",Prob (F-statistic):,0.234
Time:,09:06:27,Log-Likelihood:,-656.81
No. Observations:,176,AIC:,1324.0
Df Residuals:,171,BIC:,1339.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-22.9364,13.342,-1.719,0.087,-49.273,3.400
%Gender,0.0045,0.047,0.096,0.923,-0.089,0.098
%Trade,0.0152,0.014,1.103,0.271,-0.012,0.042
GPI,21.2158,14.249,1.489,0.138,-6.912,49.343
"Expected years of schooling, male",0.1922,0.395,0.487,0.627,-0.587,0.971

0,1,2,3
Omnibus:,134.145,Durbin-Watson:,1.979
Prob(Omnibus):,0.0,Jarque-Bera (JB):,6640.341
Skew:,2.124,Prob(JB):,0.0
Kurtosis:,32.79,Cond. No.,3020.0


In [174]:
y_train_pred = results2.predict(X_train.loc[:,['const','%Gender', '%Trade', 'GPI', 'Expected years of schooling, male']])
y_test_pred = results2.predict(X_test.loc[:,['const','%Gender', '%Trade', 'GPI', 'Expected years of schooling, male']])

In [175]:
print('MSE on train data:',mean_squared_error(y_train, y_train_pred))

MSE on train data: 102.08935493789419


In [176]:
print('MSE on test data:',mean_squared_error(y_test, y_test_pred))

MSE on test data: 42.782030442894126


### Regression 3 Senguino (2000)

In [177]:
X = df_ME .drop('%Growth',axis=1)
y = df_ME ['%Growth']
X = X.fillna(method = 'ffill', axis = 'rows')
y = y.fillna(method = 'ffill', axis = 'rows')
X = X.fillna(X.mean(numeric_only=True), axis = 'rows')
y = y.fillna(y.mean(), axis = 'rows')
X = sm.add_constant(X)

In [178]:
X = X.loc[:,['const','%Gender', '%Completion_F', '%Completion_M', 'GDP_C']]

In [179]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=42)

In [180]:
regression_model = LinearRegression()
mse_scorer = make_scorer(mean_squared_error)
mse_scores = cross_val_score(regression_model, X_train, y_train, scoring=mse_scorer, cv=5)
mean_mse = np.mean(mse_scores)
mean_mse

113.70079680883737

### Regression 4 Ghosh(2021)

In [181]:
X = df_ME .drop('%Growth',axis=1)
y = df_ME ['%Growth']
X = X.fillna(method = 'ffill', axis = 'rows')
y = y.fillna(method = 'ffill', axis = 'rows')
X = X.fillna(X.mean(numeric_only=True), axis = 'rows')
y = y.fillna(y.mean(), axis = 'rows')
X = sm.add_constant(X)

In [182]:
X = X.loc[:,['const','%Gender', '%Trade', '%Trade_2']]

In [183]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=42)

In [184]:
regression_model = LinearRegression()
mse_scorer = make_scorer(mean_squared_error)
mse_scores = cross_val_score(regression_model, X_train, y_train, scoring=mse_scorer, cv=5)
mean_mse = np.mean(mse_scores)
mean_mse

109.1947295699833

In [185]:
model4 = sm.OLS(y_train,X_train)
results4 = model4.fit()
results4.summary()

0,1,2,3
Dep. Variable:,%Growth,R-squared:,0.019
Model:,OLS,Adj. R-squared:,0.002
Method:,Least Squares,F-statistic:,1.101
Date:,"Fri, 23 Jun 2023",Prob (F-statistic):,0.35
Time:,09:06:27,Log-Likelihood:,-657.98
No. Observations:,176,AIC:,1324.0
Df Residuals:,172,BIC:,1337.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-2.6583,3.489,-0.762,0.447,-9.544,4.228
%Gender,0.0256,0.046,0.554,0.580,-0.066,0.117
%Trade,0.0637,0.050,1.272,0.205,-0.035,0.162
%Trade_2,-0.0001,0.000,-0.988,0.324,-0.000,0.000

0,1,2,3
Omnibus:,136.682,Durbin-Watson:,1.962
Prob(Omnibus):,0.0,Jarque-Bera (JB):,6451.682
Skew:,2.211,Prob(JB):,0.0
Kurtosis:,32.329,Cond. No.,120000.0


In [186]:
y_train_pred = results4.predict(X_train)
y_test_pred = results4.predict(X_test)

In [187]:
print('MSE on train data:',mean_squared_error(y_train, y_train_pred))

MSE on train data: 103.46253806463152


In [188]:
print('MSE on test data:',mean_squared_error(y_test, y_test_pred))

MSE on test data: 38.341659056694255


### Regression 5 Abida and Sghaier(2012)

In [189]:
X = df_ME .drop('%Growth',axis=1)
y = df_ME ['%Growth']
X = X.fillna(method = 'ffill', axis = 'rows')
y = y.fillna(method = 'ffill', axis = 'rows')
X = X.fillna(X.mean(numeric_only=True), axis = 'rows')
y = y.fillna(y.mean(), axis = 'rows')
X = sm.add_constant(X)

In [190]:
X = X.loc[:,['const', 'Gini index', '%Investments', '%Enrollment', '%Trade', 'GDP_C']]

In [191]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=42)

In [192]:
regression_model = LinearRegression()
mse_scorer = make_scorer(mean_squared_error)
mse_scores = cross_val_score(regression_model, X_train, y_train, scoring=mse_scorer, cv=5)
mean_mse = np.mean(mse_scores)
mean_mse

110.22253806949134

In [193]:
model5 = sm.OLS(y_train,X_train)
results5 = model5.fit()
results5.summary()

0,1,2,3
Dep. Variable:,%Growth,R-squared:,0.032
Model:,OLS,Adj. R-squared:,0.003
Method:,Least Squares,F-statistic:,1.11
Date:,"Fri, 23 Jun 2023",Prob (F-statistic):,0.357
Time:,09:06:28,Log-Likelihood:,-656.83
No. Observations:,176,AIC:,1326.0
Df Residuals:,170,BIC:,1345.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-0.5757,7.464,-0.077,0.939,-15.309,14.158
Gini index,-0.0834,0.170,-0.490,0.625,-0.420,0.253
%Investments,-0.0170,0.130,-0.131,0.896,-0.274,0.240
%Enrollment,0.0500,0.052,0.955,0.341,-0.053,0.153
%Trade,0.0155,0.016,0.986,0.326,-0.016,0.046
GDP_C,4.08e-05,5.94e-05,0.687,0.493,-7.65e-05,0.000

0,1,2,3
Omnibus:,137.247,Durbin-Watson:,1.965
Prob(Omnibus):,0.0,Jarque-Bera (JB):,6790.452
Skew:,2.209,Prob(JB):,0.0
Kurtosis:,33.107,Cond. No.,208000.0


In [194]:
y_train_pred = results5.predict(X_train)
y_test_pred = results5.predict(X_test)

In [195]:
print('MSE on train data:',mean_squared_error(y_train, y_train_pred))

MSE on train data: 102.11751825737251


In [196]:
print('MSE on test data:',mean_squared_error(y_test, y_test_pred))

MSE on test data: 41.52287569940558


### Regression 6 Vo et al (2019)

In [197]:
X = df_ME .drop('%Growth',axis=1)
y = df_ME ['%Growth']
X = X.fillna(method = 'ffill', axis = 'rows')
y = y.fillna(method = 'ffill', axis = 'rows')
X = X.fillna(X.mean(numeric_only=True), axis = 'rows')
y = y.fillna(y.mean(), axis = 'rows')
X = sm.add_constant(X)

In [198]:
X = X.loc[:,['const', 'Gini index', '%Agriculture', '%Industry', '%Service', '%Investments']]

In [199]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=42)

In [200]:
regression_model = LinearRegression()
mse_scorer = make_scorer(mean_squared_error)
mse_scores = cross_val_score(regression_model, X_train, y_train, scoring=mse_scorer, cv=5)
mean_mse = np.mean(mse_scores)
mean_mse

114.67800783429252

### Regression 7 Knowles(2001)

In [201]:
X = df_ME .drop('%Growth',axis=1)
y = df_ME ['%Growth']
X = X.fillna(method = 'ffill', axis = 'rows')
y = y.fillna(method = 'ffill', axis = 'rows')
X = X.fillna(X.mean(numeric_only=True), axis = 'rows')
y = y.fillna(y.mean(), axis = 'rows')
X = sm.add_constant(X)

In [202]:
X = X.loc[:,['const', 'Gini index', 'GDP_C', '%Completion_F', '%Completion_M', '%Investments']]

In [203]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=42)

In [204]:
regression_model = LinearRegression()
mse_scorer = make_scorer(mean_squared_error)
mse_scores = cross_val_score(regression_model, X_train, y_train, scoring=mse_scorer, cv=5)
mean_mse = np.mean(mse_scores)
mean_mse

115.41572121925151

### Regression 8 Lee and Son(2016)

In [205]:
X = df_ME .drop('%Growth',axis=1)
y = df_ME ['%Growth']
X = X.fillna(method = 'ffill', axis = 'rows')
y = y.fillna(method = 'ffill', axis = 'rows')
X = X.fillna(X.mean(numeric_only=True), axis = 'rows')
y = y.fillna(y.mean(), axis = 'rows')
X = sm.add_constant(X)

In [206]:
X = X.loc[:,['const', 'Gini index', 'Expected years of schooling', '%Investments', '%Trade', '%Expenditures', 'GDP_C']]

In [207]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=42)

In [208]:
regression_model = LinearRegression()
mse_scorer = make_scorer(mean_squared_error)
mse_scores = cross_val_score(regression_model, X_train, y_train, scoring=mse_scorer, cv=5)
mean_mse = np.mean(mse_scores)
mean_mse

112.55250584670812

## South Asia, East Asia and the Pacific

In [209]:
df_EA = df.loc[df['Region'].isin(['East Asia & Pacific','South Asia'])]

### Regression 1 Klasen and Lamanna(2009)

In [210]:
X = df_EA .drop('%Growth',axis=1)
y = df_EA ['%Growth']
X = X.fillna(method = 'ffill', axis = 'rows')
y = y.fillna(method = 'ffill', axis = 'rows')
X = X.fillna(X.mean(numeric_only=True), axis = 'rows')
y = y.fillna(y.mean(), axis = 'rows')
X = sm.add_constant(X)

In [211]:
X = X.loc[:,['const','%Gender', '%Pop_Growth', '%WorkingPop', '%Investments', '%Trade', 'GPI', 'Expected years of schooling, male']]

In [212]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=42)

In [213]:
regression_model = LinearRegression()
mse_scorer = make_scorer(mean_squared_error)
mse_scores = cross_val_score(regression_model, X_train, y_train, scoring=mse_scorer, cv=5)
mean_mse = np.mean(mse_scores)
mean_mse

20.085747873344182

In [214]:
model1 = sm.OLS(y_train,X_train)
results1 = model1.fit()
results1.summary()

0,1,2,3
Dep. Variable:,%Growth,R-squared:,0.036
Model:,OLS,Adj. R-squared:,0.013
Method:,Least Squares,F-statistic:,1.588
Date:,"Fri, 23 Jun 2023",Prob (F-statistic):,0.138
Time:,09:06:28,Log-Likelihood:,-876.27
No. Observations:,302,AIC:,1769.0
Df Residuals:,294,BIC:,1798.0
Df Model:,7,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-3.6255,4.181,-0.867,0.387,-11.855,4.604
%Gender,0.0098,0.018,0.558,0.577,-0.025,0.045
%Pop_Growth,0.6136,0.241,2.542,0.012,0.139,1.089
%WorkingPop,0.0077,0.052,0.146,0.884,-0.096,0.111
%Investments,0.0577,0.047,1.232,0.219,-0.034,0.150
%Trade,-0.0036,0.005,-0.755,0.451,-0.013,0.006
GPI,7.6461,3.515,2.175,0.030,0.727,14.565
"Expected years of schooling, male",-0.1565,0.124,-1.260,0.209,-0.401,0.088

0,1,2,3
Omnibus:,244.841,Durbin-Watson:,2.08
Prob(Omnibus):,0.0,Jarque-Bera (JB):,6086.37
Skew:,-3.073,Prob(JB):,0.0
Kurtosis:,24.116,Cond. No.,2840.0


In [215]:
y_train_pred = results1.predict(X_train)
y_test_pred = results1.predict(X_test)

In [216]:
print('MSE on train data:',mean_squared_error(y_train, y_train_pred))

MSE on train data: 19.398809256438074


In [217]:
print('MSE on test data:',mean_squared_error(y_test, y_test_pred))

MSE on test data: 43.31395390191771


### Regression 2 Klasen and Lamanna(2009) reduced form

In [218]:
X = df_EA .drop('%Growth',axis=1)
y = df_EA ['%Growth']
X = X.fillna(method = 'ffill', axis = 'rows')
y = y.fillna(method = 'ffill', axis = 'rows')
X = X.fillna(X.mean(numeric_only=True), axis = 'rows')
y = y.fillna(y.mean(), axis = 'rows')
X = sm.add_constant(X)

In [219]:
X = X.loc[:,['const','%Gender', '%Trade', 'GPI', 'Expected years of schooling, male']]

In [220]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=42)

In [221]:
regression_model = LinearRegression()
mse_scorer = make_scorer(mean_squared_error)
mse_scores = cross_val_score(regression_model, X_train, y_train, scoring=mse_scorer, cv=5)
mean_mse = np.mean(mse_scores)
mean_mse

20.281421253085995

### Regression 3 Senguino (2000)

In [222]:
X = df_EA .drop('%Growth',axis=1)
y = df_EA ['%Growth']
X = X.fillna(method = 'ffill', axis = 'rows')
y = y.fillna(method = 'ffill', axis = 'rows')
X = X.fillna(X.mean(numeric_only=True), axis = 'rows')
y = y.fillna(y.mean(), axis = 'rows')
X = sm.add_constant(X)

In [223]:
X = X.loc[:,['const','%Gender', '%Completion_F', '%Completion_M', 'GDP_C']]

In [224]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=42)

In [225]:
regression_model = LinearRegression()
mse_scorer = make_scorer(mean_squared_error)
mse_scores = cross_val_score(regression_model, X_train, y_train, scoring=mse_scorer, cv=5)
mean_mse = np.mean(mse_scores)
mean_mse

20.08773368743411

### Regression 4 Ghosh(2021)

In [226]:
X = df_EA .drop('%Growth',axis=1)
y = df_EA ['%Growth']
X = X.fillna(method = 'ffill', axis = 'rows')
y = y.fillna(method = 'ffill', axis = 'rows')
X = X.fillna(X.mean(numeric_only=True), axis = 'rows')
y = y.fillna(y.mean(), axis = 'rows')
X = sm.add_constant(X)

In [227]:
X = X.loc[:,['const','%Gender', '%Trade', '%Trade_2']]

In [228]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=42)

In [229]:
regression_model = LinearRegression()
mse_scorer = make_scorer(mean_squared_error)
mse_scores = cross_val_score(regression_model, X_train, y_train, scoring=mse_scorer, cv=5)
mean_mse = np.mean(mse_scores)
mean_mse

20.455742919487438

### Regression 5 Abida and Sghaier(2012)

In [230]:
X = df_EA .drop('%Growth',axis=1)
y = df_EA ['%Growth']
X = X.fillna(method = 'ffill', axis = 'rows')
y = y.fillna(method = 'ffill', axis = 'rows')
X = X.fillna(X.mean(numeric_only=True), axis = 'rows')
y = y.fillna(y.mean(), axis = 'rows')
X = sm.add_constant(X)

In [231]:
X = X.loc[:,['const', 'Gini index', '%Investments', '%Enrollment', '%Trade', 'GDP_C']]

In [232]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=42)

In [233]:
regression_model = LinearRegression()
mse_scorer = make_scorer(mean_squared_error)
mse_scores = cross_val_score(regression_model, X_train, y_train, scoring=mse_scorer, cv=5)
mean_mse = np.mean(mse_scores)
mean_mse

19.885842766068237

### Regression 6 Vo et al (2019)

In [234]:
X = df_EA .drop('%Growth',axis=1)
y = df_EA ['%Growth']
X = X.fillna(method = 'ffill', axis = 'rows')
y = y.fillna(method = 'ffill', axis = 'rows')
X = X.fillna(X.mean(numeric_only=True), axis = 'rows')
y = y.fillna(y.mean(), axis = 'rows')
X = sm.add_constant(X)

In [235]:
X = X.loc[:,['const', 'Gini index', '%Agriculture', '%Industry', '%Service', '%Investments']]

In [236]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=42)

In [237]:
regression_model = LinearRegression()
mse_scorer = make_scorer(mean_squared_error)
mse_scores = cross_val_score(regression_model, X_train, y_train, scoring=mse_scorer, cv=5)
mean_mse = np.mean(mse_scores)
mean_mse

18.76765815136842

In [238]:
model6 = sm.OLS(y_train,X_train.loc[:,['const', 'Gini index', '%Agriculture', '%Industry', '%Service', '%Investments']])
results6 = model6.fit()
results6.summary()

0,1,2,3
Dep. Variable:,%Growth,R-squared:,0.093
Model:,OLS,Adj. R-squared:,0.078
Method:,Least Squares,F-statistic:,6.105
Date:,"Fri, 23 Jun 2023",Prob (F-statistic):,2.12e-05
Time:,09:06:29,Log-Likelihood:,-867.05
No. Observations:,302,AIC:,1746.0
Df Residuals:,296,BIC:,1768.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,1.055e+04,5204.195,2.027,0.044,304.498,2.08e+04
Gini index,0.1533,0.063,2.442,0.015,0.030,0.277
%Agriculture,-105.4530,52.038,-2.026,0.044,-207.865,-3.041
%Industry,-105.4562,52.034,-2.027,0.044,-207.860,-3.053
%Service,-105.5169,52.041,-2.028,0.043,-207.934,-3.100
%Investments,0.0897,0.033,2.728,0.007,0.025,0.154

0,1,2,3
Omnibus:,239.823,Durbin-Watson:,2.051
Prob(Omnibus):,0.0,Jarque-Bera (JB):,5221.308
Skew:,-3.034,Prob(JB):,0.0
Kurtosis:,22.445,Cond. No.,1510000.0


In [239]:
y_train_pred = results6.predict(X_train.loc[:,['const', 'Gini index', '%Agriculture', '%Industry', '%Service', '%Investments']])
y_test_pred = results6.predict(X_test.loc[:,['const', 'Gini index', '%Agriculture', '%Industry', '%Service', '%Investments']])

In [240]:
print('MSE on train data:',mean_squared_error(y_train, y_train_pred))

MSE on train data: 18.250262913951303


In [241]:
print('MSE on test data:',mean_squared_error(y_test, y_test_pred))

MSE on test data: 42.189869296497754


### Regression 7 Knowles(2001)

In [242]:
X = df_EA .drop('%Growth',axis=1)
y = df_EA ['%Growth']
X = X.fillna(method = 'ffill', axis = 'rows')
y = y.fillna(method = 'ffill', axis = 'rows')
X = X.fillna(X.mean(numeric_only=True), axis = 'rows')
y = y.fillna(y.mean(), axis = 'rows')
X = sm.add_constant(X)

In [243]:
X = X.loc[:,['const', 'Gini index', 'GDP_C', '%Completion_F', '%Completion_M', '%Investments']]

In [244]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=42)

In [245]:
regression_model = LinearRegression()
mse_scorer = make_scorer(mean_squared_error)
mse_scores = cross_val_score(regression_model, X_train, y_train, scoring=mse_scorer, cv=5)
mean_mse = np.mean(mse_scores)
mean_mse

20.042513253080536

### Regression 8 Lee and Son(2016)

In [246]:
X = df_EA .drop('%Growth',axis=1)
y = df_EA ['%Growth']
X = X.fillna(method = 'ffill', axis = 'rows')
y = y.fillna(method = 'ffill', axis = 'rows')
X = X.fillna(X.mean(numeric_only=True), axis = 'rows')
y = y.fillna(y.mean(), axis = 'rows')
X = sm.add_constant(X)

In [247]:
X = X.loc[:,['const', 'Gini index', 'Expected years of schooling', '%Investments', '%Trade', '%Expenditures', 'GDP_C']]

In [248]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=42)

In [249]:
regression_model = LinearRegression()
mse_scorer = make_scorer(mean_squared_error)
mse_scores = cross_val_score(regression_model, X_train, y_train, scoring=mse_scorer, cv=5)
mean_mse = np.mean(mse_scores)
mean_mse

19.57564574934518

In [250]:
model8 = sm.OLS(y_train,X_train.loc[:,['const', 'Gini index', 'Expected years of schooling', '%Investments', '%Trade', '%Expenditures', 'GDP_C']])
results8 = model8.fit()
results8.summary()

0,1,2,3
Dep. Variable:,%Growth,R-squared:,0.06
Model:,OLS,Adj. R-squared:,0.04
Method:,Least Squares,F-statistic:,3.113
Date:,"Fri, 23 Jun 2023",Prob (F-statistic):,0.00567
Time:,09:06:30,Log-Likelihood:,-872.6
No. Observations:,302,AIC:,1759.0
Df Residuals:,295,BIC:,1785.0
Df Model:,6,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-0.6747,2.962,-0.228,0.820,-6.504,5.155
Gini index,0.1186,0.066,1.806,0.072,-0.011,0.248
Expected years of schooling,0.0755,0.109,0.693,0.489,-0.139,0.290
%Investments,0.0466,0.046,1.003,0.317,-0.045,0.138
%Trade,0.0027,0.005,0.537,0.592,-0.007,0.013
%Expenditures,-0.2030,0.079,-2.571,0.011,-0.358,-0.048
GDP_C,-5.109e-05,1.98e-05,-2.582,0.010,-9e-05,-1.22e-05

0,1,2,3
Omnibus:,243.297,Durbin-Watson:,2.022
Prob(Omnibus):,0.0,Jarque-Bera (JB):,5653.716
Skew:,-3.073,Prob(JB):,0.0
Kurtosis:,23.286,Cond. No.,229000.0


In [251]:
y_train_pred = results8.predict(X_train.loc[:,['const', 'Gini index', 'Expected years of schooling', '%Investments', '%Trade', '%Expenditures', 'GDP_C']])
y_test_pred = results8.predict(X_test.loc[:,['const', 'Gini index', 'Expected years of schooling', '%Investments', '%Trade', '%Expenditures', 'GDP_C']])

In [252]:
print('MSE on train data:',mean_squared_error(y_train, y_train_pred))

MSE on train data: 18.933499568556552


In [253]:
print('MSE on test data:',mean_squared_error(y_test, y_test_pred))

MSE on test data: 42.910919088930676
