In [1]:
import evopreprocess as evp
import niapy.algorithms.basic as nia
import numpy as np
import pandas as pd

from sklearn.tree import DecisionTreeRegressor

dataset = pd.read_excel("../databases/final_dataset.xlsx")
dataset.head()

# dataset.loc[dataset['name'] == 'moldova']

Unnamed: 0.1,Unnamed: 0,name,code,year,pop_growth,rural_pop,agri_land,death_rate,life_exp_male,life_exp_female,...,gpd,inflation,grd_growth,gpd_growth_per_capita,oil_rents,col_rents,foreign_invest,purchasing_power,school_enrol,food_consumption
0,0,albania,ALB,1961,3.120855,69.057,44.963504,10.719,62.272,62.272,...,72276170000.0,3.700008,3.269687,1.156293,13.102271,0.368326,0.0,8379726000.0,119.329742,21.35
1,1,united arab emirates,ARE,1961,8.683733,25.617,2.928752,14.761,50.363,50.363,...,72276170000.0,3.700008,3.269687,1.156293,41.516091,0.0,7780000.0,159314800000.0,80.064751,4.64
2,2,argentina,ARG,1961,1.624717,25.783,50.363395,8.618,62.46,62.46,...,72276170000.0,20.310698,5.427843,3.728779,0.268367,0.0,89769999.9,234455300000.0,105.821312,31.45
3,3,australia,AUS,1961,1.98974,18.059,61.763332,8.5,67.9,67.9,...,19683060000.0,3.221021,2.483271,0.464273,0.013721,0.134279,893138535.5,296371500000.0,111.197807,32.31
4,4,austria,AUT,1961,0.548472,35.186,43.085313,12.1,66.47,66.47,...,7311750000.0,5.087154,5.537979,4.960717,0.012188,0.0,112650000.0,149426300000.0,103.295776,79.14


In [2]:
target = dataset.loc[:, 'pop_growth']

dataset.drop(columns=['Unnamed: 0', 'name', 'code', 'pop_growth'], inplace=True)
dataset.rename(columns={'fertelity_rate': 'fertility_rate', 'moratality_rate': 'mortality_rate'}, inplace=True)

print(f'Empty cell check: {np.isnan(dataset.values).any()}')

print(dataset.dtypes)


Empty cell check: False
year                       int64
rural_pop                float64
agri_land                float64
death_rate               float64
life_exp_male            float64
life_exp_female          float64
fertility_rate           float64
young_a_d                float64
old_a_d                  float64
mortality_rate           float64
gpd                      float64
inflation                float64
grd_growth               float64
gpd_growth_per_capita    float64
oil_rents                float64
col_rents                float64
foreign_invest           float64
purchasing_power         float64
school_enrol             float64
food_consumption         float64
dtype: object


## Evaluating regression models with k-fold cross-validation:

#### Ordinary Least Squares

In [3]:
import matplotlib.pyplot as plt

from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import get_scorer_names

# X_train, X_test, y_train, y_test = train_test_split(dataset, target, train_size=60)

# x1 = np.arange(100)
# y1 = np.zeros(100)
# for i in range(100):

cv = KFold(n_splits=10, random_state=1, shuffle=True)

reg1 = linear_model.LinearRegression(fit_intercept=False, copy_X=True, positive=False)
reg2 = linear_model.LinearRegression(fit_intercept=True, copy_X=True, positive=False)

scores1 = abs(cross_val_score(reg1, dataset, target, scoring='neg_mean_squared_error', cv=cv, n_jobs=-1))
scores2 = abs(cross_val_score(reg2, dataset, target, scoring='neg_mean_squared_error', cv=cv, n_jobs=-1))

print(scores1)
print(scores2)

print(np.median(scores1))
print(np.median(scores2))



[0.1508704  0.08878651 0.10223447 0.07456343 0.15912567 0.12191264
 0.10331104 0.18519325 0.15294851 0.15295555]
[0.1508524  0.0887945  0.10239409 0.07462667 0.15922555 0.12204448
 0.10347359 0.18518438 0.15293724 0.15297433]
0.13639151526481155
0.13644843785016242


As we can see, difference is negligible with ot without intercept value.

#### Ridge Regression

In [4]:
cv = KFold(n_splits=10, random_state=2, shuffle=True)

for alpha in np.arange(0, 1, 0.1):
    reg = linear_model.Ridge(alpha)
    scores = abs(cross_val_score(reg, dataset, target, scoring='neg_mean_squared_error', cv=cv, n_jobs=-1))
    print(scores)
    print(np.median(scores))

[0.09087325 0.10462387 0.22418435 0.07480145 0.1718964  0.20607426
 0.20650918 0.13508841 0.12955173 0.08633357]
0.1323200727859003
[0.0907773  0.09867422 0.18450994 0.07427687 0.1719554  0.12209658
 0.20541319 0.13508688 0.12650044 0.08560381]
0.12429851385327337
[0.09077525 0.09867375 0.184507   0.07427562 0.1719563  0.12209954
 0.20542262 0.13508536 0.12649751 0.08560246]
0.1242985257538297
[0.09077321 0.09867329 0.18450406 0.07427438 0.1719572  0.1221025
 0.20543205 0.13508384 0.12649458 0.08560112]
0.12429853929575291
[0.09077117 0.09867283 0.18450113 0.07427315 0.1719581  0.12210545
 0.20544149 0.13508232 0.12649166 0.08559977]
0.1242985544785864
[0.09076913 0.09867237 0.18449819 0.07427191 0.171959   0.12210841
 0.20545093 0.1350808  0.12648873 0.08559842]
0.12429857130187719
[0.0907671  0.09867191 0.18449526 0.07427067 0.17195991 0.12211137
 0.20546037 0.13507929 0.12648581 0.08559708]
0.12429858976517316
[0.09076506 0.09867146 0.18449233 0.07426944 0.17196082 0.12211433
 0.205

With alpha > 0.1, Ridge Regression in general is more accurate than Linear regression.

#### Lasso Regression

In [5]:
cv = KFold(n_splits=10, random_state=3, shuffle=True)

for alpha in np.arange(0, 1, 0.1):
    reg = linear_model.Lasso(alpha)
    scores = abs(cross_val_score(reg, dataset, target, scoring='neg_mean_squared_error', cv=cv, n_jobs=-1))
    print(scores)
    print(np.median(scores))

[0.10909756 0.11473856 0.10639578 0.16180886 0.15154614 0.14340827
 0.15388379 0.19628744 0.08179372 0.07529692]
0.12907341130661618
[0.17423527 0.17966181 0.14830682 0.23092919 0.17404721 0.18293267
 0.25137133 0.28559081 0.10604141 0.09768816]
0.1769485387179346
[0.32744256 0.3686987  0.25301192 0.35781944 0.26919151 0.27868097
 0.46617744 0.4631405  0.18359068 0.15959137]
0.3030617657120962
[0.57561463 0.68713344 0.43523344 0.55874974 0.44425057 0.44646135
 0.81215881 0.7414766  0.31827309 0.26851665]
0.5026055473811039
[0.91840872 1.13465084 0.69473099 0.83348267 0.69892203 0.68545175
 1.28890107 1.12059913 0.50994857 0.42455319]
0.7662023547209158
[1.03572525 1.22582717 0.80991992 0.95690369 0.81647688 0.80602663
 1.36731111 1.22114917 0.62188432 0.52925492]
0.886690287155528
[1.0461863  1.23939752 0.81234957 0.9623745  0.81992738 0.81130402
 1.3809424  1.2327337  0.62473716 0.53506092]
0.8911509383369666
[1.05704138 1.252582   0.81573343 0.96895215 0.82388553 0.81682906
 1.395371

The results show that Lasso Regression would be a terrible choice for our dataset, no matter the alpha score.
