In [15]:
from bs4 import BeautifulSoup
import requests, re, time
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
import statsmodels.api as sm
from itertools import combinations, chain

In [2]:
page = requests.get("https://es.wikipedia.org/wiki/Anexo:Pa%C3%ADses_de_América_Latina_por_%C3%ADndice_de_desarrollo_humano")
soup = BeautifulSoup(page.content, 'html.parser')
tables = soup.find_all("table")
data = tables[1]

In [3]:
# column names
col_names = []
for elem in data.find_all("th"):
    if elem.string[0]=="2" or elem.string[0]=="1":
        col_names.append(elem.string.replace("\n",""))
col_names

['2018', '2015', '2010', '2005', '2000', '1995', '1990']

In [4]:
# countries
countries = []
for elem in data.find_all(["a"]):
    countries.append(elem.text.replace("\n",""))
countries

['Chile',
 'Argentina',
 'Uruguay',
 'Costa Rica',
 'Cuba',
 'Panamá',
 'México',
 'Brasil',
 'Colombia',
 'Guatemala',
 'República Dominicana',
 'Paraguay',
 'Bolivia',
 'Venezuela',
 'El Salvador',
 'Nicaragua',
 'Honduras',
 'Haití']

In [5]:
# numerical data
values = []
for elem in data.find_all("td"):
    if elem.text[0]=="0":
        values.append(float(elem.text.replace(",",".")))
        
values = np.array(values)
values.resize((len(countries),len(col_names)+1))
values = values[:,0:len(col_names)]

In [6]:
df = pd.DataFrame(values, columns=col_names, index=countries)
df["Change"] = df.iloc[:,1] - df.iloc[:,len(df.columns)-1]
df

Unnamed: 0,2018,2015,2010,2005,2000,1995,1990,Change
Chile,0.847,0.839,0.8,0.788,0.753,0.726,0.703,0.136
Argentina,0.83,0.828,0.818,0.777,0.77,0.731,0.707,0.121
Uruguay,0.808,0.802,0.774,0.756,0.742,0.71,0.692,0.11
Costa Rica,0.795,0.782,0.758,0.739,0.719,0.688,0.659,0.123
Cuba,0.794,0.786,0.754,0.727,0.711,0.686,0.655,0.131
Panamá,0.779,0.768,0.777,0.73,0.686,0.654,0.676,0.092
México,0.767,0.759,0.739,0.729,0.705,0.672,0.652,0.107
Brasil,0.761,0.755,0.726,0.699,0.684,0.65,0.613,0.142
Colombia,0.761,0.753,0.729,0.692,0.662,0.633,0.599,0.154
Guatemala,0.745,0.733,0.7,0.675,0.653,0.624,0.593,0.14


### Feature Selection

In [49]:
def all_subsets(ss):
    return chain(*map(lambda x: combinations(ss, x), range(0, len(ss)+1)))

subsets = []
for subset in all_subsets(df.columns.to_list()[1:]):
    subsets.append(subset)

subsets = subsets[1:]

In [52]:
target = "2018"
Y = df[target]
aics = []
for subset in subsets:
    cols = list(subset)
    X = df[cols]
    est = sm.OLS(Y, X)
    est2 = est.fit()
    aics.append(est2.aic)

In [66]:
aics = np.array(aics)
df1 = pd.DataFrame({"Subset":subsets, "AIC":aics})
df1.sort_values(by="AIC", ascending=False).head(10)

Unnamed: 0,Subset,AIC
6,"(Change,)",-18.717198
5,"(1990,)",-70.000754
25,"(1995, 1990)",-79.813714
4,"(1995,)",-80.985618
16,"(2010, 1990)",-85.970078
15,"(2010, 1995)",-86.705486
50,"(2010, 1995, 1990)",-87.383788
1,"(2010,)",-87.752015
44,"(2010, 2005, 1995)",-88.01259
86,"(2010, 2005, 1995, 1990)",-88.022527


In [50]:
df.columns

Index(['2018', '2015', '2010', '2005', '2000', '1995', '1990', 'Change'], dtype='object')

In [63]:
X = df[['2015', 'Change']]
Y = df['2018']

In [70]:
lreg = LinearRegression()
lreg.fit(X,Y)
pred = lreg.predict(X)

# X2 = sm.add_constant(X)
est = sm.OLS(Y, X)
est2 = est.fit()
print(est2.summary())

                                 OLS Regression Results                                
Dep. Variable:                   2018   R-squared (uncentered):                   1.000
Model:                            OLS   Adj. R-squared (uncentered):              1.000
Method:                 Least Squares   F-statistic:                          3.492e+04
Date:                Sun, 05 Jul 2020   Prob (F-statistic):                    7.57e-30
Time:                        12:29:55   Log-Likelihood:                          55.442
No. Observations:                  18   AIC:                                     -106.9
Df Residuals:                      16   BIC:                                     -105.1
Df Model:                           2                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

  "anyway, n=%i" % int(n))


### Model fitting and cross validation

In [71]:
kf = KFold(n_splits=4)
kf.get_n_splits(X)

model = LinearRegression(fit_intercept=False)

results = []
# cross validation
for train_idx, test_idx in kf.split(X):
    X_train, X_test = X.iloc[train_idx,], X.iloc[test_idx,]
    y_train, y_test = Y[train_idx], Y[test_idx]
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    r2 = r2_score(y_test, predictions)
    print("R2: ", r2)
    results.append(r2)

print("Avg R2:", np.mean(results))

R2:  0.9543072255450861
R2:  0.7089753391667065
R2:  0.12786652519053332
R2:  0.9926641018010405
Avg R2: 0.6959532979258416
