In [12]:
from bs4 import BeautifulSoup
import requests, re, time
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
import statsmodels.api as sm
from itertools import combinations

In [2]:
page = requests.get("https://es.wikipedia.org/wiki/Anexo:Pa%C3%ADses_de_América_Latina_por_%C3%ADndice_de_desarrollo_humano")
soup = BeautifulSoup(page.content, 'html.parser')
tables = soup.find_all("table")
data = tables[1]

In [3]:
# column names
col_names = []
for elem in data.find_all("th"):
    if elem.string[0]=="2" or elem.string[0]=="1":
        col_names.append(elem.string.replace("\n",""))
col_names

['2018', '2015', '2010', '2005', '2000', '1995', '1990']

In [4]:
# countries
countries = []
for elem in data.find_all(["a"]):
    countries.append(elem.text.replace("\n",""))
countries

['Chile',
 'Argentina',
 'Uruguay',
 'Costa Rica',
 'Cuba',
 'Panamá',
 'México',
 'Brasil',
 'Colombia',
 'Guatemala',
 'República Dominicana',
 'Paraguay',
 'Bolivia',
 'Venezuela',
 'El Salvador',
 'Nicaragua',
 'Honduras',
 'Haití']

In [5]:
# numerical data
values = []
for elem in data.find_all("td"):
    if elem.text[0]=="0":
        values.append(float(elem.text.replace(",",".")))
        
values = np.array(values)
values.resize((len(countries),len(col_names)+1))
values = values[:,0:len(col_names)]

In [6]:
df = pd.DataFrame(values, columns=col_names, index=countries)
df["Change"] = df.iloc[:,1] - df.iloc[:,len(df.columns)-1]
df

Unnamed: 0,2018,2015,2010,2005,2000,1995,1990,Change
Chile,0.847,0.839,0.8,0.788,0.753,0.726,0.703,0.136
Argentina,0.83,0.828,0.818,0.777,0.77,0.731,0.707,0.121
Uruguay,0.808,0.802,0.774,0.756,0.742,0.71,0.692,0.11
Costa Rica,0.795,0.782,0.758,0.739,0.719,0.688,0.659,0.123
Cuba,0.794,0.786,0.754,0.727,0.711,0.686,0.655,0.131
Panamá,0.779,0.768,0.777,0.73,0.686,0.654,0.676,0.092
México,0.767,0.759,0.739,0.729,0.705,0.672,0.652,0.107
Brasil,0.761,0.755,0.726,0.699,0.684,0.65,0.613,0.142
Colombia,0.761,0.753,0.729,0.692,0.662,0.633,0.599,0.154
Guatemala,0.745,0.733,0.7,0.675,0.653,0.624,0.593,0.14


### Feature Selection

In [13]:
comb = combinations(df.columns.to_list())

TypeError: combinations() missing required argument 'r' (pos 2)

In [10]:
df.columns

Index(['2018', '2015', '2010', '2005', '2000', '1995', '1990', 'Change'], dtype='object')

In [8]:
X = df[['2015']]
Y = df['2018']

In [9]:
lreg = LinearRegression()
lreg.fit(X,Y)
pred = lreg.predict(X)

#X2 = sm.add_constant(X)
est = sm.OLS(Y, X)
est2 = est.fit()
print(est2.summary())

                                 OLS Regression Results                                
Dep. Variable:                   2018   R-squared (uncentered):                   1.000
Model:                            OLS   Adj. R-squared (uncentered):              1.000
Method:                 Least Squares   F-statistic:                          7.301e+04
Date:                Sun, 05 Jul 2020   Prob (F-statistic):                    2.51e-32
Time:                        12:10:06   Log-Likelihood:                          55.295
No. Observations:                  18   AIC:                                     -108.6
Df Residuals:                      17   BIC:                                     -107.7
Df Model:                           1                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

  "anyway, n=%i" % int(n))


### Model fitting and cross validation

In [11]:
kf = KFold(n_splits=4)
kf.get_n_splits(X)

model = LinearRegression(fit_intercept=False)

results = []
# cross validation
for train_idx, test_idx in kf.split(X):
    X_train, X_test = X.iloc[train_idx,], X.iloc[test_idx,]
    y_train, y_test = Y[train_idx], Y[test_idx]
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    r2 = r2_score(y_test, predictions)
    print("R2: ", r2)
    results.append(r2)

print("Avg R2:", np.mean(results))

R2:  0.9663247535036669
R2:  0.8354477891596949
R2:  0.12900577350330267
R2:  0.9950791222954323
Avg R2: 0.7314643596155241


In [23]:
X_test

Unnamed: 0,2015,2010,2005,2000,1995,1990,Change
El Salvador,0.646,0.602,0.571,0.546,0.507,0.478,0.168
Nicaragua,0.644,0.614,0.593,0.568,0.523,0.494,0.15
Honduras,0.613,0.598,0.581,0.555,0.53,0.508,0.105
Haití,0.493,0.467,0.451,0.44,0.421,0.412,0.081


In [41]:
model.predict(X_test)

array([0.64699741, 0.64649301, 0.62639152, 0.50903709])

In [42]:
predictions

array([0.64699741, 0.64649301, 0.62639152, 0.50903709])

In [43]:
y_test

El Salvador    0.651
Nicaragua      0.651
Honduras       0.623
Haití          0.503
Name: 2018, dtype: float64