In [None]:
# ==========================================
# PROYECTO ABP - Dataset público (World Bank WDI)
# SECCIÓN 1: Descargar, unir, limpiar y guardar CSV
# ==========================================

import os
import time
import requests
import numpy as np
import pandas as pd
import warnings

# ---------------------------
# Configuración de salida
# ---------------------------
OUT_DIR = "outputs"
DATA_DIR = os.path.join(OUT_DIR, "data")
os.makedirs(DATA_DIR, exist_ok=True)

# Base URL de la API del Banco Mundial (World Bank)
BASE = "https://api.worldbank.org/v2"

# ---------------------------
# Indicadores WDI
# Elegimos una variable objetivo (y) y varias explicativas (X)
# y = PIB per cápita (US$ corrientes)
# ---------------------------
INDICATORS = {
    "gdp_pc": "NY.GDP.PCAP.CD",        # PIB per cápita (US$ corrientes) -> variable objetivo
    "life_exp": "SP.DYN.LE00.IN",      # Esperanza de vida al nacer (años)
    "urban_pct": "SP.URB.TOTL.IN.ZS",  # Población urbana (% del total)
    "co2_pc": "EN.GHG.CO2.PC.CE.AR5",        # Emisiones CO2 (toneladas métricas per cápita)
    "unemp": "SL.UEM.TOTL.ZS",         # Desempleo (% fuerza laboral)
    "infl": "FP.CPI.TOTL.ZG",          # Inflación (IPC, % anual)
    "trade_gdp": "NE.TRD.GNFS.ZS",     # Comercio (% del PIB)
    "internet": "IT.NET.USER.ZS",      # Usuarios de internet (% de la población)
}

# Rango de años para construir un panel país-año
START_YEAR = 2010
END_YEAR = 2023

def wb_fetch_indicator(ind_code: str, start_year: int, end_year: int, per_page: int = 20000) -> pd.DataFrame:
    """
    Descarga un indicador para "todos los países" (country/all) en un rango de años.
    Devuelve un DataFrame con columnas:
      - iso3c: código ISO3 del país
      - country: nombre del país
      - year: año (int)
      - value: valor numérico del indicador (float)
    """
    url = f"{BASE}/country/all/indicator/{ind_code}"
    params = {
        "format": "json",
        "per_page": per_page,
        "date": f"{start_year}:{end_year}",
        "page": 1
    }

    # Primera llamada para conocer cuántas páginas hay
    r = requests.get(url, params=params, timeout=60)
    r.raise_for_status()
    payload = r.json()

    # payload[0] = metadata, payload[1] = datos
    if len(payload) < 2 or (isinstance(payload[1], list) and not payload[1]): # Check if data part exists and is not empty
        print(f"Warning: No data found for indicator {ind_code} between {start_year}-{end_year}. Returning empty DataFrame.")
        # Return an empty DataFrame with expected columns if no data is found
        return pd.DataFrame(columns=["iso3c", "country", "year", "value"])

    meta = payload[0]
    data = payload[1]

    pages = int(meta.get("pages", 1))

    # Parseamos la primera página
    rows = []
    for item in data:
        if item is None:
            continue
        country = item.get("country", {}).get("value")
        iso3c = item.get("countryiso3code")
        year = item.get("date")
        val = item.get("value")
        rows.append((iso3c, country, int(year), val))

    # Si hay más páginas, las recorremos y acumulamos
    for p in range(2, pages + 1):
        params["page"] = p
        r = requests.get(url, params=params, timeout=60)
        r.raise_for_status()
        payload = r.json()
        # Again, check if data part exists for subsequent pages
        if len(payload) < 2 or (isinstance(payload[1], list) and not payload[1]):
            print(f"Warning: No data found for indicator {ind_code} on page {p}. Stopping further fetching for this indicator.")
            break # Stop fetching if no data on subsequent pages
        data = payload[1]

        for item in data:
            if item is None:
                continue
            country = item.get("country", {}).get("value")
            iso3c = item.get("countryiso3code")
            year = item.get("date")
            val = item.get("value")
            rows.append((iso3c, country, int(year), val))

        # Pausa pequeña para ser amable con la API
        time.sleep(0.2)

    # Construimos DataFrame
    df = pd.DataFrame(rows, columns=["iso3c", "country", "year", "value"])

    # Convertimos a numérico (lo que no se pueda, queda como NaN)
    df["value"] = pd.to_numeric(df["value"], errors="coerce")
    return df

# ---------------------------
# 1) Descargar cada indicador y renombrar "value" con el nombre corto del indicador
# ---------------------------
dfs = []
for name, code in INDICATORS.items():
    print(f"Descargando indicador: {name} ({code})...")
    dfi = wb_fetch_indicator(code, START_YEAR, END_YEAR)
    # Only append if the DataFrame is not empty
    if not dfi.empty:
        dfi = dfi.rename(columns={"value": name})
        dfs.append(dfi)
    else:
        print(f"Skipping {name} due to no data.")

# Check if any data was successfully downloaded
if not dfs:
    print("Error: No data frames were successfully downloaded. Cannot proceed with merging.")
else:
    # ---------------------------
    # 2) Unir (merge) por país-año para armar el panel final
    #    Partimos con el primero y luego vamos uniendo los demás
    # ---------------------------
    df = dfs[0]
    for dfi in dfs[1:]:
        # Unimos solo la columna del indicador nuevo para evitar duplicar country/iso3c innecesariamente
        indicador_col = dfi.columns[-1]
        df = df.merge(dfi[["iso3c", "year", indicador_col]], on=["iso3c", "year"], how="left")

    # ---------------------------
    # 3) Limpieza mínima:
    #    - eliminar filas sin iso3c
    #    - filtrar códigos ISO3 válidos (largo 3)
    # ---------------------------
    df = df.dropna(subset=["iso3c"])
    df = df[df["iso3c"].astype(str).str.len() == 3].copy()

# ---------------------------
# 4) Para poder hacer regresión, necesitamos observar la variable objetivo (gdp_pc)
#    Por eso eliminamos filas donde gdp_pc es NaN
# ---------------------------

# WARNING: si por algún motivo no está la columna objetivo, detenemos el script
if "gdp_pc" not in df.columns:
    warnings.warn(
        "No se descargó la columna 'gdp_pc'. No se puede continuar. "
        f"Columnas disponibles: {list(df.columns)}",
        category=UserWarning
    )
    raise SystemExit(1)

df = df.dropna(subset=["gdp_pc"]).copy()

# ---------------------------
# 5) Guardar dataset final como CSV (reproducible)
# ---------------------------
out_path = os.path.join(DATA_DIR, "wdi_dataset.csv")
df.to_csv(out_path, index=False)

print(f"\n[OK] Dataset guardado en: {out_path}")
print("\nPrimeras filas:")
print(df.head())

print("\nNulos por columna (ordenado):")
print(df.isna().sum().sort_values(ascending=False))

print("\nDimensión final (filas, columnas):", df.shape)

Descargando indicador: gdp_pc (NY.GDP.PCAP.CD)...
Descargando indicador: life_exp (SP.DYN.LE00.IN)...
Descargando indicador: urban_pct (SP.URB.TOTL.IN.ZS)...
Descargando indicador: co2_pc (EN.GHG.CO2.PC.CE.AR5)...
Descargando indicador: unemp (SL.UEM.TOTL.ZS)...
Descargando indicador: infl (FP.CPI.TOTL.ZG)...
Descargando indicador: trade_gdp (NE.TRD.GNFS.ZS)...
Descargando indicador: internet (IT.NET.USER.ZS)...

[OK] Dataset guardado en: outputs/data/wdi_dataset.csv

Primeras filas:
  iso3c                      country  year       gdp_pc   life_exp  urban_pct  \
0   AFE  Africa Eastern and Southern  2023  1571.449189  65.146154  37.772301   
1   AFE  Africa Eastern and Southern  2022  1679.327622  64.487020  37.360578   
2   AFE  Africa Eastern and Southern  2021  1562.416175  62.979999  36.908543   
3   AFE  Africa Eastern and Southern  2020  1351.591669  63.766484  36.488322   
4   AFE  Africa Eastern and Southern  2019  1507.085600  63.857261  36.097331   

     co2_pc     unemp   

In [None]:
# ==========================================
# SECCIÓN 2: EDA (tipos de variables, descriptiva, missing, outliers + gráficos)
# ==========================================

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

FIG_DIR = os.path.join("outputs", "figures")
os.makedirs(FIG_DIR, exist_ok=True)

# ---------------------------
# 2.1) Cargar dataset ya construido (por seguridad/reproducibilidad)
# ---------------------------
df = pd.read_csv("outputs/data/wdi_dataset.csv")

print("\n--- INFO GENERAL ---")
print(df.info())

# ---------------------------
# 2.2) Identificar tipos de variables
# ---------------------------
cat_cols = ["iso3c", "country"]
num_cols = [c for c in df.columns if c not in cat_cols]

print("\n--- COLUMNAS CATEGÓRICAS ---")
print(cat_cols)

print("\n--- COLUMNAS NUMÉRICAS ---")
print(num_cols)

# ---------------------------
# 2.3) Missing values (conteo y porcentaje)
# ---------------------------
missing_count = df.isna().sum()
missing_pct = (missing_count / len(df) * 100).round(2)
missing_table = pd.DataFrame({"nulos": missing_count, "pct_nulos": missing_pct}).sort_values("nulos", ascending=False)

print("\n--- NULOS (conteo y %) ---")
print(missing_table)

# Guardar tabla de nulos (por si la quieres para el informe)
missing_table.to_csv("outputs/data/missing_table.csv")

# ---------------------------
# 2.4) Estadística descriptiva (numéricas)
# ---------------------------
desc = df[num_cols].describe().T
desc["missing"] = df[num_cols].isna().sum()
desc["missing_pct"] = (desc["missing"] / len(df) * 100).round(2)

print("\n--- DESCRIPTIVA NUMÉRICAS ---")
print(desc)

desc.to_csv("outputs/data/descriptiva_numericas.csv")

# ---------------------------
# 2.5) Gráficos: histogramas (distribuciones)
# ---------------------------
sns.set(style="whitegrid")

vars_to_plot = ["gdp_pc", "life_exp", "urban_pct", "co2_pc", "unemp", "infl", "trade_gdp", "internet"]

for v in vars_to_plot:
    if v not in df.columns:
        continue
    plt.figure(figsize=(8, 4))
    sns.histplot(df[v], kde=True)
    plt.title(f"Distribución de {v}")
    plt.xlabel(v)
    plt.ylabel("Frecuencia")
    plt.tight_layout()
    plt.savefig(os.path.join(FIG_DIR, f"hist_{v}.png"), dpi=200)
    plt.close()

print(f"\n[OK] Histogramas guardados en: {FIG_DIR}")

# ---------------------------
# 2.6) Outliers: boxplots + detección por IQR (solo numéricas)
# ---------------------------
def iqr_outliers(s: pd.Series):
    """Devuelve máscara booleana de outliers según regla IQR (1.5*IQR)."""
    x = s.dropna()
    q1 = x.quantile(0.25)
    q3 = x.quantile(0.75)
    iqr = q3 - q1
    low = q1 - 1.5 * iqr
    high = q3 + 1.5 * iqr
    return (s < low) | (s > high), low, high

outlier_summary = []

for v in vars_to_plot:
    if v not in df.columns:
        continue

    # Boxplot
    plt.figure(figsize=(8, 3))
    sns.boxplot(x=df[v])
    plt.title(f"Boxplot de {v} (outliers visuales)")
    plt.xlabel(v)
    plt.tight_layout()
    plt.savefig(os.path.join(FIG_DIR, f"box_{v}.png"), dpi=200)
    plt.close()

    # Conteo outliers por IQR
    mask, low, high = iqr_outliers(df[v])
    n_out = int(mask.sum())
    outlier_summary.append({"variable": v, "n_outliers_IQR": n_out, "lim_inf_IQR": low, "lim_sup_IQR": high})

outlier_df = pd.DataFrame(outlier_summary).sort_values("n_outliers_IQR", ascending=False)
print("\n--- OUTLIERS (regla IQR) ---")
print(outlier_df)

outlier_df.to_csv("outputs/data/outliers_iqr.csv", index=False)
print(f"\n[OK] Boxplots guardados en: {FIG_DIR}")

# ---------------------------
# 2.7) Sugerencia práctica: gdp_pc suele ser muy sesgada → log para análisis/modelo
# ---------------------------
df["log_gdp_pc"] = np.log(df["gdp_pc"])
plt.figure(figsize=(8, 4))
sns.histplot(df["log_gdp_pc"], kde=True)
plt.title("Distribución de log_gdp_pc (log del PIB per cápita)")
plt.xlabel("log_gdp_pc")
plt.ylabel("Frecuencia")
plt.tight_layout()
plt.savefig(os.path.join(FIG_DIR, "hist_log_gdp_pc.png"), dpi=200)
plt.close()

df.to_csv("outputs/data/wdi_dataset_con_log.csv", index=False)
print("\n[OK] Guardado: outputs/data/wdi_dataset_con_log.csv (incluye log_gdp_pc)")


--- INFO GENERAL ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3568 entries, 0 to 3567
Data columns (total 11 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   iso3c      3568 non-null   object 
 1   country    3568 non-null   object 
 2   year       3568 non-null   int64  
 3   gdp_pc     3568 non-null   float64
 4   life_exp   3568 non-null   float64
 5   urban_pct  3568 non-null   float64
 6   co2_pc     3391 non-null   float64
 7   unemp      3185 non-null   float64
 8   infl       3197 non-null   float64
 9   trade_gdp  3055 non-null   float64
 10  internet   3081 non-null   float64
dtypes: float64(8), int64(1), object(2)
memory usage: 306.8+ KB
None

--- COLUMNAS CATEGÓRICAS ---
['iso3c', 'country']

--- COLUMNAS NUMÉRICAS ---
['year', 'gdp_pc', 'life_exp', 'urban_pct', 'co2_pc', 'unemp', 'infl', 'trade_gdp', 'internet']

--- NULOS (conteo y %) ---
           nulos  pct_nulos
trade_gdp    513      14.38
internet     487      1

In [None]:
# ==========================================
# SECCIÓN 3: Correlaciones + visualizaciones
# ==========================================

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

FIG_DIR = os.path.join("outputs", "figures")
os.makedirs(FIG_DIR, exist_ok=True)

# Cargar dataset con log (asegura que existe log_gdp_pc)
df = pd.read_csv("outputs/data/wdi_dataset_con_log.csv")

# Columnas numéricas para correlación (excluimos iso3c/country)
num_cols = ["year", "gdp_pc", "log_gdp_pc", "life_exp", "urban_pct", "co2_pc", "unemp", "infl", "trade_gdp", "internet"]

# ---------------------------
# 3.1) Matriz de correlación (Pearson) con eliminación por pares (pairwise)
# ---------------------------
corr = df[num_cols].corr(method="pearson", min_periods=200)  # min_periods evita correlaciones con muy pocos datos

print("\n--- CORRELACIÓN (Pearson) ---")
print(corr)

# Heatmap
plt.figure(figsize=(10, 7))
sns.heatmap(corr, annot=False, cmap="coolwarm", center=0, linewidths=0.5)
plt.title("Matriz de correlación (Pearson) - WDI")
plt.tight_layout()
plt.savefig(os.path.join(FIG_DIR, "corr_heatmap.png"), dpi=200)
plt.close()

print(f"\n[OK] Heatmap guardado en: {os.path.join(FIG_DIR, 'corr_heatmap.png')}")

# ---------------------------
# 3.2) Correlaciones con la variable objetivo (log_gdp_pc)
# ---------------------------
target = "log_gdp_pc"
corr_target = corr[target].drop(labels=[target, "gdp_pc"]).sort_values(key=lambda s: s.abs(), ascending=False)

print("\n--- CORRELACIONES con log_gdp_pc (ordenadas por magnitud) ---")
print(corr_target)

corr_target.to_csv("outputs/data/correlaciones_con_target.csv")

# ---------------------------
# 3.3) Scatterplots con línea de regresión para las 4 X más correlacionadas
# ---------------------------
top4 = corr_target.index[:4].tolist()
print("\nVariables top4 para scatter/regplot:", top4)

for x in top4:
    tmp = df[[target, x]].dropna()
    plt.figure(figsize=(6, 4))
    sns.regplot(data=tmp, x=x, y=target, scatter_kws={"s": 12}, line_kws={"linewidth": 2})
    plt.title(f"{target} vs {x}")
    plt.tight_layout()
    plt.savefig(os.path.join(FIG_DIR, f"scatter_{target}_vs_{x}.png"), dpi=200)
    plt.close()

print(f"\n[OK] Scatters guardados en: {FIG_DIR}")

# ---------------------------
# 3.4) (Opcional, útil) correlación sin 'year' para no mezclar tendencia temporal
# ---------------------------
cols_no_year = [c for c in num_cols if c != "year"]
corr_no_year = df[cols_no_year].corr(method="pearson", min_periods=200)

plt.figure(figsize=(10, 7))
sns.heatmap(corr_no_year, annot=False, cmap="coolwarm", center=0, linewidths=0.5)
plt.title("Matriz de correlación (Pearson) - sin year")
plt.tight_layout()
plt.savefig(os.path.join(FIG_DIR, "corr_heatmap_sin_year.png"), dpi=200)
plt.close()

print(f"[OK] Heatmap sin year guardado en: {os.path.join(FIG_DIR, 'corr_heatmap_sin_year.png')}")


--- CORRELACIÓN (Pearson) ---
                year    gdp_pc  log_gdp_pc  life_exp  urban_pct    co2_pc  \
year        1.000000  0.041602    0.051597  0.092076   0.056708 -0.025899   
gdp_pc      0.041602  1.000000    0.778487  0.607038   0.458189  0.365904   
log_gdp_pc  0.051597  0.778487    1.000000  0.837712   0.686195  0.432432   
life_exp    0.092076  0.607038    0.837712  1.000000   0.607606  0.329202   
urban_pct   0.056708  0.458189    0.686195  0.607606   1.000000  0.366550   
co2_pc     -0.025899  0.365904    0.432432  0.329202   0.366550  1.000000   
unemp      -0.068143 -0.110206    0.039293 -0.007783   0.121685 -0.111580   
infl        0.089704 -0.109356   -0.138008 -0.101978  -0.040124 -0.057791   
trade_gdp   0.000307  0.365196    0.371004  0.288261   0.289461  0.153287   
internet    0.417445  0.592472    0.836081  0.801740   0.662747  0.540775   

               unemp      infl  trade_gdp  internet  
year       -0.068143  0.089704   0.000307  0.417445  
gdp_pc     -0

In [None]:
# ==========================================
# SECCIÓN 4: Regresión lineal (OLS) + métricas (R², MSE, MAE)
# ==========================================

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

FIG_DIR = os.path.join("outputs", "figures")
os.makedirs(FIG_DIR, exist_ok=True)

# 4.1) Cargar dataset con log
df = pd.read_csv("outputs/data/wdi_dataset_con_log.csv")

# Variable objetivo (y) y explicativas (X)
y_col = "log_gdp_pc"
x_cols = ["life_exp", "internet", "urban_pct", "co2_pc", "trade_gdp", "infl", "unemp"]  # modelo base (sin year)

# 4.2) Construir dataset del modelo (dropna solo en las columnas del modelo)
df_model = df[[y_col] + x_cols].dropna().copy()

print("\n--- REGRESIÓN OLS: datos usados ---")
print("Observaciones totales:", df.shape[0])
print("Observaciones usadas en regresión (sin NaN):", df_model.shape[0])
print("Observaciones descartadas por NaN:", df.shape[0] - df_model.shape[0])

# 4.3) Train / Test split (para evaluar fuera de muestra)
X = df_model[x_cols]
y = df_model[y_col]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42
)

# 4.4) Ajustar OLS con constante
X_train_sm = sm.add_constant(X_train)
X_test_sm = sm.add_constant(X_test)

model = sm.OLS(y_train, X_train_sm).fit()

print("\n--- RESUMEN DEL MODELO (OLS) ---")
print(model.summary())

# Guardar resumen a txt (útil para el informe)
with open("outputs/data/ols_summary.txt", "w", encoding="utf-8") as f:
    f.write(model.summary().as_text())

# 4.5) Predicción y métricas
y_pred = model.predict(X_test_sm)

r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

print("\n--- MÉTRICAS (test) ---")
print(f"R²:  {r2:.4f}")
print(f"MSE: {mse:.4f}")
print(f"MAE: {mae:.4f}")

metrics_df = pd.DataFrame([{"R2_test": r2, "MSE_test": mse, "MAE_test": mae, "n_train": len(X_train), "n_test": len(X_test)}])
metrics_df.to_csv("outputs/data/ols_metrics.csv", index=False)

# 4.6) Gráficos diagnósticos básicos (residuos)
residuals = y_test - y_pred

# Residuo vs predicción
plt.figure(figsize=(6, 4))
plt.scatter(y_pred, residuals, s=12)
plt.axhline(0)
plt.title("Residuos vs Predicción (test)")
plt.xlabel("Predicción (log_gdp_pc)")
plt.ylabel("Residuo")
plt.tight_layout()
plt.savefig(os.path.join(FIG_DIR, "residuos_vs_pred.png"), dpi=200)
plt.close()

# Histograma de residuos
plt.figure(figsize=(6, 4))
plt.hist(residuals, bins=30)
plt.title("Distribución de residuos (test)")
plt.xlabel("Residuo")
plt.ylabel("Frecuencia")
plt.tight_layout()
plt.savefig(os.path.join(FIG_DIR, "hist_residuos.png"), dpi=200)
plt.close()

print("\n[OK] Guardado: outputs/data/ols_summary.txt y outputs/data/ols_metrics.csv")
print("[OK] Gráficos: residuos_vs_pred.png e hist_residuos.png en outputs/figures")


--- REGRESIÓN OLS: datos usados ---
Observaciones totales: 3568
Observaciones usadas en regresión (sin NaN): 2441
Observaciones descartadas por NaN: 1127

--- RESUMEN DEL MODELO (OLS) ---
                            OLS Regression Results                            
Dep. Variable:             log_gdp_pc   R-squared:                       0.852
Model:                            OLS   Adj. R-squared:                  0.851
Method:                 Least Squares   F-statistic:                     1497.
Date:                Fri, 13 Feb 2026   Prob (F-statistic):               0.00
Time:                        19:37:43   Log-Likelihood:                -1458.4
No. Observations:                1830   AIC:                             2933.
Df Residuals:                    1822   BIC:                             2977.
Df Model:                           7                                         
Covariance Type:            nonrobust                                         
                 coef

In [None]:
# ==========================================
# SECCIÓN 5: Tablas y gráficos finales (para el informe) + VIF
# ==========================================

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

FIG_DIR = os.path.join("outputs", "figures")
DATA_DIR = os.path.join("outputs", "data")
os.makedirs(FIG_DIR, exist_ok=True)
os.makedirs(DATA_DIR, exist_ok=True)

df = pd.read_csv("outputs/data/wdi_dataset_con_log.csv")

y_col = "log_gdp_pc"
x_cols = ["life_exp", "internet", "urban_pct", "co2_pc", "trade_gdp", "infl", "unemp"]

df_model = df[[y_col] + x_cols].dropna().copy()

X = df_model[x_cols]
y = df_model[y_col]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

X_train_sm = sm.add_constant(X_train)
X_test_sm = sm.add_constant(X_test)

model = sm.OLS(y_train, X_train_sm).fit()
y_pred = model.predict(X_test_sm)

# ---- 5.1) Tabla de coeficientes (para pegar en el informe)
coef_table = pd.DataFrame({
    "coef": model.params,
    "std_err": model.bse,
    "t": model.tvalues,
    "p_value": model.pvalues
})
coef_table.to_csv(os.path.join(DATA_DIR, "tabla_coeficientes.csv"))

# ---- 5.2) Métricas (test)
r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

metrics_df = pd.DataFrame([{
    "R2_test": r2, "MSE_test": mse, "MAE_test": mae,
    "n_total_model": len(df_model), "n_train": len(X_train), "n_test": len(X_test)
}])
metrics_df.to_csv(os.path.join(DATA_DIR, "metricas_finales.csv"), index=False)

# ---- 5.3) Predicho vs Real (scatter)
plt.figure(figsize=(5.5, 4.5))
plt.scatter(y_test, y_pred, s=12)
plt.title("Predicho vs Real (log_gdp_pc) - Test")
plt.xlabel("Real (log_gdp_pc)")
plt.ylabel("Predicho (log_gdp_pc)")
plt.tight_layout()
plt.savefig(os.path.join(FIG_DIR, "predicho_vs_real.png"), dpi=200)
plt.close()

# ---- 5.4) VIF (para revisar multicolinealidad)
# VIF = 1/(1-R2_j) donde R2_j es al regredir X_j contra las demás X
def compute_vif(X_df: pd.DataFrame) -> pd.DataFrame:
    Xv = X_df.dropna().copy()
    out = []
    for col in Xv.columns:
        yj = Xv[col]
        Xj = Xv.drop(columns=[col])
        Xj = sm.add_constant(Xj)
        r2j = sm.OLS(yj, Xj).fit().rsquared
        vif = 1.0 / (1.0 - r2j) if r2j < 0.999999 else np.inf
        out.append({"variable": col, "VIF": vif})
    return pd.DataFrame(out).sort_values("VIF", ascending=False)

vif_df = compute_vif(X)
vif_df.to_csv(os.path.join(DATA_DIR, "vif.csv"), index=False)

print("\n[OK] Guardado para informe:")
print("- outputs/data/tabla_coeficientes.csv")
print("- outputs/data/metricas_finales.csv")
print("- outputs/data/vif.csv")
print("- outputs/figures/predicho_vs_real.png")
print("\nMétricas test:", {"R2": r2, "MSE": mse, "MAE": mae})
print("\nVIF (top):")
print(vif_df.head(10))


[OK] Guardado para informe:
- outputs/data/tabla_coeficientes.csv
- outputs/data/metricas_finales.csv
- outputs/data/vif.csv
- outputs/figures/predicho_vs_real.png

Métricas test: {'R2': 0.8494009801711508, 'MSE': 0.29058636489396805, 'MAE': 0.42443435263921175}

VIF (top):
    variable       VIF
1   internet  3.655510
0   life_exp  3.261560
2  urban_pct  2.583787
3     co2_pc  1.678970
4  trade_gdp  1.146824
6      unemp  1.091880
5       infl  1.018628


In [None]:
import shutil
import os
from google.colab import files

# Define the directory containing the figures
FIG_DIR = os.path.join("outputs", "figures")

# Define the name for the zip archive
zip_filename = "wdi_figures.zip"
zip_path = os.path.join("outputs", zip_filename)

# Create the zip archive
# The root_dir is the directory to start zipping from (e.g., 'outputs')
# The base_dir is the directory inside root_dir to zip (e.g., 'figures')
shutil.make_archive(zip_path.replace('.zip', ''), 'zip', root_dir='outputs', base_dir='figures')

print(f"[OK] Archivo ZIP creado: {zip_path}")

# Provide a download link
files.download(zip_path)

[OK] Archivo ZIP creado: outputs/wdi_figures.zip


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>