In [75]:
import numpy as np
import pandas as pd
import tensorflow as tf
import random
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Flatten, Dense, Concatenate
from sklearn.metrics import mean_absolute_error

# Fijar la semilla para reproducibilidad
seed = 5
np.random.seed(seed)
tf.random.set_seed(seed)
random.seed(seed)

# Cargar el dataset
df = pd.read_csv('Life Expectancy Data.csv')
df

Unnamed: 0,Country,Year,Status,Life expectancy,Adult Mortality,infant deaths,Alcohol,percentage expenditure,Hepatitis B,Measles,...,Polio,Total expenditure,Diphtheria,HIV/AIDS,GDP,Population,thinness 1-19 years,thinness 5-9 years,Income composition of resources,Schooling
0,Afghanistan,2015,Developing,65.0,263.0,62,0.01,71.279624,65.0,1154,...,6.0,8.16,65.0,0.1,584.259210,33736494.0,17.2,17.3,0.479,10.1
1,Afghanistan,2014,Developing,59.9,271.0,64,0.01,73.523582,62.0,492,...,58.0,8.18,62.0,0.1,612.696514,327582.0,17.5,17.5,0.476,10.0
2,Afghanistan,2013,Developing,59.9,268.0,66,0.01,73.219243,64.0,430,...,62.0,8.13,64.0,0.1,631.744976,31731688.0,17.7,17.7,0.470,9.9
3,Afghanistan,2012,Developing,59.5,272.0,69,0.01,78.184215,67.0,2787,...,67.0,8.52,67.0,0.1,669.959000,3696958.0,17.9,18.0,0.463,9.8
4,Afghanistan,2011,Developing,59.2,275.0,71,0.01,7.097109,68.0,3013,...,68.0,7.87,68.0,0.1,63.537231,2978599.0,18.2,18.2,0.454,9.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2933,Zimbabwe,2004,Developing,44.3,723.0,27,4.36,0.000000,68.0,31,...,67.0,7.13,65.0,33.6,454.366654,12777511.0,9.4,9.4,0.407,9.2
2934,Zimbabwe,2003,Developing,44.5,715.0,26,4.06,0.000000,7.0,998,...,7.0,6.52,68.0,36.7,453.351155,12633897.0,9.8,9.9,0.418,9.5
2935,Zimbabwe,2002,Developing,44.8,73.0,25,4.43,0.000000,73.0,304,...,73.0,6.53,71.0,39.8,57.348340,125525.0,1.2,1.3,0.427,10.0
2936,Zimbabwe,2001,Developing,45.3,686.0,25,1.72,0.000000,76.0,529,...,76.0,6.16,75.0,42.1,548.587312,12366165.0,1.6,1.7,0.427,9.8


In [76]:
# Estandarizar los nombres de columnas (en minúsculas y sin espacios extremos)
df.columns = [col.strip().lower() for col in df.columns]

# Asumimos que las columnas relevantes son: 'country', 'year' y 'life expectancy'
# Elimina filas con valores nulos en las columnas de interés
df = df.dropna(subset=['country', 'year', 'life expectancy'])

# Convertir la columna 'country' a índices numéricos
le = LabelEncoder()
df['country_encoded'] = le.fit_transform(df['country'])
num_countries = df['country_encoded'].nunique()

# Definir las variables de entrada y la variable objetivo
X_country = df['country_encoded'].values        # Entrada categórica para el país
X_year = df['year'].values.reshape(-1, 1)         # Entrada numérica para el año
y = df['life expectancy'].values.reshape(-1, 1)     # Esperanza de vida (target)

# Escalar la salida (target)
scaler_y = StandardScaler()
y_scaled = scaler_y.fit_transform(y)

# (Opcional) Escalar la variable 'year' para mejorar el entrenamiento
scaler_year = StandardScaler()
X_year_scaled = scaler_year.fit_transform(X_year)

# Dividir el dataset en entrenamiento y prueba (80%-20%)
X_country_train, X_country_test, X_year_train, X_year_test, y_train, y_test = train_test_split(
    X_country, X_year_scaled, y_scaled, test_size=0.2, random_state=seed
)

# Definir el modelo con dos entradas

# Entrada para el país (con embedding)
input_country = Input(shape=(1,), name='country')
embedding_dim = 15  # Dimensión del embedding (puedes ajustar este valor)
embed = Embedding(input_dim=num_countries, output_dim=embedding_dim, name='country_embedding')(input_country)
flat = Flatten()(embed)

# Entrada para el año
input_year = Input(shape=(1,), name='year')

# Concatenar ambas entradas
x = Concatenate()([flat, input_year])
x = Dense(128, activation='relu')(x)
x = Dense(64, activation='relu')(x)
x = Dense(32, activation='relu')(x)
output = Dense(1, activation='linear')(x)

# Construir y compilar el modelo
model = Model(inputs=[input_country, input_year], outputs=output)
model.compile(optimizer='adam', loss='mse', metrics=['mae'])
model.summary()

# Entrenar el modelo
history = model.fit(
    {'country': X_country_train, 'year': X_year_train},
    y_train,
    epochs=200,
    batch_size=64,
    validation_split=0.1,
    verbose=0  # Puedes cambiar a 1 para ver el progreso
)

# Evaluar el modelo en el set de prueba (la métrica 'mae' está en la escala del target escalado)
loss, mae_scaled = model.evaluate(
    {'country': X_country_test, 'year': X_year_test},
    y_test,
    verbose=0
)
print("MAE escalado en test:", mae_scaled)

# Predecir en el set de prueba y desescalar las predicciones para obtener el MAE en la escala original
y_pred_scaled = model.predict({'country': X_country_test, 'year': X_year_test})
y_pred = scaler_y.inverse_transform(y_pred_scaled)
y_test_original = scaler_y.inverse_transform(y_test)

mae_descaled = mean_absolute_error(y_test_original, y_pred)
print("MAE desescalado en test:", mae_descaled)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['country_encoded'] = le.fit_transform(df['country'])


MAE escalado en test: 0.09617716073989868
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step
MAE desescalado en test: 0.9158222563030776


MAE escalado en test: 0.09617716073989868
19/19 ━━━━━━━━━━━━━━━━━━━━ 0s 5ms/step
MAE desescalado en test: 0.9158222563030776