## PASO 1: Plantear la pregunta.
        ¿Puede predecirse el consumo energético de un país en función de su población y su nivel de industrialización?
## PASOS 2 y 3: Concretar y buscar los datos necesarios.
        Cargamos los ficheros .csv utilizando las librerías y funciones correspondientes.           

In [1]:
import pandas as pd

#Ficheros de datos
pop_den = pd.read_csv("Datos\\population_density.csv")
urb_gro = pd.read_csv("Datos\\urban_growth.csv")
pop = pd.read_csv("Datos\\population.csv")
pop_gro = pd.read_csv("Datos\\population_growth.csv")
life_exp = pd.read_csv("Datos\\life_expectancy.csv")
co2_emi = pd.read_csv("Datos\\co2_emissions.csv")

#Fichero de comprobación
target = pd.read_csv("Datos\\energy_person_ratio.csv")

## PASOS 4 y 5: Limpieza y exploración de los datos..
        1) Transponemos las tablas utilizando la función data.melt_df().
        2) Juntamos las tablas mediante la función data.merge_all(), describiendo nuestro propio diccionario.
        3) Visualizamos la información básica de nuestra tabla global mediante las funciones .head() y .describe() 

In [2]:
import data #data es una librería de funciones propias que nos permiten unir y transponer los dataframes anteriores de manera fácil y rápida (Si quieres profundizar, abre el fichero data.py) 
pop_den_melt = data.melt_df(pop_den, "population_density")
urb_gro_melt = data.melt_df(urb_gro, "urban_growth")
pop_melt = data.melt_df(pop, "population")
pop_gro_melt = data.melt_df(pop_gro, "population_growth")
co2_emi_melt = data.melt_df(co2_emi, "co2_emissions")
life_exp_melt = data.melt_df(life_exp, "life_expectancy")

target_melt = data.melt_df(target, "target")

In [41]:
#Describimos un objeto tipo diccionario feature_dict, que usaremos como argumento en la función data.merge_all()
feature_dict = {"pop_den_melt":pop_den_melt, "urb_gro_melt": urb_gro_melt, "pop_melt": pop_melt, "pop_gro_melt": pop_gro_melt, "co2_emi_melt": co2_emi_melt, "life_exp_melt": life_exp_melt, "target":target_melt}

merged_data = data.merge_all(feature_dict = feature_dict, keys = ["country", "year"]).reset_index(drop=True)

In [42]:
merged_data.head()

Unnamed: 0,country,year,population_density,urban_growth,population,population_growth,co2_emissions,life_expectancy,target
0,Australia,1961,1.36,0.0249,10400000.0,1.99,8.67,71.4,3120.0
1,Austria,1961,86.3,0.00694,7110000.0,0.548,4.48,70.0,1550.0
2,Belgium,1961,305.0,0.00434,9230000.0,0.332,10.1,70.6,2570.0
3,Canada,1961,2.01,0.0288,18200000.0,2.0,10.6,71.3,4310.0
4,Denmark,1961,109.0,0.0168,4610000.0,0.698,6.88,72.3,2020.0


In [43]:
merged_data.describe()

Unnamed: 0,population_density,urban_growth,population,population_growth,co2_emissions,life_expectancy,target
count,5880.0,5880.0,5880.0,5880.0,5880.0,5880.0,5880.0
mean,151.510255,0.02559,40285360.0,1.651167,5.871009,68.819014,2242.902859
std,467.544295,0.022368,133605300.0,1.616812,7.943471,8.40614,2630.882603
min,1.24,-0.0651,54500.0,-9.08,0.00465,32.5,9.55
25%,21.8,0.009955,3700000.0,0.584,0.8385,63.9,505.75
50%,65.8,0.0225,9825000.0,1.51,3.36,70.9,1180.0
75%,128.0,0.0384,29725000.0,2.5325,8.17,74.8,3040.0
max,7890.0,0.178,1400000000.0,17.6,87.7,84.2,22100.0


## PASO 6: Preprocesamiento de datos.
        1) Generamos el objeto que realiza la normalización a través de Sklearn.
        2) Normalización de los datos.
        3) Visualización de los mismos.  
        4) Reordenación final del DataFrame.

In [45]:
from sklearn.preprocessing import MinMaxScaler
#Generar un objeto de escalado
scaler = MinMaxScaler()

In [65]:
#Creamos un DataFrame nuevo sobre el que normalizaremos los datos
scaled_data = merged_data.copy()

#Aplicamos la normalización SOLO en las columnas que lo necesitan. El resto se mantienen igual.
scaled_data[["population_density","urban_growth","population","population_growth","co2_emissions","life_expectancy", "target"]] = scaler.fit_transform(scaled_data[["population_density","urban_growth","population","population_growth","co2_emissions","life_expectancy","target"]])

#Visualizamos los datos normalizados. 
scaled_data.head()

Unnamed: 0,country,year,population_density,urban_growth,population,population_growth,co2_emissions,life_expectancy,target
0,Australia,1961,1.5e-05,0.370218,0.00739,0.414918,0.098812,0.752418,0.140805
1,Austria,1961,0.010782,0.296339,0.00504,0.36087,0.051033,0.725338,0.069734
2,Belgium,1961,0.038505,0.285644,0.006554,0.352774,0.115118,0.736944,0.115908
3,Canada,1961,9.8e-05,0.386261,0.012962,0.415292,0.12082,0.750484,0.194675
4,Denmark,1961,0.01366,0.336898,0.003254,0.366492,0.0784,0.769826,0.09101


In [68]:
#Unimos Pais+Año y lo tomamos como indice del DataFrame y ya tenemos los datos listos para el siguiente paso
scaled_data["country_year"] = scaled_data["country"] + "_" + scaled_data["year"]
scaled_data.set_index("country_year", inplace = True)
scaled_data.drop(columns=["country", "year"], inplace = True)
scaled_data.head()

Unnamed: 0_level_0,population_density,urban_growth,population,population_growth,co2_emissions,life_expectancy,target
country_year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Australia_1961,1.5e-05,0.370218,0.00739,0.414918,0.098812,0.752418,0.140805
Austria_1961,0.010782,0.296339,0.00504,0.36087,0.051033,0.725338,0.069734
Belgium_1961,0.038505,0.285644,0.006554,0.352774,0.115118,0.736944,0.115908
Canada_1961,9.8e-05,0.386261,0.012962,0.415292,0.12082,0.750484,0.194675
Denmark_1961,0.01366,0.336898,0.003254,0.366492,0.0784,0.769826,0.09101


## PASO 7: Elección y generación del modelo de Machine Learning.
        1) Generamos el objeto que realiza la normalización a través de Sklearn.
        2) Normalización de los datos.
        3) Visualización de los mismos.  
        4) Reordenación final del DataFrame.