In [42]:
# Para el tratamiento de datos:
import pandas as pd
import numpy as np
from datetime import datetime
import re
import unicodedata
import os
pd.set_option('display.max_columns', None) #Para ver todas las columnas
from functools import reduce
import pycountry
from rapidfuzz import process
from iso3166 import countries as iso_countries
from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer


### Paso 1: Carga y visualizacion del csv

In [43]:
df = pd.read_csv("final_powerbi_data/worldbank_data_combined.csv")

In [44]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 91872 entries, 0 to 91871
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Country         91872 non-null  object 
 1   Country Code    91872 non-null  object 
 2   Indicator       91872 non-null  object 
 3   Indicator Code  91872 non-null  object 
 4   Year            91872 non-null  int64  
 5   Value           62647 non-null  float64
 6   Category        91872 non-null  object 
dtypes: float64(1), int64(1), object(5)
memory usage: 4.9+ MB


In [45]:
df.head()

Unnamed: 0,Country,Country Code,Indicator,Indicator Code,Year,Value,Category
0,Africa Eastern and Southern,AFE,"Renewable internal freshwater resources, total...",ER.H2O.INTR.K3,2021,1985.321,availability
1,Africa Eastern and Southern,AFE,"Renewable internal freshwater resources, total...",ER.H2O.INTR.K3,2020,1985.321,availability
2,Africa Eastern and Southern,AFE,"Renewable internal freshwater resources, total...",ER.H2O.INTR.K3,2019,1985.321,availability
3,Africa Eastern and Southern,AFE,"Renewable internal freshwater resources, total...",ER.H2O.INTR.K3,2018,1985.321,availability
4,Africa Eastern and Southern,AFE,"Renewable internal freshwater resources, total...",ER.H2O.INTR.K3,2017,1985.321,availability


In [47]:
#veamos los nombres y el numero de los indicadores
print(df["Indicator"].unique())
print(df["Indicator"].nunique())

# Ver cuántos valores hay por indicador
print(df["Indicator"].value_counts())

# Ver datos para un año específico
df[df["Year"] == 2010].head(10)

# Ver datos para un país específico
df[df["Country"] == "Spain"].head(10)



['Renewable internal freshwater resources, total (billion cubic meters)'
 'Renewable internal freshwater resources per capita (cubic meters)'
 'Level of water stress: freshwater withdrawal as a proportion of available freshwater resources'
 'Annual freshwater withdrawals, total (billion cubic meters)'
 'Annual freshwater withdrawals, domestic (% of total freshwater withdrawal)'
 'Annual freshwater withdrawals, industry (% of total freshwater withdrawal)'
 'Annual freshwater withdrawals, agriculture (% of total freshwater withdrawal)'
 'Water productivity, total (constant 2015 US$ GDP per cubic meter of total freshwater withdrawal)'
 'People using safely managed drinking water services (% of population)'
 'People using at least basic drinking water services (% of population)'
 'Average precipitation in depth (mm per year)'
 'Agricultural irrigated land (% of total agricultural land)'
 'Manufacturing, value added (% of GDP)'
 'Industry (including construction), value added (% of GDP)' 'G

Unnamed: 0,Country,Country Code,Indicator,Indicator Code,Year,Value,Category
4906,Spain,ESP,"Renewable internal freshwater resources, total...",ER.H2O.INTR.K3,2021,111.2,availability
4907,Spain,ESP,"Renewable internal freshwater resources, total...",ER.H2O.INTR.K3,2020,111.2,availability
4908,Spain,ESP,"Renewable internal freshwater resources, total...",ER.H2O.INTR.K3,2019,111.2,availability
4909,Spain,ESP,"Renewable internal freshwater resources, total...",ER.H2O.INTR.K3,2018,111.2,availability
4910,Spain,ESP,"Renewable internal freshwater resources, total...",ER.H2O.INTR.K3,2017,111.2,availability
4911,Spain,ESP,"Renewable internal freshwater resources, total...",ER.H2O.INTR.K3,2016,111.2,availability
4912,Spain,ESP,"Renewable internal freshwater resources, total...",ER.H2O.INTR.K3,2015,111.2,availability
4913,Spain,ESP,"Renewable internal freshwater resources, total...",ER.H2O.INTR.K3,2014,111.2,availability
4914,Spain,ESP,"Renewable internal freshwater resources, total...",ER.H2O.INTR.K3,2013,111.2,availability
4915,Spain,ESP,"Renewable internal freshwater resources, total...",ER.H2O.INTR.K3,2012,111.2,availability


In [48]:
df[df["Indicator"] == 'Gini index']

Unnamed: 0,Country,Country Code,Indicator,Indicator Code,Year,Value,Category
80388,Africa Eastern and Southern,AFE,Gini index,SI.POV.GINI,2021,,poverty_development
80389,Africa Eastern and Southern,AFE,Gini index,SI.POV.GINI,2020,,poverty_development
80390,Africa Eastern and Southern,AFE,Gini index,SI.POV.GINI,2019,,poverty_development
80391,Africa Eastern and Southern,AFE,Gini index,SI.POV.GINI,2018,,poverty_development
80392,Africa Eastern and Southern,AFE,Gini index,SI.POV.GINI,2017,,poverty_development
...,...,...,...,...,...,...,...
86125,Zimbabwe,ZWE,Gini index,SI.POV.GINI,2004,,poverty_development
86126,Zimbabwe,ZWE,Gini index,SI.POV.GINI,2003,,poverty_development
86127,Zimbabwe,ZWE,Gini index,SI.POV.GINI,2002,,poverty_development
86128,Zimbabwe,ZWE,Gini index,SI.POV.GINI,2001,,poverty_development


### Paso 2: Limpieza

In [49]:
#Vemos las columnas únicas por indicadores, los años que analizamos y cuantos paises
print("Indicadores únicos:")
print(df["Indicator"].unique())
print("Rango de años:")
print(df["Year"].min(), "a", df["Year"].max())
print("Número de países únicos:", df["Country"].nunique())


Indicadores únicos:
['Renewable internal freshwater resources, total (billion cubic meters)'
 'Renewable internal freshwater resources per capita (cubic meters)'
 'Level of water stress: freshwater withdrawal as a proportion of available freshwater resources'
 'Annual freshwater withdrawals, total (billion cubic meters)'
 'Annual freshwater withdrawals, domestic (% of total freshwater withdrawal)'
 'Annual freshwater withdrawals, industry (% of total freshwater withdrawal)'
 'Annual freshwater withdrawals, agriculture (% of total freshwater withdrawal)'
 'Water productivity, total (constant 2015 US$ GDP per cubic meter of total freshwater withdrawal)'
 'People using safely managed drinking water services (% of population)'
 'People using at least basic drinking water services (% of population)'
 'Average precipitation in depth (mm per year)'
 'Agricultural irrigated land (% of total agricultural land)'
 'Manufacturing, value added (% of GDP)'
 'Industry (including construction), value 

In [50]:
#Los nombres de los indicadores son algo dificultosos, vamos a cambiarlos por algo más legible y simple para que sea más facil leer las gráficas despues en powerBi
nombres_indicadores = {
    'Renewable internal freshwater resources, total (billion cubic meters)': 'Total renewable water (bn m³)',
    'Renewable internal freshwater resources per capita (cubic meters)': 'Renewable water per capita (m³)',
    'Level of water stress: freshwater withdrawal as a proportion of available freshwater resources': 'Water stress (%)',
    'Annual freshwater withdrawals, total (billion cubic meters)': 'Total water withdrawal (bn m³)',
    'Annual freshwater withdrawals, domestic (% of total freshwater withdrawal)': 'Domestic use (%)',
    'Annual freshwater withdrawals, industry (% of total freshwater withdrawal)': 'Industrial use (%)',
    'Annual freshwater withdrawals, agriculture (% of total freshwater withdrawal)': 'Agricultural use (%)',
    'Water productivity, total (constant 2015 US$ GDP per cubic meter of total freshwater withdrawal)': 'Water productivity (GDP/m³)',
    'People using safely managed drinking water services (% of population)': 'Safe drinking water (%)',
    'People using at least basic drinking water services (% of population)': 'Basic drinking water (%)',
    'Average precipitation in depth (mm per year)': 'Average rainfall (mm/year)',
    'Agricultural irrigated land (% of total agricultural land)': 'Irrigated land (%)',
    'Manufacturing, value added (% of GDP)': 'Manufacturing (% GDP)',
    'Industry (including construction), value added (% of GDP)': 'Industry (% GDP)',
    'Gini index': 'Gini index',
    'Poverty headcount ratio at $2.15 a day (2017 PPP) (% of population)': 'Extreme poverty (%)'}

df["Indicator"] = df["Indicator"].replace(nombres_indicadores)

#verificamos el cambio
print(df["Indicator"].nunique())
print(df["Indicator"].unique())



16
['Total renewable water (bn m³)' 'Renewable water per capita (m³)'
 'Water stress (%)' 'Total water withdrawal (bn m³)' 'Domestic use (%)'
 'Industrial use (%)' 'Agricultural use (%)' 'Water productivity (GDP/m³)'
 'Safe drinking water (%)' 'Basic drinking water (%)'
 'Average rainfall (mm/year)' 'Irrigated land (%)' 'Manufacturing (% GDP)'
 'Industry (% GDP)' 'Gini index' 'Extreme poverty (%)']


In [51]:
#sacamos los valores unicos de Country
print(df["Country"].nunique())
print(df["Country"].unique())

261
['Africa Eastern and Southern' 'Africa Western and Central' 'Arab World'
 'Caribbean small states' 'Central Europe and the Baltics'
 'Early-demographic dividend' 'East Asia & Pacific'
 'East Asia & Pacific (excluding high income)'
 'East Asia & Pacific (IDA & IBRD countries)' 'Euro area'
 'Europe & Central Asia' 'Europe & Central Asia (excluding high income)'
 'Europe & Central Asia (IDA & IBRD countries)' 'European Union'
 'Fragile and conflict affected situations'
 'Heavily indebted poor countries (HIPC)' 'IBRD only' 'IDA & IBRD total'
 'IDA blend' 'IDA only' 'IDA total' 'Late-demographic dividend'
 'Latin America & Caribbean'
 'Latin America & Caribbean (excluding high income)'
 'Latin America & the Caribbean (IDA & IBRD countries)'
 'Least developed countries: UN classification' 'Low & middle income'
 'Middle East & North Africa'
 'Middle East & North Africa (excluding high income)'
 'Middle East & North Africa (IDA & IBRD countries)' 'Middle income'
 'North America' 'OECD memb

In [52]:
#recuperamos nuestra super funcion de limpiar y normalizar valores
def limpiar_nombres(nombre):
    if pd.isnull(nombre):
        return nombre
    nombre = nombre.strip()
    nombre = re.sub(r'["\'].*?["\']', '', nombre)  # quitamos apodos entre comillas
    nombre = re.sub(r'\(.*?\)', '', nombre)  # quitamos texto entre paréntesis
    nombre = re.sub(r"[-/]", " ", nombre)  #quitamos guiones y barras rarunas
    nombre = " ".join(nombre.split())   # quitamos los multiespacios
    nombre = unicodedata.normalize("NFKD", nombre).encode("ASCII", "ignore").decode("utf-8")  # quitamos acentos y caracteres especiales
    return nombre.title()  # formato título


#aplicamos la funcion a los valores de la columna "Country Name"
df["Country"] = df["Country"].apply(limpiar_nombres)


In [53]:
# El dataset contiene información tanto de países soberanos como de regiones y agrupaciones económicas.
# Para poder quedarnos con el listado de países soberanos vamos a usar la librería iso3166,
# que nos facilita todos los países y territorios oficialmente definidos por la métrica ISO3166-1 y su código alpha-3 oficial.
# ⚠️ Importante: la ISO no diferencia entre país soberano y territorio dependiente. Ver después para el análisis.

# Primero sacamos los países oficiales y no incluir las agrupaciones
paises_soberanos = {c.name for c in iso_countries}

# Creamos la función usando fuzzy matching que nos ayuda a limpiar nombres de países escritos de forma distinta
def normalizar_a_soberano(nombre):
    if pd.isnull(nombre):
        return None
    match, score, _ = process.extractOne(nombre, paises_soberanos)
    return match if score > 95 else None

# Aplicamos la función para normalizar nombres
df["Country"] = df["Country"].apply(normalizar_a_soberano)

# Filtramos los nulos resultantes
df = df[df["Country"].notnull()]

# Al listado de países con los que nos quedamos le pasamos el código alpha-3 para evitar duplicaciones
diccionario_name_to_code = {c.name: c.alpha3 for c in iso_countries}

#Usamos .loc para evitar SettingWithCopyWarning
df.loc[:, "Country Code"] = df["Country"].map(diccionario_name_to_code)

# Eliminamos duplicados por combinación única de país, indicador y año
df = df.drop_duplicates(subset=["Country", "Indicator", "Year"], keep="first")


In [54]:
#sacamos los valores unicos de Country
print(df["Country"].nunique())
print(df["Country"].unique())

179
['Afghanistan' 'Albania' 'Algeria' 'American Samoa' 'Andorra' 'Angola'
 'Argentina' 'Armenia' 'Aruba' 'Australia' 'Austria' 'Azerbaijan'
 'Bahrain' 'Bangladesh' 'Barbados' 'Belarus' 'Belgium' 'Belize' 'Benin'
 'Bermuda' 'Bhutan' 'Bosnia and Herzegovina' 'Botswana' 'Brazil'
 'Brunei Darussalam' 'Bulgaria' 'Burkina Faso' 'Burundi' 'Cabo Verde'
 'Cambodia' 'Cameroon' 'Canada' 'Cayman Islands'
 'Central African Republic' 'Chad' 'Chile' 'China' 'Colombia' 'Comoros'
 'Costa Rica' 'Croatia' 'Cuba' 'Cyprus' 'Czechia' 'Denmark' 'Djibouti'
 'Dominica' 'Dominican Republic' 'Ecuador' 'El Salvador'
 'Equatorial Guinea' 'Eritrea' 'Estonia' 'Eswatini' 'Ethiopia'
 'Faroe Islands' 'Fiji' 'Finland' 'France' 'French Polynesia' 'Gabon'
 'Georgia' 'Germany' 'Ghana' 'Gibraltar' 'Greece' 'Greenland' 'Grenada'
 'Guam' 'Guatemala' 'Guinea' 'Guyana' 'Haiti' 'Honduras' 'Hungary'
 'Iceland' 'India' 'Indonesia' 'Iraq' 'Ireland' 'Israel' 'Italy' 'Jamaica'
 'Japan' 'Jordan' 'Kazakhstan' 'Kenya' 'Kiribati' 'Kosov

In [55]:
#vemos que aun pasando la iso nos quedan algunas zonas que excluimos manualmente
excluir_manual = [
    'American Samoa',
    'Aruba',
    'Bermuda',
    'Cayman Islands',
    'Faroe Islands',
    'French Polynesia',
    'Gibraltar',
    'Greenland',
    'Guam',
    'New Caledonia',
    'Northern Mariana Islands',
    'Puerto Rico',
    'Turks and Caicos Islands']

df = df[~df["Country"].isin(excluir_manual)]
print(f"Países únicos tras el filtrado: {df['Country'].nunique()}")
print(f"Países únicos tras el filtrado: {df['Country'].unique()}")

Países únicos tras el filtrado: 166
Países únicos tras el filtrado: ['Afghanistan' 'Albania' 'Algeria' 'Andorra' 'Angola' 'Argentina'
 'Armenia' 'Australia' 'Austria' 'Azerbaijan' 'Bahrain' 'Bangladesh'
 'Barbados' 'Belarus' 'Belgium' 'Belize' 'Benin' 'Bhutan'
 'Bosnia and Herzegovina' 'Botswana' 'Brazil' 'Brunei Darussalam'
 'Bulgaria' 'Burkina Faso' 'Burundi' 'Cabo Verde' 'Cambodia' 'Cameroon'
 'Canada' 'Central African Republic' 'Chad' 'Chile' 'China' 'Colombia'
 'Comoros' 'Costa Rica' 'Croatia' 'Cuba' 'Cyprus' 'Czechia' 'Denmark'
 'Djibouti' 'Dominica' 'Dominican Republic' 'Ecuador' 'El Salvador'
 'Equatorial Guinea' 'Eritrea' 'Estonia' 'Eswatini' 'Ethiopia' 'Fiji'
 'Finland' 'France' 'Gabon' 'Georgia' 'Germany' 'Ghana' 'Greece' 'Grenada'
 'Guatemala' 'Guinea' 'Guyana' 'Haiti' 'Honduras' 'Hungary' 'Iceland'
 'India' 'Indonesia' 'Iraq' 'Ireland' 'Israel' 'Italy' 'Jamaica' 'Japan'
 'Jordan' 'Kazakhstan' 'Kenya' 'Kiribati' 'Kosovo' 'Kuwait' 'Latvia'
 'Lebanon' 'Lesotho' 'Liberia' 'Lib

In [56]:
#verificamos si hay duplicados completos y si hay duplicados con nuestra super combinacion mágica unica
print(df.duplicated().sum())
df.duplicated(subset=["Country", "Indicator", "Year"]).sum()


0


0

In [57]:
#Para facilitar la lectura de los gráficos en PowerBi, vamos a agrupar los paises en macro regiones.
#asi podremos ver tanto los datos de cada pais como los de su región y sub-región segun escogamos el filtro.

#Cargamos el listado de la ONU
df_m49 = pd.read_csv("raw_worldbank_data/ONU-M49.csv")

df_m49.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 249 entries, 0 to 248
Data columns (total 56 columns):
 #   Column                                   Non-Null Count  Dtype  
---  ------                                   --------------  -----  
 0   FIFA                                     239 non-null    object 
 1   Dial                                     249 non-null    object 
 2   ISO3166-1-Alpha-3                        249 non-null    object 
 3   MARC                                     249 non-null    object 
 4   is_independent                           249 non-null    object 
 5   ISO3166-1-numeric                        249 non-null    int64  
 6   GAUL                                     243 non-null    float64
 7   FIPS                                     249 non-null    object 
 8   WMO                                      246 non-null    object 
 9   ISO3166-1-Alpha-2                        248 non-null    object 
 10  ITU                                      247 non-n

In [58]:
# Seleccionamos las columnas necesarias
df_m49_simple = df_m49[["ISO3166-1-Alpha-3", "Region Name", "Sub-region Name"]]

# Hacemos el merge usando Country Code que es ISO alpha-3 del 
# nuevo dataset y nepues eliminanos las culumnas no necesarias
df = df.merge(df_m49_simple, left_on="Country Code", right_on="ISO3166-1-Alpha-3", how="left")
df.drop(columns=["ISO3166-1-Alpha-3"], inplace=True)

In [59]:
#ORdenamos las columnas para que tenga más sentido
columnas_ordenadas = ['Country', 'Country Code', 'Region Name', 'Sub-region Name', 'Year', 'Indicator', 'Indicator Code', 'Category', 'Value']
df = df[columnas_ordenadas]


In [60]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 58432 entries, 0 to 58431
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Country          58432 non-null  object 
 1   Country Code     58432 non-null  object 
 2   Region Name      58080 non-null  object 
 3   Sub-region Name  58080 non-null  object 
 4   Year             58432 non-null  int64  
 5   Indicator        58432 non-null  object 
 6   Indicator Code   58432 non-null  object 
 7   Category         58432 non-null  object 
 8   Value            45514 non-null  float64
dtypes: float64(1), int64(1), object(7)
memory usage: 4.0+ MB


In [61]:
#Comprobamos los valores unicos
print("Valores únicos de 'Region Name':")
print(df["Region Name"].nunique())
print(df["Region Name"].unique())
print("\nValores únicos de 'Sub-region Name':")
print(df["Sub-region Name"].nunique())
print(df["Sub-region Name"].unique())

Valores únicos de 'Region Name':
5
['Asia' 'Europe' 'Africa' 'Americas' 'Oceania' nan]

Valores únicos de 'Sub-region Name':
17
['Southern Asia' 'Southern Europe' 'Northern Africa' 'Sub-Saharan Africa'
 'Latin America and the Caribbean' 'Western Asia'
 'Australia and New Zealand' 'Western Europe' 'Eastern Europe'
 'South-eastern Asia' 'Northern America' 'Eastern Asia' 'Northern Europe'
 'Melanesia' 'Central Asia' 'Micronesia' nan 'Polynesia']


In [62]:
#vemos las estadisticas basicas
df.groupby("Indicator")["Value"].describe().round(2)


Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
Indicator,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Agricultural use (%),3195.0,52.93,32.36,0.0,21.54,60.76,81.71,99.7
Average rainfall (mm/year),3342.0,1174.88,830.03,56.0,560.0,1020.0,1755.25,3240.0
Basic drinking water (%),3551.0,85.06,18.34,18.68,75.72,93.45,99.23,100.0
Domestic use (%),3179.0,27.89,24.51,0.3,10.55,19.75,39.7,167.93
Extreme poverty (%),1383.0,7.2,14.51,0.0,0.2,0.9,6.3,81.5
Gini index,1383.0,37.04,8.4,23.7,30.7,35.1,42.3,64.8
Industrial use (%),3143.0,19.07,23.56,0.0,3.0,8.27,24.24,99.56
Industry (% GDP),3429.0,26.82,12.15,2.76,19.38,24.71,31.46,86.67
Irrigated land (%),917.0,11.39,15.38,0.0,0.92,5.16,16.22,79.36
Manufacturing (% GDP),3228.0,12.45,6.59,0.23,7.58,12.1,16.54,44.98


In [63]:
#water stress deberia estar entre 0 y 10, debe haberse producido un error de unidad, limitamos el valor a 100
df.loc[(df["Indicator"] == "Water stress (%)") & (df["Value"] > 100), "Value"] = 100


In [64]:
### PASO 3: Gestion de nulos

In [65]:
#vemos los datos nulos
total = len(df)
nulos_df = pd.DataFrame({
    'Nulos': df.isnull().sum(),
    'Porcentaje': (df.isnull().sum() / total * 100).round(2)}).sort_values(by='Nulos', ascending=False)
print("Nulos y porcentaje por columna:")
print(nulos_df)

Nulos y porcentaje por columna:
                 Nulos  Porcentaje
Value            12918       22.11
Region Name        352        0.60
Sub-region Name    352        0.60
Country              0        0.00
Country Code         0        0.00
Year                 0        0.00
Indicator            0        0.00
Indicator Code       0        0.00
Category             0        0.00


In [66]:
paises_nan_region = df[df["Region Name"].isnull()]["Country"].unique()
print("Países sin región asignada:")
print(paises_nan_region)


Países sin región asignada:
['Kosovo']


In [67]:
#Kosovo no tiene un reconocimiento unanimo como pais soberano por parte de otros paises, por lo que no tiene region en los listados internacionales como la M49.
#Lo imputamos manualmente y comprobamos.
df.loc[df["Country"] == "Kosovo", ["Region Name", "Sub-region Name"]] = ["Europe", "Southern Europe"]
print(df[["Region Name", "Sub-region Name"]].isnull().sum())



Region Name        0
Sub-region Name    0
dtype: int64


In [70]:
#para ubicar donde estan los valores nulos vamos a sacar el numero de nulos por indicador y por pais.
nulos_indicador = df[df["Value"].isnull()].groupby("Indicator")["Country"].nunique().sort_values(ascending=False)
print("numero países con nulos por indicador:", nulos_indicador)

nulos_pais = df[df["Value"].isnull()].groupby("Country")["Indicator"].nunique().sort_values(ascending=False)
print("\nnumero indicadores con nulos por pais:", nulos_pais)


numero países con nulos por indicador: Indicator
Irrigated land (%)                 166
Extreme poverty (%)                157
Gini index                         157
Safe drinking water (%)             61
Industrial use (%)                  40
Manufacturing (% GDP)               40
Domestic use (%)                    39
Agricultural use (%)                36
Water productivity (GDP/m³)         34
Water stress (%)                    34
Total water withdrawal (bn m³)      32
Industry (% GDP)                    26
Average rainfall (mm/year)          15
Basic drinking water (%)            15
Renewable water per capita (m³)     15
Total renewable water (bn m³)       15
Name: Country, dtype: int64

numero indicadores con nulos por pais: Country
Kosovo                16
South Sudan           16
Tuvalu                14
Liechtenstein         14
San Marino            14
                      ..
Georgia                1
France                 1
Dominican Republic     1
Luxembourg             1
S

In [71]:
#tambien vemos que hay 3 indicadores (Irrigated land, Extreme poverty y Gini index) de los que casi la totalidad de los paises no tienen datos, por lo que tambien pasamos a eliminarlos.
indicadores_a_eliminar = ["Irrigated land (%)", "Extreme poverty (%)", "Gini index"]
df = df[~df["Indicator"].isin(indicadores_a_eliminar)]


In [72]:
#vemos que hay paises que no tienendatos en ninguno de los indicadores, por lo que no nos sirven para el analisis y los eliminamos
paises_todos_nulos = df[df["Value"].isnull()].groupby("Country")["Indicator"].nunique()
paises_sin_datos = paises_todos_nulos[paises_todos_nulos == df["Indicator"].nunique()].index
df = df[~df["Country"].isin(paises_sin_datos)]

In [73]:
#vemos de nuevo los datos nulos
total = len(df)
nulos_df = pd.DataFrame({
    'Nulos': df.isnull().sum(),
    'Porcentaje': (df.isnull().sum() / total * 100).round(2)}).sort_values(by='Nulos', ascending=False)
print("Nulos y porcentaje por columna:")
print(nulos_df)

Nulos y porcentaje por columna:
                 Nulos  Porcentaje
Value             5212       11.11
Country              0        0.00
Country Code         0        0.00
Region Name          0        0.00
Sub-region Name      0        0.00
Year                 0        0.00
Indicator            0        0.00
Indicator Code       0        0.00
Category             0        0.00


In [74]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 46904 entries, 0 to 51127
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Country          46904 non-null  object 
 1   Country Code     46904 non-null  object 
 2   Region Name      46904 non-null  object 
 3   Sub-region Name  46904 non-null  object 
 4   Year             46904 non-null  int64  
 5   Indicator        46904 non-null  object 
 6   Indicator Code   46904 non-null  object 
 7   Category         46904 non-null  object 
 8   Value            41692 non-null  float64
dtypes: float64(1), int64(1), object(7)
memory usage: 3.6+ MB


In [75]:
df.sample(10)

Unnamed: 0,Country,Country Code,Region Name,Sub-region Name,Year,Indicator,Indicator Code,Category,Value
45806,Malawi,MWI,Africa,Sub-Saharan Africa,2019,Manufacturing (% GDP),NV.IND.MANF.ZS,industry_economy,11.529419
15335,Colombia,COL,Americas,Latin America and the Caribbean,2020,Domestic use (%),ER.H2O.FWDM.ZS,use_by_sector,12.491907
25101,Sri Lanka,LKA,Asia,Southern Asia,2000,Agricultural use (%),ER.H2O.FWAG.ZS,use_by_sector,92.232487
35099,Montenegro,MNE,Europe,Southern Europe,2012,Basic drinking water (%),SH.H2O.BASW.ZS,access_to_water,96.65431
7254,Viet Nam,VNM,Asia,South-eastern Asia,2005,Renewable water per capita (m³),ER.H2O.INTR.PC,availability,4432.45132
50978,United Arab Emirates,ARE,Asia,Western Asia,2017,Industry (% GDP),NV.IND.TOTL.ZS,industry_economy,42.528035
27107,Ireland,IRL,Europe,Northern Europe,2018,Water productivity (GDP/m³),ER.GDP.FWTL.M3.KD,efficiency,253.86294
15257,Central African Republic,CAF,Africa,Sub-Saharan Africa,2010,Domestic use (%),ER.H2O.FWDM.ZS,use_by_sector,82.896552
11002,Algeria,DZA,Africa,Northern Africa,2019,Total water withdrawal (bn m³),ER.H2O.FWTL.K3,water_stress,9.802
9682,Netherlands,NLD,Europe,Western Europe,2019,Water stress (%),ER.H2O.FWST.ZS,water_stress,17.000607


tenemos un 11% de nulos que viene directamente desde la fuente, que es el Banco Mundial la que nos da los datos oficiales. La ausencia de iertos datos no se trata de un error, si no que nos da a entender que algo paso en ese momento para que no se pudieran obtener los datos: conflictos armados, crisis institucionales, falta de recursos, cambio de regimen y censura,...........
Imputar esos datos pueden llevarnos a maquillar el contexto geopolitico. Se abre el debate de imputar los nulos o no?
Imputar esos datos en estos contextos puede resultar engañoso o poco ético pues la falta de esos valores ya en si es informacion.
Por contrario si queremos ver las tendencias y facilitar los gráficos en powerbi, creo que se debe imputar con tecnicas avanzadas.

In [None]:
#guardamos el dataframe limpio
#os.makedirs("final_powerbi_data", exist_ok=True)
#output_path = "final_powerbi_data/clean_final_data.csv"
#df.to_csv(output_path, index=False)
#print(f"Archivo guardado correctamente en: {output_path}")

Frente a este conflicto ético, hemos guardado un CSV limpio y sin imputar, con una gestion de los datos estricta. Dado que nuestro interes mayoritario es hacer gráficas y ver las tendencias y patrones decicimos utilizar un metodo de imputacion avazanda para completar ese porcentaje de nulos.

In [77]:
df.groupby("Indicator")["Value"].describe().round(2)

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
Indicator,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Agricultural use (%),3185.0,52.98,32.39,0.0,21.54,60.87,81.74,99.7
Average rainfall (mm/year),3332.0,1175.71,831.14,56.0,560.0,1020.0,1761.0,3240.0
Basic drinking water (%),3540.0,85.19,18.2,18.68,75.96,93.49,99.23,100.0
Domestic use (%),3169.0,27.89,24.55,0.3,10.55,19.6,39.79,167.93
Industrial use (%),3133.0,19.02,23.58,0.0,3.0,8.27,23.73,99.56
Industry (% GDP),3407.0,26.76,12.12,2.76,19.36,24.63,31.46,86.67
Manufacturing (% GDP),3206.0,12.47,6.59,0.23,7.58,12.09,16.55,44.98
Renewable water per capita (m³),3348.0,18444.85,54782.14,0.0,974.41,2918.81,13162.39,604541.17
Safe drinking water (%),2390.0,66.88,30.93,2.3,43.47,75.07,96.34,100.0
Total renewable water (bn m³),3348.0,240.77,709.3,0.0,6.16,34.8,136.6,5661.0


In [84]:

# 1. Pivoteamos a formato ancho (indicadores como columnas)
df_wide = df.pivot_table(index=["Country", "Year"], columns="Indicator", values="Value")

# 2. Guardamos medias y desviaciones para desescalar después
means = df_wide.mean()
stds = df_wide.std()

# 3. Escalamos para que KNN funcione correctamente
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df_wide)

# 4. Aplicamos imputación con KNN
imputer = KNNImputer(n_neighbors=10)
df_imputed_scaled = imputer.fit_transform(df_scaled)

# 5. Desescalamos manualmente para volver a la escala original
df_imputed = pd.DataFrame(df_imputed_scaled, index=df_wide.index, columns=df_wide.columns)
df_imputed = df_imputed * stds + means

# 6. Restablecemos índice para convertir Country y Year en columnas
df_imputed = df_imputed.reset_index()

# 7. Añadimos columnas auxiliares: código de país, región, subregión
columnas_auxiliares = [
    "Country", "Country Code", "Region Name", "Sub-region Name"
]
df_aux = df[columnas_auxiliares].drop_duplicates()

# 8. Hacemos el merge
df_imputed = df_imputed.merge(df_aux, on="Country", how="left")

# 9. Reordenamos columnas para dejarlo bonito
columnas_indicadores = [col for col in df_imputed.columns if col not in columnas_auxiliares + ["Year"]]
df_imputed = df_imputed[["Country", "Country Code", "Region Name", "Sub-region Name", "Year"] + columnas_indicadores]



In [85]:
df_imputed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3608 entries, 0 to 3607
Data columns (total 18 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Country                          3608 non-null   object 
 1   Country Code                     3608 non-null   object 
 2   Region Name                      3608 non-null   object 
 3   Sub-region Name                  3608 non-null   object 
 4   Year                             3608 non-null   int64  
 5   Agricultural use (%)             3608 non-null   float64
 6   Average rainfall (mm/year)       3608 non-null   float64
 7   Basic drinking water (%)         3608 non-null   float64
 8   Domestic use (%)                 3608 non-null   float64
 9   Industrial use (%)               3608 non-null   float64
 10  Industry (% GDP)                 3608 non-null   float64
 11  Manufacturing (% GDP)            3608 non-null   float64
 12  Renewable water per 

In [88]:
df_imputed.describe().round(2)

Unnamed: 0,Year,Agricultural use (%),Average rainfall (mm/year),Basic drinking water (%),Domestic use (%),Industrial use (%),Industry (% GDP),Manufacturing (% GDP),Renewable water per capita (m³),Safe drinking water (%),Total renewable water (bn m³),Total water withdrawal (bn m³),Water productivity (GDP/m³),Water stress (%)
count,3608.0,3608.0,3608.0,3608.0,3608.0,3608.0,3608.0,3608.0,3608.0,3608.0,3608.0,3608.0,3608.0,3608.0
mean,2010.5,51.96,1169.62,85.28,30.1,18.41,26.71,12.31,17576.15,58.66,226.73,18.69,78.96,26.18
std,6.35,32.05,817.14,18.13,25.82,22.85,12.09,6.72,52955.46,32.87,686.01,70.71,182.94,32.08
min,2000.0,-0.01,55.83,18.67,0.3,-0.0,2.76,0.23,-2.76,2.28,-0.04,-0.0,0.21,0.09
25%,2005.0,20.47,559.91,76.11,11.2,2.93,19.16,7.2,929.09,26.87,5.19,0.39,7.79,3.84
50%,2010.5,59.01,1025.98,93.54,22.0,8.27,24.55,11.73,2766.51,59.42,30.27,1.63,21.1,10.19
75%,2016.0,80.59,1738.08,99.22,44.35,22.72,31.66,16.35,11742.04,92.48,121.98,9.93,66.45,36.51
max,2021.0,99.7,3240.31,100.0,167.96,99.58,86.68,44.99,604628.72,100.01,5661.81,655.68,3056.07,100.01


In [None]:
#arreglamos los pequeños desajustes que se han producido tras la imputacion
vars_no_negativos = [
    "Agricultural use (%)", "Domestic use (%)", "Industrial use (%)",
    "Renewable water per capita (m³)", "Total renewable water (bn m³)",
    "Total water withdrawal (bn m³)", "Water productivity (GDP/m³)"]
for col in vars_no_negativos:
    df_imputed.loc[df_imputed[col] < 0, col] = 0

# Recortar al 100% los indicadores que no pueden pasar ese límite
vars_max_100 = [
    "Water stress (%)", "Basic drinking water (%)", "Safe drinking water (%)",
    "Agricultural use (%)", "Industrial use (%)", "Domestic use (%)"
]
for col in vars_max_100:
    df_imputed.loc[df_imputed[col] > 100, col] = 100


In [90]:
df_imputed.describe().round(2)

Unnamed: 0,Year,Agricultural use (%),Average rainfall (mm/year),Basic drinking water (%),Domestic use (%),Industrial use (%),Industry (% GDP),Manufacturing (% GDP),Renewable water per capita (m³),Safe drinking water (%),Total renewable water (bn m³),Total water withdrawal (bn m³),Water productivity (GDP/m³),Water stress (%)
count,3608.0,3608.0,3608.0,3608.0,3608.0,3608.0,3608.0,3608.0,3608.0,3608.0,3608.0,3608.0,3608.0,3608.0
mean,2010.5,51.96,1169.62,85.28,29.71,18.41,26.71,12.31,17576.19,58.66,226.73,18.69,78.96,26.18
std,6.35,32.05,817.14,18.13,24.26,22.85,12.09,6.72,52955.44,32.87,686.01,70.71,182.94,32.08
min,2000.0,0.0,55.83,18.67,0.3,0.0,2.76,0.23,0.0,2.28,0.0,0.0,0.21,0.09
25%,2005.0,20.47,559.91,76.11,11.2,2.93,19.16,7.2,929.09,26.87,5.19,0.39,7.79,3.84
50%,2010.5,59.01,1025.98,93.54,22.0,8.27,24.55,11.73,2766.51,59.42,30.27,1.63,21.1,10.19
75%,2016.0,80.59,1738.08,99.22,44.35,22.72,31.66,16.35,11742.04,92.48,121.98,9.93,66.45,36.51
max,2021.0,99.7,3240.31,100.0,100.0,99.58,86.68,44.99,604628.72,100.0,5661.81,655.68,3056.07,100.0


In [91]:
#guardamos el archivo sin nulos
os.makedirs("final_powerbi_data", exist_ok=True)
output_path = "final_powerbi_data/water_data_wide.csv"
df_imputed.to_csv(output_path, index=False)
print(f"Archivo final guardado en: {output_path}")

Archivo final guardado en: final_powerbi_data/water_data_wide.csv
