In [1]:
#Instalar dependencias
!pip install pandas
!pip install numpy



In [2]:
import pandas as pd
import numpy as np

# Mostrar columnas para verificar la limpieza final
pd.set_option("display.max_columns", None)


In [14]:
#Cargar el csv
from pathlib import Path

file = Path(".\\Life_Expectancy_Data.csv")

if file.exists():
    print("Cargando el archivo:", file)
else:
    print("Debes cambiar el nombre del archivo en el código o renombrar el tuyo a:", file)

df = pd.read_csv(file)

# Vista rápida
print("(rows, columns):", df.shape)


Cargando el archivo: Life_Expectancy_Data.csv
(rows, columns) (2938, 22)


In [4]:
# Revisar duplicados
# Número de filas duplicadas
print("Duplicados encontrados:", df.duplicated().sum())

# Eliminar duplicados
df = df.drop_duplicates()
print("(rows, columns):", df.shape)

Duplicados encontrados: 0
(rows, columns) (2938, 22)


In [5]:
# Revisar valores sin rellenar
df.isna().sum()

country                            0
year                               0
status                             0
life_expectancy                    0
adult_mortality                    0
infant_deaths                      0
alcohol                            0
percentage_expenditure             0
hepatitis_b                        0
measles                            0
bmi                                0
under_five_deaths                  0
polio                              0
total_expenditure                  0
diphtheria                         0
hiv/aids                           0
gdp                                0
population                         0
thinness__1_19_years               0
thinness_5_9_years                 0
income_composition_of_resources    0
schooling                          0
dtype: int64

In [6]:
# Imputar valores faltantes
# numéricas → media
num_cols = df.select_dtypes(include=[np.number]).columns
for col in num_cols:
    df[col] = df[col].fillna(df[col].mean())

# categóricas → valor más frecuente (moda)
cat_cols = df.select_dtypes(exclude=[np.number]).columns
for col in cat_cols:
    df[col] = df[col].fillna(df[col].mode()[0])

# Revisa si el relleno s eha completado
df.isna().sum()

country                            0
year                               0
status                             0
life_expectancy                    0
adult_mortality                    0
infant_deaths                      0
alcohol                            0
percentage_expenditure             0
hepatitis_b                        0
measles                            0
bmi                                0
under_five_deaths                  0
polio                              0
total_expenditure                  0
diphtheria                         0
hiv/aids                           0
gdp                                0
population                         0
thinness__1_19_years               0
thinness_5_9_years                 0
income_composition_of_resources    0
schooling                          0
dtype: int64

In [7]:
# Normalizar nombres de columnas y formatos de las mismas
# Convertir a minúsculas y reemplazar espacios por guiones bajos
df.columns = (
    df.columns.str.strip()        # quitar espacios iniciales/finales
              .str.lower()        # minúsculas
              .str.replace(" ", "_") # espacios a _
              .str.replace("-", "_") # guiones a _
)

# fechas → tipo datetime
if "date" in df.columns:
    df["date"] = pd.to_datetime(df["date"], errors="coerce")

# códigos de distrito → string uniforme (rellenar con ceros)
if "district_code" in df.columns:
    df["codigo_distrito"] = df["codigo_distrito"].astype(str).str.zfill(3)

# unidades → a minúsculas
if "units" in df.columns:
    df["units"] = df["units"].str.lower().str.strip()

#Comprueba lo realizado
df.head()

Unnamed: 0,country,year,status,life_expectancy,adult_mortality,infant_deaths,alcohol,percentage_expenditure,hepatitis_b,measles,bmi,under_five_deaths,polio,total_expenditure,diphtheria,hiv/aids,gdp,population,thinness__1_19_years,thinness_5_9_years,income_composition_of_resources,schooling
0,Afghanistan,2015,Developing,65.0,263.0,62,0.01,71.279624,65.0,1154,19.1,83,6.0,8.16,65.0,0.1,584.25921,33736494.0,17.2,17.3,0.479,10.1
1,Afghanistan,2014,Developing,59.9,271.0,64,0.01,73.523582,62.0,492,18.6,86,58.0,8.18,62.0,0.1,612.696514,327582.0,17.5,17.5,0.476,10.0
2,Afghanistan,2013,Developing,59.9,268.0,66,0.01,73.219243,64.0,430,18.1,89,62.0,8.13,64.0,0.1,631.744976,31731688.0,17.7,17.7,0.47,9.9
3,Afghanistan,2012,Developing,59.5,272.0,69,0.01,78.184215,67.0,2787,17.6,93,67.0,8.52,67.0,0.1,669.959,3696958.0,17.9,18.0,0.463,9.8
4,Afghanistan,2011,Developing,59.2,275.0,71,0.01,7.097109,68.0,3013,17.2,97,68.0,7.87,68.0,0.1,63.537231,2978599.0,18.2,18.2,0.454,9.5


In [8]:
# Guardar dataset limpio
df.to_csv("clean_data.csv", index=False)
print("Archivo limpio guardado como clean_data.csv")

Archivo limpio guardado como clean_data.csv
