In [11]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [12]:
df = pd.read_csv("world_bank_data_2025.csv")
print("--- Vista previa inicial ---")
print(df.head())

--- Vista previa inicial ---
  country_name country_id  year  Inflation (CPI %)  GDP (Current USD)  \
0        Aruba         aw  2010           2.078141       2.453597e+09   
1        Aruba         aw  2011           4.316297       2.637859e+09   
2        Aruba         aw  2012           0.627472       2.615208e+09   
3        Aruba         aw  2013          -2.372065       2.727850e+09   
4        Aruba         aw  2014           0.421441       2.790850e+09   

   GDP per Capita (Current USD)  Unemployment Rate (%)  \
0                  24093.140151                    NaN   
1                  25712.384302                    NaN   
2                  25119.665545                    NaN   
3                  25813.576727                    NaN   
4                  26129.839062                    NaN   

   Interest Rate (Real, %)  Inflation (GDP Deflator, %)  \
0                11.666131                    -1.223407   
1                 4.801974                     4.005674   
2     

In [13]:
print("--- Información inicial del dataset ---")
print(df.info())
print("\nValores faltantes por columna:")
print(df.isnull().sum())

--- Información inicial del dataset ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3472 entries, 0 to 3471
Data columns (total 16 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   country_name                     3472 non-null   object 
 1   country_id                       3472 non-null   object 
 2   year                             3472 non-null   int64  
 3   Inflation (CPI %)                2694 non-null   float64
 4   GDP (Current USD)                2933 non-null   float64
 5   GDP per Capita (Current USD)     2938 non-null   float64
 6   Unemployment Rate (%)            2795 non-null   float64
 7   Interest Rate (Real, %)          1735 non-null   float64
 8   Inflation (GDP Deflator, %)      2904 non-null   float64
 9   GDP Growth (% Annual)            2912 non-null   float64
 10  Current Account Balance (% GDP)  2563 non-null   float64
 11  Government Expense (% of GDP)    1820 non-

1. Tratar valores faltantes

In [14]:
for col in df.columns:
    if df[col].dtype in ['int64', 'float64']:
        df[col] = df[col].fillna(df[col].median())
    else:
        df[col] = df[col].fillna(df[col].mode()[0])

2. Normalizar texto en columnas categóricas

In [15]:
for col in df.select_dtypes(include=['object']).columns:
    df[col] = df[col].astype(str).str.strip().str.lower()

3. Eliminar duplicados

In [16]:
df = df.drop_duplicates()

4. Detectar y recortar valores fuera de rango en columnas de %

In [17]:
if 'gender diversity ratio (%)' in df.columns:
    df = df[(df['gender diversity ratio (%)'] >= 0) & (df['gender diversity ratio (%)'] <= 100)]
if 'remote work ratio (%)' in df.columns:
    df = df[(df['remote work ratio (%)'] >= 0) & (df['remote work ratio (%)'] <= 100)]
if 'automation risk (%)' in df.columns:
    df = df[(df['automation risk (%)'] >= 0) & (df['automation risk (%)'] <= 100)]

In [18]:
print("--- Información después de limpieza ---")
print(df.info())

--- Información después de limpieza ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3472 entries, 0 to 3471
Data columns (total 16 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   country_name                     3472 non-null   object 
 1   country_id                       3472 non-null   object 
 2   year                             3472 non-null   int64  
 3   Inflation (CPI %)                3472 non-null   float64
 4   GDP (Current USD)                3472 non-null   float64
 5   GDP per Capita (Current USD)     3472 non-null   float64
 6   Unemployment Rate (%)            3472 non-null   float64
 7   Interest Rate (Real, %)          3472 non-null   float64
 8   Inflation (GDP Deflator, %)      3472 non-null   float64
 9   GDP Growth (% Annual)            3472 non-null   float64
 10  Current Account Balance (% GDP)  3472 non-null   float64
 11  Government Expense (% of GDP)    3472 non-

Guardar dataset limpio

In [20]:
df.to_csv("world_bank_data_2025_clean.csv", index=False)
print("Dataset limpio guardado en: world_bank_data_2025_clean.csv")

Dataset limpio guardado en: world_bank_data_2025_clean.csv
