In [3]:
%load_ext kedro.ipython
print("Kedro extension loaded!")

The kedro.ipython extension is already loaded. To reload it, use:
  %reload_ext kedro.ipython
Kedro extension loaded!


In [4]:
print("Datasets disponibles en el catálogo:")
catalog.keys()


Datasets disponibles en el catálogo:



[1m[[0m
    [32m'animal_charity_donation_records'[0m,
    [32m'synthetic_dog_breed_health_data'[0m,
    [32m'aac_intakes_outcomes'[0m,
    [32m'cleaned_dog_breed'[0m,
    [32m'cleaned_charity_donations_notebook'[0m,
    [32m'parameters'[0m,
    [32m'params:data_engineering'[0m,
    [32m'params:data_engineering.imputation_strategy'[0m,
    [32m'params:data_engineering.imputation_strategy.numerical'[0m,
    [32m'params:data_engineering.imputation_strategy.categorical'[0m,
    [32m'params:data_engineering.scaling'[0m,
    [32m'params:data_engineering.scaling.standard_scaler'[0m,
    [32m'params:data_engineering.scaling.robust_scaler'[0m,
    [32m'params:data_engineering.outlier_limits'[0m,
    [32m'params:data_engineering.outlier_limits.lower_percentile'[0m,
    [32m'params:data_engineering.outlier_limits.upper_percentile'[0m,
    [32m'params:data_engineering.outlier_limits.iqr_multiplier'[0m,
    [32m'params:data_engineering.encode_columns'[0m
[1m]

In [5]:
df_charity_donation = catalog.load("animal_charity_donation_records")



print("df_charity_donation raw y columnas")
df_charity_donation.shape


df_charity_donation raw y columnas


[1m([0m[1;36m10000[0m, [1;36m14[0m[1m)[0m

In [6]:
# NOTEBOOK 03: PREPARACIÓN DATOS DE DONACIONES
# =============================================

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, RobustScaler

# Cargar datos
df = catalog.load("animal_charity_donation_records")
print(f"📊 Dataset original: {df.shape}")

# 1. ANÁLISIS INICIAL
print("🔍 MISSING VALUES:")
print(df.isnull().sum())
print(f"\n📝 Columnas: {list(df.columns)}")

📊 Dataset original: (10000, 14)
🔍 MISSING VALUES:
donor_id             0
age_group            0
gender               0
name                 0
email                0
country              0
donation_type        0
donation_amount      0
donation_date        0
payment_method       0
newsletter_opt_in    0
referral_channel     0
sector               0
campaign             0
dtype: int64

📝 Columnas: ['donor_id', 'age_group', 'gender', 'name', 'email', 'country', 'donation_type', 'donation_amount', 'donation_date', 'payment_method', 'newsletter_opt_in', 'referral_channel', 'sector', 'campaign']


In [7]:
# 2. ELIMINAR COLUMNAS IRRELEVANTES
columns_to_drop = ['donor_id', 'name', 'email']  # Info personal
df_clean = df.drop(columns=columns_to_drop)
print(f"✅ Columnas eliminadas: {columns_to_drop}")
print(f"📊 Nuevo shape: {df_clean.shape}")

✅ Columnas eliminadas: ['donor_id', 'name', 'email']
📊 Nuevo shape: (10000, 11)


In [8]:
# 3. MANEJO DE OUTLIERS - DONATION_AMOUNT
print("🎯 TRATAMIENTO DE OUTLIERS:")

# Winsorization en percentiles 1% y 99%
lower_bound = df_clean['donation_amount'].quantile(0.01)
upper_bound = df_clean['donation_amount'].quantile(0.99)

print(f"Límites winsorization: [{lower_bound:.2f}, {upper_bound:.2f}]")

# Aplicar winsorization
df_clean['donation_amount'] = np.where(
    df_clean['donation_amount'] < lower_bound, lower_bound,
    np.where(df_clean['donation_amount'] > upper_bound, upper_bound, df_clean['donation_amount'])
)

print(f"✅ Outliers tratados - Nuevo rango: [{df_clean['donation_amount'].min():.2f}, {df_clean['donation_amount'].max():.2f}]")

🎯 TRATAMIENTO DE OUTLIERS:
Límites winsorization: [1.94, 224.99]
✅ Outliers tratados - Nuevo rango: [1.94, 224.99]


In [9]:
# 4. TRANSFORMACIÓN PARA NORMALIDAD
print("📈 TRANSFORMACIÓN LOGARÍTMICA:")
df_clean['donation_amount_log'] = np.log1p(df_clean['donation_amount'])
print(f"Skewness original: {df['donation_amount'].skew():.3f}")
print(f"Skewness log: {df_clean['donation_amount_log'].skew():.3f}")

📈 TRANSFORMACIÓN LOGARÍTMICA:
Skewness original: 1.870
Skewness log: -0.025


In [10]:
# 5. ENGINEERING DE VARIABLES TEMPORALES
df_clean['donation_date'] = pd.to_datetime(df_clean['donation_date'])
df_clean['donation_year'] = df_clean['donation_date'].dt.year
df_clean['donation_month'] = df_clean['donation_date'].dt.month
df_clean['donation_day'] = df_clean['donation_date'].dt.day

print("📅 Variables temporales creadas: year, month, day")

📅 Variables temporales creadas: year, month, day


In [11]:
# 6. CODIFICACIÓN DE VARIABLES CATEGÓRICAS
categorical_cols = ['country', 'payment_method', 'referral_channel', 'sector', 'campaign']
df_encoded = pd.get_dummies(df_clean, columns=categorical_cols, prefix_sep='_', drop_first=True)

print(f"✅ Dataset encoded: {df_encoded.shape}")
print(f"📊 Nuevas columnas: {len(df_encoded.columns)}")

✅ Dataset encoded: (10000, 55)
📊 Nuevas columnas: 55


In [12]:
# GUARDAR DATASET CORRECTAMENTE
import os
from pathlib import Path

# 1. Primero crear la carpeta si no existe
os.makedirs('data/02_intermediate', exist_ok=True)

# 2. Guardar el DataFrame como CSV
output_path = 'data/02_intermediate/cleaned_charity_donations_notebook.csv'
df_encoded.to_csv(output_path, index=False)  # ← ¡ESTA LÍNEA FALTA!

print(f"💾 Dataset guardado como: {output_path}")

# 3. Verificar que se creó
file_path = Path(output_path)
if file_path.exists():
    print(f"✅ Archivo verificado: {file_path} ({file_path.stat().st_size} bytes)")
else:
    print("❌ Error: El archivo no se creó")

💾 Dataset guardado como: data/02_intermediate/cleaned_charity_donations_notebook.csv
✅ Archivo verificado: data\02_intermediate\cleaned_charity_donations_notebook.csv (3381823 bytes)
