In [None]:
# Feature Engineering (Fase 4 - CRISP-DM)

En este notebook se crean nuevas variables (**features**) a partir de los datos 
limpios guardados en `03_primary/`.  
Estos features permitirÃ¡n mejorar el desempeÃ±o de los modelos en la fase de **Modelado**.


In [28]:
import pandas as pd
from pathlib import Path

# Ruta del proyecto
project_path = Path("C:/Users/Ricardo/ricardo-ojeda-machine")
data_path = project_path / "data/03_primary"

# Cargar dataset limpio
merged = pd.read_csv(data_path / "intakes_outcomes_clean.csv")

print("âœ… Dataset cargado:", merged.shape)
display(merged.head())



âœ… Dataset cargado: (3261, 40)


Unnamed: 0,Animal ID,Name_intake,DateTime_intake,MonthYear_intake,Found Location,Intake Type,Intake Condition,Animal Type_intake,Sex upon Intake,Age upon Intake,...,Month_outcome,age_days_outcome,age_years_outcome,sex_outcome,status_outcome,length_of_stay_days,intake_year,outcome_year,stay_weekday_intake,stay_weekday_outcome
0,A006100,Scamp,2014-03-07 14:26:00,March 2014,8700 Research in Austin (TX),Public Assist,Normal,Dog,Neutered Male,6 years,...,12.0,2556.75,7.0,Male,Neutered,1370.0,2014,2017,Friday,Thursday
1,A433746,Evette,2015-10-28 18:41:00,October 2015,Austin (TX),Owner Surrender,Normal,Dog,Spayed Female,9 years,...,11.0,3287.25,9.0,Female,Spayed,6.0,2015,2015,Wednesday,Wednesday
2,A458233,Cinnamon,2017-05-29 09:32:00,May 2017,9401 S 1St St in Austin (TX),Owner Surrender,Normal,Dog,Spayed Female,11 years,...,6.0,4383.0,12.0,Female,Spayed,374.0,2017,2018,Monday,Friday
3,A459161,Dugan,2015-04-18 16:02:00,April 2015,W Parmer Ln & Mcneil Dr in Austin (TX),Stray,Normal,Dog,Neutered Male,9 years,...,4.0,3287.25,9.0,Male,Neutered,0.0,2015,2015,Saturday,Sunday
4,A465637,*Commander,2017-06-04 08:17:00,June 2017,7201 Levander Loop in Austin (TX),Stray,Injured,Dog,Neutered Male,10 years,...,6.0,3652.5,10.0,Male,Neutered,15.0,2017,2017,Sunday,Tuesday


In [29]:
# Crear categorÃ­as de edad (en intake)
if "age_years_intake" in merged.columns:
    bins = [0, 1, 3, 7, 25]
    labels = ["Cachorro (<1a)", "Joven (1-3a)", "Adulto (3-7a)", "Senior (>7a)"]
    merged["age_category_intake"] = pd.cut(
        merged["age_years_intake"], 
        bins=bins, 
        labels=labels, 
        include_lowest=True
    )

print("ðŸ“Œ Nuevas categorÃ­as de edad creadas.")
display(merged[["age_years_intake", "age_category_intake"]].head(10))


ðŸ“Œ Nuevas categorÃ­as de edad creadas.


Unnamed: 0,age_years_intake,age_category_intake
0,6.0,Adulto (3-7a)
1,9.0,Senior (>7a)
2,11.0,Senior (>7a)
3,9.0,Senior (>7a)
4,10.0,Senior (>7a)
5,9.0,Senior (>7a)
6,7.0,Adulto (3-7a)
7,10.0,Senior (>7a)
8,7.0,Adulto (3-7a)
9,7.0,Adulto (3-7a)


In [30]:
if "length_of_stay_days" in merged.columns:
    bins = [0, 7, 30, 180, 3650]  # 1 semana, 1 mes, 6 meses, >1 aÃ±o
    labels = ["Corto (<1 sem)", "Medio (1 sem-1 mes)", "Largo (1-6 meses)", "Muy largo (>6m)"]
    merged["stay_category"] = pd.cut(
        merged["length_of_stay_days"], 
        bins=bins, 
        labels=labels, 
        include_lowest=True
    )

print("ðŸ“Œ Nuevas categorÃ­as de estancia creadas.")
display(merged[["length_of_stay_days", "stay_category"]].head(10))



ðŸ“Œ Nuevas categorÃ­as de estancia creadas.


Unnamed: 0,length_of_stay_days,stay_category
0,1370.0,Muy largo (>6m)
1,6.0,Corto (<1 sem)
2,374.0,Muy largo (>6m)
3,0.0,Corto (<1 sem)
4,15.0,Medio (1 sem-1 mes)
5,2.0,Corto (<1 sem)
6,50.0,Largo (1-6 meses)
7,33.0,Largo (1-6 meses)
8,6.0,Corto (<1 sem)
9,504.0,Muy largo (>6m)


In [32]:
# Crear intake_month si no existe
if "DateTime_intake" in merged.columns:
    merged["intake_month"] = pd.to_datetime(merged["DateTime_intake"]).dt.month
    print("âœ… Columna intake_month creada.")
# Estaciones (USA, hemisferio norte)
season_map = {
    12: "Winter", 1: "Winter", 2: "Winter",
    3: "Spring", 4: "Spring", 5: "Spring",
    6: "Summer", 7: "Summer", 8: "Summer",
    9: "Fall", 10: "Fall", 11: "Fall"
}

if "intake_month" in merged.columns:
    merged["intake_season"] = merged["intake_month"].map(season_map)

print("ðŸ“Œ Columna intake_season creada.")
print(merged["intake_season"].value_counts())


âœ… Columna intake_month creada.
ðŸ“Œ Columna intake_season creada.
intake_season
Summer    1016
Spring     955
Fall       758
Winter     532
Name: count, dtype: int64


In [33]:
from sklearn.preprocessing import LabelEncoder

categorical_cols = [
    "Animal Type_intake", "Breed_intake", "Color_intake",
    "sex_intake", "status_intake", "Outcome Type_outcome"
]

encoder = LabelEncoder()
for col in categorical_cols:
    if col in merged.columns:
        merged[f"{col}_enc"] = encoder.fit_transform(merged[col].astype(str))

print("ðŸ“Œ Variables categÃ³ricas codificadas.")
display(merged.head())


ðŸ“Œ Variables categÃ³ricas codificadas.


Unnamed: 0,Animal ID,Name_intake,DateTime_intake,MonthYear_intake,Found Location,Intake Type,Intake Condition,Animal Type_intake,Sex upon Intake,Age upon Intake,...,stay_weekday_outcome,age_category_intake,stay_category,intake_month,intake_season,Animal Type_intake_enc,Breed_intake_enc,Color_intake_enc,sex_intake_enc,status_intake_enc
0,A006100,Scamp,2014-03-07 14:26:00,March 2014,8700 Research in Austin (TX),Public Assist,Normal,Dog,Neutered Male,6 years,...,Thursday,Adulto (3-7a),Muy largo (>6m),3,Spring,2,367,177,1,1
1,A433746,Evette,2015-10-28 18:41:00,October 2015,Austin (TX),Owner Surrender,Normal,Dog,Spayed Female,9 years,...,Wednesday,Senior (>7a),Corto (<1 sem),10,Fall,2,31,157,0,2
2,A458233,Cinnamon,2017-05-29 09:32:00,May 2017,9401 S 1St St in Austin (TX),Owner Surrender,Normal,Dog,Spayed Female,11 years,...,Friday,Senior (>7a),Muy largo (>6m),5,Spring,2,44,145,0,2
3,A459161,Dugan,2015-04-18 16:02:00,April 2015,W Parmer Ln & Mcneil Dr in Austin (TX),Stray,Normal,Dog,Neutered Male,9 years,...,Sunday,Senior (>7a),Corto (<1 sem),4,Spring,2,355,21,1,1
4,A465637,*Commander,2017-06-04 08:17:00,June 2017,7201 Levander Loop in Austin (TX),Stray,Injured,Dog,Neutered Male,10 years,...,Tuesday,Senior (>7a),Medio (1 sem-1 mes),6,Summer,2,298,70,1,1


In [34]:
# Crear carpeta si no existe
feature_path = project_path / "data/04_feature"
feature_path.mkdir(parents=True, exist_ok=True)

# Guardar dataset con features
merged.to_csv(feature_path / "features_dataset.csv", index=False)

print("âœ… Dataset con features guardado en:", feature_path / "features_dataset.csv")



âœ… Dataset con features guardado en: C:\Users\Ricardo\ricardo-ojeda-machine\data\04_feature\features_dataset.csv
