# 03 — Data Preparation (Kedro + CRISP-DM)

In [1]:
# --- Inicializar Kedro sin magics ---
from pathlib import Path
from kedro.framework.session import KedroSession
from kedro.framework.startup import bootstrap_project

# localizar la raíz del proyecto subiendo hasta encontrar pyproject.toml
project_path = Path.cwd()
while project_path != project_path.parent and not (project_path / "pyproject.toml").exists():
    project_path = project_path.parent
assert (project_path / "pyproject.toml").exists(), f"No encuentro pyproject.toml desde {Path.cwd()}"

bootstrap_project(str(project_path))
session = KedroSession.create(project_path=str(project_path))
ctx = session.load_context()
catalog = ctx.catalog

print("Proyecto:", project_path)

Proyecto: c:\Users\lttlk\Documents\Nueo\machinegame


In [2]:
# Diagnóstico rápido
import sys
try:
    import kedro
    print("Python:", sys.executable)
    print("Kedro:", kedro.__version__)
    print("Datasets (primeros 15):", list(catalog.list())[:15])
except Exception as e:
    print("No se pudo inspeccionar kedro:", e)

Python: c:\Users\lttlk\Documents\Nueo\.venv\Scripts\python.exe
Kedro: 1.0.0
No se pudo inspeccionar kedro: 'DataCatalogWithCatalogCommandsMixin' object has no attribute 'list'


In [3]:
# Helpers: cargar o reconstruir y reparación de encoding
from pathlib import Path
import pandas as pd

def load_or_build(name: str, pipe: str):
    """Intenta catalog.load(name); si falla, ejecuta el pipeline y vuelve a cargar."""
    try:
        return catalog.load(name)
    except Exception as e:
        print(f"[load_or_build] {name} no disponible o ilegible ({e}). Corriendo pipeline: {pipe} …")
        session.run(pipeline_name=pipe)
        return catalog.load(name)

def fix_reporting_csv_encoding(report_dir: Path | str = None, encoding_from: str = "latin1", encoding_to: str = "utf-8"):
    """Repara CSVs en 08_reporting que estén en ANSI/latin-1: reescribe en UTF-8.
    Si ya son UTF-8 válidos, los deja intactos.
    """
    if report_dir is None:
        report_dir = project_path / "data" / "08_reporting"
    report_dir = Path(report_dir)
    if not report_dir.exists():
        print("[fix_reporting_csv_encoding] No existe la carpeta:", report_dir)
        return

    fixed, skipped = [], []
    for p in report_dir.glob("*.csv"):
        try:
            _ = pd.read_csv(p, encoding=encoding_to, nrows=5)
            skipped.append(p.name)
        except Exception:
            try:
                df = pd.read_csv(p, encoding=encoding_from)
                df.to_csv(p, index=False, encoding=encoding_to)
                fixed.append(p.name)
            except Exception as e:
                print(f"[fix_reporting_csv_encoding] No se pudo reparar {p.name}: {e}")
    print("[fix_reporting_csv_encoding] Reparados (latin-1 -> utf-8):", fixed)
    print("[fix_reporting_csv_encoding] Ya estaban en utf-8:", skipped)

### Ejecutar pipeline de Data Preparation

In [4]:
session.run(pipeline_name="data_preparation")


[1m{[0m
    [32m'games_prepared'[0m: [1;35mkedro_datasets.pandas.parquet_dataset.ParquetDataset[0m[1m([0m[33mfilepath[0m=[1;35mPurePosixPath[0m[1m([0m[32m'C:/Users/lttlk/Documents/Nueo/machinegame/data/05_model_input/games_prepared.parquet'[0m[1m)[0m, [33mprotocol[0m=[32m'file'[0m, [33mload_args[0m=[1m{[0m[1m}[0m, [33msave_args[0m=[1m{[0m[1m}[0m[1m)[0m,
    [32m'steam_prepared'[0m: [1;35mkedro_datasets.pandas.parquet_dataset.ParquetDataset[0m[1m([0m[33mfilepath[0m=[1;35mPurePosixPath[0m[1m([0m[32m'C:/Users/lttlk/Documents/Nueo/machinegame/data/05_model_input/steam_prepared.parquet'[0m[1m)[0m, [33mprotocol[0m=[32m'file'[0m, [33mload_args[0m=[1m{[0m[1m}[0m, [33msave_args[0m=[1m{[0m[1m}[0m[1m)[0m,
    [32m'vg_sales_prepared'[0m: [1;35mkedro_datasets.pandas.parquet_dataset.ParquetDataset[0m[1m([0m[33mfilepath[0m=[1;35mPurePosixPath[0m[1m([0m[32m'C:/Users/lttlk/Documents/Nueo/machinegame/data/05_model_input/

### Cargar y comparar RAW vs CLEAN vs PREPARED

In [5]:
games_raw = load_or_build("games_raw", "data_preparation")
games_clean = load_or_build("games_clean", "data_preparation")
games_prep = load_or_build("games_prepared", "data_preparation")

steam_raw = load_or_build("steam_raw", "data_preparation")
steam_clean = load_or_build("steam_clean", "data_preparation")
steam_prep = load_or_build("steam_prepared", "data_preparation")

vg_raw = load_or_build("vg_sales_raw", "data_preparation")
vg_clean = load_or_build("vg_sales_clean", "data_preparation")
vg_prep = load_or_build("vg_sales_prepared", "data_preparation")

def compare(df_raw, df_clean, name="dataset"):
    before = int(df_raw.isna().sum().sum())
    after = int(df_clean.isna().sum().sum())
    print(f"""== {name} ==
shape raw -> clean: {df_raw.shape} -> {df_clean.shape}
nulos  raw -> clean: {before} -> {after}
""".rstrip())

compare(games_raw, games_clean, "games")
compare(steam_raw, steam_clean, "steam")
compare(vg_raw, vg_clean, "vg_sales")

from IPython.display import display
display(games_clean.head())
display(steam_clean.head())
display(vg_clean.head())

print("prepared shapes:",
      games_prep.shape, steam_prep.shape, vg_prep.shape)

== games ==
shape raw -> clean: (14801, 4) -> (14785, 4)
nulos  raw -> clean: 0 -> 0
== steam ==
shape raw -> clean: (199999, 5) -> (199292, 5)
nulos  raw -> clean: 0 -> 0
== vg_sales ==
shape raw -> clean: (16719, 16) -> (16719, 16)
nulos  raw -> clean: 49141 -> 0


Unnamed: 0,Console,GameName,Review,Score
0,PC,Baldur's Gate 3 Early Access Review,Early Access,6
1,NS,Control: Ultimate Edition Cloud Version Review,Good,7
2,"XONE, PC, PS4",Doom Eternal: The Ancient Gods Part 1 Review,Good,7
3,"XONE, PC, PS4",Watch Dogs: Legion Review,Great,8
4,PC,Ring Of Pain Review,Great,8


Unnamed: 0,151603712,The Elder Scrolls V Skyrim,purchase,1.0,0
0,151603712,The Elder Scrolls V Skyrim,play,273.0,0
1,151603712,Fallout 4,purchase,1.0,0
2,151603712,Fallout 4,play,87.0,0
3,151603712,Spore,purchase,1.0,0
4,151603712,Spore,play,14.9,0


Unnamed: 0,Name,Platform,Year_of_Release,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales,Critic_Score,Critic_Count,User_Score,User_Count,Developer,Rating
0,Wii Sports,Wii,2006.0,Sports,Nintendo,41.36,28.96,3.77,8.45,82.53,76.0,51.0,8.0,322.0,Nintendo,E
1,Super Mario Bros.,NES,1985.0,Platform,Nintendo,29.08,3.58,6.81,0.77,40.24,0.0,0.0,0.0,0.0,,
2,Mario Kart Wii,Wii,2008.0,Racing,Nintendo,15.68,12.76,3.79,3.29,35.52,82.0,73.0,8.3,709.0,Nintendo,E
3,Wii Sports Resort,Wii,2009.0,Sports,Nintendo,15.61,10.93,3.28,2.95,32.77,80.0,73.0,8.0,192.0,Nintendo,E
4,Pokemon Red/Pokemon Blue,GB,1996.0,Role-Playing,Nintendo,11.27,8.89,10.22,1.0,31.37,0.0,0.0,0.0,0.0,,


prepared shapes: (14785, 6) (199292, 7) (16719, 18)


### Checklist Fase 03
- [ ] Columnas 100% nulas eliminadas  
- [ ] Duplicados tratados  
- [ ] Nulos numéricos → 0 o mediana (según parámetros)  
- [ ] Nulos texto → cadena vacía  
- [ ] *Features* básicas agregadas (`row_na_count`, `row_na_ratio`)  
- [ ] Artefactos en `data/03_primary/` y/o `data/05_model_input/`