# Configuración de paths. 
Busco y agrego el 'root' del Proyecto en sys.path, para evitar problemas al importar mis scripts.

In [None]:
from pathlib import Path
import sys

# Subimos hasta encontrar config.py
base = Path().resolve()
while not (base / "config.py").exists() and base != base.parent:
    base = base.parent

if not (base / "config.py").exists():
    print(f"❌ No se encontró config.py en la jerarquía de carpetas desde: {Path().resolve()}")
    print("➜ sys.path no fue modificado.")
else:
    if str(base) not in sys.path:
        sys.path.insert(0, str(base))
        print(f"✔️ sys.path configurado con raíz del proyecto: {base}")
    else:
        print(f"✔️ sys.path ya está configurado con raíz del proyecto: {base}")
    
    project_root = str(base)
    print(f"✔️ Root detectado: {project_root}")

# Fraccionar DF muy pesado.

In [None]:
import ipywidgets as widgets
from IPython.display import display
#from scripts.splitter import run_split
#from scripts import splitter
import subprocess

# Widgets
mode_selector = widgets.Dropdown(options=["rows", "dynamic"], value="rows", description="Modo:")
mb_slider = widgets.IntSlider(value=100, min=10, max=200, step=10, description="Max MB:")
format_selector = widgets.Dropdown(options=["csv", "parquet"], value="csv", description="Formato:")
output_dir_text = widgets.Text(value="splits", description="Output dir:")
base_name_text = widgets.Text(value="part", description="Base name:")
input_path_text = widgets.Text(value="dataset.csv", description="Input file:")
run_button = widgets.Button(description="Dividir Dataset", button_style="success")
out = widgets.Output()

def on_run_clicked(b):
    out.clear_output()
    cmd = [
        "python", "scripts/splitter.py",
        "--input", input_path_text.value,
        "--mode", mode_selector.value,
        "--max-mb", str(mb_slider.value),
        "--output-dir", output_dir_text.value,
        "--base-name", base_name_text.value,
        "--fmt", format_selector.value
    ]
    with out:
        process = subprocess.Popen(cmd, cwd=project_root, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)
        for line in process.stdout:
            print(line.strip())
run_button.on_click(on_run_clicked)
# Mostrar interfaz
display(input_path_text, mode_selector, mb_slider, format_selector, output_dir_text, base_name_text, run_button, out)


D:\CHardyE-Projects\Python\DataAnalitics\DataAnalitics2025\NYC_Taxi_Lab\data\raw\yellow_tripdata_2025-01.parquet


In [None]:
import os
print(os.getcwd())

In [None]:
import sys
print(sys.path[:3])


In [None]:
import os
print(os.listdir(r"D:\CHardyE-Projects\Python\DataAnalitics\Repositorios\DA_NYC_Taxis_Yellow_LAB\scripts"))


In [None]:
import ipywidgets as widgets
from IPython.display import display
import subprocess

# Widgets
mode_selector = widgets.Dropdown(options=["rows", "dynamic"], value="rows", description="Modo:")
mb_slider = widgets.IntSlider(value=100, min=10, max=200, step=10, description="Max MB:")
format_selector = widgets.Dropdown(options=["csv", "parquet"], value="csv", description="Formato:")
output_dir_text = widgets.Text(value="splits", description="Output dir:")
base_name_text = widgets.Text(value="part", description="Base name:")
input_path_text = widgets.Text(value="dataset.csv", description="Input file:")

run_button = widgets.Button(description="Dividir Dataset", button_style="success")

def on_run_clicked(b):
    cmd = [
        "python", "splitter.py",
        "--input", input_path_text.value,
        "--mode", mode_selector.value,
        "--max-mb", str(mb_slider.value),
        "--output-dir", output_dir_text.value,
        "--base-name", base_name_text.value,
        "--fmt", format_selector.value
    ]
    process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)
    for line in process.stdout:
        print(line.strip())

run_button.on_click(on_run_clicked)

# Mostrar interfaz completa
display(input_path_text, mode_selector, mb_slider, format_selector, output_dir_text, base_name_text, run_button)


In [None]:
import pandas as pd
import math
import os
import ipywidgets as widgets
from IPython.display import display

def split_by_rows(df, max_mb=100, output_dir="splits", base_name="part", fmt="csv"):
    total_bytes = df.memory_usage(deep=True).sum()
    total_mb = total_bytes / (1024**2)
    rows_per_chunk = math.floor(len(df) * (max_mb / total_mb))

    os.makedirs(output_dir, exist_ok=True)
    num_chunks = math.ceil(len(df) / rows_per_chunk)

    for i in range(num_chunks):
        start = i * rows_per_chunk
        end = (i + 1) * rows_per_chunk
        chunk = df.iloc[start:end]

        filename = os.path.join(output_dir, f"{base_name}_rows_{i+1}.{fmt}")
        if fmt == "csv":
            chunk.to_csv(filename, index=False)
        elif fmt == "parquet":
            chunk.to_parquet(filename, index=False)
        print(f"[rows] Guardado {filename} con {len(chunk)} filas")


def split_dynamic(df, max_mb=100, output_dir="splits", base_name="part", fmt="csv"):
    os.makedirs(output_dir, exist_ok=True)
    start = 0
    part = 1
    while start < len(df):
        end = start + 10000
        while end <= len(df):
            chunk = df.iloc[start:end]
            filename = os.path.join(output_dir, f"{base_name}_dyn_{part}.{fmt}")
            if fmt == "csv":
                chunk.to_csv(filename, index=False)
            elif fmt == "parquet":
                chunk.to_parquet(filename, index=False)

            size_mb = os.path.getsize(filename) / (1024**2)
            if size_mb > max_mb:
                end = end - 1000 if end - 1000 > start else start + 1
                chunk = df.iloc[start:end]
                if fmt == "csv":
                    chunk.to_csv(filename, index=False)
                elif fmt == "parquet":
                    chunk.to_parquet(filename, index=False)
                size_mb = os.path.getsize(filename) / (1024**2)
                print(f"[dyn] Guardado {filename} con {len(chunk)} filas ({size_mb:.2f} MB)")
                start = end
                part += 1
                break
            else:
                end += 10000
        else:
            chunk = df.iloc[start:]
            filename = os.path.join(output_dir, f"{base_name}_dyn_{part}.{fmt}")
            if fmt == "csv":
                chunk.to_csv(filename, index=False)
            elif fmt == "parquet":
                chunk.to_parquet(filename, index=False)
            size_mb = os.path.getsize(filename) / (1024**2)
            print(f"[dyn] Guardado {filename} con {len(chunk)} filas ({size_mb:.2f} MB)")
            break


# ============================
# Widget interactivo
# ============================

def run_split(input_path):
    # Cargar dataset
    if input_path.endswith(".csv"):
        df = pd.read_csv(input_path)
    elif input_path.endswith(".parquet"):
        df = pd.read_parquet(input_path, engine="pyarrow")
    else:
        print("Formato no soportado. Usa CSV o Parquet.")
        return

    mode = mode_selector.value
    max_mb = mb_slider.value
    fmt = format_selector.value
    output_dir = output_dir_text.value
    base_name = base_name_text.value

    if mode == "rows":
        split_by_rows(df, max_mb=max_mb, output_dir=output_dir, base_name=base_name, fmt=fmt)
    else:
        split_dynamic(df, max_mb=max_mb, output_dir=output_dir, base_name=base_name, fmt=fmt)


# Widgets
mode_selector = widgets.Dropdown(options=["rows", "dynamic"], value="rows", description="Modo:")
mb_slider = widgets.IntSlider(value=100, min=10, max=200, step=10, description="Max MB:")
format_selector = widgets.Dropdown(options=["csv", "parquet"], value="csv", description="Formato:")
output_dir_text = widgets.Text(value="splits", description="Output dir:")
base_name_text = widgets.Text(value="part", description="Base name:")
input_path_text = widgets.Text(value="dataset.csv", description="Input file:")

run_button = widgets.Button(description="Dividir Dataset", button_style="success")

def on_run_clicked(b):
    run_split(input_path_text.value)

run_button.on_click(on_run_clicked)

# Mostrar interfaz
display(input_path_text, mode_selector, mb_slider, format_selector, output_dir_text, base_name_text, run_button)

# D:\CHardyE-Projects\Python\DataAnalitics\DataAnalitics2025\NYC_Taxi_Lab\data\raw\yellow_tripdata_2025-01.parquet
# D:\CHardyE-Projects\Python\DataAnalitics\DataAnalitics2025\NYC_Taxi_Lab\data\splits


In [None]:
import pyarrow
print(pyarrow.__version__)


In [None]:
import sys
print(sys.executable)
