# Proyecto: Impacto del Riego en la Productividad Agrícola en México
## ETL reproducible
Este notebook contiene el flujo ETL base para preparar los datos.

In [0]:
import pandas as pd
import numpy as np
import os

data_path = '/mnt/data'
files = os.listdir(data_path)
files

## Funciones ETL

In [0]:
def load_dataset(path):
    return pd.read_csv(path, encoding='utf-8', low_memory=False)

def clean_columns(df):
    df.columns = (
        df.columns
        .str.lower()
        .str.replace(' ', '_')
        .str.replace('[^a-z0-9_]', '', regex=True)
    )
    return df

def add_variables(df, surface_col='superficie', production_col='produccion'):
    if surface_col in df.columns and production_col in df.columns:
        df['rendimiento'] = df[production_col] / df[surface_col].replace(0, np.nan)
    return df

## Pipeline ejemplo (placeholders)

In [0]:
example_file = 0
for f in files:
    if f.endswith('.csv'):
        example_file = os.path.join(data_path, f)
        break

example_file

In [0]:
if example_file:
    df = load_dataset(example_file)
    df = clean_columns(df)
    df = add_variables(df)
    df.head()
else:
    'No CSV files found to demonstrate ETL.'