## Configuration

In [50]:
base_cfg = {
    # Project settings
    'input_file': 'data/test_client/comercializadora_transactions_10.csv',  # Input CSV file
    'out_dir': 'data',
    'client': 'test_client',
    'fidx_config': {'type': 'local', 'path': 'feature_store'},
    
    # Analysis settings
    'analysis_dt': '2024-12-02',
    'language': 'EN',
    
    # Logging and performance
    'log_level': 'INFO',               # 'DEBUG', 'INFO', 'WARNING','ERROR', 'CRITICAL'
    
    # Column Mappings
    'column_mappings': {
        'in_dt': 'fecha',
        'in_trans_id': 'trans_id',
        'in_product': 'producto',
        'in_description': 'glosa',
        'in_cost': 'costo',
        'in_price': 'precio',
        'in_quantity': 'cantidad',
        'in_total_price': 'total',
        'in_customer_id': 'customer_id',
        'in_customer_name': 'customer_name',
        'in_customer_location': 'customer_location',
    },
    
    # Column Types
    'column_types' : {
        'in_dt': 'date',
        'in_trans_id': 'str',
        'in_product': 'str',
        'in_description': 'str',
        'in_cost': 'float',
        'in_price': 'float',
        'in_quantity': 'float',
        'in_total_price': 'float',
        'in_customer_id': 'str',
        'in_customer_name': 'str',
        'in_customer_location': 'str',
    },
}

# General imports
import os
import pandas as pd
import numpy as np

## Logging and otuput

In [51]:
# Initialize logging from config
from src.logger import setup_logging
setup_logging(log_level=base_cfg.get('log_level', 'INFO'), config=base_cfg)

# Create output directory if it doesn't exist
out_path = f"{base_cfg['out_dir']}/{base_cfg['client']}"
os.makedirs(out_path, exist_ok=True)

📝 Run instance ID: test_client_20251008_155238 - Logging [INFO] to: logs\test_client_20251008_155238.log


## GabedaContext + ModelExecutor

In [52]:
# Step 1: Import new architecture components
from src.gabeda_context import GabedaContext
from src.model_executor import ModelExecutor, ModelOrchestrator
from src.preprocessing import preprocess_data

In [53]:
# Step 2: Create GabedaContext with user configuration
# Initialize context
ctx = GabedaContext(base_cfg)
print(f"✓ Context initialized: {ctx}")

✓ Context initialized: GabedaContext(run_id='test_client_20251008_155238', datasets=0, models=0)


In [54]:
# Step 3: Load and preprocess data
raw_data = pd.read_csv(base_cfg['input_file'])
preprocessed_df = preprocess_data(raw_data, base_cfg)

# Store preprocessed data in context
ctx.set_dataset('raw', raw_data)
ctx.set_dataset('preprocessed', preprocessed_df)

print(f"✓ Data loaded: {preprocessed_df.shape}")
print(f"✓ Datasets in context: {ctx.list_datasets()}")

✓ Data loaded: (8, 11)
✓ Datasets in context: ['raw', 'preprocessed']


In [55]:
ctx.get_dataset('raw').head()

Unnamed: 0,trans_id,fecha,producto,glosa,costo,total,cantidad,customer_id,customer_name,customer_location
0,AS000001_1,12/01/2024 10:58:00 AM,DRY007,HARINA TEMPURA ESPECIAL 1KG,2200,6641,2,B017,PENSION VOLCAN,Temuco
1,AS000001_2,12/01/2024 10:58:00 AM,FROZ007,PAPAS FRITAS CORTE BASTÓN 2.5KG,2200,3075,1,B017,PENSION VOLCAN,Temuco
2,AS000001_3,12/01/2024 10:58:00 AM,FROZ003,BARRITAS MERLUZA EMPANADAS 1KG,4200,5966,1,B017,PENSION VOLCAN,Temuco
3,AS000001_4,12/01/2024 10:58:00 AM,SUSH008,SALSA SOYA KIKKOMAN 1L,3200,4320,1,B017,PENSION VOLCAN,Temuco
4,AS000001_5,12/01/2024 10:58:00 AM,SUSH008,SALSA SOYA KIKKOMAN 1L,3200,4327,1,B017,PENSION VOLCAN,Temuco


In [56]:
ctx.get_dataset('preprocessed').head()

Unnamed: 0,in_trans_id,in_dt,in_product,in_description,in_cost,in_total_price,in_quantity,in_customer_id,in_customer_name,in_customer_location,analysis_dt
0,AS000001_1,2024-12-01 10:58:00,DRY007,HARINA TEMPURA ESPECIAL 1KG,2200.0,6641.0,2.0,B017,PENSION VOLCAN,Temuco,2024-12-02
1,AS000001_2,2024-12-01 10:58:00,FROZ007,PAPAS FRITAS CORTE BASTÓN 2.5KG,2200.0,3075.0,1.0,B017,PENSION VOLCAN,Temuco,2024-12-02
2,AS000001_3,2024-12-01 10:58:00,FROZ003,BARRITAS MERLUZA EMPANADAS 1KG,4200.0,5966.0,1.0,B017,PENSION VOLCAN,Temuco,2024-12-02
3,AS000001_4,2024-12-01 10:58:00,SUSH008,SALSA SOYA KIKKOMAN 1L,3200.0,4320.0,1.0,B017,PENSION VOLCAN,Temuco,2024-12-02
4,AS000001_5,2024-12-01 10:58:00,SUSH008,SALSA SOYA KIKKOMAN 1L,3200.0,4327.0,1.0,B017,PENSION VOLCAN,Temuco,2024-12-02


# Features

## Filters

In [57]:
FLOATPRECISION = 0.0000001
# Time conversion utility
def timestamp(in_dt):
    return pd.Timestamp(in_dt)

# Time features (convert back from numpy.datetime64)
def hour(timestamp):
    return pd.Timestamp(timestamp).hour

def minute(timestamp):
    return pd.Timestamp(timestamp).minute

def second(timestamp):
    return pd.Timestamp(timestamp).second

def microsecond(timestamp):
    return pd.Timestamp(timestamp).microsecond

# Date features
def dt_date(timestamp):
    return pd.Timestamp(timestamp).date()

def dt_year(timestamp):
    return pd.Timestamp(timestamp).year

def dt_month(timestamp):
    return pd.Timestamp(timestamp).month

def dt_day(timestamp):
    return pd.Timestamp(timestamp).day

def dt_weekday(timestamp):
    return pd.Timestamp(timestamp).dayofweek

def dt_weekday_name(timestamp):
    return pd.Timestamp(timestamp).day_name()

def dt_weekofyear(timestamp):
    return pd.Timestamp(timestamp).isocalendar().week

def dt_quarter(timestamp):
    return pd.Timestamp(timestamp).quarter

def dayofyear(timestamp):
    return pd.Timestamp(timestamp).dayofyear

def is_leap_year(timestamp):
    return pd.Timestamp(timestamp).is_leap_year

def is_month_start(timestamp):
    return pd.Timestamp(timestamp).is_month_start

def is_month_end(timestamp):
    return pd.Timestamp(timestamp).is_month_end

def is_quarter_start(timestamp):
    return pd.Timestamp(timestamp).is_quarter_start

def is_quarter_end(timestamp):
    return pd.Timestamp(timestamp).is_quarter_end

def is_year_start(timestamp):
    return pd.Timestamp(timestamp).is_year_start

def is_year_end(timestamp):
    return pd.Timestamp(timestamp).is_year_end

def is_weekend(timestamp):
    return pd.Timestamp(timestamp).dayofweek >= 5

def is_holiday(timestamp):
    # Placeholder for holiday logic
    return False

def days_since_analysis_dt(timestamp, analysis_dt):
    ts = pd.Timestamp(timestamp)
    adt = pd.Timestamp(analysis_dt)
    return (adt - ts).days

def months_since_analysis_dt(timestamp, analysis_dt):
    ts = pd.Timestamp(timestamp)
    adt = pd.Timestamp(analysis_dt)
    return (adt.year - ts.year) * 12 + (adt.month - ts.month)

def time_unix(timestamp):
    ts = pd.Timestamp(timestamp)
    return (ts.hour * 3600) + (ts.minute * 60) + ts.second

def date_unix(timestamp):
    ts = pd.Timestamp(timestamp)
    epoch = pd.Timestamp('1970-01-01')
    return (ts - epoch).days

## Attributes

In [None]:
def analysis_dt_agg(analysis_dt):
    return analysis_dt.unique()[0]

def description(in_description):
    return in_description.unique()[0]

def total_transactions(in_trans_id):
    return np.count_nonzero(in_trans_id)

def total_units_sold(in_quantity):
    return np.sum(in_quantity)

def total_revenue(in_total_price):
    return np.sum(in_total_price)

def total_cost(in_cost):
    return np.sum(in_cost)

def first_sale(in_dt):
    return in_dt.min().date()

def last_sale(in_dt):
    return in_dt.max().date()

def active_days(first_sale, last_sale):
    return (pd.to_datetime(last_sale) - pd.to_datetime(first_sale)).dt.days + 1

def days_since_last_sale(last_sale, analysis_dt_agg):
    return (pd.to_datetime(analysis_dt_agg) - pd.to_datetime(last_sale)).dt.days

def days_since_first_sale(first_sale, analysis_dt_agg):
    return (pd.to_datetime(analysis_dt_agg) - pd.to_datetime(first_sale)).dt.days

### Execute Product Model with ModelExecutor

In [59]:
# Step 4: Create model configuration
cfg_product = {
    'model_name': 'product_stats',
    'group_by': ['in_product'],
    'features': {
        # Filtrs
        'timestamp': timestamp,
        'hour': hour,
        'minute': minute,
        'second': second,
        'microsecond': microsecond,
        'dt_date': dt_date,
        'dt_year': dt_year,
        'dt_month': dt_month,
        'dt_day': dt_day,
        'dt_weekday': dt_weekday,
        'dt_weekday_name': dt_weekday_name,
        'dt_weekofyear': dt_weekofyear,
        'dt_quarter': dt_quarter,
        'dayofyear': dayofyear,
        'is_leap_year': is_leap_year,
        'is_month_start': is_month_start,
        'is_month_end': is_month_end,
        'is_quarter_start': is_quarter_start,
        'is_quarter_end': is_quarter_end,
        'is_year_start': is_year_start,
        'is_year_end': is_year_end,
        'is_weekend': is_weekend,
        'is_holiday': is_holiday,
        'days_since_analysis_dt': days_since_analysis_dt,
        'months_since_analysis_dt': months_since_analysis_dt,
        'time_unix': time_unix,
        'date_unix': date_unix,
        # Attrs
        'analysis_dt_agg': analysis_dt_agg,
        'description': description,
        'total_transactions': total_transactions,
        'total_units_sold': total_units_sold,
        'total_revenue': total_revenue,
        'total_cost': total_cost,
        'first_sale': first_sale,
        'last_sale': last_sale,
        'active_days': active_days,
        'days_since_last_sale': days_since_last_sale,
        'days_since_first_sale': days_since_first_sale,
        
    },
    # 'output_cols': []
}

In [60]:
# Step 5: Create ModelExecutor and execute
executor = ModelExecutor(cfg_product)
results = executor.execute(ctx, input_dataset_name='preprocessed')

print(f"\n✓ Model executed: {executor.model_name}")
print(f"  - Filters shape: {results['filters'].shape}")
print(f"  - Attrs shape: {results['attrs'].shape if results['attrs'] is not None else 'None'}")

TypeError: Cannot convert input [in_product
DRY007      2024-12-02
FRESH004    2024-12-02
FROZ003     2024-12-02
FROZ007     2024-12-02
SUSH004     2024-12-02
SUSH008     2024-12-02
dtype: object] of type <class 'pandas.core.series.Series'> to Timestamp

In [None]:
# Step 6: Access results from context (multiple ways)
print("\n--- Accessing Results ---")

# Method 1: Direct from results dictionary
filters_df = results['filters']
attrs_df = results['attrs']

# Method 2: From context by dataset name
filters_from_ctx = ctx.get_dataset('product_stats_filters')
attrs_from_ctx = ctx.get_dataset('product_stats_attrs')

# Method 3: Convenience methods
filters_method = ctx.get_model_filters('product_stats')
attrs_method = ctx.get_model_attrs('product_stats')

print(f"All methods return same data: {filters_df.equals(filters_from_ctx) and filters_df.equals(filters_method)}")


--- Accessing Results ---
All methods return same data: True


In [None]:
ctx.get_model_filters('product_stats').head()

Unnamed: 0,in_trans_id,in_dt,in_product,in_description,in_cost,in_total_price,in_quantity,in_customer_id,in_customer_name,in_customer_location,...,is_quarter_start,is_quarter_end,is_year_start,is_year_end,is_weekend,is_holiday,days_since_analysis_dt,months_since_analysis_dt,time_unix,date_unix
0,AS000001_1,2024-12-01 10:58:00,DRY007,HARINA TEMPURA ESPECIAL 1KG,2200.0,6641.0,2.0,B017,PENSION VOLCAN,Temuco,...,False,False,False,False,True,False,0,0,39480,20058
1,AS000001_2,2024-12-01 10:58:00,FROZ007,PAPAS FRITAS CORTE BASTÓN 2.5KG,2200.0,3075.0,1.0,B017,PENSION VOLCAN,Temuco,...,False,False,False,False,True,False,0,0,39480,20058
2,AS000001_3,2024-12-01 10:58:00,FROZ003,BARRITAS MERLUZA EMPANADAS 1KG,4200.0,5966.0,1.0,B017,PENSION VOLCAN,Temuco,...,False,False,False,False,True,False,0,0,39480,20058
3,AS000001_4,2024-12-01 10:58:00,SUSH008,SALSA SOYA KIKKOMAN 1L,3200.0,4320.0,1.0,B017,PENSION VOLCAN,Temuco,...,False,False,False,False,True,False,0,0,39480,20058
4,AS000001_5,2024-12-01 10:58:00,SUSH008,SALSA SOYA KIKKOMAN 1L,3200.0,4327.0,1.0,B017,PENSION VOLCAN,Temuco,...,False,False,False,False,True,False,0,0,39480,20058


In [None]:
ctx.get_model_attrs('product_stats').head()

Unnamed: 0,in_product,description,total_transactions,total_units_sold,total_revenue,total_cost,first_sale,last_sale,active_days
0,DRY007,HARINA TEMPURA ESPECIAL 1KG,1,2.0,6641.0,2200.0,2024-12-01,2024-12-01,1
1,FRESH004,CENTOLLA FRESCA KG,1,3.0,60772.0,15000.0,2024-12-01,2024-12-01,1
2,FROZ003,BARRITAS MERLUZA EMPANADAS 1KG,2,6.0,35343.0,8400.0,2024-12-01,2024-12-01,1
3,FROZ007,PAPAS FRITAS CORTE BASTÓN 2.5KG,1,1.0,3075.0,2200.0,2024-12-01,2024-12-01,1
4,SUSH004,ARROZ SUSHI KOSHIHIKARI 5KG,1,1.0,16929.0,12000.0,2024-12-01,2024-12-01,1


## Orchestrate

In [None]:
# Example: Running multiple models with orchestrator
# (You can add more models later: customer_stats, time_period, etc.)

# Create fresh context for orchestrator example
ctx2 = GabedaContext(base_cfg)
ctx2.set_dataset('preprocessed', preprocessed_df)

# Initialize orchestrator
orchestrator = ModelOrchestrator(ctx2)

# Register models
orchestrator.register_model(ModelExecutor(cfg_product))
# orchestrator.register_model(ModelExecutor(cfg_customer))  # Add when ready
# orchestrator.register_model(ModelExecutor(cfg_time))       # Add when ready

# Execute all registered models
orchestrator.execute_all(input_dataset_name='preprocessed')

# View summary
ctx2.print_summary() 


GabeDA Execution Summary - Run ID: test_client_20251008_155203

Datasets (3):
  - preprocessed: (8, 38)
  - product_stats_filters: (8, 38)
  - product_stats_attrs: (6, 9)

Models Executed (1):
  - product_stats: ['product_stats_filters', 'product_stats_attrs']

Total Steps: 4



In [None]:
# Optional: Save to disk
ctx.save_dataset('product_stats_filters', f"{out_path}/product_filters", format='csv')
ctx.save_dataset('product_stats_attrs', f"{out_path}/product_attrs", format='csv')
print(f"\n✓ Results saved to {out_path}/")


✓ Results saved to data/test_client/
