## Configuration

In [1]:
base_cfg = {
    # Project settings
    'input_file': 'data/test_client/comercializadora_transactions_10.csv',  # Input CSV file
    'out_dir': 'data',
    'client': 'test_client',
    'fidx_config': {'type': 'local', 'path': 'feature_store'},
    
    # Analysis settings
    'analysis_dt': '2024-12-02',
    'language': 'EN',
    
    # Logging and performance
    'log_level': 'INFO',            # 'DEBUG', 'INFO', 'WARNING','ERROR', 'CRITICAL'
    
    # Column Mappings
    'column_mappings': {
        'in_dt': 'fecha',
        'in_trans_id': 'trans_id',
        'in_product': 'producto',
        'in_description': 'glosa',
        'in_cost_unit': 'costo',
        'in_cost_total': 'costo_total',
        'in_price_unit': 'precio',
        'in_price_total': 'total',
        'in_quantity': 'cantidad',
        'in_customer_id': 'customer_id',
        'in_customer_name': 'customer_name',
        'in_customer_location': 'customer_location',
    },
    
    # Column Types
    'column_types' : {
        'in_dt': 'date',
        'in_trans_id': 'str',
        'in_product': 'str',
        'in_description': 'str',
        'in_cost_unit': 'float',
        'in_cost_total': 'float',
        'in_price_unit': 'float',
        'in_price_total': 'float',
        'in_quantity': 'float',
        'in_customer_id': 'str',
        'in_customer_name': 'str',
        'in_customer_location': 'str',
    },
}

# General imports
import os
import pandas as pd
import numpy as np

## Logging and otuput

In [2]:
# Initialize logging from config
from src.logger import setup_logging
setup_logging(log_level=base_cfg.get('log_level', 'INFO'), config=base_cfg)

# Create output directory if it doesn't exist
out_path = f"{base_cfg['out_dir']}/{base_cfg['client']}"
os.makedirs(out_path, exist_ok=True)

📝 Run instance ID: test_client_20251008_204239 - Logging [INFO] to: logs\test_client_20251008_204239.log


## GabedaContext + ModelExecutor

In [3]:
# Step 1: Import new architecture components
from src.gabeda_context import GabedaContext
from src.model_executor import ModelExecutor, ModelOrchestrator
from src.preprocessing import preprocess_data

In [4]:
# Step 2: Create GabedaContext with user configuration
# Initialize context
ctx = GabedaContext(base_cfg)
print(f"✓ Context initialized: {ctx}")

✓ Context initialized: GabedaContext(run_id='test_client_20251008_204239', datasets=0, models=0)


In [5]:
# Step 3: Load and preprocess data
raw_data = pd.read_csv(base_cfg['input_file'])
preprocessed_df, base_cfg = preprocess_data(raw_data, base_cfg)

# Update context with the modified config (includes available_cols)
ctx.user_config.update(base_cfg)

# Store preprocessed data in context
ctx.set_dataset('raw', raw_data)
ctx.set_dataset('preprocessed', preprocessed_df)

print(f"✓ Data loaded: {preprocessed_df.shape}")
print(f"✓ Datasets in context: {ctx.list_datasets()}")

✓ Data loaded: (8, 11)
✓ Datasets in context: ['raw', 'preprocessed']


In [6]:
ctx.get_dataset('raw').head()

Unnamed: 0,trans_id,fecha,producto,glosa,costo,total,cantidad,customer_id,customer_name,customer_location
0,AS000001_1,11/01/2024 10:58:00 AM,DRY007,HARINA TEMPURA ESPECIAL 1KG,2200,6641,2,B017,PENSION VOLCAN,Temuco
1,AS000001_2,11/29/2024 10:58:00 AM,FROZ007,PAPAS FRITAS CORTE BASTÓN 2.5KG,2200,3075,1,B017,PENSION VOLCAN,Temuco
2,AS000001_3,11/15/2024 10:58:00 AM,FROZ003,BARRITAS MERLUZA EMPANADAS 1KG,4200,5966,1,B017,PENSION VOLCAN,Temuco
3,AS000001_4,10/01/2024 10:58:00 AM,SUSH008,SALSA SOYA KIKKOMAN 1L,3200,4320,1,B017,PENSION VOLCAN,Temuco
4,AS000001_5,12/01/2024 10:58:00 AM,SUSH008,SALSA SOYA KIKKOMAN 1L,3200,4327,1,B017,PENSION VOLCAN,Temuco


In [7]:
ctx.get_dataset('preprocessed').head()

Unnamed: 0,in_trans_id,in_dt,in_product,in_description,in_cost_unit,in_price_total,in_quantity,in_customer_id,in_customer_name,in_customer_location,analysis_dt
0,AS000001_1,2024-11-01 10:58:00,DRY007,HARINA TEMPURA ESPECIAL 1KG,2200.0,6641.0,2.0,B017,PENSION VOLCAN,Temuco,2024-12-02
1,AS000001_2,2024-11-29 10:58:00,FROZ007,PAPAS FRITAS CORTE BASTÓN 2.5KG,2200.0,3075.0,1.0,B017,PENSION VOLCAN,Temuco,2024-12-02
2,AS000001_3,2024-11-15 10:58:00,FROZ003,BARRITAS MERLUZA EMPANADAS 1KG,4200.0,5966.0,1.0,B017,PENSION VOLCAN,Temuco,2024-12-02
3,AS000001_4,2024-10-01 10:58:00,SUSH008,SALSA SOYA KIKKOMAN 1L,3200.0,4320.0,1.0,B017,PENSION VOLCAN,Temuco,2024-12-02
4,AS000001_5,2024-12-01 10:58:00,SUSH008,SALSA SOYA KIKKOMAN 1L,3200.0,4327.0,1.0,B017,PENSION VOLCAN,Temuco,2024-12-02


# Features

## Filters

In [8]:
FLOATPRECISION = 0.0000001

def in_cost_unit(in_quantity, in_cost_total):
    return in_cost_total / in_quantity if in_quantity != 0 else 0.0

def in_cost_total(in_cost_unit, in_quantity):
    return in_cost_unit * in_quantity

def in_price_unit(in_quantity, in_price_total):
    return in_price_total / in_quantity if in_quantity != 0 else 0.0

def in_price_total(in_price_unit, in_quantity):
    return in_price_unit * in_quantity

# Time conversion utility
def timestamp(in_dt):
    return pd.Timestamp(in_dt)

# Time features (convert back from numpy.datetime64)
def hour(timestamp):
    return pd.Timestamp(timestamp).hour

def minute(timestamp):
    return pd.Timestamp(timestamp).minute

def second(timestamp):
    return pd.Timestamp(timestamp).second

def microsecond(timestamp):
    return pd.Timestamp(timestamp).microsecond

# Date features
def dt_date(timestamp):
    return pd.Timestamp(timestamp).date()

def dt_year(timestamp):
    return pd.Timestamp(timestamp).year

def dt_month(timestamp):
    return pd.Timestamp(timestamp).month

def dt_day(timestamp):
    return pd.Timestamp(timestamp).day

def dt_weekday(timestamp):
    return pd.Timestamp(timestamp).dayofweek

def dt_weekday_name(timestamp):
    return pd.Timestamp(timestamp).day_name()

def dt_weekofyear(timestamp):
    return pd.Timestamp(timestamp).isocalendar().week

def dt_quarter(timestamp):
    return pd.Timestamp(timestamp).quarter

def dayofyear(timestamp):
    return pd.Timestamp(timestamp).dayofyear

def is_leap_year(timestamp):
    return pd.Timestamp(timestamp).is_leap_year

def is_month_start(timestamp):
    return pd.Timestamp(timestamp).is_month_start

def is_month_end(timestamp):
    return pd.Timestamp(timestamp).is_month_end

def is_quarter_start(timestamp):
    return pd.Timestamp(timestamp).is_quarter_start

def is_quarter_end(timestamp):
    return pd.Timestamp(timestamp).is_quarter_end

def is_year_start(timestamp):
    return pd.Timestamp(timestamp).is_year_start

def is_year_end(timestamp):
    return pd.Timestamp(timestamp).is_year_end

def is_weekend(timestamp):
    return pd.Timestamp(timestamp).dayofweek >= 5

def is_holiday(timestamp):
    # Placeholder for holiday logic
    return False

def days_since_analysis_dt(timestamp, analysis_dt):
    ts = pd.Timestamp(timestamp)
    adt = pd.Timestamp(analysis_dt)
    return (adt - ts).days

def months_since_analysis_dt(timestamp, analysis_dt):
    ts = pd.Timestamp(timestamp)
    adt = pd.Timestamp(analysis_dt)
    return (adt.year - ts.year) * 12 + (adt.month - ts.month)

def time_unix(timestamp):
    ts = pd.Timestamp(timestamp)
    return (ts.hour * 3600) + (ts.minute * 60) + ts.second

def date_unix(timestamp):
    ts = pd.Timestamp(timestamp)
    epoch = pd.Timestamp('1970-01-01')
    return (ts - epoch).days

def margin_unit(in_price_unit, in_cost_unit):
    return in_price_unit - in_cost_unit

def margin_unit_pct(in_price_unit, in_cost_unit):
    if abs(in_price_unit) < FLOATPRECISION:
        return 0.0
    return ((in_price_unit - in_cost_unit) / in_price_unit) * 100.0

def margin_unit_valid(margin_unit_pct, in_cost_unit, in_price_unit):
    return margin_unit_pct >= 0.0 and in_cost_unit >= 0.0 and in_price_unit >= 0.0

def margin_total(in_price_total, in_cost_total):
    return in_price_total - in_cost_total

def ticket_size_cat(in_price_total):
    if in_price_total < 10000:
        return 'Low'
    elif in_price_total < 50000:
        return 'Medium'
    else:
        return 'High'
    
def volume_cat(in_quantity):
    if in_quantity < 3:
        return 'Unit'
    elif in_quantity < 6:
        return 'Multiple'
    else:
        return 'Massive'

def day_segment(timestamp):
    hour = pd.Timestamp(timestamp).hour
    if 5 <= hour < 12:
        return 'Morning'
    elif 12 <= hour < 17:
        return 'Afternoon'
    elif 17 <= hour < 21:
        return 'Evening'
    else:
        return 'Night'
    


## Attributes

In [9]:
def analysis_dt_agg(analysis_dt):
    return analysis_dt.unique()[0]

def prod_description(in_description):
    return in_description.unique()[0]

def prod_transactions_total(in_trans_id):
    return np.count_nonzero(in_trans_id)

def prod_quantity_total(in_quantity):
    return np.sum(in_quantity)

# Price
def prod_price_sum(in_price_total):
    return np.sum(in_price_total)

def prod_price_unit_max(in_price_unit):
    return in_price_unit.max()

def prod_price_unit_min(in_price_unit):
    return in_price_unit.min()

def prod_price_unit_avg(in_price_unit):
    return np.mean(in_price_unit)

def prod_price_unit_volatility(prod_price_unit_min, prod_price_unit_max):
    return prod_price_unit_max - prod_price_unit_min

# Cost
def prod_cost_sum(in_cost_total):
    return np.sum(in_cost_total)

def prod_cost_unit_max(in_cost_unit):
    return in_cost_unit.max()

def prod_cost_unit_min(in_cost_unit):
    return in_cost_unit.min()

def prod_cost_unit_avg(in_cost_unit):
    return np.mean(in_cost_unit)

def prod_cost_unit_volatility(prod_cost_unit_min, prod_cost_unit_max):
    return prod_cost_unit_max - prod_cost_unit_min

# Margin
def prod_margin_sum(margin_total):
    return np.sum(margin_total)

def prod_margin_unit_max(margin_unit):
    return np.max(margin_unit)

def prod_margin_unit_min(margin_unit):
    return np.min(margin_unit)

def prod_margin_unit_avg(margin_unit):
    return np.mean(margin_unit)

def prod_margin_unit_pct_avg(margin_unit_pct):
    return np.mean(margin_unit_pct)

def prod_margin_unit_volatility(prod_margin_unit_min, prod_margin_unit_max):
    return prod_margin_unit_max - prod_margin_unit_min

# date features
def prod_sale_first(in_dt):
    return in_dt.min().date()

def prod_sale_last(in_dt):
    return in_dt.max().date()

def prod_days_active(prod_sale_first, prod_sale_last):
    return (pd.to_datetime(prod_sale_last) - pd.to_datetime(prod_sale_first)).dt.days + 1

def prod_days_since_prod_sale_last(prod_sale_last, analysis_dt_agg):
    return (pd.to_datetime(analysis_dt_agg) - pd.to_datetime(prod_sale_last)).dt.days

def prod_days_since_prod_sale_first(prod_sale_first, analysis_dt_agg):
    return (pd.to_datetime(analysis_dt_agg) - pd.to_datetime(prod_sale_first)).dt.days




## Execute Product Model with ModelExecutor

In [10]:
# Step 4: Create model configuration
cfg_product = {
    'model_name': 'product_stats',
    'group_by': ['in_product'],
    'features': {
        # Filtrs
        'in_cost_unit': in_cost_unit,
        'in_cost_total': in_cost_total,
        'in_price_unit': in_price_unit,
        'in_price_total': in_price_total,
        'timestamp': timestamp,
        'hour': hour,
        'minute': minute,
        'second': second,
        'microsecond': microsecond,
        'dt_date': dt_date,
        'dt_year': dt_year,
        'dt_month': dt_month,
        'dt_day': dt_day,
        'dt_weekday': dt_weekday,
        'dt_weekday_name': dt_weekday_name,
        'dt_weekofyear': dt_weekofyear,
        'dt_quarter': dt_quarter,
        'dayofyear': dayofyear,
        'is_leap_year': is_leap_year,
        'is_month_start': is_month_start,
        'is_month_end': is_month_end,
        'is_quarter_start': is_quarter_start,
        'is_quarter_end': is_quarter_end,
        'is_year_start': is_year_start,
        'is_year_end': is_year_end,
        'is_weekend': is_weekend,
        'is_holiday': is_holiday,
        'days_since_analysis_dt': days_since_analysis_dt,
        'months_since_analysis_dt': months_since_analysis_dt,
        'time_unix': time_unix,
        'date_unix': date_unix,
        'margin_unit': margin_unit,
        'margin_unit_pct': margin_unit_pct,
        'margin_unit_valid': margin_unit_valid,
        'margin_total': margin_total,
        'ticket_size_cat': ticket_size_cat,
        'volume_cat': volume_cat,
        'day_segment': day_segment,
        # Attrs
        'analysis_dt_agg': analysis_dt_agg,
        'prod_description': prod_description,
        'prod_transactions_total': prod_transactions_total,
        'prod_quantity_total': prod_quantity_total,
        'prod_price_sum': prod_price_sum,
        'prod_price_unit_max': prod_price_unit_max,
        'prod_price_unit_min': prod_price_unit_min,
        'prod_price_unit_avg': prod_price_unit_avg,
        'prod_price_unit_volatility': prod_price_unit_volatility,
        'prod_cost_sum': prod_cost_sum,
        'prod_cost_unit_max': prod_cost_unit_max,
        'prod_cost_unit_min': prod_cost_unit_min,
        'prod_cost_unit_avg': prod_cost_unit_avg,
        'prod_cost_unit_volatility': prod_cost_unit_volatility,
        'prod_margin_sum': prod_margin_sum,
        'prod_margin_unit_max': prod_margin_unit_max,
        'prod_margin_unit_min': prod_margin_unit_min,
        'prod_margin_unit_avg': prod_margin_unit_avg,
        'prod_margin_unit_pct_avg': prod_margin_unit_pct_avg,
        'prod_margin_unit_volatility': prod_margin_unit_volatility,
        'prod_sale_first': prod_sale_first,
        'prod_sale_last': prod_sale_last,
        'prod_days_active': prod_days_active,
        'prod_days_since_prod_sale_last': prod_days_since_prod_sale_last,
        'prod_days_since_prod_sale_first': prod_days_since_prod_sale_first,
    },
    # 'output_cols': []
}

In [11]:
# Step 5: Create ModelExecutor and execute
executor = ModelExecutor(cfg_product)
results = executor.execute(ctx, input_dataset_name='preprocessed')

print(f"\n✓ Model executed: {executor.model_name}")
print(f"  - Filters shape: {results['filters'].shape}")
print(f"  - Attrs shape: {results['attrs'].shape if results['attrs'] is not None else 'None'}")


✓ Model executed: product_stats
  - Filters shape: (8, 47)
  - Attrs shape: (6, 26)


In [12]:
# Step 6: Access results from context (multiple ways)
print("\n--- Accessing Results ---")

# Method 1: Direct from results dictionary
filters_df = results['filters']
attrs_df = results['attrs']

# Method 2: From context by dataset name
filters_from_ctx = ctx.get_dataset('product_stats_filters')
attrs_from_ctx = ctx.get_dataset('product_stats_attrs')

# Method 3: Convenience methods
filters_method = ctx.get_model_filters('product_stats')
attrs_method = ctx.get_model_attrs('product_stats')

print(f"All methods return same data: {filters_df.equals(filters_from_ctx) and filters_df.equals(filters_method)}")


--- Accessing Results ---
All methods return same data: True


In [13]:
ctx.get_model_filters('product_stats').head()

Unnamed: 0,in_trans_id,in_dt,in_product,in_description,in_cost_unit,in_price_total,in_quantity,in_customer_id,in_customer_name,in_customer_location,...,months_since_analysis_dt,time_unix,date_unix,margin_unit,margin_unit_pct,margin_unit_valid,margin_total,ticket_size_cat,volume_cat,day_segment
0,AS000001_1,2024-11-01 10:58:00,DRY007,HARINA TEMPURA ESPECIAL 1KG,2200.0,6641.0,2.0,B017,PENSION VOLCAN,Temuco,...,1,39480,20028,1120.5,33.744919,True,2241.0,Low,Unit,Morning
1,AS000001_2,2024-11-29 10:58:00,FROZ007,PAPAS FRITAS CORTE BASTÓN 2.5KG,2200.0,3075.0,1.0,B017,PENSION VOLCAN,Temuco,...,1,39480,20056,875.0,28.455284,True,875.0,Low,Unit,Morning
2,AS000001_3,2024-11-15 10:58:00,FROZ003,BARRITAS MERLUZA EMPANADAS 1KG,4200.0,5966.0,1.0,B017,PENSION VOLCAN,Temuco,...,1,39480,20042,1766.0,29.601072,True,1766.0,Low,Unit,Morning
3,AS000001_4,2024-10-01 10:58:00,SUSH008,SALSA SOYA KIKKOMAN 1L,3200.0,4320.0,1.0,B017,PENSION VOLCAN,Temuco,...,2,39480,19997,1120.0,25.925926,True,1120.0,Low,Unit,Morning
4,AS000001_5,2024-12-01 10:58:00,SUSH008,SALSA SOYA KIKKOMAN 1L,3200.0,4327.0,1.0,B017,PENSION VOLCAN,Temuco,...,0,39480,20058,1127.0,26.045759,True,1127.0,Low,Unit,Morning


In [14]:
ctx.get_model_attrs('product_stats').head()

Unnamed: 0,in_product,analysis_dt_agg,prod_description,prod_transactions_total,prod_quantity_total,prod_price_sum,prod_price_unit_max,prod_price_unit_min,prod_price_unit_avg,prod_price_unit_volatility,...,prod_margin_unit_max,prod_margin_unit_min,prod_margin_unit_avg,prod_margin_unit_pct_avg,prod_margin_unit_volatility,prod_sale_first,prod_sale_last,prod_days_active,prod_days_since_prod_sale_last,prod_days_since_prod_sale_first
0,DRY007,2024-12-02,HARINA TEMPURA ESPECIAL 1KG,1,2.0,6641.0,3320.5,3320.5,3320.5,0.0,...,1120.5,1120.5,1120.5,33.744919,0.0,2024-11-01,2024-11-01,1,31,31
1,FRESH004,2024-12-02,CENTOLLA FRESCA KG,1,3.0,60772.0,20257.333984,20257.333984,20257.333984,0.0,...,5257.333984,5257.333984,5257.333984,25.952744,0.0,2024-12-01,2024-12-01,1,1,1
2,FROZ003,2024-12-02,BARRITAS MERLUZA EMPANADAS 1KG,2,6.0,35343.0,5966.0,5875.399902,5920.700195,90.600098,...,1766.0,1675.399902,1720.699951,29.058289,90.600098,2024-09-01,2024-11-15,76,17,92
3,FROZ007,2024-12-02,PAPAS FRITAS CORTE BASTÓN 2.5KG,1,1.0,3075.0,3075.0,3075.0,3075.0,0.0,...,875.0,875.0,875.0,28.455284,0.0,2024-11-29,2024-11-29,1,3,3
4,SUSH004,2024-12-02,ARROZ SUSHI KOSHIHIKARI 5KG,1,1.0,16929.0,16929.0,16929.0,16929.0,0.0,...,4929.0,4929.0,4929.0,29.115719,0.0,2024-11-03,2024-11-03,1,29,29


## Save

In [15]:
# Optional: Save to disk
ctx.save_dataset('product_stats_filters', f"{out_path}/product_filters", format='csv')
ctx.save_dataset('product_stats_attrs', f"{out_path}/product_attrs", format='csv')
print(f"\n✓ Results saved to {out_path}/")


✓ Results saved to data/test_client/


## Orchestrate

In [16]:
# # Example: Running multiple models with orchestrator
# # (You can add more models later: customer_stats, time_period, etc.)

# # Create fresh context for orchestrator example
# ctx2 = GabedaContext(base_cfg)
# ctx2.set_dataset('preprocessed', preprocessed_df)

# # Initialize orchestrator
# orchestrator = ModelOrchestrator(ctx2)

# # Register models
# orchestrator.register_model(ModelExecutor(cfg_product))
# # orchestrator.register_model(ModelExecutor(cfg_customer))  # Add when ready
# # orchestrator.register_model(ModelExecutor(cfg_time))       # Add when ready

# # Execute all registered models
# orchestrator.execute_all(input_dataset_name='preprocessed')

# # View summary
# ctx2.print_summary() 