## Configuration

In [None]:
khujta_cfg = {
    # Project settings
    'input_file': 'data/test_client/comercializadora_transactions_10.csv',  # Input CSV file
    'out_dir': 'data',
    'client': 'test_client',
    'fidx_config': {'type': 'local', 'path': 'feature_store'},
    
    # Analysis settings
    'analysis_dt': '2024-12-02',
    'language': 'EN',
    
    # Logging and performance
    'log_level': 'DEBUG',               # 'DEBUG', 'INFO', 'WARNING','ERROR', 'CRITICAL'
    
    # Column Mappings
    'column_mappings': {
        'in_dt': 'fecha',
        'in_trans_id': 'trans_id',
        'in_product': 'producto',
        'in_description': 'glosa',
        'in_cost': 'costo',
        'in_price': 'precio',
        'in_quantity': 'cantidad',
        'in_total': 'total',
        'in_customer_id': 'customer_id',
        'in_customer_name': 'customer_name',
        'in_customer_location': 'customer_location',
    },
    
    # Column Types
    'column_types' : {
        'in_dt': 'date',
        'in_trans_id': 'str',
        'in_product': 'str',
        'in_description': 'str',
        'in_cost': 'float',
        'in_price': 'float',
        'in_quantity': 'float',
        'in_total': 'float',
        'in_customer_id': 'str',
        'in_customer_name': 'str',
        'in_customer_location': 'str',
    },
}

# General imports
import os
import pandas as pd
import numpy as np

## Logging and otuput

In [18]:
# Initialize logging from config
from src.logger import setup_logging
setup_logging(log_level=khujta_cfg.get('log_level', 'INFO'), config=khujta_cfg)

# Create output directory if it doesn't exist
out_path = f"{khujta_cfg['out_dir']}/{khujta_cfg['client']}"
os.makedirs(out_path, exist_ok=True)

📝 Run instance ID: test_client_20251008_123303 - Logging [DEBUG] to: logs\test_client_20251008_123303.log


## GabedaContext + ModelExecutor

In [None]:
# Step 1: Import new architecture components
from src.gabeda_context import GabedaContext
from src.model_executor import ModelExecutor, ModelOrchestrator
from src.preprocessing import preprocess_data

In [33]:
# Step 2: Create GabedaContext with user configuration
# Initialize context
ctx = GabedaContext(khujta_cfg)
print(f"✓ Context initialized: {ctx}")

✓ Context initialized: GabedaContext(run_id='test_client_20251008_123333', datasets=0, models=0)


In [34]:
# Step 3: Load and preprocess data
raw_data = pd.read_csv(khujta_cfg['input_file'])
preprocessed_df = preprocess_data(raw_data, khujta_cfg)

# Store preprocessed data in context
ctx.set_dataset('raw', raw_data)
ctx.set_dataset('preprocessed', preprocessed_df)

print(f"✓ Data loaded: {preprocessed_df.shape}")
print(f"✓ Datasets in context: {ctx.list_datasets()}")

✓ Data loaded: (8, 10)
✓ Datasets in context: ['raw', 'preprocessed']


In [36]:
ctx.get_dataset('raw').head()

Unnamed: 0,trans_id,fecha,producto,glosa,costo,total,cantidad,customer_id,customer_name,customer_location
0,AS000001_1,12/01/2024 10:58:00 AM,DRY007,HARINA TEMPURA ESPECIAL 1KG,2200,6641,2,B017,PENSION VOLCAN,Temuco
1,AS000001_2,12/01/2024 10:58:00 AM,FROZ007,PAPAS FRITAS CORTE BASTÓN 2.5KG,2200,3075,1,B017,PENSION VOLCAN,Temuco
2,AS000001_3,12/01/2024 10:58:00 AM,FROZ003,BARRITAS MERLUZA EMPANADAS 1KG,4200,5966,1,B017,PENSION VOLCAN,Temuco
3,AS000001_4,12/01/2024 10:58:00 AM,SUSH008,SALSA SOYA KIKKOMAN 1L,3200,4320,1,B017,PENSION VOLCAN,Temuco
4,AS000001_5,12/01/2024 10:58:00 AM,SUSH008,SALSA SOYA KIKKOMAN 1L,3200,4327,1,B017,PENSION VOLCAN,Temuco


In [35]:
ctx.get_dataset('preprocessed').head()

Unnamed: 0,in_trans_id,in_dt,in_product,in_description,in_cost,in_total,in_quantity,in_customer_id,in_customer_name,in_customer_location
0,AS000001_1,2024-12-01 10:58:00,DRY007,HARINA TEMPURA ESPECIAL 1KG,2200.0,6641.0,2.0,B017,PENSION VOLCAN,Temuco
1,AS000001_2,2024-12-01 10:58:00,FROZ007,PAPAS FRITAS CORTE BASTÓN 2.5KG,2200.0,3075.0,1.0,B017,PENSION VOLCAN,Temuco
2,AS000001_3,2024-12-01 10:58:00,FROZ003,BARRITAS MERLUZA EMPANADAS 1KG,4200.0,5966.0,1.0,B017,PENSION VOLCAN,Temuco
3,AS000001_4,2024-12-01 10:58:00,SUSH008,SALSA SOYA KIKKOMAN 1L,3200.0,4320.0,1.0,B017,PENSION VOLCAN,Temuco
4,AS000001_5,2024-12-01 10:58:00,SUSH008,SALSA SOYA KIKKOMAN 1L,3200.0,4327.0,1.0,B017,PENSION VOLCAN,Temuco


# Features

## Filters

In [38]:
def hour(in_dt):
    return pd.Timestamp(in_dt).hour

## Attributes

In [37]:
def total_transacciones(in_trans_id):
    return np.count_nonzero(in_trans_id)

def total_unidades_vendidas(in_quantity):
    return np.sum(in_quantity)

### Execute Product Model with ModelExecutor

In [None]:
# Step 4: Create model configuration
cfg_product = {
    'model_name': 'product_stats',
    'group_by': ['in_product'],
    'features': {
        'hour': hour,
        'total_transacciones': total_transacciones,
        'total_unidades_vendidas': total_unidades_vendidas,
    },
    'output_cols': ['hour', 'total_transacciones', 'total_unidades_vendidas']
}

In [45]:
# Step 5: Create ModelExecutor and execute
executor = ModelExecutor(cfg_product)
results = executor.execute(ctx, input_dataset_name='preprocessed')

print(f"\n✓ Model executed: {executor.model_name}")
print(f"  - Filters shape: {results['filters'].shape}")
print(f"  - Attrs shape: {results['attrs'].shape if results['attrs'] is not None else 'None'}")


✓ Model executed: product_stats
  - Filters shape: (8, 11)
  - Attrs shape: (6, 3)


In [46]:
# Step 6: Access results from context (multiple ways)
print("\n--- Accessing Results ---")

# Method 1: Direct from results dictionary
filters_df = results['filters']
attrs_df = results['attrs']

# Method 2: From context by dataset name
filters_from_ctx = ctx.get_dataset('product_stats_filters')
attrs_from_ctx = ctx.get_dataset('product_stats_attrs')

# Method 3: Convenience methods
filters_method = ctx.get_model_filters('product_stats')
attrs_method = ctx.get_model_attrs('product_stats')

print(f"All methods return same data: {filters_df.equals(filters_from_ctx) and filters_df.equals(filters_method)}")


--- Accessing Results ---
All methods return same data: True


In [47]:
ctx.get_model_filters('product_stats').head()

Unnamed: 0,in_trans_id,in_dt,in_product,in_description,in_cost,in_total,in_quantity,in_customer_id,in_customer_name,in_customer_location,hour
0,AS000001_1,2024-12-01 10:58:00,DRY007,HARINA TEMPURA ESPECIAL 1KG,2200.0,6641.0,2.0,B017,PENSION VOLCAN,Temuco,10
1,AS000001_2,2024-12-01 10:58:00,FROZ007,PAPAS FRITAS CORTE BASTÓN 2.5KG,2200.0,3075.0,1.0,B017,PENSION VOLCAN,Temuco,10
2,AS000001_3,2024-12-01 10:58:00,FROZ003,BARRITAS MERLUZA EMPANADAS 1KG,4200.0,5966.0,1.0,B017,PENSION VOLCAN,Temuco,10
3,AS000001_4,2024-12-01 10:58:00,SUSH008,SALSA SOYA KIKKOMAN 1L,3200.0,4320.0,1.0,B017,PENSION VOLCAN,Temuco,10
4,AS000001_5,2024-12-01 10:58:00,SUSH008,SALSA SOYA KIKKOMAN 1L,3200.0,4327.0,1.0,B017,PENSION VOLCAN,Temuco,10


In [48]:
ctx.get_model_attrs('product_stats').head()

Unnamed: 0,in_product,total_transacciones,total_unidades_vendidas
0,DRY007,1,2.0
1,FRESH004,1,3.0
2,FROZ003,2,6.0
3,FROZ007,1,1.0
4,SUSH004,1,1.0


## Preprocess

### View Results: Filters and Attrs DataFrames

In [None]:
# Example: Running multiple models with orchestrator
# (You can add more models later: customer_stats, time_period, etc.)

# Create fresh context for orchestrator example
ctx2 = GabedaContext(khujta_cfg)
ctx2.set_dataset('preprocessed', preprocessed_df)

# Initialize orchestrator
orchestrator = ModelOrchestrator(ctx2)

# Register models
orchestrator.register_model(ModelExecutor(cfg_product))
# orchestrator.register_model(ModelExecutor(cfg_customer))  # Add when ready
# orchestrator.register_model(ModelExecutor(cfg_time))       # Add when ready

# Execute all registered models
orchestrator.execute_all(input_dataset_name='preprocessed')

# View summary
ctx2.print_summary() 


GabeDA Execution Summary - Run ID: test_client_20251008_123303

Datasets (3):
  - preprocessed: (8, 11)
  - product_stats_filters: (8, 11)
  - product_stats_attrs: (6, 3)

Models Executed (1):
  - product_stats: ['product_stats_filters', 'product_stats_attrs']

Total Steps: 4



In [50]:
# Optional: Save to disk
ctx.save_dataset('product_stats_filters', f"{out_path}/product_filters", format='csv')
ctx.save_dataset('product_stats_attrs', f"{out_path}/product_attrs", format='csv')
print(f"\n✓ Results saved to {out_path}/")


✓ Results saved to data/test_client/


### Using ModelOrchestrator for Multiple Models

### Attribn

In [26]:
preprocess_data.to_csv(f"{out_path}/preprocessed_data.csv", index=False)
preprocess_data

Unnamed: 0,in_trans_id,in_dt,in_product,in_description,in_cost,in_total,in_quantity,in_customer_id,in_customer_name,in_customer_location
0,AS000001_1,2024-12-01 10:58:00,DRY007,HARINA TEMPURA ESPECIAL 1KG,2200.0,6641.0,2.0,B017,PENSION VOLCAN,Temuco
1,AS000001_2,2024-12-01 10:58:00,FROZ007,PAPAS FRITAS CORTE BASTÓN 2.5KG,2200.0,3075.0,1.0,B017,PENSION VOLCAN,Temuco
2,AS000001_3,2024-12-01 10:58:00,FROZ003,BARRITAS MERLUZA EMPANADAS 1KG,4200.0,5966.0,1.0,B017,PENSION VOLCAN,Temuco
3,AS000001_4,2024-12-01 10:58:00,SUSH008,SALSA SOYA KIKKOMAN 1L,3200.0,4320.0,1.0,B017,PENSION VOLCAN,Temuco
4,AS000001_5,2024-12-01 10:58:00,SUSH008,SALSA SOYA KIKKOMAN 1L,3200.0,4327.0,1.0,B017,PENSION VOLCAN,Temuco
5,AS000003_1,2024-12-01 11:14:00,SUSH004,ARROZ SUSHI KOSHIHIKARI 5KG,12000.0,16929.0,1.0,R010,PARRILLA DEL LAGO PREMIUM,Villarrica
6,AS000023_12,2024-12-01 11:44:00,FROZ003,BARRITAS MERLUZA EMPANADAS 1KG,4200.0,29377.0,5.0,R004,MARISQUERIA EL MUELLE,Villarrica
7,AS000024_1,2024-12-01 10:06:00,FRESH004,CENTOLLA FRESCA KG,15000.0,60772.0,3.0,R005,SUSHI TOKYO TEMUCO,Temuco


## Features

### Filters

In [27]:
import numpy as np
import pandas as pd

## CUSTOM - INI ***
FLOATPRECISION = 0.0000001
## CUSTOM - END ***

def hour(in_dt):
    return  pd.Timestamp(in_dt).hour

### Attributes

In [28]:
import numpy as np
import pandas as pd

## CUSTOM - INI ***
FLOATPRECISION = 0.0000001

def total_transacciones(in_trans_id):
    return np.count_nonzero(in_trans_id)

def total_unidades_vendidas(in_quantity):
    return np.sum(in_quantity)


# product_metrics = df.groupby('producto_clean').agg({
#     'trans_id': 'count',
#     'cantidad': 'sum',
#     'total': 'sum',
#     'costo': 'sum',
#     'fecha': ['min', 'max']
# }).reset_index()

# product_metrics.columns = [
#     'producto', 'total_transacciones', 'total_unidades_vendidas',
#     'total_ingresos', 'total_costo', 'primera_venta', 'ultima_venta'
# ]

# product_metrics['dias_activo'] = (
#     product_metrics['ultima_venta'] - product_metrics['primera_venta']
# ).dt.days + 1

## Execution

In [29]:
from src.fidx import get_dependencies

cfg_PM = {
    'model_name': 'product_stats',
    'group_by': ['in_product'],
    'features': {
        'hour': hour,
        'total_transacciones': total_transacciones,
        'total_unidades_vendidas': total_unidades_vendidas,
        # 'total_quantity': (['in_quantity'], sum, True),
        # 'total_sales': (['in_total'], sum, True),
        # 'avg_price': (['in_price'], lambda x: sum(x)/len(x) if len(x) > 0 else 0, True),
        # 'min_price': (['in_price'], min, True),
        # 'max_price': (['in_price'], max, True),
        # 'num_transactions':
    }
}

PM_output_cols = ['hour','total_transacciones','total_unidades_vendidas']
cfg_fidx = {'type': 'local', 'path': 'feature_store'}
cfg_PM = get_dependencies(cfg_fidx, cfg_PM['model_name'], cfg_PM, PM_output_cols)
cfg_PM

{'model_name': 'product_stats',
 'group_by': ['in_product'],
 'features': {'hour': <function __main__.hour(in_dt)>,
  'total_transacciones': <function __main__.total_transacciones(in_trans_id)>,
  'total_unidades_vendidas': <function __main__.total_unidades_vendidas(in_quantity)>},
 'in_cols': ['in_dt', 'in_trans_id', 'in_quantity'],
 'exec_seq': ['hour', 'total_transacciones', 'total_unidades_vendidas'],
 'out_cols': ['hour', 'total_transacciones', 'total_unidades_vendidas']}

In [30]:
from src.modeling import calc_datasets

filtrs_data, attrs_data = calc_datasets(preprocess_data, cfg_PM)
filtrs_data


Unnamed: 0,in_trans_id,in_dt,in_product,in_description,in_cost,in_total,in_quantity,in_customer_id,in_customer_name,in_customer_location,hour
0,AS000001_1,2024-12-01 10:58:00,DRY007,HARINA TEMPURA ESPECIAL 1KG,2200.0,6641.0,2.0,B017,PENSION VOLCAN,Temuco,10
1,AS000001_2,2024-12-01 10:58:00,FROZ007,PAPAS FRITAS CORTE BASTÓN 2.5KG,2200.0,3075.0,1.0,B017,PENSION VOLCAN,Temuco,10
2,AS000001_3,2024-12-01 10:58:00,FROZ003,BARRITAS MERLUZA EMPANADAS 1KG,4200.0,5966.0,1.0,B017,PENSION VOLCAN,Temuco,10
3,AS000001_4,2024-12-01 10:58:00,SUSH008,SALSA SOYA KIKKOMAN 1L,3200.0,4320.0,1.0,B017,PENSION VOLCAN,Temuco,10
4,AS000001_5,2024-12-01 10:58:00,SUSH008,SALSA SOYA KIKKOMAN 1L,3200.0,4327.0,1.0,B017,PENSION VOLCAN,Temuco,10
5,AS000003_1,2024-12-01 11:14:00,SUSH004,ARROZ SUSHI KOSHIHIKARI 5KG,12000.0,16929.0,1.0,R010,PARRILLA DEL LAGO PREMIUM,Villarrica,11
6,AS000023_12,2024-12-01 11:44:00,FROZ003,BARRITAS MERLUZA EMPANADAS 1KG,4200.0,29377.0,5.0,R004,MARISQUERIA EL MUELLE,Villarrica,11
7,AS000024_1,2024-12-01 10:06:00,FRESH004,CENTOLLA FRESCA KG,15000.0,60772.0,3.0,R005,SUSHI TOKYO TEMUCO,Temuco,10


In [31]:
print(attrs_data)

  in_product  total_transacciones  total_unidades_vendidas
0     DRY007                    1                      2.0
1   FRESH004                    1                      3.0
2    FROZ003                    2                      6.0
3    FROZ007                    1                      1.0
4    SUSH004                    1                      1.0
5    SUSH008                    2                      2.0
