## Configuration

In [12]:
khujta_cfg = {
    # Project settings
    'input_file': 'data/test_client/comercializadora_transactions_10.csv',  # Input CSV file
    'out_dir': 'data',
    'client': 'test_client',
    'fidx_config': {'type': 'local', 'path': 'feature_store'},
    
    # Analysis settings
    'analysis_dt': '2024-12-02',
    'language': 'EN',
    
    # Logging and performance
    'log_level': 'INFO',               # 'DEBUG', 'INFO', 'WARNING','ERROR', 'CRITICAL'
    
    # Column Mappings
    'column_mappings': {
        'in_dt': 'fecha',
        'in_trans_id': 'trans_id',
        'in_product': 'producto',
        'in_description': 'glosa',
        'in_cost': 'costo',
        'in_price': 'precio',
        'in_quantity': 'cantidad',
        'in_total_price': 'total',
        'in_customer_id': 'customer_id',
        'in_customer_name': 'customer_name',
        'in_customer_location': 'customer_location',
    },
    
    # Column Types
    'column_types' : {
        'in_dt': 'date',
        'in_trans_id': 'str',
        'in_product': 'str',
        'in_description': 'str',
        'in_cost': 'float',
        'in_price': 'float',
        'in_quantity': 'float',
        'in_total_price': 'float',
        'in_customer_id': 'str',
        'in_customer_name': 'str',
        'in_customer_location': 'str',
    },
}

# General imports
import os
import pandas as pd
import numpy as np

## Logging and otuput

In [13]:
# Initialize logging from config
from src.logger import setup_logging
setup_logging(log_level=khujta_cfg.get('log_level', 'INFO'), config=khujta_cfg)

# Create output directory if it doesn't exist
out_path = f"{khujta_cfg['out_dir']}/{khujta_cfg['client']}"
os.makedirs(out_path, exist_ok=True)

📝 Run instance ID: test_client_20251008_133649 - Logging [INFO] to: logs\test_client_20251008_133649.log


## GabedaContext + ModelExecutor

In [14]:
# Step 1: Import new architecture components
from src.gabeda_context import GabedaContext
from src.model_executor import ModelExecutor, ModelOrchestrator
from src.preprocessing import preprocess_data

In [15]:
# Step 2: Create GabedaContext with user configuration
# Initialize context
ctx = GabedaContext(khujta_cfg)
print(f"✓ Context initialized: {ctx}")

✓ Context initialized: GabedaContext(run_id='test_client_20251008_133649', datasets=0, models=0)


In [16]:
# Step 3: Load and preprocess data
raw_data = pd.read_csv(khujta_cfg['input_file'])
preprocessed_df = preprocess_data(raw_data, khujta_cfg)

# Store preprocessed data in context
ctx.set_dataset('raw', raw_data)
ctx.set_dataset('preprocessed', preprocessed_df)

print(f"✓ Data loaded: {preprocessed_df.shape}")
print(f"✓ Datasets in context: {ctx.list_datasets()}")

✓ Data loaded: (8, 10)
✓ Datasets in context: ['raw', 'preprocessed']


In [17]:
ctx.get_dataset('raw').head()

Unnamed: 0,trans_id,fecha,producto,glosa,costo,total,cantidad,customer_id,customer_name,customer_location
0,AS000001_1,12/01/2024 10:58:00 AM,DRY007,HARINA TEMPURA ESPECIAL 1KG,2200,6641,2,B017,PENSION VOLCAN,Temuco
1,AS000001_2,12/01/2024 10:58:00 AM,FROZ007,PAPAS FRITAS CORTE BASTÓN 2.5KG,2200,3075,1,B017,PENSION VOLCAN,Temuco
2,AS000001_3,12/01/2024 10:58:00 AM,FROZ003,BARRITAS MERLUZA EMPANADAS 1KG,4200,5966,1,B017,PENSION VOLCAN,Temuco
3,AS000001_4,12/01/2024 10:58:00 AM,SUSH008,SALSA SOYA KIKKOMAN 1L,3200,4320,1,B017,PENSION VOLCAN,Temuco
4,AS000001_5,12/01/2024 10:58:00 AM,SUSH008,SALSA SOYA KIKKOMAN 1L,3200,4327,1,B017,PENSION VOLCAN,Temuco


In [18]:
ctx.get_dataset('preprocessed').head()

Unnamed: 0,in_trans_id,in_dt,in_product,in_description,in_cost,in_total_price,in_quantity,in_customer_id,in_customer_name,in_customer_location
0,AS000001_1,2024-12-01 10:58:00,DRY007,HARINA TEMPURA ESPECIAL 1KG,2200.0,6641.0,2.0,B017,PENSION VOLCAN,Temuco
1,AS000001_2,2024-12-01 10:58:00,FROZ007,PAPAS FRITAS CORTE BASTÓN 2.5KG,2200.0,3075.0,1.0,B017,PENSION VOLCAN,Temuco
2,AS000001_3,2024-12-01 10:58:00,FROZ003,BARRITAS MERLUZA EMPANADAS 1KG,4200.0,5966.0,1.0,B017,PENSION VOLCAN,Temuco
3,AS000001_4,2024-12-01 10:58:00,SUSH008,SALSA SOYA KIKKOMAN 1L,3200.0,4320.0,1.0,B017,PENSION VOLCAN,Temuco
4,AS000001_5,2024-12-01 10:58:00,SUSH008,SALSA SOYA KIKKOMAN 1L,3200.0,4327.0,1.0,B017,PENSION VOLCAN,Temuco


# Features

## Filters

In [19]:
FLOATPRECISION = 0.0000001
def hour(in_dt):
    return pd.Timestamp(in_dt).hour

## Attributes

In [None]:
def description(in_description):
    return in_description.unique()[0]

def total_transactions(in_trans_id):
    return np.count_nonzero(in_trans_id)

def total_units_sold(in_quantity):
    return np.sum(in_quantity)

def total_revenue(in_total_price):
    return np.sum(in_total_price)

def total_cost(in_cost):
    return np.sum(in_cost)

def first_sale(in_dt):
    return in_dt.min()

def last_sale(in_dt):
    return in_dt.max()

def active_days(first_sale, last_sale):
    return (last_sale - first_sale).dt.days + 1

### Execute Product Model with ModelExecutor

In [21]:
# Step 4: Create model configuration
cfg_product = {
    'model_name': 'product_stats',
    'group_by': ['in_product'],
    'features': {
        # Filtrs
        'hour': hour,
        # Attrs
        'description': description,
        'total_transactions': total_transactions,
        'total_units_sold': total_units_sold,
        'total_revenue': total_revenue,
        'total_cost': total_cost,
        'first_sale': first_sale,
        'last_sale': last_sale,
        'active_days': active_days,
    },
    'output_cols': ['hour', 'description','total_transactions', 'total_units_sold','total_revenue', 'total_cost', 'first_sale', 'last_sale', 'active_days']
}

In [22]:
# Step 5: Create ModelExecutor and execute
executor = ModelExecutor(cfg_product)
results = executor.execute(ctx, input_dataset_name='preprocessed')

print(f"\n✓ Model executed: {executor.model_name}")
print(f"  - Filters shape: {results['filters'].shape}")
print(f"  - Attrs shape: {results['attrs'].shape if results['attrs'] is not None else 'None'}")

0    HARINA TEMPURA ESPECIAL 1KG
Name: in_description, dtype: object


AttributeError: module 'numpy' has no attribute 'first'

In [None]:
# Step 6: Access results from context (multiple ways)
print("\n--- Accessing Results ---")

# Method 1: Direct from results dictionary
filters_df = results['filters']
attrs_df = results['attrs']

# Method 2: From context by dataset name
filters_from_ctx = ctx.get_dataset('product_stats_filters')
attrs_from_ctx = ctx.get_dataset('product_stats_attrs')

# Method 3: Convenience methods
filters_method = ctx.get_model_filters('product_stats')
attrs_method = ctx.get_model_attrs('product_stats')

print(f"All methods return same data: {filters_df.equals(filters_from_ctx) and filters_df.equals(filters_method)}")

In [None]:
ctx.get_model_filters('product_stats').head()

In [None]:
ctx.get_model_attrs('product_stats').head()

## Preprocess

### View Results: Filters and Attrs DataFrames

In [None]:
# Example: Running multiple models with orchestrator
# (You can add more models later: customer_stats, time_period, etc.)

# Create fresh context for orchestrator example
ctx2 = GabedaContext(khujta_cfg)
ctx2.set_dataset('preprocessed', preprocessed_df)

# Initialize orchestrator
orchestrator = ModelOrchestrator(ctx2)

# Register models
orchestrator.register_model(ModelExecutor(cfg_product))
# orchestrator.register_model(ModelExecutor(cfg_customer))  # Add when ready
# orchestrator.register_model(ModelExecutor(cfg_time))       # Add when ready

# Execute all registered models
orchestrator.execute_all(input_dataset_name='preprocessed')

# View summary
ctx2.print_summary() 

In [None]:
# Optional: Save to disk
ctx.save_dataset('product_stats_filters', f"{out_path}/product_filters", format='csv')
ctx.save_dataset('product_stats_attrs', f"{out_path}/product_attrs", format='csv')
print(f"\n✓ Results saved to {out_path}/")