# BitByBit Datathon 2025 - Final Submission Notebook

## Team: BitByBit

This notebook contains the complete pipeline for:
- **Task 1**: Service Processing Time Prediction  
- **Task 2**: Staffing Requirements Prediction

### Competition Tasks Summary
1. **Task 1**: Predict `expected_completion_time_minutes` given date, time, and task_id
2. **Task 2**: Predict `predicted_employee_count` given date and section_id

## 1. Data Loading and Preprocessing

In [None]:
# Import necessary libraries
import sys
import os
import pandas as pd
import numpy as np
from pathlib import Path
import logging
from datetime import datetime
import pickle

# Setup paths
code_dir = Path('./code')
data_dir = code_dir / 'data' / 'raw'
artifacts_dir = code_dir / 'artifacts'

# Add src to path for imports
sys.path.append(str(code_dir / 'src'))

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

print("Environment setup complete")

In [None]:
# Load training datasets
try:
    bookings_df = pd.read_csv(data_dir / 'bookings_train.csv')
    tasks_df = pd.read_csv(data_dir / 'tasks.csv')
    staffing_df = pd.read_csv(data_dir / 'staffing_train.csv')
    
    print("Training datasets loaded successfully:")
    print(f"- Bookings: {len(bookings_df)} records")
    print(f"- Tasks: {len(tasks_df)} records")
    print(f"- Staffing: {len(staffing_df)} records")
    
    # Display first few rows of each dataset
    print("\nBookings Dataset Sample:")
    print(bookings_df.head())
    
    print("\nTasks Dataset Sample:")
    print(tasks_df.head())
    
    print("\nStaffing Dataset Sample:")
    print(staffing_df.head())
    
except Exception as e:
    print(f"Error loading datasets: {e}")
    raise

## 2. Data Preprocessing

### Task 1: Processing Time Target Creation

In [None]:
# Import preprocessing functions
from preprocessing import clean_bookings_data, merge_training_data
from time_utils import parse_datetime_safe, calculate_duration_minutes

# Clean bookings data and create target variable for Task 1
logger.info("Preprocessing bookings data for Task 1...")

# Parse datetime columns
bookings_clean = bookings_df.copy()
bookings_clean['check_in_time'] = pd.to_datetime(bookings_clean['check_in_time'], errors='coerce')
bookings_clean['check_out_time'] = pd.to_datetime(bookings_clean['check_out_time'], errors='coerce')
bookings_clean['appointment_date'] = pd.to_datetime(bookings_clean['appointment_date'], errors='coerce')

# Calculate processing time target (Task 1)
mask = (~bookings_clean['check_in_time'].isna()) & (~bookings_clean['check_out_time'].isna())
bookings_clean.loc[mask, 'processing_time_minutes'] = (
    bookings_clean.loc[mask, 'check_out_time'] - bookings_clean.loc[mask, 'check_in_time']
).dt.total_seconds() / 60

# Remove outliers and invalid processing times
bookings_clean = bookings_clean[
    (bookings_clean['processing_time_minutes'] >= 1) & 
    (bookings_clean['processing_time_minutes'] <= 480)  # Max 8 hours
]

print(f"Valid bookings for Task 1: {len(bookings_clean)}")
print(f"Processing time statistics:")
print(bookings_clean['processing_time_minutes'].describe())

### Task 2: Staffing Target Creation

In [None]:
# Prepare staffing data for Task 2
logger.info("Preprocessing staffing data for Task 2...")

staffing_clean = staffing_df.copy()
staffing_clean['date'] = pd.to_datetime(staffing_clean['date'], errors='coerce')

# Remove invalid data
staffing_clean = staffing_clean[
    (staffing_clean['employees_on_duty'] >= 1) & 
    (staffing_clean['employees_on_duty'] <= 50)  # Reasonable limits
]

print(f"Valid staffing records for Task 2: {len(staffing_clean)}")
print(f"Employees on duty statistics:")
print(staffing_clean['employees_on_duty'].describe())

# Show section distribution
print("\nSection distribution:")
print(staffing_clean['section_id'].value_counts())

## 3. Feature Engineering

In [None]:
# Import feature engineering functions
from features_task1 import extract_training_features_task1
from features_task2 import extract_training_features_task2

# Feature engineering for Task 1
logger.info("Extracting features for Task 1...")

# Merge bookings with tasks to get section information
task1_data = bookings_clean.merge(tasks_df[['task_id', 'section_id']], on='task_id', how='left')

# Extract temporal features
task1_data['appt_hour'] = pd.to_datetime(task1_data['appointment_time'], format='%H:%M', errors='coerce').dt.hour
task1_data['appt_weekday'] = task1_data['appointment_date'].dt.weekday
task1_data['appt_month'] = task1_data['appointment_date'].dt.month
task1_data['is_weekend'] = task1_data['appt_weekday'].isin([5, 6]).astype(int)

# Feature engineering for Task 2
logger.info("Extracting features for Task 2...")

# Extract temporal features for staffing
task2_data = staffing_clean.copy()
task2_data['weekday'] = task2_data['date'].dt.weekday
task2_data['month'] = task2_data['date'].dt.month
task2_data['quarter'] = task2_data['date'].dt.quarter
task2_data['is_weekend'] = task2_data['weekday'].isin([5, 6]).astype(int)

print("Feature engineering completed")
print(f"Task 1 features shape: {task1_data.shape}")
print(f"Task 2 features shape: {task2_data.shape}")

## 4. Model Training

### Task 1: Processing Time Prediction Model

In [None]:
# Import baseline models
from baselines import MedianBaselinePredictor, train_median_baseline

# Train Task 1 model
logger.info("Training Task 1 model...")

# Prepare features and target
task1_features = ['task_id', 'appt_hour', 'appt_weekday', 'section_id']
task1_target = 'processing_time_minutes'

# Remove rows with missing target or features
task1_train = task1_data.dropna(subset=[task1_target] + task1_features)

# Create baseline model with hierarchical fallbacks
task1_model = MedianBaselinePredictor()

# Define grouping strategies (in order of preference)
grouping_strategies = [
    ['task_id', 'appt_hour', 'appt_weekday'],  # Most specific
    ['task_id', 'appt_hour'],
    ['task_id', 'appt_weekday'],
    ['task_id'],
    ['appt_hour', 'appt_weekday'],
    ['appt_hour'],
    []  # Global median fallback
]

# Train the model
task1_model.fit(task1_train[task1_features], task1_train[task1_target], grouping_strategies)

print(f"Task 1 model trained on {len(task1_train)} samples")
print(f"Global median: {task1_model.global_median_:.1f} minutes")

### Task 2: Staffing Prediction Model

In [None]:
# Train Task 2 model
logger.info("Training Task 2 model...")

# Prepare features and target
task2_features = ['section_id', 'weekday', 'month', 'is_weekend']
task2_target = 'employees_on_duty'

# Remove rows with missing target or features
task2_train = task2_data.dropna(subset=[task2_target] + task2_features)

# Create baseline model
task2_model = MedianBaselinePredictor()

# Define grouping strategies for Task 2
grouping_strategies_task2 = [
    ['section_id', 'weekday'],  # Most specific
    ['section_id', 'is_weekend'],
    ['section_id'],
    ['weekday'],
    []  # Global median fallback
]

# Train the model
task2_model.fit(task2_train[task2_features], task2_train[task2_target], grouping_strategies_task2)

print(f"Task 2 model trained on {len(task2_train)} samples")
print(f"Global median: {task2_model.global_median_:.1f} employees")

## 5. Model Evaluation

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error

# Evaluate Task 1 model
logger.info("Evaluating Task 1 model...")

task1_predictions = task1_model.predict(task1_train[task1_features])
task1_mae = mean_absolute_error(task1_train[task1_target], task1_predictions)
task1_rmse = np.sqrt(mean_squared_error(task1_train[task1_target], task1_predictions))

print(f"Task 1 Performance:")
print(f"  MAE: {task1_mae:.2f} minutes")
print(f"  RMSE: {task1_rmse:.2f} minutes")

# Evaluate Task 2 model
logger.info("Evaluating Task 2 model...")

task2_predictions = task2_model.predict(task2_train[task2_features])
task2_mae = mean_absolute_error(task2_train[task2_target], task2_predictions)
task2_rmse = np.sqrt(mean_squared_error(task2_train[task2_target], task2_predictions))

print(f"Task 2 Performance:")
print(f"  MAE: {task2_mae:.2f} employees")
print(f"  RMSE: {task2_rmse:.2f} employees")

# Store training metrics
training_metrics = {
    'task1': {'mae': task1_mae, 'rmse': task1_rmse, 'samples': len(task1_train)},
    'task2': {'mae': task2_mae, 'rmse': task2_rmse, 'samples': len(task2_train)}
}

print("\nModel evaluation completed")

## 6. Model Persistence

Save trained models for inference

In [None]:
# Create artifacts directory
artifacts_dir.mkdir(exist_ok=True)

# Save Task 1 model
task1_artifacts = {
    'model': task1_model,
    'model_type': 'median_baseline',
    'feature_columns': task1_features,
    'tasks_df': tasks_df,
    'training_samples': len(task1_train),
    'metrics': training_metrics['task1']
}

with open(artifacts_dir / 'task1_model.pkl', 'wb') as f:
    pickle.dump(task1_artifacts, f)

# Save Task 2 model
task2_artifacts = {
    'model': task2_model,
    'model_type': 'median_baseline',
    'feature_columns': task2_features,
    'training_samples': len(task2_train),
    'metrics': training_metrics['task2']
}

with open(artifacts_dir / 'task2_model.pkl', 'wb') as f:
    pickle.dump(task2_artifacts, f)

print("Models saved to artifacts directory")
print(f"Task 1 model: {artifacts_dir / 'task1_model.pkl'}")
print(f"Task 2 model: {artifacts_dir / 'task2_model.pkl'}")

---

# FINAL INFERENCE CELL

## Competition Requirements: Model Loading and Demonstration

This cell demonstrates the final model by:
1. Loading the saved models (.pkl format)
2. Running inference demos for both tasks
3. Showing clear input/output examples

In [None]:
# ============================================================================
# FINAL INFERENCE DEMONSTRATION
# ============================================================================

print("="*70)
print("FINAL MODEL INFERENCE DEMONSTRATION")
print("="*70)

# Load trained models from .pkl files
print("\n1. Loading trained models...")

# Load Task 1 model
with open('task1_model.pkl', 'rb') as f:
    task1_artifacts = pickle.load(f)
    task1_model_loaded = task1_artifacts['model']
    task1_features = task1_artifacts['feature_columns']
    tasks_df_loaded = task1_artifacts['tasks_df']

print(f"✓ Task 1 model loaded: {task1_artifacts['model_type']}")
print(f"  - Trained on {task1_artifacts['training_samples']} samples")
print(f"  - MAE: {task1_artifacts['metrics']['mae']:.2f} minutes")

# Load Task 2 model
with open('task2_model.pkl', 'rb') as f:
    task2_artifacts = pickle.load(f)
    task2_model_loaded = task2_artifacts['model']
    task2_features = task2_artifacts['feature_columns']

print(f"✓ Task 2 model loaded: {task2_artifacts['model_type']}")
print(f"  - Trained on {task2_artifacts['training_samples']} samples")
print(f"  - MAE: {task2_artifacts['metrics']['mae']:.2f} employees")

print("\n" + "="*70)
print("TASK 1: SERVICE PROCESSING TIME PREDICTION")
print("="*70)

# Task 1 Inference Demo
print("\n2. Task 1 Inference Demo:")
print("Input: date, time, task_id")
print("Output: expected_completion_time_minutes")
print("-" * 50)

# Demo examples for Task 1
task1_demo_inputs = [
    {'date': '2025-08-29', 'time': '10:30', 'task_id': 'TASK-001'},
    {'date': '2025-08-29', 'time': '14:15', 'task_id': 'TASK-008'},
    {'date': '2025-08-30', 'time': '09:00', 'task_id': 'TASK-015'}
]

for i, demo_input in enumerate(task1_demo_inputs, 1):
    # Extract features for prediction
    appt_datetime = pd.to_datetime(f"{demo_input['date']} {demo_input['time']}", errors='coerce')
    appt_hour = appt_datetime.hour
    appt_weekday = pd.to_datetime(demo_input['date']).weekday()
    
    # Get section_id from tasks mapping
    task_info = tasks_df_loaded[tasks_df_loaded['task_id'] == demo_input['task_id']]
    section_id = task_info['section_id'].iloc[0] if len(task_info) > 0 else 'SEC-001'
    task_name = task_info['task_name'].iloc[0] if len(task_info) > 0 else 'Unknown Task'
    
    # Create feature vector
    feature_row = pd.DataFrame({
        'task_id': [demo_input['task_id']],
        'appt_hour': [appt_hour],
        'appt_weekday': [appt_weekday],
        'section_id': [section_id]
    })
    
    # Make prediction
    prediction = task1_model_loaded.predict(feature_row[task1_features])[0]
    prediction = int(np.round(prediction))
    
    print(f"\nExample {i}:")
    print(f"  Input: date={demo_input['date']}, time={demo_input['time']}, task_id={demo_input['task_id']}")
    print(f"  Task: {task_name}")
    print(f"  Predicted completion time: {prediction} minutes")

print("\n" + "="*70)
print("TASK 2: STAFFING REQUIREMENTS PREDICTION")
print("="*70)

# Task 2 Inference Demo
print("\n3. Task 2 Inference Demo:")
print("Input: date, section_id")
print("Output: predicted_employee_count")
print("-" * 50)

# Demo examples for Task 2
task2_demo_inputs = [
    {'date': '2025-08-29', 'section_id': 'SEC-001'},  # Friday
    {'date': '2025-08-30', 'section_id': 'SEC-003'},  # Saturday
    {'date': '2025-09-01', 'section_id': 'SEC-005'}   # Monday
]

for i, demo_input in enumerate(task2_demo_inputs, 1):
    # Extract features for prediction
    demo_date = pd.to_datetime(demo_input['date'])
    weekday = demo_date.weekday()
    month = demo_date.month
    is_weekend = int(weekday in [5, 6])
    
    # Get section name
    section_info = tasks_df_loaded[tasks_df_loaded['section_id'] == demo_input['section_id']]
    section_name = section_info['section_name'].iloc[0] if len(section_info) > 0 else 'Unknown Section'
    
    # Create feature vector
    feature_row = pd.DataFrame({
        'section_id': [demo_input['section_id']],
        'weekday': [weekday],
        'month': [month],
        'is_weekend': [is_weekend]
    })
    
    # Make prediction
    prediction = task2_model_loaded.predict(feature_row[task2_features])[0]
    prediction = int(np.round(prediction))
    
    # Day name for clarity
    day_names = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
    day_name = day_names[weekday]
    
    print(f"\nExample {i}:")
    print(f"  Input: date={demo_input['date']}, section_id={demo_input['section_id']}")
    print(f"  Section: {section_name}")
    print(f"  Day: {day_name} ({'Weekend' if is_weekend else 'Weekday'})")
    print(f"  Predicted employees needed: {prediction}")

print("\n" + "="*70)
print("INFERENCE DEMONSTRATION COMPLETED")
print("Models successfully loaded and demonstrated for both tasks")
print("="*70)