# Dynamic Workforce Shift & Exception Analytics - Demo Notebook

This notebook demonstrates:
- Synthetic data generation
- ETL processing with PySpark
- Exception detection
- Anomaly detection
- Visualizations


In [None]:
import sys
import os
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(''))))

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

# Set style
plt.style.use('seaborn-v0_8')
%matplotlib inline


## 1. Generate Synthetic Data


In [None]:
from src.data.synthetic_generator import SyntheticDataGenerator

# Initialize generator
generator = SyntheticDataGenerator(seed=42)

# Generate employees
employees = generator.generate_employees(count=50)
print(f"Generated {len(employees)} employees")

# Generate shifts
shifts = generator.generate_shifts(employees)
print(f"Generated {len(shifts)} shifts")

# Generate attendance events
start_date = datetime.now() - timedelta(days=14)
events = generator.generate_attendance_events(
    employees, shifts, start_date, days=14, rows=2000
)
print(f"Generated {len(events)} attendance events")

# Convert to DataFrame
events_df = pd.DataFrame(events)
events_df['event_timestamp'] = pd.to_datetime(events_df['event_timestamp'])

print("\nSample events:")
print(events_df.head(10))


## 4. Apply Exception Rules


## 2. Visualize Raw Events


In [None]:
# Event timeline
fig = go.Figure()

check_ins = events_df[events_df['event_type'] == 'CHECK_IN']
check_outs = events_df[events_df['event_type'] == 'CHECK_OUT']

fig.add_trace(go.Scatter(
    x=check_ins['event_timestamp'],
    y=check_ins['employee_id'],
    mode='markers',
    name='Check-In',
    marker=dict(symbol='triangle-up', size=8, color='green')
))

fig.add_trace(go.Scatter(
    x=check_outs['event_timestamp'],
    y=check_outs['employee_id'],
    mode='markers',
    name='Check-Out',
    marker=dict(symbol='triangle-down', size=8, color='red')
))

fig.update_layout(
    title="Attendance Events Timeline",
    xaxis_title="Time",
    yaxis_title="Employee ID",
    height=600
)

fig.show()


## 3. Run ETL Processing (Simplified)


In [None]:
# For demo purposes, we'll simulate ETL processing
# In production, this would use PySpark

from src.rules.exception_engine import ExceptionEngine

# Group events into sessions
sessions = []
session_id = 1

for emp_id in events_df['employee_id'].unique()[:20]:  # Process first 20 employees
    emp_events = events_df[events_df['employee_id'] == emp_id].sort_values('event_timestamp')
    
    i = 0
    while i < len(emp_events):
        if emp_events.iloc[i]['event_type'] == 'CHECK_IN':
            check_in = emp_events.iloc[i]
            
            # Find corresponding check-out
            check_out_idx = None
            for j in range(i + 1, len(emp_events)):
                if emp_events.iloc[j]['event_type'] == 'CHECK_OUT':
                    check_out_idx = j
                    break
            
            if check_out_idx:
                check_out = emp_events.iloc[check_out_idx]
                
                # Calculate worked hours
                worked_hours = (check_out['event_timestamp'] - check_in['event_timestamp']).total_seconds() / 3600
                
                session = {
                    'session_id': session_id,
                    'employee_id': emp_id,
                    'actual_start': check_in['event_timestamp'],
                    'actual_end': check_out['event_timestamp'],
                    'worked_hours': worked_hours,
                    'overtime_hours': max(0, worked_hours - 8),
                    'is_partial': worked_hours < 6,
                    'facility': check_in['facility']
                }
                
                sessions.append(session)
                session_id += 1
                i = check_out_idx + 1
            else:
                i += 1
        else:
            i += 1

sessions_df = pd.DataFrame(sessions)
print(f"Computed {len(sessions_df)} work sessions")
print("\nSample sessions:")
print(sessions_df.head())


In [None]:
exception_engine = ExceptionEngine()

# Add shift times for demo (simplified)
sessions_df['shift_start'] = sessions_df['actual_start'].dt.normalize() + pd.Timedelta(hours=9)
sessions_df['shift_end'] = sessions_df['actual_start'].dt.normalize() + pd.Timedelta(hours=17)

# Evaluate exceptions
exception_codes_list = []
exception_explanations_list = []

for idx, row in sessions_df.iterrows():
    session_dict = row.to_dict()
    exceptions = exception_engine.evaluate_session(session_dict)
    
    codes = [e['code'] for e in exceptions]
    explanations = {e['code']: e['explanation'] for e in exceptions}
    
    exception_codes_list.append(','.join(codes) if codes else None)
    exception_explanations_list.append(explanations if explanations else None)

sessions_df['exception_codes'] = exception_codes_list
sessions_df['exception_explanations'] = exception_explanations_list

# Count exceptions
exceptions_with_data = sessions_df[sessions_df['exception_codes'].notna()]
print(f"Sessions with exceptions: {len(exceptions_with_data)}")

if len(exceptions_with_data) > 0:
    print("\nException examples:")
    for idx, row in exceptions_with_data.head(5).iterrows():
        print(f"\nSession {row['session_id']} (Employee {row['employee_id']}):")
        print(f"  Worked: {row['worked_hours']:.1f} hours")
        print(f"  Exceptions: {row['exception_codes']}")
        if row['exception_explanations']:
            for code, expl in row['exception_explanations'].items():
                print(f"    {code}: {expl}")


## 5. Anomaly Detection


In [None]:
from src.models.anomaly_detector import AnomalyDetector

# Initialize detector
detector = AnomalyDetector(contamination=0.1, random_state=42)

# Fit on sessions
detector.fit(sessions_df)

# Detect anomalies
anomaly_scores, is_anomaly = detector.predict(sessions_df)

sessions_df['anomaly_score'] = anomaly_scores
sessions_df['is_anomaly'] = is_anomaly

anomalies = sessions_df[sessions_df['is_anomaly']]
print(f"Detected {len(anomalies)} anomalous sessions ({len(anomalies)/len(sessions_df)*100:.1f}%)")

if len(anomalies) > 0:
    print("\nTop anomalous sessions:")
    top_anomalies = anomalies.nsmallest(5, 'anomaly_score')
    for idx, row in top_anomalies.iterrows():
        explanation = detector.explain_anomaly(row.to_dict(), row['anomaly_score'])
        print(f"\nSession {row['session_id']} (Employee {row['employee_id']}):")
        print(f"  Score: {row['anomaly_score']:.2f}")
        print(f"  Explanation: {explanation['explanation']}")
        print(f"  Top features: {[f['feature'] for f in explanation['top_features']]}")


## 6. Visualizations


In [None]:
# Work hours distribution
fig = px.histogram(
    sessions_df,
    x='worked_hours',
    nbins=30,
    labels={'worked_hours': 'Worked Hours', 'count': 'Frequency'},
    title="Distribution of Worked Hours"
)
fig.show()


In [None]:
# Workforce heatmap by hour
sessions_df['hour'] = sessions_df['actual_start'].dt.hour

if 'facility' in sessions_df.columns:
    heatmap_data = sessions_df.groupby(['facility', 'hour']).size().reset_index(name='count')
    
    pivot_data = heatmap_data.pivot_table(
        index='hour',
        columns='facility',
        values='count',
        aggfunc='mean'
    ).fillna(0)
    
    fig = px.imshow(
        pivot_data.T,
        labels=dict(x="Hour of Day", y="Facility", color="Employee Count"),
        x=[f"{h:02d}:00" for h in range(24)],
        y=pivot_data.columns,
        color_continuous_scale="YlOrRd",
        title="Average Workforce by Hour and Facility"
    )
    fig.update_layout(height=400)
    fig.show()
else:
    hourly_counts = sessions_df.groupby('hour').size()
    fig = go.Figure(data=go.Bar(
        x=[f"{h:02d}:00" for h in hourly_counts.index],
        y=hourly_counts.values,
        marker_color='steelblue'
    ))
    fig.update_layout(
        title="Average Workforce by Hour",
        xaxis_title="Hour of Day",
        yaxis_title="Employee Count",
        height=400
    )
    fig.show()


In [None]:
# Exception timeline
if 'exception_codes' in sessions_df.columns:
    exceptions_df = sessions_df[sessions_df['exception_codes'].notna()].copy()
    
    if not exceptions_df.empty:
        exceptions_df['exception_list'] = exceptions_df['exception_codes'].str.split(',')
        exceptions_df = exceptions_df.explode('exception_list')
        exceptions_df['exception_list'] = exceptions_df['exception_list'].str.strip()
        
        exception_counts = exceptions_df['exception_list'].value_counts().reset_index()
        exception_counts.columns = ['Exception Type', 'Count']
        
        fig = px.bar(
            exception_counts,
            x='Exception Type',
            y='Count',
            title="Exception Summary"
        )
        fig.update_layout(xaxis_tickangle=-45)
        fig.show()


In [None]:
# Anomaly score distribution
fig = px.histogram(
    sessions_df,
    x='anomaly_score',
    color='is_anomaly',
    nbins=30,
    labels={'anomaly_score': 'Anomaly Score', 'count': 'Frequency'},
    title="Anomaly Score Distribution"
)
fig.show()


## 7. Save Summary


In [None]:
# Save summary statistics
summary = {
    'total_sessions': len(sessions_df),
    'total_hours': sessions_df['worked_hours'].sum(),
    'avg_hours': sessions_df['worked_hours'].mean(),
    'sessions_with_exceptions': len(sessions_df[sessions_df['exception_codes'].notna()]),
    'anomalous_sessions': len(sessions_df[sessions_df['is_anomaly']]),
    'total_overtime': sessions_df['overtime_hours'].sum()
}

print("Summary Statistics:")
for key, value in summary.items():
    print(f"  {key}: {value}")

# Save heatmap as PNG (if kaleido is available)
try:
    if 'facility' in sessions_df.columns:
        heatmap_data = sessions_df.groupby(['facility', 'hour']).size().reset_index(name='count')
        pivot_data = heatmap_data.pivot_table(
            index='hour',
            columns='facility',
            values='count',
            aggfunc='mean'
        ).fillna(0)
        
        fig = px.imshow(
            pivot_data.T,
            labels=dict(x="Hour of Day", y="Facility", color="Employee Count"),
            x=[f"{h:02d}:00" for h in range(24)],
            y=pivot_data.columns,
            color_continuous_scale="YlOrRd",
            title="Average Workforce by Hour and Facility"
        )
        fig.update_layout(height=600, width=1000)
        
        # Save to output directory
        output_path = '../output/demo_summary.png'
        fig.write_image(output_path)
        print(f"\nSaved heatmap to {output_path}")
except Exception as e:
    print(f"\nCould not save PNG (kaleido may not be installed): {e}")
    print("Install with: pip install kaleido")
