# Complete Workflow: Pandas DataFrame ↔ TimeDB

This notebook demonstrates a complete workflow using pandas DataFrames with TimeDB:
1. Generate sample forecast data in a DataFrame
2. Write it to TimeDB
3. Read it back
4. Perform analysis
5. Update with revised forecasts
6. Compare forecast revisions

This is a realistic example that combines all the concepts from the previous notebooks.


In [None]:
import os
import uuid
import pandas as pd
import numpy as np
from datetime import datetime, timezone, timedelta
from dotenv import load_dotenv

from timedb.db import create, insert, read

load_dotenv()

# Get database connection string
conninfo = os.environ.get("TIMEDB_DSN") or os.environ.get("DATABASE_URL")
if not conninfo:
    raise ValueError("Set TIMEDB_DSN or DATABASE_URL environment variable")

# Set up IDs
tenant_id = uuid.uuid4()
entity_id = uuid.uuid4()

# Create schema
print("Setting up database...")
create.create_schema(conninfo)
print("✓ Database ready")


## Step 1: Generate Forecast Data

Let's simulate a forecast workflow. We'll create a DataFrame with forecast values including mean and quantiles.


In [None]:
# Generate forecast data for the next 48 hours
base_time = datetime(2025, 1, 15, 0, 0, tzinfo=timezone.utc)
forecast_time = datetime.now(timezone.utc)  # When the forecast is made

# Create hourly timestamps
timestamps = [base_time + timedelta(hours=i) for i in range(48)]

# Generate forecast values with some realistic patterns
np.random.seed(42)  # For reproducibility
base_value = 100.0
trend = np.linspace(0, 10, 48)  # Gradual upward trend
seasonality = 5 * np.sin(2 * np.pi * np.arange(48) / 24)  # Daily cycle
noise = np.random.normal(0, 2, 48)  # Random noise

mean_values = base_value + trend + seasonality + noise
quantile_01 = mean_values - 5 - np.abs(np.random.normal(0, 1, 48))
quantile_09 = mean_values + 5 + np.abs(np.random.normal(0, 1, 48))

# Create DataFrame
df_forecast = pd.DataFrame({
    'valid_time': timestamps,
    'mean': mean_values,
    'quantile:0.1': quantile_01,
    'quantile:0.9': quantile_09,
})

print("Forecast DataFrame:")
print(df_forecast.head(10))
print(f"\nShape: {df_forecast.shape}")
print(f"Time range: {df_forecast['valid_time'].min()} to {df_forecast['valid_time'].max()}")


## Step 2: Convert and Write to TimeDB

Now we'll convert the DataFrame to TimeDB format and insert it:


In [None]:
# Helper function to convert DataFrame to TimeDB format
def dataframe_multi_keys_to_timedb_rows(
    df: pd.DataFrame,
    tenant_id: uuid.UUID,
    entity_id: uuid.UUID,
    valid_time_col: str = 'valid_time',
    value_columns: list = None,
) -> list:
    """Convert DataFrame with multiple value columns to TimeDB format."""
    if value_columns is None:
        value_columns = [col for col in df.columns if col != valid_time_col]
    
    # Melt the DataFrame
    df_melted = df.melt(
        id_vars=[valid_time_col],
        value_vars=value_columns,
        var_name='value_key',
        value_name='value'
    )
    
    # Convert to TimeDB format
    rows = []
    for _, row in df_melted.iterrows():
        rows.append((
            tenant_id,
            row[valid_time_col],
            entity_id,
            row['value_key'],
            row['value']
        ))
    
    return rows

# Convert to TimeDB format
value_rows = dataframe_multi_keys_to_timedb_rows(
    df_forecast,
    tenant_id=tenant_id,
    entity_id=entity_id,
    value_columns=['mean', 'quantile:0.1', 'quantile:0.9']
)

print(f"Converted {len(value_rows)} rows to TimeDB format")

# Insert into TimeDB
run_id_1 = uuid.uuid4()
insert.insert_run_with_values(
    conninfo,
    run_id=run_id_1,
    tenant_id=tenant_id,
    workflow_id="forecast-workflow",
    run_start_time=forecast_time,
    known_time=forecast_time,
    value_rows=value_rows,
)

print(f"✓ Inserted run {run_id_1}")
print(f"✓ Inserted {len(value_rows)} values")


## Step 3: Read Data Back

Let's read the data back and verify it matches what we inserted:


In [None]:
# Helper function to read TimeDB data as wide DataFrame
def read_timedb_to_wide_dataframe(
    conninfo: str,
    tenant_id: uuid.UUID,
    start_valid: datetime = None,
    end_valid: datetime = None,
    value_keys: list = None,
) -> pd.DataFrame:
    """Read data from TimeDB and return as wide-format DataFrame."""
    df = read.read_values_between(
        conninfo,
        tenant_id=tenant_id,
        start_valid=start_valid,
        end_valid=end_valid,
        mode="flat",
    )
    
    df_reset = df.reset_index()
    
    if value_keys:
        df_reset = df_reset[df_reset['value_key'].isin(value_keys)]
    
    df_wide = df_reset.pivot(
        index='valid_time',
        columns='value_key',
        values='value'
    )
    
    return df_wide

# Read the data back
df_read = read_timedb_to_wide_dataframe(
    conninfo,
    tenant_id=tenant_id,
    start_valid=base_time,
    end_valid=base_time + timedelta(hours=48),
    value_keys=['mean', 'quantile:0.1', 'quantile:0.9']
)

print("Data read from TimeDB:")
print(df_read.head(10))
print(f"\nShape: {df_read.shape}")
print(f"\nColumns: {df_read.columns.tolist()}")

# Verify the data matches
print("\n✓ Data successfully read from TimeDB")


## Step 4: Visualize the Forecast

Let's create a visualization of our forecast:


In [None]:
import matplotlib.pyplot as plt

fig, ax = plt.subplots(figsize=(14, 6))

# Plot mean with confidence interval
ax.plot(df_read.index, df_read['mean'], label='Mean Forecast', linewidth=2, color='blue')
ax.fill_between(
    df_read.index,
    df_read['quantile:0.1'],
    df_read['quantile:0.9'],
    alpha=0.3,
    color='blue',
    label='80% Confidence Interval'
)

ax.set_title('48-Hour Forecast with Confidence Intervals', fontsize=14, fontweight='bold')
ax.set_xlabel('Valid Time', fontsize=12)
ax.set_ylabel('Value', fontsize=12)
ax.legend()
ax.grid(True, alpha=0.3)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

print("Forecast visualization created")


## Step 5: Insert Revised Forecast

Now let's simulate a revised forecast (e.g., after new data becomes available):


In [None]:
# Revised forecast (made 3 hours later)
revised_time = forecast_time + timedelta(hours=3)

# Generate revised values (slightly different)
np.random.seed(123)  # Different seed for different values
revised_mean = mean_values + np.random.normal(0, 1.5, 48)  # Slight adjustments
revised_quantile_01 = revised_mean - 5 - np.abs(np.random.normal(0, 1, 48))
revised_quantile_09 = revised_mean + 5 + np.abs(np.random.normal(0, 1, 48))

df_forecast_revised = pd.DataFrame({
    'valid_time': timestamps,
    'mean': revised_mean,
    'quantile:0.1': revised_quantile_01,
    'quantile:0.9': revised_quantile_09,
})

print("Revised Forecast DataFrame (first 10 rows):")
print(df_forecast_revised.head(10))

# Convert and insert
value_rows_revised = dataframe_multi_keys_to_timedb_rows(
    df_forecast_revised,
    tenant_id=tenant_id,
    entity_id=entity_id,
    value_columns=['mean', 'quantile:0.1', 'quantile:0.9']
)

run_id_2 = uuid.uuid4()
insert.insert_run_with_values(
    conninfo,
    run_id=run_id_2,
    tenant_id=tenant_id,
    workflow_id="forecast-workflow",
    run_start_time=revised_time,
    known_time=revised_time,
    value_rows=value_rows_revised,
)

print(f"\n✓ Inserted revised run {run_id_2}")
print(f"✓ Inserted {len(value_rows_revised)} values")


## Step 6: Compare Forecast Revisions

Let's compare the original and revised forecasts:


In [None]:
# Read in overlapping mode to see both forecasts
df_overlapping = read.read_values_between(
    conninfo,
    tenant_id=tenant_id,
    start_valid=base_time,
    end_valid=base_time + timedelta(hours=48),
    mode="overlapping",
)

df_overlapping_reset = df_overlapping.reset_index()
df_overlapping_mean = df_overlapping_reset[df_overlapping_reset['value_key'] == 'mean']

# Pivot to compare forecasts
df_comparison = df_overlapping_mean.pivot(
    index='valid_time',
    columns='known_time',
    values='value'
)

print("Forecast Comparison (mean values):")
print(df_comparison.head(10))
print(f"\nShape: {df_comparison.shape}")
print(f"Forecast times: {df_comparison.columns.tolist()}")


In [None]:
# Visualize the comparison
fig, ax = plt.subplots(figsize=(14, 6))

# Plot both forecasts
for col in df_comparison.columns:
    ax.plot(df_comparison.index, df_comparison[col], 
            label=f'Forecast at {col.strftime("%Y-%m-%d %H:%M")}', 
            linewidth=2, alpha=0.7)

ax.set_title('Forecast Revisions Comparison', fontsize=14, fontweight='bold')
ax.set_xlabel('Valid Time', fontsize=12)
ax.set_ylabel('Mean Forecast Value', fontsize=12)
ax.legend()
ax.grid(True, alpha=0.3)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Calculate differences
if len(df_comparison.columns) == 2:
    forecast_1 = df_comparison.iloc[:, 0]
    forecast_2 = df_comparison.iloc[:, 1]
    differences = forecast_2 - forecast_1
    
    print(f"\nForecast differences (revised - original):")
    print(f"Mean difference: {differences.mean():.2f}")
    print(f"Max difference: {differences.max():.2f}")
    print(f"Min difference: {differences.min():.2f}")
    print(f"Std deviation: {differences.std():.2f}")


## Step 7: Forecast Accuracy Analysis

Let's simulate actual values and compare them with our forecasts:


In [None]:
# Simulate "actual" values (in reality, these would come from observations)
np.random.seed(456)
actual_values = mean_values + np.random.normal(0, 3, 48)  # Actual with more noise

df_actual = pd.DataFrame({
    'valid_time': timestamps,
    'actual': actual_values
})

# Get latest forecast
df_latest = read_timedb_to_wide_dataframe(
    conninfo,
    tenant_id=tenant_id,
    start_valid=base_time,
    end_valid=base_time + timedelta(hours=48),
    value_keys=['mean']
)

# Compare latest forecast with actuals
df_latest_mean = df_latest[['mean']].copy()
df_latest_mean['actual'] = df_actual.set_index('valid_time')['actual']
df_latest_mean['error'] = df_latest_mean['actual'] - df_latest_mean['mean']
df_latest_mean['abs_error'] = df_latest_mean['error'].abs()

print("Forecast vs Actual Comparison:")
print(df_latest_mean.head(10))

print(f"\nForecast Accuracy Metrics:")
print(f"Mean Absolute Error (MAE): {df_latest_mean['abs_error'].mean():.2f}")
print(f"Root Mean Squared Error (RMSE): {np.sqrt((df_latest_mean['error']**2).mean()):.2f}")
print(f"Mean Error (Bias): {df_latest_mean['error'].mean():.2f}")


In [None]:
# Visualize forecast vs actual
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(14, 10), sharex=True)

# Top plot: Forecast vs Actual
ax1.plot(df_latest_mean.index, df_latest_mean['mean'], 
         label='Latest Forecast', linewidth=2, color='blue')
ax1.plot(df_latest_mean.index, df_latest_mean['actual'], 
         label='Actual', linewidth=2, color='red', linestyle='--')
ax1.set_title('Forecast vs Actual Values', fontsize=14, fontweight='bold')
ax1.set_ylabel('Value', fontsize=12)
ax1.legend()
ax1.grid(True, alpha=0.3)

# Bottom plot: Error over time
ax2.plot(df_latest_mean.index, df_latest_mean['error'], 
         linewidth=2, color='green', alpha=0.7)
ax2.axhline(y=0, color='black', linestyle='-', linewidth=1)
ax2.set_title('Forecast Error Over Time', fontsize=14, fontweight='bold')
ax2.set_xlabel('Valid Time', fontsize=12)
ax2.set_ylabel('Error (Actual - Forecast)', fontsize=12)
ax2.grid(True, alpha=0.3)

plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


## Summary

This complete workflow demonstrated:

1. ✅ **Data Generation**: Created forecast DataFrames with multiple value keys
2. ✅ **Data Writing**: Converted DataFrames to TimeDB format and inserted them
3. ✅ **Data Reading**: Retrieved data from TimeDB as DataFrames
4. ✅ **Data Visualization**: Created plots to visualize forecasts
5. ✅ **Forecast Revisions**: Inserted revised forecasts and compared them
6. ✅ **Analysis**: Compared forecasts with simulated actuals and calculated accuracy metrics

**Key Takeaways:**
- TimeDB seamlessly integrates with pandas DataFrames
- The workflow supports multiple forecast revisions over time
- You can easily compare different forecast versions
- The system maintains full history of all forecast revisions
- All operations are atomic (no partial writes)

**Next Steps:**
- Explore the other example notebooks for more specific use cases
- Check out the API documentation for REST API usage
- Review the workflow examples in `timedb/workflows/` for real-world scenarios
