In [17]:
import sys
import os

# Add the venv to the Python path
venv_path = os.path.join(os.getcwd(), '.venv', 'Lib', 'site-packages')
if venv_path not in sys.path:
    sys.path.insert(0, venv_path)

print(f"Python executable: {sys.executable}")
print(f"Python version: {sys.version}")
print(f"Working directory: {os.getcwd()}")

Python executable: c:\Users\mvzie\Documents\AI Agent Experiment\.venv\Scripts\python.exe
Python version: 3.12.2 (tags/v3.12.2:6abddd9, Feb  6 2024, 21:26:36) [MSC v.1937 64 bit (AMD64)]
Working directory: c:\Users\mvzie\Documents\AI Agent Experiment


## 1. Setup and Load Consolidated Data

In [18]:
import pandas as pd
import duckdb
from datetime import datetime
import numpy as np
import time

# Connect to DuckDB with retry logic for file lock issues
max_retries = 3
retry_delay = 2

conn = None
for attempt in range(max_retries):
    try:
        conn = duckdb.connect('animal_shelter.duckdb')
        print(f"✓ Connected to animal_shelter.duckdb")
        break
    except Exception as e:
        if attempt < max_retries - 1:
            print(f"Connection attempt {attempt + 1} failed: {str(e)[:80]}")
            print(f"Retrying in {retry_delay} seconds...")
            time.sleep(retry_delay)
        else:
            print(f"Failed to connect after {max_retries} attempts")
            raise

# Load the consolidated data
consolidated_df = conn.execute("""
    SELECT * FROM animal_outcomes_consolidated
""").fetchdf()

print(f"Consolidated table loaded: {consolidated_df.shape[0]} rows, {consolidated_df.shape[1]} columns")
print(f"\nColumns: {list(consolidated_df.columns)}")
print(f"\nFirst row preview:")
consolidated_df.head(1)

✓ Connected to animal_shelter.duckdb
Consolidated table loaded: 172044 rows, 35 columns

Columns: ['Animal ID', 'Date of Birth', 'Name', 'DateTime', 'MonthYear', 'Outcome Type', 'Outcome Subtype', 'Animal Type', 'Sex upon Outcome', 'Age upon Outcome', 'Breed', 'Color', 'outcome_year', 'outcome_month', 'outcome_day_of_month', 'outcome_day_of_week', 'outcome_week_of_year', 'outcome_quarter', 'outcome_is_weekend', 'primary_breed', 'secondary_breed', 'is_mixed_breed', 'breed_group', 'age_at_outcome_days', 'age_at_outcome_years', 'age_group', 'is_intact', 'is_male', 'is_female', 'is_live_outcome', 'intake_date', 'Intake Type', 'Intake Condition', 'days_in_shelter', 'stay_duration_category']

First row preview:


Unnamed: 0,Animal ID,Date of Birth,Name,DateTime,MonthYear,Outcome Type,Outcome Subtype,Animal Type,Sex upon Outcome,Age upon Outcome,...,age_group,is_intact,is_male,is_female,is_live_outcome,intake_date,Intake Type,Intake Condition,days_in_shelter,stay_duration_category
0,A694007,2014-06-18,,2014-12-18 17:28:00-07:00,12-2014,Transfer,Partner,Dog,Intact Male,6 months,...,Under 1 Year,1,1,0,1,2014-12-18 14:57:00,Stray,Normal,0,Same Day


## 2. Build DIM_DATE (Role-Playing for Outcome and Intake Dates)

In [19]:
# Extract unique dates from DateTime (outcome) and intake_date
outcome_dates = pd.to_datetime(consolidated_df['DateTime']).dt.date
intake_dates = pd.to_datetime(consolidated_df['intake_date']).dt.date
all_dates = sorted(set(outcome_dates.dropna()) | set(intake_dates.dropna()))

# Create DIM_DATE
date_records = []
for date_val in all_dates:
    pd_date = pd.Timestamp(date_val)
    date_records.append({
        'date_key': int(pd_date.strftime('%Y%m%d')),
        'date': date_val,
        'year': pd_date.year,
        'quarter': pd_date.quarter,
        'month': pd_date.month,
        'month_name': pd_date.strftime('%B'),
        'day_of_month': pd_date.day,
        'day_of_week': pd_date.dayofweek,  # 0=Monday, 6=Sunday
        'day_of_week_name': pd_date.strftime('%A'),
        'week_of_year': pd_date.isocalendar()[1],
        'is_weekend': 1 if pd_date.dayofweek >= 5 else 0
    })

dim_date = pd.DataFrame(date_records)

print(f"DIM_DATE created: {dim_date.shape[0]} rows")
print(f"Date range: {dim_date['date'].min()} to {dim_date['date'].max()}")
print(f"\nSample rows:")
dim_date.head(20)

DIM_DATE created: 4233 rows
Date range: 2013-10-01 to 2025-05-05

Sample rows:


Unnamed: 0,date_key,date,year,quarter,month,month_name,day_of_month,day_of_week,day_of_week_name,week_of_year,is_weekend
0,20131001,2013-10-01,2013,4,10,October,1,1,Tuesday,40,0
1,20131002,2013-10-02,2013,4,10,October,2,2,Wednesday,40,0
2,20131003,2013-10-03,2013,4,10,October,3,3,Thursday,40,0
3,20131004,2013-10-04,2013,4,10,October,4,4,Friday,40,0
4,20131005,2013-10-05,2013,4,10,October,5,5,Saturday,40,1
5,20131006,2013-10-06,2013,4,10,October,6,6,Sunday,40,1
6,20131007,2013-10-07,2013,4,10,October,7,0,Monday,41,0
7,20131008,2013-10-08,2013,4,10,October,8,1,Tuesday,41,0
8,20131009,2013-10-09,2013,4,10,October,9,2,Wednesday,41,0
9,20131010,2013-10-10,2013,4,10,October,10,3,Thursday,41,0


## 3. Build DIM_ANIMAL_ATTRIBUTES

In [20]:
# Create DIM_ANIMAL_ATTRIBUTES by getting unique combinations
dim_animal_attributes = consolidated_df[[
    'Animal Type', 'primary_breed', 'secondary_breed', 'is_mixed_breed', 
    'breed_group', 'Color'
]].drop_duplicates().reset_index(drop=True)

# Rename to lowercase for consistency
dim_animal_attributes.columns = ['animal_type', 'primary_breed', 'secondary_breed', 'is_mixed_breed', 'breed_group', 'color']

# Add surrogate key
dim_animal_attributes.insert(0, 'animal_attributes_key', range(1, len(dim_animal_attributes) + 1))

print(f"DIM_ANIMAL_ATTRIBUTES created: {dim_animal_attributes.shape[0]} rows")
print(f"\nSample rows:")
dim_animal_attributes.head(5)

DIM_ANIMAL_ATTRIBUTES created: 16414 rows

Sample rows:


Unnamed: 0,animal_attributes_key,animal_type,primary_breed,secondary_breed,is_mixed_breed,breed_group,color
0,1,Dog,Pit Bull Mix,,0,Working,Brown/White
1,2,Dog,Labrador Retriever Mix,,0,Sporting,Chocolate/White
2,3,Cat,Domestic Shorthair Mix,,0,Mixed,Calico
3,4,Cat,Domestic Shorthair Mix,,0,Mixed,Black
4,5,Dog,German Shepherd,Labrador Retriever,1,Sporting,Brown/Black


## 4. Build DIM_SEX_ON_OUTCOME (with Age Group)

In [21]:
# Create DIM_SEX_ON_OUTCOME including age_group
# Include all combinations of sex attributes and age groups from consolidated data
dim_sex = consolidated_df[[
    'Sex upon Outcome', 'is_intact', 'is_male', 'is_female', 'age_group'
]].drop_duplicates().reset_index(drop=True)

# Include all age groups including 'Unknown' - don't filter any out
# The dimension should represent all possible combinations that exist in the data

# Rename to lowercase for consistency
dim_sex.columns = ['sex_upon_outcome', 'is_intact', 'is_male', 'is_female', 'age_group']

# Add surrogate key
dim_sex.insert(0, 'sex_key', range(1, len(dim_sex) + 1))

print(f"DIM_SEX_ON_OUTCOME created: {dim_sex.shape[0]} rows")
print(f"Age group distribution:")
for age_group in sorted(dim_sex['age_group'].unique()):
    count = (dim_sex['age_group'] == age_group).sum()
    print(f"  {age_group}: {count} sex combinations")
print(f"\nSample rows:")
dim_sex.head(10)

DIM_SEX_ON_OUTCOME created: 21 rows
Age group distribution:
  1-5 Years: 5 sex combinations
  5-10 Years: 6 sex combinations
  Over 10 Years: 5 sex combinations
  Under 1 Year: 5 sex combinations

Sample rows:


Unnamed: 0,sex_key,sex_upon_outcome,is_intact,is_male,is_female,age_group
0,1,Intact Male,1,1,0,Under 1 Year
1,2,Spayed Female,0,0,1,5-10 Years
2,3,Spayed Female,0,0,1,1-5 Years
3,4,Neutered Male,0,1,0,Under 1 Year
4,5,Neutered Male,0,1,0,5-10 Years
5,6,Neutered Male,0,1,0,1-5 Years
6,7,Neutered Male,0,1,0,Over 10 Years
7,8,Spayed Female,0,0,1,Under 1 Year
8,9,Intact Female,1,0,1,Under 1 Year
9,10,Intact Male,1,1,0,1-5 Years


## 5. Build DIM_OUTCOME_TYPE (with Stay Duration Category)

In [22]:
# Create DIM_OUTCOME_TYPE including stay_duration_category
dim_outcome = consolidated_df[[
    'Outcome Type', 'Outcome Subtype', 'is_live_outcome', 'stay_duration_category'
]].drop_duplicates().reset_index(drop=True)

# Rename to lowercase for consistency
dim_outcome.columns = ['outcome_type', 'outcome_subtype', 'is_live_outcome', 'stay_duration_category']

# Add surrogate key
dim_outcome.insert(0, 'outcome_key', range(1, len(dim_outcome) + 1))

print(f"DIM_OUTCOME_TYPE created: {dim_outcome.shape[0]} rows")
print(f"\nUnique stay duration categories:")
print(consolidated_df['stay_duration_category'].unique())
print(f"\nSample rows:")
dim_outcome.head(10)

DIM_OUTCOME_TYPE created: 215 rows

Unique stay duration categories:
['Same Day' 'Under 1 Week' '1-4 Weeks' '1-3 Months' '3-6 Months'
 '6-12 Months' '1-1.6 Years']

Sample rows:


Unnamed: 0,outcome_key,outcome_type,outcome_subtype,is_live_outcome,stay_duration_category
0,1,Transfer,Partner,1,Same Day
1,2,Return to Owner,,1,Under 1 Week
2,3,Transfer,Partner,1,1-4 Weeks
3,4,Adoption,,1,Under 1 Week
4,5,Adoption,,1,1-3 Months
5,6,Transfer,Partner,1,Under 1 Week
6,7,Adoption,,1,1-4 Weeks
7,8,Return to Owner,,1,Same Day
8,9,Transfer,SCRP,1,Under 1 Week
9,10,Return to Owner,,1,1-4 Weeks


## 6. Build DIM_INTAKE_DETAILS

In [23]:
# Create DIM_INTAKE_DETAILS
dim_intake_details = consolidated_df[[
    'Intake Type', 'Intake Condition'
]].drop_duplicates().reset_index(drop=True)

# Rename to lowercase for consistency
dim_intake_details.columns = ['intake_type', 'intake_condition']

# Add engineered columns
dim_intake_details['condition_severity'] = dim_intake_details['intake_condition'].map({
    'Normal': 'Healthy',
    'Injured': 'Sick/Injured',
    'Nursing': 'Pregnant/Nursing',
    'Pregnant': 'Pregnant/Nursing',
    'Feral': 'Feral',
    'Aged': 'Elderly',
    'Behavior Issue': 'Behavioral',
    'Other': 'Other'
}).fillna(dim_intake_details['intake_condition'])  # Default to condition if not mapped

dim_intake_details['has_condition_flag'] = (dim_intake_details['intake_condition'] != 'Normal').astype(int)

# Add surrogate key
dim_intake_details.insert(0, 'intake_details_key', range(1, len(dim_intake_details) + 1))

print(f"DIM_INTAKE_DETAILS created: {dim_intake_details.shape[0]} rows")
print(f"\nSample rows:")
dim_intake_details.head(10)

DIM_INTAKE_DETAILS created: 76 rows

Sample rows:


Unnamed: 0,intake_details_key,intake_type,intake_condition,condition_severity,has_condition_flag
0,1,Stray,Normal,Healthy,0
1,2,Owner Surrender,Normal,Healthy,0
2,3,Public Assist,Normal,Healthy,0
3,4,Stray,Sick,Sick,1
4,5,Stray,Injured,Sick/Injured,1
5,6,Euthanasia Request,Normal,Healthy,0
6,7,Stray,Nursing,Pregnant/Nursing,1
7,8,Stray,Aged,Elderly,1
8,9,Owner Surrender,Sick,Sick,1
9,10,Owner Surrender,Nursing,Pregnant/Nursing,1


## 7. Build FACT_ANIMAL_OUTCOME with Foreign Keys

In [24]:
# Create a working copy for joins
fact_df = consolidated_df.copy()

# Rename key columns to match dimension join keys
fact_df = fact_df.rename(columns={
    'Animal Type': 'animal_type',
    'Color': 'color',
    'Sex upon Outcome': 'sex_upon_outcome',
    'Outcome Type': 'outcome_type',
    'Outcome Subtype': 'outcome_subtype',
    'Intake Type': 'intake_type',
    'Intake Condition': 'intake_condition',
    'Animal ID': 'animal_id'
})

# Join to dim_animal_attributes
fact_df = fact_df.merge(
    dim_animal_attributes[['animal_attributes_key', 'animal_type', 'primary_breed', 'secondary_breed', 'is_mixed_breed', 'breed_group', 'color']],
    on=['animal_type', 'primary_breed', 'secondary_breed', 'is_mixed_breed', 'breed_group', 'color'],
    how='left'
)

# Join to dim_sex
fact_df = fact_df.merge(
    dim_sex[['sex_key', 'sex_upon_outcome', 'is_intact', 'is_male', 'is_female', 'age_group']],
    on=['sex_upon_outcome', 'is_intact', 'is_male', 'is_female', 'age_group'],
    how='left'
)

# Join to dim_outcome (need to handle columns carefully)
fact_df = fact_df.merge(
    dim_outcome[['outcome_key', 'outcome_type', 'outcome_subtype', 'is_live_outcome', 'stay_duration_category']],
    on=['outcome_type', 'outcome_subtype', 'is_live_outcome', 'stay_duration_category'],
    how='left'
)

# Join to dim_intake_details
fact_df = fact_df.merge(
    dim_intake_details[['intake_details_key', 'intake_type', 'intake_condition']],
    on=['intake_type', 'intake_condition'],
    how='left'
)

# Create date keys
fact_df['outcome_date_key'] = pd.to_datetime(fact_df['DateTime']).dt.strftime('%Y%m%d').astype(int)
fact_df['intake_date_key'] = pd.to_datetime(fact_df['intake_date']).dt.strftime('%Y%m%d').astype(int)

print(f"Fact table created (before selecting columns): {fact_df.shape[0]} rows")
print(f"\nForeign keys columns present:")
fk_cols = ['animal_attributes_key', 'sex_key', 'outcome_key', 'intake_details_key', 'outcome_date_key', 'intake_date_key']
for col in fk_cols:
    nulls = fact_df[col].isna().sum()
    print(f"  {col}: {nulls} nulls")

Fact table created (before selecting columns): 172044 rows

Foreign keys columns present:
  animal_attributes_key: 0 nulls
  sex_key: 0 nulls
  outcome_key: 0 nulls
  intake_details_key: 0 nulls
  outcome_date_key: 0 nulls
  intake_date_key: 0 nulls


In [25]:
# Select final fact table columns: degenerate dimension, foreign keys, and measures
fact_animal_outcome = fact_df[[
    'animal_id',  # Degenerate dimension
    'animal_attributes_key',  # FK
    'sex_key',  # FK
    'outcome_date_key',  # FK (role-playing)
    'intake_date_key',  # FK (role-playing)
    'outcome_key',  # FK
    'intake_details_key',  # FK
    'days_in_shelter',  # Measure
    'age_at_outcome_days',  # Measure
    'age_at_outcome_years'  # Measure
]].reset_index(drop=True)

# Add fact_id as primary key (1-indexed)
fact_animal_outcome.insert(0, 'fact_id', range(1, len(fact_animal_outcome) + 1))

print(f"FACT_ANIMAL_OUTCOME created: {fact_animal_outcome.shape[0]} rows, {fact_animal_outcome.shape[1]} columns")
print(f"\nColumns: {list(fact_animal_outcome.columns)}")
print(f"\nData types:")
print(fact_animal_outcome.dtypes)
print(f"\nSample rows:")
fact_animal_outcome.head(3)

FACT_ANIMAL_OUTCOME created: 172044 rows, 11 columns

Columns: ['fact_id', 'animal_id', 'animal_attributes_key', 'sex_key', 'outcome_date_key', 'intake_date_key', 'outcome_key', 'intake_details_key', 'days_in_shelter', 'age_at_outcome_days', 'age_at_outcome_years']

Data types:
fact_id                   int64
animal_id                object
animal_attributes_key     int64
sex_key                   int64
outcome_date_key          int32
intake_date_key           int32
outcome_key               int64
intake_details_key        int64
days_in_shelter           int64
age_at_outcome_days       int32
age_at_outcome_years      int64
dtype: object

Sample rows:


Unnamed: 0,fact_id,animal_id,animal_attributes_key,sex_key,outcome_date_key,intake_date_key,outcome_key,intake_details_key,days_in_shelter,age_at_outcome_days,age_at_outcome_years
0,1,A694007,1,1,20141218,20141218,1,1,0,183,0
1,2,A694013,2,2,20141221,20141218,2,1,3,2923,8
2,3,A694015,3,3,20141226,20141218,3,2,8,556,1


## 8. Validate Star Schema Structure

In [26]:
print("=" * 80)
print("STAR SCHEMA VALIDATION")
print("=" * 80)

print(f"\n✓ DIM_DATE: {dim_date.shape[0]} rows")
print(f"  Date range: {dim_date['date'].min()} to {dim_date['date'].max()}")
print(f"  Unique date keys: {dim_date['date_key'].nunique()}")

print(f"\n✓ DIM_ANIMAL_ATTRIBUTES: {dim_animal_attributes.shape[0]} rows")
print(f"  Columns: {list(dim_animal_attributes.columns)}")

print(f"\n✓ DIM_SEX_ON_OUTCOME: {dim_sex.shape[0]} rows")
print(f"  Columns: {list(dim_sex.columns)}")
print(f"  Age groups: {sorted(dim_sex['age_group'].unique())}")

print(f"\n✓ DIM_OUTCOME_TYPE: {dim_outcome.shape[0]} rows")
print(f"  Columns: {list(dim_outcome.columns)}")
print(f"  Duration categories: {sorted(dim_outcome['stay_duration_category'].unique())}")

print(f"\n✓ DIM_INTAKE_DETAILS: {dim_intake_details.shape[0]} rows")
print(f"  Columns: {list(dim_intake_details.columns)}")

print(f"\n✓ FACT_ANIMAL_OUTCOME: {fact_animal_outcome.shape[0]} rows")
print(f"  Columns: {list(fact_animal_outcome.columns)}")
print(f"  Foreign keys referential integrity:")
print(f"    - animal_attributes_key: {fact_animal_outcome['animal_attributes_key'].nunique()} unique values")
print(f"    - sex_key: {fact_animal_outcome['sex_key'].nunique()} unique values")
print(f"    - outcome_key: {fact_animal_outcome['outcome_key'].nunique()} unique values")
print(f"    - intake_details_key: {fact_animal_outcome['intake_details_key'].nunique()} unique values")
print(f"    - outcome_date_key: {fact_animal_outcome['outcome_date_key'].nunique()} unique values")
print(f"    - intake_date_key: {fact_animal_outcome['intake_date_key'].nunique()} unique values")

STAR SCHEMA VALIDATION

✓ DIM_DATE: 4233 rows
  Date range: 2013-10-01 to 2025-05-05
  Unique date keys: 4233

✓ DIM_ANIMAL_ATTRIBUTES: 16414 rows
  Columns: ['animal_attributes_key', 'animal_type', 'primary_breed', 'secondary_breed', 'is_mixed_breed', 'breed_group', 'color']

✓ DIM_SEX_ON_OUTCOME: 21 rows
  Columns: ['sex_key', 'sex_upon_outcome', 'is_intact', 'is_male', 'is_female', 'age_group']
  Age groups: ['1-5 Years', '5-10 Years', 'Over 10 Years', 'Under 1 Year']

✓ DIM_OUTCOME_TYPE: 215 rows
  Columns: ['outcome_key', 'outcome_type', 'outcome_subtype', 'is_live_outcome', 'stay_duration_category']
  Duration categories: ['1-1.6 Years', '1-3 Months', '1-4 Weeks', '3-6 Months', '6-12 Months', 'Same Day', 'Under 1 Week']

✓ DIM_INTAKE_DETAILS: 76 rows
  Columns: ['intake_details_key', 'intake_type', 'intake_condition', 'condition_severity', 'has_condition_flag']

✓ FACT_ANIMAL_OUTCOME: 172044 rows
  Columns: ['fact_id', 'animal_id', 'animal_attributes_key', 'sex_key', 'outcome_dat

## 9. Check for Foreign Key Violations

In [27]:
print("Checking foreign key constraints...\n")

# Check animal_attributes_key
invalid_aak = fact_animal_outcome[~fact_animal_outcome['animal_attributes_key'].isin(dim_animal_attributes['animal_attributes_key'])]
print(f"Invalid animal_attributes_key: {len(invalid_aak)} rows")

# Check sex_key
invalid_sk = fact_animal_outcome[~fact_animal_outcome['sex_key'].isin(dim_sex['sex_key'])]
print(f"Invalid sex_key: {len(invalid_sk)} rows")

# Check outcome_key
invalid_ok = fact_animal_outcome[~fact_animal_outcome['outcome_key'].isin(dim_outcome['outcome_key'])]
print(f"Invalid outcome_key: {len(invalid_ok)} rows")

# Check intake_details_key
invalid_idk = fact_animal_outcome[~fact_animal_outcome['intake_details_key'].isin(dim_intake_details['intake_details_key'])]
print(f"Invalid intake_details_key: {len(invalid_idk)} rows")

# Check outcome_date_key
invalid_odk = fact_animal_outcome[~fact_animal_outcome['outcome_date_key'].isin(dim_date['date_key'])]
print(f"Invalid outcome_date_key: {len(invalid_odk)} rows")

# Check intake_date_key
invalid_idk2 = fact_animal_outcome[~fact_animal_outcome['intake_date_key'].isin(dim_date['date_key'])]
print(f"Invalid intake_date_key: {len(invalid_idk2)} rows")

print("\n✓ All foreign key constraints validated!")

Checking foreign key constraints...

Invalid animal_attributes_key: 0 rows
Invalid sex_key: 0 rows
Invalid outcome_key: 0 rows
Invalid intake_details_key: 0 rows
Invalid outcome_date_key: 0 rows
Invalid intake_date_key: 0 rows

✓ All foreign key constraints validated!


## 10. Write Tables to DuckDB

In [28]:
# Write dimension and fact tables to DuckDB
conn.execute("DROP TABLE IF EXISTS dim_date")
conn.execute("DROP TABLE IF EXISTS dim_animal_attributes")
conn.execute("DROP TABLE IF EXISTS dim_sex_on_outcome")
conn.execute("DROP TABLE IF EXISTS dim_outcome_type")
conn.execute("DROP TABLE IF EXISTS dim_intake_details")
conn.execute("DROP TABLE IF EXISTS fact_animal_outcome")

conn.register('dim_date_temp', dim_date)
conn.execute("CREATE TABLE dim_date AS SELECT * FROM dim_date_temp")

conn.register('dim_animal_attributes_temp', dim_animal_attributes)
conn.execute("CREATE TABLE dim_animal_attributes AS SELECT * FROM dim_animal_attributes_temp")

conn.register('dim_sex_on_outcome_temp', dim_sex)
conn.execute("CREATE TABLE dim_sex_on_outcome AS SELECT * FROM dim_sex_on_outcome_temp")

conn.register('dim_outcome_type_temp', dim_outcome)
conn.execute("CREATE TABLE dim_outcome_type AS SELECT * FROM dim_outcome_type_temp")

conn.register('dim_intake_details_temp', dim_intake_details)
conn.execute("CREATE TABLE dim_intake_details AS SELECT * FROM dim_intake_details_temp")

conn.register('fact_animal_outcome_temp', fact_animal_outcome)
conn.execute("CREATE TABLE fact_animal_outcome AS SELECT * FROM fact_animal_outcome_temp")

print("Tables written to DuckDB!")
print("\nVerifying tables...")

# Verify tables exist and show row counts
tables = ['dim_date', 'dim_animal_attributes', 'dim_sex_on_outcome', 'dim_outcome_type', 'dim_intake_details', 'fact_animal_outcome']
for table in tables:
    count = conn.execute(f"SELECT COUNT(*) as cnt FROM {table}").fetchall()[0][0]
    print(f"  {table}: {count:,} rows")

Tables written to DuckDB!

Verifying tables...
  dim_date: 4,233 rows
  dim_animal_attributes: 16,414 rows
  dim_sex_on_outcome: 21 rows
  dim_outcome_type: 215 rows
  dim_intake_details: 76 rows
  fact_animal_outcome: 172,044 rows


## 11. Sample Queries for Testing

In [29]:
# Test query 1: Summary of outcomes by type
print("Outcomes by Type:")
print(conn.execute("""
    SELECT 
        d.outcome_type,
        COUNT(*) as outcome_count,
        ROUND(AVG(f.days_in_shelter), 2) as avg_days_in_shelter,
        SUM(f.days_in_shelter) as total_days_in_shelter
    FROM fact_animal_outcome f
    JOIN dim_outcome_type d ON f.outcome_key = d.outcome_key
    GROUP BY d.outcome_type
    ORDER BY outcome_count DESC
""").df())

print("\n" + "="*80 + "\n")

# Test query 2: Outcomes by age group
print("Outcomes by Age Group:")
print(conn.execute("""
    SELECT 
        s.age_group,
        COUNT(*) as animal_count,
        ROUND(AVG(f.age_at_outcome_years), 2) as avg_age_years
    FROM fact_animal_outcome f
    JOIN dim_sex_on_outcome s ON f.sex_key = s.sex_key
    GROUP BY s.age_group
    ORDER BY animal_count DESC
""").df())

print("\n" + "="*80 + "\n")

# Test query 3: Stay duration analysis
print("Outcomes by Stay Duration Category:")
print(conn.execute("""
    SELECT 
        d.stay_duration_category,
        COUNT(*) as animal_count,
        ROUND(AVG(f.days_in_shelter), 2) as avg_days
    FROM fact_animal_outcome f
    JOIN dim_outcome_type d ON f.outcome_key = d.outcome_key
    GROUP BY d.stay_duration_category
    ORDER BY animal_count DESC
""").df())

Outcomes by Type:
       outcome_type  outcome_count  avg_days_in_shelter  total_days_in_shelter
0          Adoption          83805                33.47              2805368.0
1          Transfer          48059                 9.95               478342.0
2   Return to Owner          25596                 3.72                95304.0
3        Euthanasia          10693                 5.67                60664.0
4              Died           1628                12.86                20933.0
5         Rto-Adopt           1239                17.28                21406.0
6          Disposal            859                 5.44                 4670.0
7           Missing             90                52.19                 4697.0
8           Unknown             42                17.98                  755.0
9          Relocate             26                 5.58                  145.0
10           Stolen              5                87.60                  438.0
11             Lost              2

## 12. Populate Age Group in DIM_SEX_ON_OUTCOME

In [30]:
# No longer needed - age_group is now included at dimension creation time
# All 20 combinations of sex attributes and age groups are already in dim_sex_on_outcome

conn = duckdb.connect('animal_shelter.duckdb')

print("DIM_SEX_ON_OUTCOME populated with age_group during dimension creation")
print()

# Verify the dimension has all combinations
result = conn.execute("""
    SELECT COUNT(*) as total_rows, 
           COUNT(DISTINCT age_group) as distinct_age_groups,
           COUNT(CASE WHEN age_group IS NULL THEN 1 END) as null_count
    FROM dim_sex_on_outcome
""").fetchall()[0]

print(f"DIM_SEX_ON_OUTCOME: {result[0]} rows")
print(f"Distinct age groups: {result[1]}")
print(f"NULL age_group values: {result[2]}")
print()

# Show final distribution
distribution = conn.execute("""
    SELECT age_group, COUNT(*) as count
    FROM dim_sex_on_outcome
    GROUP BY age_group
    ORDER BY 
        CASE 
            WHEN age_group = 'Under 1 Year' THEN 1
            WHEN age_group = '1-5 Years' THEN 2
            WHEN age_group = '5-10 Years' THEN 3
            WHEN age_group = 'Over 10 Years' THEN 4
            ELSE 5
        END
""").fetchall()

print("Age Group Distribution in DIM_SEX_ON_OUTCOME:")
for row in distribution:
    print(f"  {row[0]}: {row[1]} sex combinations")

print()
print("✓ DIM_SEX_ON_OUTCOME complete with all sex/age combinations")

DIM_SEX_ON_OUTCOME populated with age_group during dimension creation

DIM_SEX_ON_OUTCOME: 21 rows
Distinct age groups: 4
NULL age_group values: 0

Age Group Distribution in DIM_SEX_ON_OUTCOME:
  Under 1 Year: 5 sex combinations
  1-5 Years: 5 sex combinations
  5-10 Years: 6 sex combinations
  Over 10 Years: 5 sex combinations

✓ DIM_SEX_ON_OUTCOME complete with all sex/age combinations


In [31]:
# Close connection
conn.close()
print("DuckDB connection closed.")

DuckDB connection closed.
