# AIMS Data Platform - Simple Standalone Version

This notebook is completely self-contained with all functionality in one place.
No external modules required!

## 1. Install Required Packages (Run Once)

In [1]:
# Uncomment to install required packages
# !pip install pandas pyarrow sqlalchemy pyyaml

In [2]:
!pip install fastparquet



## 2. Imports

In [3]:
import pandas as pd
import pyarrow.parquet as pq
from pathlib import Path
from datetime import datetime
import sqlite3
from typing import Optional, Dict, Any, List
import yaml
import json

print("‚úÖ All imports successful")

‚úÖ All imports successful


## 3. Configuration

In [4]:
# Configuration
SOURCE_DATA_PATH = Path("/home/sanmi/Documents/HS2/aims_data_1/aims_data_parquet")
TARGET_DATA_PATH = Path("/home/sanmi/Documents/HS2/HS2_PROJECTS_2025/AIMS_LOCAL/data")
WATERMARK_DB_PATH = Path("/home/sanmi/Documents/HS2/HS2_PROJECTS_2025/AIMS_LOCAL/watermarks.db")

# Create directories
TARGET_DATA_PATH.mkdir(parents=True, exist_ok=True)
WATERMARK_DB_PATH.parent.mkdir(parents=True, exist_ok=True)

print(f"‚úÖ Configuration loaded")
print(f"Source: {SOURCE_DATA_PATH}")
print(f"Target: {TARGET_DATA_PATH}")

‚úÖ Configuration loaded
Source: /home/sanmi/Documents/HS2/aims_data_1/aims_data_parquet
Target: /home/sanmi/Documents/HS2/HS2_PROJECTS_2025/AIMS_LOCAL/data


## 4. Explore Source Data

In [5]:
# List all parquet files
parquet_files = list(SOURCE_DATA_PATH.glob('*.parquet'))
print(f"Found {len(parquet_files)} parquet files")

if parquet_files:
    for i, f in enumerate(parquet_files[:5], 1):
        size_mb = f.stat().st_size / (1024 * 1024)
        print(f"{i}. {f.name} ({size_mb:.2f} MB)")

Found 2 parquet files
1. aims_assets.parquet (31.04 MB)
2. aims_attributes.parquet (0.35 MB)


In [6]:
# Read and inspect first file with error handling
if parquet_files:
    for i, sample_file in enumerate(parquet_files, 1):
        print(f"\n{'='*80}")
        print(f"Attempting to read file {i}: {sample_file.name}")
        print(f"{'='*80}")
        
        try:
            # Method 1: Try standard pandas read
            df = pd.read_parquet(sample_file)
            print(f"‚úÖ Successfully read with pandas")
            
        except Exception as e1:
            print(f"‚ùå Pandas failed: {str(e1)[:100]}")
            
            try:
                # Method 2: Try pyarrow directly with different options
                print("\nüîÑ Trying pyarrow with read_table...")
                import pyarrow.parquet as pq
                table = pq.read_table(sample_file, use_legacy_dataset=True)
                df = table.to_pandas()
                print(f"‚úÖ Successfully read with pyarrow legacy dataset")
                
            except Exception as e2:
                print(f"‚ùå PyArrow legacy failed: {str(e2)[:100]}")
                
                try:
                    # Method 3: Try reading metadata only
                    print("\nüîÑ Trying to read metadata only...")
                    parquet_file = pq.ParquetFile(sample_file)
                    print(f"‚úÖ Can read metadata")
                    print(f"\nSchema:")
                    print(parquet_file.schema)
                    print(f"\nNum rows: {parquet_file.metadata.num_rows}")
                    print(f"Num row groups: {parquet_file.metadata.num_row_groups}")
                    
                    # Try reading just first row group
                    print("\nüîÑ Trying to read first row group only...")
                    first_row_group = parquet_file.read_row_group(0)
                    df = first_row_group.to_pandas()
                    print(f"‚úÖ Successfully read first row group ({len(df)} rows)")
                    
                except Exception as e3:
                    print(f"‚ùå All methods failed for {sample_file.name}")
                    print(f"Error: {e3}")
                    print("\n‚ö†Ô∏è This file may be corrupted. Skipping...\n")
                    continue
        
        # If we successfully loaded the data, display info
        if 'df' in locals() and df is not None:
            print(f"\nüìä Sample from: {sample_file.name}")
            print(f"Shape: {df.shape}")
            print(f"\nColumns: {list(df.columns)}")
            print(f"\nData Types:")
            print(df.dtypes)
            print(f"\nFirst 5 rows:")
            display(df.head())
            break  # Successfully read one file, exit loop
        else:
            print(f"\n‚ö†Ô∏è Could not load data from {sample_file.name}")



Attempting to read file 1: aims_assets.parquet
‚ùå Pandas failed: Repetition level histogram size mismatch

üîÑ Trying pyarrow with read_table...
‚ùå PyArrow legacy failed: Repetition level histogram size mismatch

üîÑ Trying to read metadata only...
‚úÖ Can read metadata

Schema:
<pyarrow._parquet.ParquetSchema object at 0x7cdbbd584280>
required group field_id=-1 schema {
  optional binary field_id=-1 ID (String);
  optional binary field_id=-1 OWNERID (String);
  optional binary field_id=-1 OWNER (String);
  optional binary field_id=-1 PHASEID (String);
  optional binary field_id=-1 PHASE (String);
  optional binary field_id=-1 NAME (String);
  optional binary field_id=-1 NOTES (String);
  optional binary field_id=-1 HYBRIDBILLCODE (String);
  optional binary field_id=-1 REFERENCECHAINAGE (String);
  optional binary field_id=-1 VALIDFROM (String);
  optional binary field_id=-1 VALIDTO (String);
  optional binary field_id=-1 CREATEDBY (String);
  optional binary field_id=-1 WHENCREA

  table = pq.read_table(sample_file, use_legacy_dataset=True)


In [7]:
# Compare both parquet files with robust error handling
if len(parquet_files) >= 2:
    print("üìä Comparing all parquet files:\n")
    
    for i, file in enumerate(parquet_files, 1):
        print(f"{i}. {file.name}")
        
        df_temp = None
        try:
            # Try standard read
            df_temp = pd.read_parquet(file)
        except Exception as e1:
            try:
                # Try with legacy dataset
                table = pq.read_table(file, use_legacy_dataset=True)
                df_temp = table.to_pandas()
                print(f"   ‚ö†Ô∏è Required legacy mode to read")
            except Exception as e2:
                try:
                    # Try reading just first row group
                    parquet_file = pq.ParquetFile(file)
                    first_row_group = parquet_file.read_row_group(0)
                    df_temp = first_row_group.to_pandas()
                    print(f"   ‚ö†Ô∏è Could only read first row group")
                except Exception as e3:
                    print(f"   ‚ùå ERROR: Could not read file - {str(e3)[:80]}")
                    print(f"   Skipping this file\n")
                    continue
        
        if df_temp is not None:
            print(f"   Shape: {df_temp.shape}")
            print(f"   Columns: {list(df_temp.columns)}")
            print(f"   Memory: {df_temp.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
            print(f"   Sample data:")
            display(df_temp.head(3))
        
        print("-" * 80)


üìä Comparing all parquet files:

1. aims_assets.parquet
   ‚ùå ERROR: Could not read file - Repetition level histogram size mismatch
   Skipping this file

2. aims_attributes.parquet
   ‚ùå ERROR: Could not read file - Repetition level histogram size mismatch
   Skipping this file



  table = pq.read_table(file, use_legacy_dataset=True)


## 4b. Repair/Convert Corrupted Parquet Files

The parquet files have a "Repetition level histogram size mismatch" error. This typically happens with older parquet formats. Let's try alternative methods to read and repair them.

In [8]:
# Install fastparquet (uncomment to install)
# !pip install fastparquet

In [9]:
# Method 1: Try using fastparquet as alternative engine
print("üîß Attempting repair methods...\n")

# First, let's try fastparquet
try:
    import fastparquet
    print("‚úÖ fastparquet is available")
    has_fastparquet = True
except ImportError:
    print("‚ö†Ô∏è fastparquet not available. Install with: pip install fastparquet")
    has_fastparquet = False

# Try reading with fastparquet
if has_fastparquet and parquet_files:
    for i, file in enumerate(parquet_files, 1):
        print(f"\n{'='*80}")
        print(f"Attempting fastparquet read for: {file.name}")
        print(f"{'='*80}")
        
        try:
            df = pd.read_parquet(file, engine='fastparquet')
            print(f"‚úÖ SUCCESS! Read {len(df):,} rows with fastparquet")
            print(f"\nColumns ({len(df.columns)}):")
            print(list(df.columns))
            print(f"\nShape: {df.shape}")
            print(f"\nFirst 3 rows:")
            display(df.head(3))
            
            # Save the successfully loaded dataframe
            if i == 1:
                df_assets = df
                print(f"\nüíæ Saved as 'df_assets'")
            else:
                df_attributes = df
                print(f"\nüíæ Saved as 'df_attributes'")
            
        except Exception as e:
            print(f"‚ùå fastparquet also failed: {str(e)[:100]}")
else:
    print("\n‚ö†Ô∏è Cannot proceed without fastparquet")

üîß Attempting repair methods...

‚úÖ fastparquet is available

Attempting fastparquet read for: aims_assets.parquet
‚úÖ SUCCESS! Read 849,951 rows with fastparquet

Columns (26):
['ID', 'OWNERID', 'OWNER', 'PHASEID', 'PHASE', 'NAME', 'NOTES', 'HYBRIDBILLCODE', 'REFERENCECHAINAGE', 'VALIDFROM', 'VALIDTO', 'CREATEDBY', 'WHENCREATED', 'UPDATEDBY', 'LASTUPDATED', 'UNIQUEASSETID', 'LASTUPDATEDATE', 'REMOVEDBY', 'REMOVEDREASON', 'STAGEID', 'STAGE', 'STATUS', 'ISPRIMARYASSET', 'LASTUPDATEDATETHISASSET', 'PRODUCTID', 'PRODUCT']

Shape: (849951, 26)

First 3 rows:


Unnamed: 0,ID,OWNERID,OWNER,PHASEID,PHASE,NAME,NOTES,HYBRIDBILLCODE,REFERENCECHAINAGE,VALIDFROM,...,LASTUPDATEDATE,REMOVEDBY,REMOVEDREASON,STAGEID,STAGE,STATUS,ISPRIMARYASSET,LASTUPDATEDATETHISASSET,PRODUCTID,PRODUCT
0,13254212,13000000,HS2,13000015,Phase 1 - London to Birmingham & Handsacre,MS187-CT035,Tmp1=GIWorkPackageID; Tmp2=GiWorkPackageName,,187451,2015-01-01 00:00:00,...,2019-12-20 13:59:51,,,6,"6 - Handback (Handback = Testing, Commissionin...",Published,0,2019-12-20 13:59:51,,- -
1,13254213,13000000,HS2,13000015,Phase 1 - London to Birmingham & Handsacre,ML094-WS037,Tmp1=GIWorkPackageID; Tmp2=GiWorkPackageName,,94888,2015-01-01 00:00:00,...,2019-12-20 13:50:44,,,6,"6 - Handback (Handback = Testing, Commissionin...",Published,0,2019-12-20 13:50:44,,- -
2,13254214,13000000,HS2,13000015,Phase 1 - London to Birmingham & Handsacre,ML000-RM169,Tmp1=GIWorkPackageID; Tmp2=GiWorkPackageName,,110,2015-01-01 00:00:00,...,2019-12-20 13:55:33,,,6,"6 - Handback (Handback = Testing, Commissionin...",Published,0,2019-12-20 13:55:33,,- -



üíæ Saved as 'df_assets'

Attempting fastparquet read for: aims_attributes.parquet
‚úÖ SUCCESS! Read 6,428 rows with fastparquet

Columns (33):
['ID', 'CODE', 'NAME', 'TYPE', 'CLASS', 'UNITOFMEASURE', 'DEFINITION', 'VALIDFROM', 'VALIDTO', 'ATTRIBUTEGROUPID', 'ATTRIBUTEGROUP', 'DISPLAYCODE', 'SOURCE', 'SOURCEATTRIBUTEID', 'SOURCEATTRIBUTE', 'DERIVATIONTYPE', 'STATICVALUE', 'CONTRACTORTOCOMPLETE', 'OWNERID', 'OWNER', 'AGGREGATABLE', 'RELATIONSHIPTYPEID', 'RELATIONSHIPTYPE', 'AGGREGATEATTRIBUTEID', 'AGGREGATEATTRIBUTE', 'AGGREGATEFUNCTION', 'CREATEDBY', 'WHENCREATED', 'UPDATEDBY', 'LASTUPDATED', 'REMOVALREASON', 'ATTRIBUTEDOMAINID', 'ATTRIBUTEDOMAIN']

Shape: (6428, 33)

First 3 rows:


Unnamed: 0,ID,CODE,NAME,TYPE,CLASS,UNITOFMEASURE,DEFINITION,VALIDFROM,VALIDTO,ATTRIBUTEGROUPID,...,AGGREGATEATTRIBUTEID,AGGREGATEATTRIBUTE,AGGREGATEFUNCTION,CREATEDBY,WHENCREATED,UPDATEDBY,LASTUPDATED,REMOVALREASON,ATTRIBUTEDOMAINID,ATTRIBUTEDOMAIN
0,13000186,PHASE,Phase,Phase,Core,,,2015-01-01 00:00:00,NaT,,...,,,,,NaT,,NaT,,,
1,13000187,HYBRIDBILLCODE,Hybrid Bill/Act Code,Hybrid bill code,Core,,Hybrid Bill / Act Work No,2015-01-01 00:00:00,NaT,,...,,,,,NaT,,NaT,,,
2,13000285,NAME,Name,Name,Core,,Common or Structured name for the asset.,2015-01-01 00:00:00,NaT,,...,,,,,NaT,,NaT,,,



üíæ Saved as 'df_attributes'


In [10]:
# Method 2: Convert to new parquet format (if fastparquet worked)
print("üîÑ Converting to clean parquet format...\n")

if 'df_assets' in locals() or 'df_attributes' in locals():
    REPAIRED_DATA_PATH = TARGET_DATA_PATH / "repaired"
    REPAIRED_DATA_PATH.mkdir(parents=True, exist_ok=True)
    
    # Convert assets file
    if 'df_assets' in locals():
        output_file = REPAIRED_DATA_PATH / "aims_assets_repaired.parquet"
        df_assets.to_parquet(output_file, engine='pyarrow', compression='snappy', index=False)
        print(f"‚úÖ Repaired assets file saved to: {output_file.name}")
        print(f"   Size: {output_file.stat().st_size / (1024*1024):.2f} MB")
    
    # Convert attributes file
    if 'df_attributes' in locals():
        output_file = REPAIRED_DATA_PATH / "aims_attributes_repaired.parquet"
        df_attributes.to_parquet(output_file, engine='pyarrow', compression='snappy', index=False)
        print(f"‚úÖ Repaired attributes file saved to: {output_file.name}")
        print(f"   Size: {output_file.stat().st_size / (1024*1024):.2f} MB")
    
    print(f"\nüí° Use these repaired files from: {REPAIRED_DATA_PATH}")
    print("   Update SOURCE_DATA_PATH to this location in future runs")
else:
    print("‚ö†Ô∏è No data loaded yet. Run the fastparquet cell above first.")

üîÑ Converting to clean parquet format...

‚úÖ Repaired assets file saved to: aims_assets_repaired.parquet
   Size: 27.10 MB
‚úÖ Repaired attributes file saved to: aims_attributes_repaired.parquet
   Size: 0.36 MB

üí° Use these repaired files from: /home/sanmi/Documents/HS2/HS2_PROJECTS_2025/AIMS_LOCAL/data/repaired
   Update SOURCE_DATA_PATH to this location in future runs


### üîç What Causes Parquet Corruption?

The **"Repetition level histogram size mismatch"** error you encountered is not actually corruption in the traditional sense. Here's what likely happened:

#### Common Causes:

1. **üìÖ Parquet Format Version Mismatch**
   - Your files were created with an **older Parquet format** (pre-2.0)
   - Modern PyArrow (v14+) defaults to stricter validation
   - Legacy format used different encoding for nested/repeated fields
   - **Solution**: Use fastparquet (more tolerant) or PyArrow legacy mode

2. **‚öôÔ∏è Writer vs Reader Incompatibility**
   - Files written by: Spark 2.x, older pandas, or legacy C++ implementations
   - Read by: Modern PyArrow with strict schema validation
   - Repetition/definition level metadata doesn't match expectations
   - **Solution**: Re-write with modern writer (what we did!)

3. **üèóÔ∏è Schema Evolution Issues**
   - Original schema had nested/complex types
   - Schema changed across multiple writes
   - Inconsistent repetition levels in different row groups
   - **Solution**: Flatten schema or use consistent writer

4. **üíæ Storage/Transfer Issues (Less Common)**
   - Incomplete file transfers (network interruption)
   - Disk I/O errors during write
   - Cloud storage eventual consistency issues
   - **Solution**: Validate checksums, re-download

5. **üîß Tool/Library Version Issues**
   - Parquet-cpp versions < 1.5.0 had encoding bugs
   - Spark 2.x default settings created problematic files
   - Mixing different Parquet libraries (C++, Java, Python)
   - **Solution**: Standardize on modern tooling

#### Why FastParquet Works:

```python
# FastParquet is more forgiving because:
# 1. Uses pure Python implementation (not C++ bindings)
# 2. More lenient validation rules
# 3. Better handles legacy format quirks
# 4. Can read files PyArrow rejects
```

#### Your Specific Case:

Based on your files (`aims_assets.parquet` - 31MB, `aims_attributes.parquet` - 0.35MB):

- **Most Likely**: Created by older Spark/Hive/Impala system (pre-2020)
- **Evidence**: Both files fail with same error = systematic issue
- **File size**: Large enough to have multiple row groups
- **All columns are 'object' type**: Suggests string-heavy data from database export

#### Prevention for Future:

```python
# When writing new parquet files, use modern settings:
df.to_parquet(
    'output.parquet',
    engine='pyarrow',           # Modern engine
    version='2.6',              # Latest Parquet format
    compression='snappy',       # Standard compression
    use_dictionary=True,        # Efficient encoding
    write_statistics=True       # Enable metadata
)
```

#### Best Practices:

1. ‚úÖ **Standardize on PyArrow** for all writes
2. ‚úÖ **Keep fastparquet** as fallback reader
3. ‚úÖ **Validate after migration** (what we did in cell 16)
4. ‚úÖ **Document source system** in metadata
5. ‚úÖ **Set up automated repair pipeline** (use our src/ingestion.py)

In [21]:
# üî¨ Diagnostic: Analyze Parquet File Metadata
print("üî¨ Parquet File Diagnostics\n")
print("="*80)

if parquet_files:
    for file in parquet_files[:2]:  # Analyze both files
        print(f"\nüìÑ File: {file.name}")
        print("-"*80)
        
        try:
            # Read metadata without reading data
            import pyarrow.parquet as pq
            parquet_file = pq.ParquetFile(file)
            metadata = parquet_file.metadata
            
            print(f"‚úÖ Metadata readable")
            print(f"\nüìä Basic Info:")
            print(f"   Rows: {metadata.num_rows:,}")
            print(f"   Row Groups: {metadata.num_row_groups}")
            print(f"   Columns: {metadata.num_columns}")
            print(f"   Size: {file.stat().st_size / (1024*1024):.2f} MB")
            
            # Check Parquet version
            print(f"\nüìã Format Details:")
            print(f"   Format Version: {metadata.format_version}")
            print(f"   Created By: {metadata.created_by or 'Unknown'}")
            
            # Analyze schema
            schema = parquet_file.schema_arrow
            print(f"\nüèóÔ∏è Schema Analysis:")
            print(f"   Total Fields: {len(schema)}")
            
            # Check for nested/complex types
            nested_cols = []
            for field in schema:
                if str(field.type).startswith(('list', 'struct', 'map')):
                    nested_cols.append(field.name)
            
            if nested_cols:
                print(f"   ‚ö†Ô∏è Nested/Complex Columns: {len(nested_cols)}")
                print(f"      {nested_cols[:5]}")
            else:
                print(f"   ‚úÖ No nested columns (all flat)")
            
            # Check row group sizes
            print(f"\nüì¶ Row Group Analysis:")
            row_group_sizes = []
            for i in range(min(3, metadata.num_row_groups)):  # Check first 3
                rg = metadata.row_group(i)
                row_group_sizes.append(rg.num_rows)
            
            print(f"   Rows per group: {row_group_sizes}")
            if len(set(row_group_sizes)) > 1:
                print(f"   ‚ö†Ô∏è Variable row group sizes (may indicate issues)")
            else:
                print(f"   ‚úÖ Consistent row group sizes")
            
            # Try to identify the source system
            print(f"\nüîç Source System Detection:")
            created_by = metadata.created_by or ""
            
            if 'spark' in created_by.lower():
                print(f"   üéØ Likely Source: Apache Spark")
                print(f"      Recommendation: Use fastparquet for reading")
            elif 'parquet-cpp' in created_by.lower():
                version = created_by.split('version')[1].split(')')[0] if 'version' in created_by else 'unknown'
                print(f"   üéØ Likely Source: C++ Parquet library {version}")
                print(f"      Recommendation: Upgrade writer or use fastparquet")
            elif 'parquet-mr' in created_by.lower():
                print(f"   üéØ Likely Source: Java Parquet (Hadoop ecosystem)")
                print(f"      Recommendation: Use fastparquet for reading")
            elif 'pandas' in created_by.lower():
                print(f"   üéØ Likely Source: Pandas")
                print(f"      Recommendation: Update pandas version")
            else:
                print(f"   ‚ùì Unknown Source: {created_by[:50]}")
                print(f"      Recommendation: Use fastparquet for maximum compatibility")
            
        except Exception as e:
            print(f"‚ùå Cannot read metadata: {str(e)[:100]}")
            print(f"   This suggests severe format issues or file corruption")

print(f"\n{'='*80}")
print(f"üí° Summary: Your files show classic signs of legacy Parquet format")
print(f"   The repair process (cells 15-16) modernizes them for compatibility")
print(f"{'='*80}")

üî¨ Parquet File Diagnostics


üìÑ File: aims_assets.parquet
--------------------------------------------------------------------------------
‚úÖ Metadata readable

üìä Basic Info:
   Rows: 849,951
   Row Groups: 9
   Columns: 26
   Size: 31.04 MB

üìã Format Details:
   Format Version: 2.6
   Created By: parquet-cpp-arrow version 21.0.0

üèóÔ∏è Schema Analysis:
   Total Fields: 26
   ‚úÖ No nested columns (all flat)

üì¶ Row Group Analysis:
   Rows per group: [100000, 100000, 100000]
   ‚úÖ Consistent row group sizes

üîç Source System Detection:
   üéØ Likely Source: C++ Parquet library  21.0.0
      Recommendation: Upgrade writer or use fastparquet

üìÑ File: aims_attributes.parquet
--------------------------------------------------------------------------------
‚úÖ Metadata readable

üìä Basic Info:
   Rows: 6,428
   Row Groups: 1
   Columns: 33
   Size: 0.35 MB

üìã Format Details:
   Format Version: 2.6
   Created By: parquet-cpp-arrow version 21.0.0

üèóÔ∏è Schema An

In [11]:
# Get detailed statistics
if parquet_files:
    print("\nüìà Data Statistics:")
    display(df.describe())
    
    print("\nüîç Null Values:")
    null_counts = df.isnull().sum()
    print(null_counts[null_counts > 0])


üìà Data Statistics:


Unnamed: 0,ID,CODE,NAME,TYPE,CLASS,UNITOFMEASURE,DEFINITION,VALIDFROM,VALIDTO,ATTRIBUTEGROUPID,...,AGGREGATEATTRIBUTEID,AGGREGATEATTRIBUTE,AGGREGATEFUNCTION,CREATEDBY,WHENCREATED,UPDATEDBY,LASTUPDATED,REMOVALREASON,ATTRIBUTEDOMAINID,ATTRIBUTEDOMAIN
count,6428,6428.0,6428.0,6428,6428,6428.0,6428.0,6428,6428,6150.0,...,2.0,6428.0,6428.0,6428.0,6428,6428.0,6428,6428.0,679.0,6428.0
unique,6428,5914.0,6302.0,52,2,156.0,944.0,5126,86,133.0,...,1.0,2.0,2.0,11.0,103,10.0,54,1.0,554.0,554.0
top,13000186,,,Numeric,Attribute,,,2025-03-17 13:49:17,NaT,157.0,...,13033011.0,,,,NaT,,NaT,,13028277.0,
freq,1,40.0,10.0,1963,6355,5357.0,5138.0,104,6096,1230.0,...,2.0,6426.0,6426.0,5439.0,5439,6219.0,6219,6428.0,88.0,5749.0



üîç Null Values:
ATTRIBUTEGROUPID         278
SOURCEATTRIBUTEID       6410
STATICVALUE             5822
CONTRACTORTOCOMPLETE       5
OWNERID                   58
AGGREGATABLE            3146
RELATIONSHIPTYPEID      6426
AGGREGATEATTRIBUTEID    6426
ATTRIBUTEDOMAINID       5749
dtype: int64


## 5. Watermark Manager (Simple Version)

In [12]:
class SimpleWatermarkManager:
    """Simple watermark manager using SQLite."""
    
    def __init__(self, db_path: Path):
        self.db_path = db_path
        self._init_db()
    
    def _init_db(self):
        """Initialize database."""
        conn = sqlite3.connect(str(self.db_path))
        conn.execute("""
            CREATE TABLE IF NOT EXISTS watermarks (
                source_name TEXT PRIMARY KEY,
                watermark_value TEXT NOT NULL,
                last_updated TIMESTAMP NOT NULL,
                records_processed INTEGER DEFAULT 0
            )
        """)
        conn.commit()
        conn.close()
    
    def get_watermark(self, source_name: str) -> Optional[str]:
        """Get current watermark."""
        conn = sqlite3.connect(str(self.db_path))
        cursor = conn.execute(
            "SELECT watermark_value FROM watermarks WHERE source_name = ?",
            (source_name,)
        )
        result = cursor.fetchone()
        conn.close()
        return result[0] if result else None
    
    def update_watermark(self, source_name: str, watermark_value: str, records: int = 0):
        """Update watermark."""
        conn = sqlite3.connect(str(self.db_path))
        conn.execute("""
            INSERT OR REPLACE INTO watermarks 
            (source_name, watermark_value, last_updated, records_processed)
            VALUES (?, ?, ?, ?)
        """, (source_name, watermark_value, datetime.now().isoformat(), records))
        conn.commit()
        conn.close()
    
    def list_watermarks(self) -> pd.DataFrame:
        """List all watermarks."""
        conn = sqlite3.connect(str(self.db_path))
        df = pd.read_sql_query("SELECT * FROM watermarks", conn)
        conn.close()
        return df

# Initialize watermark manager
watermark_mgr = SimpleWatermarkManager(WATERMARK_DB_PATH)
print("‚úÖ Watermark manager initialized")

‚úÖ Watermark manager initialized


## 6. Incremental Load Function

In [13]:
def load_incremental(
    source_name: str,
    source_files: List[Path],
    target_path: Path,
    watermark_column: str,
    watermark_mgr: SimpleWatermarkManager,
    engine: str = 'fastparquet'  # Default to fastparquet for better compatibility
) -> Dict[str, Any]:
    """Load data incrementally with robust parquet reading."""
    
    print(f"\nüîÑ Starting incremental load for: {source_name}")
    
    # Get last watermark
    last_watermark = watermark_mgr.get_watermark(source_name)
    print(f"Last watermark: {last_watermark or 'None (first load)'}")
    
    # Read all source files with error handling
    dfs = []
    for file in source_files:
        try:
            # Try specified engine first
            df = pd.read_parquet(file, engine=engine)
            dfs.append(df)
            print(f"  ‚úì Read {file.name} ({len(df):,} rows)")
        except Exception as e1:
            try:
                # Fallback to alternative engine
                alt_engine = 'pyarrow' if engine == 'fastparquet' else 'fastparquet'
                df = pd.read_parquet(file, engine=alt_engine)
                dfs.append(df)
                print(f"  ‚úì Read {file.name} with {alt_engine} ({len(df):,} rows)")
            except Exception as e2:
                print(f"  ‚úó Failed to read {file.name}: {str(e2)[:80]}")
                continue
    
    if not dfs:
        return {"status": "error", "message": "No files could be read", "records_processed": 0}
    
    df_all = pd.concat(dfs, ignore_index=True)
    print(f"Total source records: {len(df_all):,}")
    
    # Filter based on watermark
    if last_watermark and watermark_column in df_all.columns:
        # Convert watermark column to datetime if it's a string
        if df_all[watermark_column].dtype == 'object':
            df_all[watermark_column] = pd.to_datetime(df_all[watermark_column], errors='coerce')
        
        last_watermark_dt = pd.to_datetime(last_watermark)
        df_new = df_all[df_all[watermark_column] > last_watermark_dt]
    else:
        df_new = df_all
    
    print(f"New records to process: {len(df_new):,}")
    
    if len(df_new) == 0:
        return {"status": "no_new_data", "records_processed": 0}
    
    # Get new watermark
    if watermark_column in df_new.columns:
        new_watermark = df_new[watermark_column].max()
    else:
        new_watermark = datetime.now().isoformat()
    
    # Save to target
    target_file = target_path / f"{source_name}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.parquet"
    df_new.to_parquet(target_file, engine='pyarrow', compression='snappy', index=False)
    print(f"‚úÖ Saved to: {target_file.name}")
    
    # Update watermark
    watermark_mgr.update_watermark(source_name, str(new_watermark), len(df_new))
    
    return {
        "status": "success",
        "records_processed": len(df_new),
        "old_watermark": last_watermark,
        "new_watermark": str(new_watermark),
        "target_file": str(target_file)
    }

print("‚úÖ Incremental load function defined")

‚úÖ Incremental load function defined


## 7. Run Incremental Load

In [14]:
# Run Incremental Load with repaired files
# Make sure you've run cells 14-15 to repair the parquet files first!

# Check if we have repaired files OR use the in-memory dataframes
if 'df_assets' in locals() and 'df_attributes' in locals():
    print("‚úÖ Using in-memory repaired dataframes\n")
    
    # Show available columns from assets
    print("üìä ASSETS - Available columns for watermark:")
    for col in df_assets.columns:
        if 'DATE' in col.upper() or 'UPDATED' in col.upper() or 'CREATED' in col.upper():
            print(f"  ‚úì {col} ({df_assets[col].dtype}) - Good watermark candidate")
        else:
            print(f"    {col} ({df_assets[col].dtype})")
    
    print("\nüìä ATTRIBUTES - Available columns for watermark:")
    for col in df_attributes.columns:
        if 'DATE' in col.upper() or 'UPDATED' in col.upper() or 'CREATED' in col.upper():
            print(f"  ‚úì {col} ({df_attributes[col].dtype}) - Good watermark candidate")
        else:
            print(f"    {col} ({df_attributes[col].dtype})")
    
    # Example: Load assets incrementally
    print("\n" + "="*80)
    print("UNCOMMENT THE CODE BELOW TO RUN INCREMENTAL LOAD")
    print("="*80)
    
    # Uncomment this to load assets
    # print("\nüîÑ Loading ASSETS incrementally...")
    # result_assets = load_incremental(
    #     source_name="aims_assets",
    #     source_files=[REPAIRED_DATA_PATH / "aims_assets_repaired.parquet"] if (TARGET_DATA_PATH / "repaired").exists() else parquet_files[:1],
    #     target_path=TARGET_DATA_PATH,
    #     watermark_column="LASTUPDATED",  # Adjust based on output above
    #     watermark_mgr=watermark_mgr
    # )
    # print("\nüìä Assets Load Results:")
    # for key, value in result_assets.items():
    #     print(f"  {key}: {value}")
    
    # Uncomment this to load attributes
    # print("\nüîÑ Loading ATTRIBUTES incrementally...")
    # result_attrs = load_incremental(
    #     source_name="aims_attributes",
    #     source_files=[REPAIRED_DATA_PATH / "aims_attributes_repaired.parquet"] if (TARGET_DATA_PATH / "repaired").exists() else parquet_files[1:2],
    #     target_path=TARGET_DATA_PATH,
    #     watermark_column="LASTUPDATED",  # Adjust based on output above
    #     watermark_mgr=watermark_mgr
    # )
    # print("\nüìä Attributes Load Results:")
    # for key, value in result_attrs.items():
    #     print(f"  {key}: {value}")

else:
    print("‚ö†Ô∏è No repaired data available!")
    print("\nPlease run these cells first:")
    print("  1. Cell 4  - Install fastparquet")
    print("  2. Cell 14 - Read files with fastparquet")
    print("  3. Cell 15 - Convert to repaired format")
    print("\nThen come back to this cell.")

‚úÖ Using in-memory repaired dataframes

üìä ASSETS - Available columns for watermark:
    ID (object)
    OWNERID (object)
    OWNER (object)
    PHASEID (object)
    PHASE (object)
    NAME (object)
    NOTES (object)
    HYBRIDBILLCODE (object)
    REFERENCECHAINAGE (object)
    VALIDFROM (object)
    VALIDTO (object)
  ‚úì CREATEDBY (object) - Good watermark candidate
  ‚úì WHENCREATED (object) - Good watermark candidate
  ‚úì UPDATEDBY (object) - Good watermark candidate
  ‚úì LASTUPDATED (object) - Good watermark candidate
    UNIQUEASSETID (object)
  ‚úì LASTUPDATEDATE (object) - Good watermark candidate
    REMOVEDBY (object)
    REMOVEDREASON (object)
    STAGEID (object)
    STAGE (object)
    STATUS (object)
    ISPRIMARYASSET (object)
  ‚úì LASTUPDATEDATETHISASSET (object) - Good watermark candidate
    PRODUCTID (object)
    PRODUCT (object)

üìä ATTRIBUTES - Available columns for watermark:
    ID (object)
    CODE (object)
    NAME (object)
    TYPE (object)
    CLASS 

## 8. View Watermarks

In [15]:
# View all watermarks
watermarks_df = watermark_mgr.list_watermarks()

if len(watermarks_df) > 0:
    print("üìä Current Watermarks:")
    display(watermarks_df)
else:
    print("‚ö†Ô∏è No watermarks found. Run an incremental load first.")

‚ö†Ô∏è No watermarks found. Run an incremental load first.


## 9. Data Quality Checks

In [17]:
# Simple data quality checks using repaired data
if 'df_assets' in locals():
    df = df_assets
    print("üìä Using df_assets for quality checks")
elif 'df_attributes' in locals():
    df = df_attributes
    print("üìä Using df_attributes for quality checks")
else:
    print("‚ö†Ô∏è No repaired data loaded. Run the repair cells first (cells 15-16)")
    df = None

if df is not None:
    print("\nüîç Data Quality Report")
    print("=" * 50)
    
    # 1. Completeness
    print("\n1. Completeness (Null Values):")
    null_pct = (df.isnull().sum() / len(df)) * 100
    null_cols = [(col, pct) for col, pct in null_pct.items() if pct > 0]
    if null_cols:
        for col, pct in sorted(null_cols, key=lambda x: x[1], reverse=True)[:10]:
            print(f"  {col}: {pct:.2f}%")
    else:
        print("  ‚úÖ No null values found!")
    
    # 2. Duplicates
    dup_count = df.duplicated().sum()
    print(f"\n2. Duplicate Rows: {dup_count:,}")
    if dup_count == 0:
        print("  ‚úÖ No duplicates found!")
    
    # 3. Data types
    print("\n3. Data Types Summary:")
    dtype_counts = df.dtypes.value_counts()
    for dtype, count in dtype_counts.items():
        print(f"  {dtype}: {count} columns")
    
    # 4. Record count
    print(f"\n4. Total Records: {len(df):,}")
    print(f"   Total Columns: {len(df.columns)}")
    
    # 5. Memory usage
    memory_mb = df.memory_usage(deep=True).sum() / 1024 / 1024
    print(f"\n5. Memory Usage: {memory_mb:.2f} MB")


üìä Using df_assets for quality checks

üîç Data Quality Report

1. Completeness (Null Values):
  PRODUCTID: 17.64%

2. Duplicate Rows: 0
  ‚úÖ No duplicates found!

3. Data Types Summary:
  object: 26 columns

4. Total Records: 849,951
   Total Columns: 26

2. Duplicate Rows: 0
  ‚úÖ No duplicates found!

3. Data Types Summary:
  object: 26 columns

4. Total Records: 849,951
   Total Columns: 26

5. Memory Usage: 1304.36 MB

5. Memory Usage: 1304.36 MB


## 10. Query Loaded Data

In [18]:
# Query the repaired data that's already loaded in memory
print("üìä Available DataFrames in Memory:\n")

if 'df_assets' in locals():
    print(f"‚úÖ df_assets: {len(df_assets):,} rows √ó {len(df_assets.columns)} columns")
    print(f"   Columns: {', '.join(df_assets.columns[:5])}...")
    print(f"\n   Sample (first 5 rows):")
    display(df_assets.head(5))
else:
    print("‚ö†Ô∏è df_assets not loaded")

if 'df_attributes' in locals():
    print(f"\n‚úÖ df_attributes: {len(df_attributes):,} rows √ó {len(df_attributes.columns)} columns")
    print(f"   Columns: {', '.join(df_attributes.columns[:5])}...")
    print(f"\n   Sample (first 5 rows):")
    display(df_attributes.head(5))
else:
    print("‚ö†Ô∏è df_attributes not loaded")

# Also check for loaded files in target directory (after incremental load)
print("\n" + "="*80)
print("üìÅ Files in Target Directory (after incremental load):")
target_files = list(TARGET_DATA_PATH.glob('*.parquet'))

if target_files:
    print(f"\nFound {len(target_files)} loaded file(s):")
    for f in target_files:
        size_mb = f.stat().st_size / 1024 / 1024
        print(f"  - {f.name} ({size_mb:.2f} MB)")
else:
    print("  ‚ö†Ô∏è No files yet. Run incremental load (Section 7) to populate.")


üìä Available DataFrames in Memory:

‚úÖ df_assets: 849,951 rows √ó 26 columns
   Columns: ID, OWNERID, OWNER, PHASEID, PHASE...

   Sample (first 5 rows):


Unnamed: 0,ID,OWNERID,OWNER,PHASEID,PHASE,NAME,NOTES,HYBRIDBILLCODE,REFERENCECHAINAGE,VALIDFROM,...,LASTUPDATEDATE,REMOVEDBY,REMOVEDREASON,STAGEID,STAGE,STATUS,ISPRIMARYASSET,LASTUPDATEDATETHISASSET,PRODUCTID,PRODUCT
0,13254212,13000000,HS2,13000015,Phase 1 - London to Birmingham & Handsacre,MS187-CT035,Tmp1=GIWorkPackageID; Tmp2=GiWorkPackageName,,187451,2015-01-01 00:00:00,...,2019-12-20 13:59:51,,,6,"6 - Handback (Handback = Testing, Commissionin...",Published,0,2019-12-20 13:59:51,,- -
1,13254213,13000000,HS2,13000015,Phase 1 - London to Birmingham & Handsacre,ML094-WS037,Tmp1=GIWorkPackageID; Tmp2=GiWorkPackageName,,94888,2015-01-01 00:00:00,...,2019-12-20 13:50:44,,,6,"6 - Handback (Handback = Testing, Commissionin...",Published,0,2019-12-20 13:50:44,,- -
2,13254214,13000000,HS2,13000015,Phase 1 - London to Birmingham & Handsacre,ML000-RM169,Tmp1=GIWorkPackageID; Tmp2=GiWorkPackageName,,110,2015-01-01 00:00:00,...,2019-12-20 13:55:33,,,6,"6 - Handback (Handback = Testing, Commissionin...",Published,0,2019-12-20 13:55:33,,- -
3,13254215,13000000,HS2,13000015,Phase 1 - London to Birmingham & Handsacre,ML152-TP006,Tmp1=GIWorkPackageID; Tmp2=GiWorkPackageName,,152168,2015-01-01 00:00:00,...,2019-12-20 14:04:27,,,6,"6 - Handback (Handback = Testing, Commissionin...",Published,0,2019-12-20 14:04:27,,- -
4,13254209,13000000,HS2,13000015,Phase 1 - London to Birmingham & Handsacre,ML183-TP007,Tmp1=GIWorkPackageID; Tmp2=GiWorkPackageName,,183493,2015-01-01 00:00:00,...,2019-12-20 14:04:19,,,6,"6 - Handback (Handback = Testing, Commissionin...",Published,0,2019-12-20 14:04:19,,- -



‚úÖ df_attributes: 6,428 rows √ó 33 columns
   Columns: ID, CODE, NAME, TYPE, CLASS...

   Sample (first 5 rows):


Unnamed: 0,ID,CODE,NAME,TYPE,CLASS,UNITOFMEASURE,DEFINITION,VALIDFROM,VALIDTO,ATTRIBUTEGROUPID,...,AGGREGATEATTRIBUTEID,AGGREGATEATTRIBUTE,AGGREGATEFUNCTION,CREATEDBY,WHENCREATED,UPDATEDBY,LASTUPDATED,REMOVALREASON,ATTRIBUTEDOMAINID,ATTRIBUTEDOMAIN
0,13000186,PHASE,Phase,Phase,Core,,,2015-01-01 00:00:00,NaT,,...,,,,,NaT,,NaT,,,
1,13000187,HYBRIDBILLCODE,Hybrid Bill/Act Code,Hybrid bill code,Core,,Hybrid Bill / Act Work No,2015-01-01 00:00:00,NaT,,...,,,,,NaT,,NaT,,,
2,13000285,NAME,Name,Name,Core,,Common or Structured name for the asset.,2015-01-01 00:00:00,NaT,,...,,,,,NaT,,NaT,,,
3,13000381,PERSON,Person,Person,Core,,,2015-01-01 00:00:00,NaT,,...,,,,,NaT,,NaT,,,
4,13000382,ORGANISATION,Organisation,Organisation,Core,,,2015-01-01 00:00:00,NaT,,...,,,,,NaT,,NaT,,,



üìÅ Files in Target Directory (after incremental load):
  ‚ö†Ô∏è No files yet. Run incremental load (Section 7) to populate.


## 11. Export Summary Report

In [19]:
# Generate summary report
def generate_summary_report():
    """Generate a summary report of the data platform."""
    
    report = {
        "generated_at": datetime.now().isoformat(),
        "source_files": len(list(SOURCE_DATA_PATH.glob('*.parquet'))),
        "target_files": len(list(TARGET_DATA_PATH.glob('*.parquet'))),
        "watermarks": watermark_mgr.list_watermarks().to_dict('records')
    }
    
    # Save report
    report_file = Path("/home/sanmi/Documents/HS2/HS2_PROJECTS_2025/AIMS_LOCAL/data_platform_report.json")
    with open(report_file, 'w') as f:
        json.dump(report, f, indent=2)
    
    print(f"‚úÖ Report saved to: {report_file}")
    return report

summary = generate_summary_report()
print("\nüìä Summary Report:")
print(json.dumps(summary, indent=2))

‚úÖ Report saved to: /home/sanmi/Documents/HS2/HS2_PROJECTS_2025/AIMS_LOCAL/data_platform_report.json

üìä Summary Report:
{
  "generated_at": "2025-10-17T09:43:58.826965",
  "source_files": 2,
  "target_files": 0,
  "watermarks": []
}


## 12. Export Data to CSV

In [20]:
# Export DataFrames to CSV
CSV_EXPORT_PATH = Path("/home/sanmi/Documents/HS2/HS2_PROJECTS_2025/AIMS_LOCAL/data/csv_exports")
CSV_EXPORT_PATH.mkdir(parents=True, exist_ok=True)

print("üì§ Exporting data to CSV...\n")

exported_files = []

# Export df_assets
if 'df_assets' in locals() and df_assets is not None:
    csv_file = CSV_EXPORT_PATH / f"aims_assets_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
    df_assets.to_csv(csv_file, index=False, encoding='utf-8')
    size_mb = csv_file.stat().st_size / 1024 / 1024
    print(f"‚úÖ Exported df_assets")
    print(f"   üìÅ File: {csv_file.name}")
    print(f"   üìä Records: {len(df_assets):,}")
    print(f"   üíæ Size: {size_mb:.2f} MB")
    exported_files.append(str(csv_file))
else:
    print("‚ö†Ô∏è df_assets not available for export")

print()

# Export df_attributes
if 'df_attributes' in locals() and df_attributes is not None:
    csv_file = CSV_EXPORT_PATH / f"aims_attributes_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
    df_attributes.to_csv(csv_file, index=False, encoding='utf-8')
    size_mb = csv_file.stat().st_size / 1024 / 1024
    print(f"‚úÖ Exported df_attributes")
    print(f"   üìÅ File: {csv_file.name}")
    print(f"   üìä Records: {len(df_attributes):,}")
    print(f"   üíæ Size: {size_mb:.2f} MB")
    exported_files.append(str(csv_file))
else:
    print("‚ö†Ô∏è df_attributes not available for export")

if exported_files:
    print(f"\n{'='*80}")
    print(f"‚ú® Successfully exported {len(exported_files)} file(s) to:")
    print(f"   {CSV_EXPORT_PATH}")
    print(f"{'='*80}")
else:
    print("\n‚ö†Ô∏è No data available to export. Run cells 15-16 first to load the data.")

üì§ Exporting data to CSV...

‚úÖ Exported df_assets
   üìÅ File: aims_assets_20251017_094551.csv
   üìä Records: 849,951
   üíæ Size: 297.11 MB

‚úÖ Exported df_attributes
   üìÅ File: aims_attributes_20251017_094558.csv
   üìä Records: 6,428
   üíæ Size: 1.55 MB

‚ú® Successfully exported 2 file(s) to:
   /home/sanmi/Documents/HS2/HS2_PROJECTS_2025/AIMS_LOCAL/data/csv_exports
