# Data Discovery and Analysis

This notebook performs comprehensive data discovery and analysis on the S3 data lake.

In [15]:
# Import required libraries
import sys
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from pathlib import Path

# Add project root to path
project_root = Path().absolute()
sys.path.insert(0, str(project_root))

from utils.database import DuckDBManager
from utils.data_processing import DataQualityChecker
from config.settings import config

# Initialize components
config.setup_logging()
db_manager = DuckDBManager()
quality_checker = DataQualityChecker()

print("📊 Data Discovery Notebook Initialized")
print(f"🔗 S3 Ingestion Path: {config.INGESTION_PATH}")

📊 Data Discovery Notebook Initialized
🔗 S3 Ingestion Path: s3://vendor-data-s3/LSEG/TRTH/LSE/ingestion


## 1. Data Inventory and File Analysis

In [16]:
# Discover and catalog all data files
print("🔍 Discovering data files...")

inventory_query = f"""
WITH file_inventory AS (
    SELECT 
        filename,
        COUNT(*) as estimated_rows,
        REGEXP_EXTRACT(filename, '([^/]+)\\.csv\\.gz$', 1) as file_basename,
        REGEXP_EXTRACT(filename, '(\\\\d{{4}}-\\\\d{{2}}-\\\\d{{2}})', 1) as date_from_filename
    FROM read_csv('{config.INGESTION_PATH}/*/*.csv.gz', 
                 AUTO_DETECT=true, 
                 FILENAME=true,
                 SAMPLE_SIZE=10000) 
    GROUP BY filename
)
SELECT 
    filename,
    estimated_rows,
    file_basename,
    date_from_filename,
    estimated_rows * 50 as total_estimated_rows
FROM file_inventory
ORDER BY estimated_rows DESC
"""

try:
    inventory = db_manager.execute_query(inventory_query)
    
    print(f"✅ Found {len(inventory)} data files")
    print(f"📊 Estimated total rows: {inventory['total_estimated_rows'].sum():,}")
    
    # Display top files by size
    print("\n📋 Largest files:")
    print(inventory.head(10))
    
except Exception as e:
    print(f"❌ File discovery failed: {e}")

2025-06-18 17:11:41,086 - utils.database - INFO - S3 credentials configured
2025-06-18 17:11:41,087 - utils.database - INFO - DuckDB connection configured successfully


🔍 Discovering data files...


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

2025-06-18 17:16:40,841 - utils.database - INFO - Query executed successfully. Returned 1 rows


✅ Found 1 data files
📊 Estimated total rows: 5,041,105,550

📋 Largest files:
                                            filename  estimated_rows  \
0  s3://vendor-data-s3/LSEG/TRTH/LSE/ingestion/20...       100822111   

                             file_basename date_from_filename  \
0  LSE-2023-09-01-NORMALIZEDMP-Data-1-of-1                      

   total_estimated_rows  
0            5041105550  


## 2. Schema Analysis

In [17]:
# Analyze schema and data structure
print("🔍 Analyzing data schema...")

sample_query = f"""
SELECT * 
FROM read_csv('{config.INGESTION_PATH}/*/*.csv.gz', 
             AUTO_DETECT=true, 
             FILENAME=true,
             SAMPLE_SIZE=5000)
LIMIT 1000
"""

try:
    sample_data = db_manager.execute_query(sample_query)
    
    print(f"✅ Schema analysis completed")
    print(f"📊 Dataset shape: {sample_data.shape}")
    
    # Display sample data
    print(f"\n🔍 Sample Data (first 5 rows):")
    print(sample_data.head())
    
    # Display column info
    print(f"\n📋 Columns ({len(sample_data.columns)}):")
    for i, col in enumerate(sample_data.columns[:10]):
        print(f"  {i+1}. {col} ({sample_data[col].dtype})")
    if len(sample_data.columns) > 10:
        print(f"  ... and {len(sample_data.columns) - 10} more columns")
    
except Exception as e:
    print(f"❌ Schema analysis failed: {e}")

🔍 Analyzing data schema...


2025-06-18 17:16:41,887 - utils.database - INFO - Query executed successfully. Returned 1000 rows


✅ Schema analysis completed
📊 Dataset shape: (1000, 114)

🔍 Sample Data (first 5 rows):
        #RIC        Domain                        Date-Time GMT Offset   Type  \
0  .TRX50GBP  Market Price 2023-09-01 02:59:58.447680-04:00         +1  Trade   
1  .TRX50GBP  Market Price 2023-09-01 03:00:58.438312-04:00         +1  Trade   
2  .TRX50GBP  Market Price 2023-09-01 03:01:58.408014-04:00         +1  Trade   
3  .TRX50GBP  Market Price 2023-09-01 03:02:58.438451-04:00         +1  Trade   
4  .TRX50GBP  Market Price 2023-09-01 03:03:58.432486-04:00         +1  Trade   

  Ex/Cntrb.ID   LOC   Price Volume Market VWAP  ... Imbalance Activity Type  \
0        None  None  113.90   None        None  ...                    None   
1        None  None  114.23   None        None  ...                    None   
2        None  None  114.20   None        None  ...                    None   
3        None  None  114.17   None        None  ...                    None   
4        None  None  114.15   

## 3. Data Quality Assessment

In [18]:
# Basic data quality assessment
print("🔎 Performing data quality assessment...")

if 'sample_data' in locals() and not sample_data.empty:
    print("\n📊 Data Quality Report:")
    print(f"Total Rows: {len(sample_data):,}")
    print(f"Total Columns: {len(sample_data.columns)}")
    
    # Check for null values
    null_counts = sample_data.isnull().sum()
    null_cols = null_counts[null_counts > 0]
    
    if len(null_cols) > 0:
        print(f"\n❌ Columns with null values ({len(null_cols)}):")
        for col, count in null_cols.head(10).items():
            pct = (count / len(sample_data)) * 100
            print(f"  {col}: {count} nulls ({pct:.1f}%)")
    else:
        print("\n✅ No null values found in sample")
    
    # Check for duplicates
    duplicate_count = sample_data.duplicated().sum()
    print(f"\n🔄 Duplicate rows: {duplicate_count} ({(duplicate_count/len(sample_data)*100):.1f}%)")
    
else:
    print("❌ No sample data available for quality assessment")

🔎 Performing data quality assessment...

📊 Data Quality Report:
Total Rows: 1,000
Total Columns: 114

❌ Columns with null values (103):
  Ex/Cntrb.ID: 1000 nulls (100.0%)
  LOC: 1000 nulls (100.0%)
  Volume: 1000 nulls (100.0%)
  Market VWAP: 1000 nulls (100.0%)
  Buyer ID: 1000 nulls (100.0%)
  Bid Price: 1000 nulls (100.0%)
  Bid Size: 1000 nulls (100.0%)
  No. Buyers: 1000 nulls (100.0%)
  Seller ID: 1000 nulls (100.0%)
  Ask Price: 1000 nulls (100.0%)

🔄 Duplicate rows: 0 (0.0%)


## 4. Generate Summary Report

In [19]:
# Generate final summary and recommendations
print("📋 Data Discovery Summary")
print("=" * 50)

if 'inventory' in locals() and not inventory.empty:
    print(f"📁 Total Files: {len(inventory)}")
    print(f"📊 Estimated Total Rows: {inventory['total_estimated_rows'].sum():,}")
    if 'date_from_filename' in inventory.columns:
        valid_dates = inventory['date_from_filename'].dropna()
        if len(valid_dates) > 0:
            print(f"📅 Date Range: {valid_dates.min()} to {valid_dates.max()}")

if 'sample_data' in locals() and not sample_data.empty:
    print(f"🏗️  Schema: {sample_data.shape[1]} columns")
    print(f"📋 Sample Size: {len(sample_data)} rows")
    
    # Data type summary
    dtype_counts = sample_data.dtypes.value_counts()
    print(f"📊 Data Types: {', '.join([f'{count} {dtype}' for dtype, count in dtype_counts.items()])}")

print("\n💡 Recommendations:")
print("1. Proceed to bronze layer creation (03_bronze_layer.ipynb)")
print("2. Implement data quality monitoring")
print("3. Consider partitioning by date for better performance")
print("4. Convert to Parquet format for optimization")

print("\n✅ Data discovery completed successfully!")

📋 Data Discovery Summary
📁 Total Files: 1
📊 Estimated Total Rows: 5,041,105,550
📅 Date Range:  to 
🏗️  Schema: 114 columns
📋 Sample Size: 1000 rows
📊 Data Types: 109 object, 4 float64, 1 datetime64[us, America/New_York]

💡 Recommendations:
1. Proceed to bronze layer creation (03_bronze_layer.ipynb)
2. Implement data quality monitoring
3. Consider partitioning by date for better performance
4. Convert to Parquet format for optimization

✅ Data discovery completed successfully!
