### Manual Test and Replace Scripts, Analytics, Ad-hoc test analysis.

In [None]:
"""
# Cell 1: Configuration
INCR_PATH = "s3://sentence-data-ingestion/DATA_MERGE_ASSETS/INCREMENTAL_DATA_SDK/finrag_sec_incremental_stg_data.parquet"
# ↑ Swap this line to test different incremental sources

# Cell 2: Load Config & Setup
config = ETLConfig()  # Gets hist_path, bucket, credentials
s3 = config.get_s3_client()
storage_options = config.get_storage_options()

# Cell 3: Pre-flight Validation
# Check HIST exists, Check INCR exists, Verify S3 access

# Cell 4: Duplicate Analysis  
# Load both files, count internal dupes, check overlap
# Show final comparison DataFrame

# Cell 5: Schema Inspector
# Compare columns, show mapping applied
# Display schema comparison DataFrame
"""

In [6]:
"""
Quick-swap incremental path for testing
Provide full S3 URI - we'll extract the key automatically
"""

# === SWAP THIS PATH TO TEST DIFFERENT INCREMENTAL SOURCES ===
INCR_PATH_URI = "s3://sentence-data-ingestion/DATA_MERGE_ASSETS/INCREMENTAL_DATA_SDK/finrag_sec_incremental_stg_data.parquet"

# Alternative paths (uncomment to test):
# INCR_PATH_URI = "s3://sentence-data-ingestion/DATA_MERGE_ASSETS/INCREMENTAL_DATA/finrag_sec_incremental_stg_data.parquet"

# Extract S3 key from URI (remove s3://bucket-name/ prefix)
INCR_PATH = INCR_PATH_URI.replace(f"s3://{config.bucket}/", "") if "s3://" in INCR_PATH_URI else INCR_PATH_URI

print(f"Testing with incremental path:")
print(f"  Full URI: {INCR_PATH_URI}")
print(f"  S3 Key: {INCR_PATH}")

Testing with incremental path:
  Full URI: s3://sentence-data-ingestion/DATA_MERGE_ASSETS/INCREMENTAL_DATA_SDK/finrag_sec_incremental_stg_data.parquet
  S3 Key: DATA_MERGE_ASSETS/INCREMENTAL_DATA_SDK/finrag_sec_incremental_stg_data.parquet


In [7]:
"""
Load ETL config and AWS clients
Historical path comes from config, incremental path from Cell 1
"""

import sys
from pathlib import Path
import polars as pl
import boto3

# Add project root
project_root = Path.cwd().parent.parent if 'notebooks' in str(Path.cwd()) else Path.cwd().parent
sys.path.append(str(project_root))

from src_aws_etl.etl.config_loader import ETLConfig

# Load config (gets historical path, bucket, credentials)
config = ETLConfig()
s3 = config.get_s3_client()
storage_options = config.get_storage_options()

# Paths
HIST_PATH = config.hist_path  # From config (always same)
BUCKET = config.bucket

print("Configuration loaded:")
print(f"  Bucket: {BUCKET}")
print(f"  Historical: {HIST_PATH}")
print(f"  Incremental: {INCR_PATH}")

[DEBUG] Modular file exists but empty/invalid
[DEBUG] Trying root fallback: d:\JoelDesktop folds_24\NEU FALL2025\MLops IE7374 Project\FinSights\DataPipeline\.env
[DEBUG] ✓ Credentials loaded from root .env
Configuration loaded:
  Bucket: sentence-data-ingestion
  Historical: DATA_MERGE_ASSETS/HISTORICAL_DATA/finrag_sec_fact_historical.parquet
  Incremental: DATA_MERGE_ASSETS/INCREMENTAL_DATA_SDK/finrag_sec_incremental_stg_data.parquet


In [8]:
"""
Validate both files exist and are accessible
"""

def check_s3_file(s3_client, bucket, key):
    """Check if S3 file exists and return size in MB"""
    response = s3_client.head_object(Bucket=bucket, Key=key)
    size_mb = response['ContentLength'] / (1024 * 1024)
    return size_mb

print("Pre-flight Validation\n")

# Check Historical
hist_size = check_s3_file(s3, BUCKET, HIST_PATH)
print(f"✓ Historical: {hist_size:.2f} MB")

# Check Incremental
incr_size = check_s3_file(s3, BUCKET, INCR_PATH)
print(f"✓ Incremental: {incr_size:.2f} MB")

# Verify S3 list permissions
s3.list_objects_v2(Bucket=BUCKET, Prefix="DATA_MERGE_ASSETS/", MaxKeys=1)
print(f"✓ S3 permissions: OK")

print(f"\nReady to proceed with analysis")

Pre-flight Validation

✓ Historical: 13.41 MB
✓ Incremental: 8.88 MB
✓ S3 permissions: OK

Ready to proceed with analysis


In [10]:
"""
Quick-swap paths for testing
Set all 4 file paths here
"""

# === FILE PATHS - CHANGE THESE TO TEST DIFFERENT SOURCES ===
HIST_PATH_URI = "https://sentence-data-ingestion.s3.us-east-1.amazonaws.com/DATA_MERGE_ASSETS/HISTORICAL_DATA/finrag_sec_fact_historical.parquet"
INCR_SDK_URI = "https://sentence-data-ingestion.s3.us-east-1.amazonaws.com/DATA_MERGE_ASSETS/INCREMENTAL_DATA_SDK/finrag_sec_incremental_stg_data.parquet"
INCR_CRAWL_URI = "https://sentence-data-ingestion.s3.us-east-1.amazonaws.com/DATA_MERGE_ASSETS/INCREMENTAL_DATA/finrag_sec_incremental_stg_data.parquet"
FINAL_PATH_URI = "https://sentence-data-ingestion.s3.us-east-1.amazonaws.com/DATA_MERGE_ASSETS/FINRAG_FACT_SENTENCES/finrag_fact_sentences.parquet"

print("Testing with file paths:")
print(f"  Historical: {HIST_PATH_URI}")
print(f"  Incremental (SDK): {INCR_SDK_URI}")
print(f"  Incremental (Crawled): {INCR_CRAWL_URI}")
print(f"  Final: {FINAL_PATH_URI}")


"""
Analyze duplicates across all 4 files with size display
No cross-file overlap - just internal duplicate counts
"""

import polars as pl

# Convert HTTPS URLs to S3 URIs for Polars
def https_to_s3(url):
    """Convert S3 HTTPS URL to s3:// URI"""
    return url.replace("https://sentence-data-ingestion.s3.us-east-1.amazonaws.com/", "s3://sentence-data-ingestion/")

def get_file_size_mb(s3_client, bucket, url):
    """Get file size from S3 in MB"""
    # Extract key from URL
    key = url.replace(f"https://{bucket}.s3.us-east-1.amazonaws.com/", "")
    response = s3_client.head_object(Bucket=bucket, Key=key)
    return response['ContentLength'] / (1024 * 1024)

# File definitions
files = {
    'Historical': HIST_PATH_URI,
    'Incremental (SDK)': INCR_SDK_URI,
    'Incremental (Crawled)': INCR_CRAWL_URI,
    'Final': FINAL_PATH_URI
}

print("Loading files and analyzing...\n")

# Collect analysis results
results = []

for file_name, url in files.items():
    # Get file size
    file_size_mb = get_file_size_mb(s3, BUCKET, url)
    
    # Load data
    s3_uri = https_to_s3(url)
    df = pl.read_parquet(s3_uri, storage_options=storage_options)
    
    # Analyze duplicates
    total_rows = len(df)
    unique_ids = df['sentenceID'].n_unique()
    internal_dupes = total_rows - unique_ids
    
    results.append({
        'File': file_name,
        'File Size': f"{file_size_mb:.1f} MB",
        'Total Rows': f"{total_rows:,}",
        'Unique IDs': f"{unique_ids:,}",
        'Internal Dupes': f"{internal_dupes:,}"
    })
    
    print(f"✓ {file_name}: {total_rows:,} rows ({file_size_mb:.1f} MB)")

print("\nDuplicate Analysis Summary\n")
pl.DataFrame(results)



Testing with file paths:
  Historical: https://sentence-data-ingestion.s3.us-east-1.amazonaws.com/DATA_MERGE_ASSETS/HISTORICAL_DATA/finrag_sec_fact_historical.parquet
  Incremental (SDK): https://sentence-data-ingestion.s3.us-east-1.amazonaws.com/DATA_MERGE_ASSETS/INCREMENTAL_DATA_SDK/finrag_sec_incremental_stg_data.parquet
  Incremental (Crawled): https://sentence-data-ingestion.s3.us-east-1.amazonaws.com/DATA_MERGE_ASSETS/INCREMENTAL_DATA/finrag_sec_incremental_stg_data.parquet
  Final: https://sentence-data-ingestion.s3.us-east-1.amazonaws.com/DATA_MERGE_ASSETS/FINRAG_FACT_SENTENCES/finrag_fact_sentences.parquet
Loading files and analyzing...

✓ Historical: 287,066 rows (13.4 MB)
✓ Incremental (SDK): 90,072 rows (8.9 MB)
✓ Incremental (Crawled): 36,999 rows (3.2 MB)
✓ Final: 469,252 rows (23.1 MB)

Duplicate Analysis Summary



File,File Size,Total Rows,Unique IDs,Internal Dupes
str,str,str,str,str
"""Historical""","""13.4 MB""","""287,066""","""287,066""","""0"""
"""Incremental (SDK)""","""8.9 MB""","""90,072""","""86,654""","""3,418"""
"""Incremental (Crawled)""","""3.2 MB""","""36,999""","""36,999""","""0"""
"""Final""","""23.1 MB""","""469,252""","""469,252""","""0"""


In [14]:
"""
Analyze duplicates in SDK file by report_year and company name
Shows where the 3,418 duplicates are concentrated
"""

# Load SDK file (the one with duplicates)
s3_uri_sdk = https_to_s3(INCR_SDK_URI)
df_sdk = pl.read_parquet(s3_uri_sdk, storage_options=storage_options)

print(f"Analyzing {len(df_sdk):,} rows from Incremental (SDK)")
print(f"Found {df_sdk['sentenceID'].n_unique():,} unique IDs")
print(f"Total duplicates: {len(df_sdk) - df_sdk['sentenceID'].n_unique():,}\n")

# Identify duplicate sentenceIDs
duplicate_ids = (
    df_sdk
    .group_by('sentenceID')
    .agg(pl.len().alias('count'))
    .filter(pl.col('count') > 1)
    .select('sentenceID')
)

# Filter to only duplicate rows
duplicates_df = df_sdk.join(duplicate_ids, on='sentenceID', how='inner')

print(f"Analyzing dup rows: {len(duplicates_df):,} :/ \n")

# Breakdown by Year and Company
breakdown = (
    duplicates_df
    .group_by(['report_year', 'name'])
    .agg([
        pl.len().alias('Duplicate Rows'),
        pl.col('sentenceID').n_unique().alias('Unique IDs Affected')
    ])
    .sort(['report_year', 'Duplicate Rows'], descending=[False, True])
)

print("Duplicate Breakdown by Year and Company\n")
breakdown

Analyzing 90,072 rows from Incremental (SDK)
Found 86,654 unique IDs
Total duplicates: 3,418

Analyzing dup rows: 6,836 :/ 

Duplicate Breakdown by Year and Company



report_year,name,Duplicate Rows,Unique IDs Affected
i64,str,u32,u32
2021,"""Alphabet Inc.""",1842,921
2022,"""Alphabet Inc.""",1678,839
2023,"""Alphabet Inc.""",1566,783
2024,"""Alphabet Inc.""",1750,875


In [15]:
"""
Quick summary of duplicate distribution
"""

# Year-level summary
year_summary = (
    duplicates_df
    .group_by('report_year')
    .agg([
        pl.len().alias('Total Duplicates'),
        pl.col('name').n_unique().alias('Companies Affected')
    ])
    .sort('report_year')
)

print("Year-Level Summary\n")
print(year_summary)

print("\n" + "="*50)

# Company-level summary (top 10)
company_summary = (
    duplicates_df
    .group_by('name')
    .agg([
        pl.len().alias('Total Duplicates'),
        pl.col('report_year').n_unique().alias('Years Affected')
    ])
    .sort('Total Duplicates', descending=True)
    .head(10)
)

print("\nTop 10 Companies with Most Duplicates\n")
company_summary

Year-Level Summary

shape: (4, 3)
┌─────────────┬──────────────────┬────────────────────┐
│ report_year ┆ Total Duplicates ┆ Companies Affected │
│ ---         ┆ ---              ┆ ---                │
│ i64         ┆ u32              ┆ u32                │
╞═════════════╪══════════════════╪════════════════════╡
│ 2021        ┆ 1842             ┆ 1                  │
│ 2022        ┆ 1678             ┆ 1                  │
│ 2023        ┆ 1566             ┆ 1                  │
│ 2024        ┆ 1750             ┆ 1                  │
└─────────────┴──────────────────┴────────────────────┘


Top 10 Companies with Most Duplicates



name,Total Duplicates,Years Affected
str,u32,u32
"""Alphabet Inc.""",6836,4


In [None]:
"""
Compare schemas between historical and incremental
"""

# Column mapping rules (if incremental uses different names)
COLUMN_MAPPINGS = {
    'SIC': 'sic',
    'section_item': 'section_name',
}

# Derived columns (computed during merge, OK to differ)
DERIVED_COLUMNS = {
    'cik_int', 'has_comparison', 'has_numbers', 'likely_kpi',
    'row_hash', 'tickers', 'sentence_index',
}

# Read schemas (just 1 row for speed)
print("Reading schemas...\n")
hist_schema_df = pl.read_parquet(hist_uri, n_rows=1, storage_options=storage_options)
incr_schema_df = pl.read_parquet(incr_uri, n_rows=1, storage_options=storage_options)

hist_schema = hist_schema_df.schema
incr_schema = incr_schema_df.schema

print(f"Historical: {len(hist_schema)} columns")
print(f"Incremental: {len(incr_schema)} columns")

In [18]:
"""
Schema Inspector - Historical vs Incremental (SDK)
Smart comparison with column mapping rules and derived column handling
"""

# ============================================================
# POLARS DISPLAY CONFIGURATION - Show Full DataFrame
# ============================================================
pl.Config.set_tbl_rows(-1)  # Show all rows (no truncation)
pl.Config.set_tbl_cols(-1)  # Show all columns
pl.Config.set_fmt_str_lengths(100)  # Allow longer string display
pl.Config.set_tbl_width_chars(1000)  # Wider table display

# Column mapping rules (if incremental uses different names)
COLUMN_MAPPINGS = {
    'SIC': 'sic',
    'section_item': 'section_name',
}

# Derived columns (can be computed during merge, OK to differ)
DERIVED_COLUMNS = {
    'cik_int', 'has_comparison', 'has_numbers', 'likely_kpi',
    'row_hash', 'tickers', 'sentence_index',
}

# Load schemas
hist_uri = f"s3://{BUCKET}/{HIST_PATH}"
incr_uri = https_to_s3(INCR_SDK_URI)

print("Reading schemas...\n")
hist_df = pl.read_parquet(hist_uri, n_rows=1, storage_options=storage_options)
incr_df = pl.read_parquet(incr_uri, n_rows=1, storage_options=storage_options)

hist_schema = hist_df.schema
incr_schema = incr_df.schema

print(f"Historical: {len(hist_schema)} columns")
print(f"Incremental (SDK): {len(incr_schema)} columns\n")

# Apply column mappings to incremental
incr_mapped = {}
for col, dtype in incr_schema.items():
    mapped_col = COLUMN_MAPPINGS.get(col, col)
    incr_mapped[mapped_col] = (col, dtype)

# Show mapping rules applied
print("Column Mapping Rules Applied:")
for incr_col, hist_col in COLUMN_MAPPINGS.items():
    if incr_col in incr_schema:
        print(f"  {incr_col:20s} → {hist_col:20s}")
print()

# Get all unique columns (after mapping)
all_cols = sorted(set(hist_schema.keys()) | set(incr_mapped.keys()))

# Build comparison
comparison_rows = []
matches = []
hist_only = []
incr_only = []
type_diffs = []

for col in all_cols:
    hist_type = str(hist_schema.get(col, "MISSING"))
    
    # Check if incremental has this column (after mapping)
    incr_orig_col, incr_type_val = incr_mapped.get(col, (None, None))
    incr_type = str(incr_type_val) if incr_type_val else "MISSING"
    
    # Normalize datetime types for comparison
    hist_norm = hist_type.replace("time_unit='us'", "TU").replace("time_unit='ns'", "TU")
    incr_norm = incr_type.replace("time_unit='us'", "TU").replace("time_unit='ns'", "TU")
    hist_norm = hist_norm.replace("time_zone='UTC'", "TZ").replace("time_zone=None", "TZ")
    incr_norm = incr_norm.replace("time_zone='UTC'", "TZ").replace("time_zone=None", "TZ")
    
    # Determine status
    if col not in hist_schema:
        status = "✓ Derived" if col in DERIVED_COLUMNS else "⚠ Incr Only"
        incr_only.append(col)
    elif incr_orig_col is None:
        status = "✓ Derived" if col in DERIVED_COLUMNS else "⚠ Hist Only"
        hist_only.append(col)
    elif hist_norm == incr_norm:
        status = "✓ Match"
        matches.append(col)
    else:
        if 'Datetime' in hist_type and 'Datetime' in incr_type:
            status = "✓ Match (datetime)"
            matches.append(col)
        else:
            status = "❌ Type Diff"
            type_diffs.append(col)
    
    # Show mapped name if different
    display_col = f"{col} ({incr_orig_col}→)" if incr_orig_col and incr_orig_col != col else col
    
    comparison_rows.append({
        'Column': display_col,
        'Historical Type': hist_type,
        'Incremental Type': incr_type,
        'Status': status
    })

schema_comparison = pl.DataFrame(comparison_rows)

print("Schema Comparison: Historical vs Incremental (SDK)\n")
schema_comparison

Reading schemas...

Historical: 24 columns
Incremental (SDK): 20 columns

Column Mapping Rules Applied:
  SIC                  → sic                 
  section_item         → section_name        

Schema Comparison: Historical vs Incremental (SDK)



Column,Historical Type,Incremental Type,Status
str,str,str,str
"""cik""","""String""","""String""","""✓ Match"""
"""cik_int""","""Int32""","""MISSING""","""✓ Derived"""
"""docID""","""String""","""String""","""✓ Match"""
"""filingDate""","""String""","""String""","""✓ Match"""
"""form""","""String""","""String""","""✓ Match"""
"""has_comparison""","""Boolean""","""MISSING""","""✓ Derived"""
"""has_numbers""","""Boolean""","""MISSING""","""✓ Derived"""
"""last_modified_date""","""Datetime(time_unit='us', time_zone='UTC')""","""Datetime(time_unit='ns', time_zone=None)""","""✓ Match"""
"""likely_kpi""","""Boolean""","""MISSING""","""✓ Derived"""
"""load_method""","""String""","""Null""","""❌ Type Diff"""


In [21]:
"""
Analyze null values in columns showing Type Diff
Load FULL datasets for accurate null analysis
"""

# Columns with Type Diff (String vs Null)
problem_columns = [
    'load_method',
    'reportDate', 
    'sample_version',
    'source_file_path',
    'temporal_bin'
]

# Load FULL datasets (not just 1 row)
print("Loading full datasets for null analysis...\n")

hist_uri = f"s3://{BUCKET}/{HIST_PATH}"
incr_uri = https_to_s3(INCR_SDK_URI)

hist_full = pl.read_parquet(hist_uri, storage_options=storage_options)
incr_full = pl.read_parquet(incr_uri, storage_options=storage_options)

print(f"Historical file: {len(hist_full):,} total rows")
print(f"Incremental (SDK) file: {len(incr_full):,} total rows\n")
print("="*70)

# Analyze each column
results = []

for col in problem_columns:
    # Historical stats
    hist_exists = col in hist_full.columns
    if hist_exists:
        hist_total = len(hist_full)
        hist_null = hist_full[col].is_null().sum()
        hist_not_null = hist_total - hist_null
        hist_null_pct = (hist_null / hist_total) * 100
    else:
        hist_total = hist_null = hist_not_null = hist_null_pct = "N/A"
    
    # Incremental stats
    incr_exists = col in incr_full.columns
    if incr_exists:
        incr_total = len(incr_full)
        incr_null = incr_full[col].is_null().sum()
        incr_not_null = incr_total - incr_null
        incr_null_pct = (incr_null / incr_total) * 100
    else:
        incr_total = incr_null = incr_not_null = incr_null_pct = "N/A"
    
    results.append({
        'Column': col,
        'Hist: Total': f"{hist_total:,}" if isinstance(hist_total, int) else hist_total,
        'Hist: Not Null': f"{hist_not_null:,}" if isinstance(hist_not_null, int) else hist_not_null,
        'Hist: Null': f"{hist_null:,}" if isinstance(hist_null, int) else hist_null,
        'Hist: Null %': f"{hist_null_pct:.1f}%" if isinstance(hist_null_pct, float) else hist_null_pct,
        'Incr: Total': f"{incr_total:,}" if isinstance(incr_total, int) else incr_total,
        'Incr: Not Null': f"{incr_not_null:,}" if isinstance(incr_not_null, int) else incr_not_null,
        'Incr: Null': f"{incr_null:,}" if isinstance(incr_null, int) else incr_null,
        'Incr: Null %': f"{incr_null_pct:.1f}%" if isinstance(incr_null_pct, float) else incr_null_pct,
    })

null_analysis = pl.DataFrame(results)

print("\nNull Value Analysis (Full Data)\n")
null_analysis

Loading full datasets for null analysis...

Historical file: 287,066 total rows
Incremental (SDK) file: 90,072 total rows


Null Value Analysis (Full Data)



Column,Hist: Total,Hist: Not Null,Hist: Null,Hist: Null %,Incr: Total,Incr: Not Null,Incr: Null,Incr: Null %
str,str,str,str,str,str,str,str,str
"""load_method""","""287,066""","""287,066""","""0""","""0.0%""","""90,072""","""0""","""90,072""","""100.0%"""
"""reportDate""","""287,066""","""287,066""","""0""","""0.0%""","""90,072""","""0""","""90,072""","""100.0%"""
"""sample_version""","""287,066""","""287,066""","""0""","""0.0%""","""90,072""","""0""","""90,072""","""100.0%"""
"""source_file_path""","""287,066""","""287,066""","""0""","""0.0%""","""90,072""","""0""","""90,072""","""100.0%"""
"""temporal_bin""","""287,066""","""287,066""","""0""","""0.0%""","""90,072""","""0""","""90,072""","""100.0%"""


In [22]:
"""
Show sample values and unique value counts from full dataset
"""

print("Sample Values and Unique Counts from Non-Null Rows\n")
print("="*70)

for col in problem_columns:
    print(f"\n{col}:")
    print("-" * 70)
    
    # Historical samples
    if col in hist_full.columns:
        hist_not_null = hist_full.filter(pl.col(col).is_not_null())
        hist_unique = hist_not_null[col].n_unique() if len(hist_not_null) > 0 else 0
        
        print(f"  Historical:")
        print(f"    - Non-null rows: {len(hist_not_null):,}")
        print(f"    - Unique values: {hist_unique:,}")
        
        if len(hist_not_null) > 0:
            hist_samples = hist_not_null[col].head(5)
            print(f"    - Sample values:")
            for val in hist_samples:
                print(f"        {val}")
        else:
            print(f"    - ALL NULL")
    else:
        print(f"  Historical: COLUMN MISSING")
    
    # Incremental samples
    if col in incr_full.columns:
        incr_not_null = incr_full.filter(pl.col(col).is_not_null())
        incr_unique = incr_not_null[col].n_unique() if len(incr_not_null) > 0 else 0
        
        print(f"  Incremental (SDK):")
        print(f"    - Non-null rows: {len(incr_not_null):,}")
        print(f"    - Unique values: {incr_unique:,}")
        
        if len(incr_not_null) > 0:
            incr_samples = incr_not_null[col].head(5)
            print(f"    - Sample values:")
            for val in incr_samples:
                print(f"        {val}")
        else:
            print(f"    - ALL NULL")
    else:
        print(f"  Incremental: COLUMN MISSING")

Sample Values and Unique Counts from Non-Null Rows


load_method:
----------------------------------------------------------------------
  Historical:
    - Non-null rows: 287,066
    - Unique values: 2
    - Sample values:
        stratified_sampling
        stratified_sampling
        stratified_sampling
        stratified_sampling
        stratified_sampling
  Incremental (SDK):
    - Non-null rows: 0
    - Unique values: 0
    - ALL NULL

reportDate:
----------------------------------------------------------------------
  Historical:
    - Non-null rows: 287,066
    - Unique values: 124
    - Sample values:
        2020-12-31
        2020-12-31
        2020-12-31
        2020-12-31
        2020-12-31
  Incremental (SDK):
    - Non-null rows: 0
    - Unique values: 0
    - ALL NULL

sample_version:
----------------------------------------------------------------------
  Historical:
    - Non-null rows: 287,066
    - Unique values: 2
    - Sample values:
        v1.0_75companies_1M
 

In [23]:
"""
Extract sentence_pos from sentenceID and validate contiguity
Tests if sentence positions are sequential within each document-section group
"""

def extract_sentence_position(df: pl.DataFrame, sentenceid_col: str = 'sentenceID') -> pl.DataFrame:
    """
    Extract sentence position from sentenceID string.
    sentenceID format: {docID}_{section}_{sequence}
    Example: "0001045810_10-K_2020_section_1A_45" → pos=45
    """
    return df.with_columns([
        pl.col(sentenceid_col)
          .str.split('_')
          .list.last()
          .cast(pl.Int16, strict=False)  # NULL on cast failure
          .fill_null(-1)                  # -1 = malformed ID
          .alias('sentence_pos')
    ])

# Extract sentence positions for incremental SDK data
print("Extracting sentence positions from sentenceID...\n")
incr_with_pos = extract_sentence_position(incr_full)

# Check for malformed IDs
malformed_count = incr_with_pos.filter(pl.col('sentence_pos') == -1).shape[0]
print(f"Malformed sentenceIDs: {malformed_count:,}")
print(f"Valid positions: {len(incr_with_pos) - malformed_count:,}")

# Group key: docID + section_name (sentences should be contiguous within these groups)
print("\nTesting contiguity within document-section groups...\n")

contiguity_test = (
    incr_with_pos
    .filter(pl.col('sentence_pos') != -1)  # Exclude malformed
    .group_by(['docID', 'section_name'])
    .agg([
        pl.len().alias('total_sentences'),
        pl.col('sentence_pos').min().alias('min_pos'),
        pl.col('sentence_pos').max().alias('max_pos'),
        pl.col('sentence_pos').n_unique().alias('unique_positions')
    ])
    .with_columns([
        # Expected count if contiguous: max - min + 1
        (pl.col('max_pos') - pl.col('min_pos') + 1).alias('expected_if_contiguous'),
        # Are they contiguous?
        (pl.col('unique_positions') == (pl.col('max_pos') - pl.col('min_pos') + 1)).alias('is_contiguous')
    ])
    .sort(['docID', 'section_name'])
)

# Summary statistics
total_groups = len(contiguity_test)
contiguous_groups = contiguity_test.filter(pl.col('is_contiguous')).shape[0]
broken_groups = total_groups - contiguous_groups

print(f"Document-Section Groups Analysis:")
print(f"  Total groups: {total_groups:,}")
print(f"  Contiguous groups: {contiguous_groups:,} ({contiguous_groups/total_groups*100:.1f}%)")
print(f"  Broken groups (gaps): {broken_groups:,} ({broken_groups/total_groups*100:.1f}%)")

# Show sample of contiguous groups
print("\nSample Contiguous Groups:\n")
contiguous_sample = (
    contiguity_test
    .filter(pl.col('is_contiguous'))
    .head(10)
)
contiguous_sample

Extracting sentence positions from sentenceID...

Malformed sentenceIDs: 0
Valid positions: 90,072

Testing contiguity within document-section groups...

Document-Section Groups Analysis:
  Total groups: 334
  Contiguous groups: 334 (100.0%)
  Broken groups (gaps): 0 (0.0%)

Sample Contiguous Groups:



docID,section_name,total_sentences,min_pos,max_pos,unique_positions,expected_if_contiguous,is_contiguous
str,str,u32,i16,i16,u32,i16,bool
"""1018724_10-K_2021""","""Business""",60,0,59,60,60,True
"""1018724_10-K_2021""","""Management & Discussion and Analysis (MD&A)""",220,0,219,220,220,True
"""1018724_10-K_2021""","""Quantitative and Qualitative Disclosures About Market Risk""",26,0,25,26,26,True
"""1018724_10-K_2021""","""Risk Factors""",186,0,185,186,186,True
"""1018724_10-K_2022""","""Business""",62,0,61,62,62,True
"""1018724_10-K_2022""","""Management & Discussion and Analysis (MD&A)""",205,0,204,205,205,True
"""1018724_10-K_2022""","""Quantitative and Qualitative Disclosures About Market Risk""",27,0,26,27,27,True
"""1018724_10-K_2022""","""Risk Factors""",193,0,192,193,193,True
"""1018724_10-K_2023""","""Business""",65,0,64,65,65,True
"""1018724_10-K_2023""","""Management & Discussion and Analysis (MD&A)""",206,0,205,206,206,True


In [24]:
"""
Analyze groups with gaps in sentence positions
Shows which document-sections have non-contiguous sequences
"""

# Filter to groups with gaps
broken_analysis = (
    contiguity_test
    .filter(~pl.col('is_contiguous'))
    .with_columns([
        (pl.col('expected_if_contiguous') - pl.col('unique_positions')).alias('missing_positions')
    ])
    .sort('missing_positions', descending=True)
)

if len(broken_analysis) > 0:
    print(f"Groups with Gaps ({len(broken_analysis):,} total):\n")
    print("Top 10 groups with most missing positions:\n")
    broken_analysis.head(10)
else:
    print("✓ NO GAPS FOUND - All groups are perfectly contiguous!")

✓ NO GAPS FOUND - All groups are perfectly contiguous!


In [25]:
"""
Show actual sentence positions for a broken group to visualize the gap
"""

if len(broken_analysis) > 0:
    # Pick the worst offender
    worst_group = broken_analysis.head(1)
    worst_docid = worst_group['docID'][0]
    worst_section = worst_group['section_name'][0]
    
    print(f"Detailed Gap Analysis for Worst Case:")
    print(f"  docID: {worst_docid}")
    print(f"  section: {worst_section}\n")
    
    # Get all positions for this group
    gap_detail = (
        incr_with_pos
        .filter(
            (pl.col('docID') == worst_docid) & 
            (pl.col('section_name') == worst_section) &
            (pl.col('sentence_pos') != -1)
        )
        .select(['sentenceID', 'sentence_pos', 'sentence'])
        .sort('sentence_pos')
    )
    
    positions = gap_detail['sentence_pos'].to_list()
    min_pos = min(positions)
    max_pos = max(positions)
    expected_range = set(range(min_pos, max_pos + 1))
    actual_positions = set(positions)
    missing = sorted(expected_range - actual_positions)
    
    print(f"Position range: {min_pos} to {max_pos}")
    print(f"Expected positions: {len(expected_range)}")
    print(f"Actual positions: {len(actual_positions)}")
    print(f"Missing positions: {missing[:20]}")  # Show first 20 gaps
    
    print(f"\nSample sentences from this group:\n")
    gap_detail.head(10)
else:
    print("No gaps to analyze - all sequences are contiguous!")


No gaps to analyze - all sequences are contiguous!


In [27]:
"""
Simple inspection of reportDate column to understand the null issue
Check both Historical and Incremental (SDK) data
"""

print("Inspecting reportDate column...\n")
print("="*70)

# Historical - Sample rows with reportDate
print("\nHISTORICAL DATA:")
print("-"*70)
hist_sample = hist_full.select([
    'cik',
    'name', 
    'report_year',
    'reportDate',
    'filingDate',
    'docID',
    'sentenceID'
]).head(10)

print(f"Sample of {len(hist_sample)} rows:\n")
hist_sample


Inspecting reportDate column...


HISTORICAL DATA:
----------------------------------------------------------------------
Sample of 10 rows:



cik,name,report_year,reportDate,filingDate,docID,sentenceID
str,str,i64,str,str,str,str
"""0000034088""","""EXXON MOBIL CORP""",2020,"""2020-12-31""","""2021-02-24""","""0000034088_10-K_2020""","""0000034088_10-K_2020_section_1_0"""
"""0000034088""","""EXXON MOBIL CORP""",2020,"""2020-12-31""","""2021-02-24""","""0000034088_10-K_2020""","""0000034088_10-K_2020_section_1_1"""
"""0000034088""","""EXXON MOBIL CORP""",2020,"""2020-12-31""","""2021-02-24""","""0000034088_10-K_2020""","""0000034088_10-K_2020_section_1_10"""
"""0000034088""","""EXXON MOBIL CORP""",2020,"""2020-12-31""","""2021-02-24""","""0000034088_10-K_2020""","""0000034088_10-K_2020_section_1_11"""
"""0000034088""","""EXXON MOBIL CORP""",2020,"""2020-12-31""","""2021-02-24""","""0000034088_10-K_2020""","""0000034088_10-K_2020_section_1_12"""
"""0000034088""","""EXXON MOBIL CORP""",2020,"""2020-12-31""","""2021-02-24""","""0000034088_10-K_2020""","""0000034088_10-K_2020_section_1_13"""
"""0000034088""","""EXXON MOBIL CORP""",2020,"""2020-12-31""","""2021-02-24""","""0000034088_10-K_2020""","""0000034088_10-K_2020_section_1_14"""
"""0000034088""","""EXXON MOBIL CORP""",2020,"""2020-12-31""","""2021-02-24""","""0000034088_10-K_2020""","""0000034088_10-K_2020_section_1_15"""
"""0000034088""","""EXXON MOBIL CORP""",2020,"""2020-12-31""","""2021-02-24""","""0000034088_10-K_2020""","""0000034088_10-K_2020_section_1_16"""
"""0000034088""","""EXXON MOBIL CORP""",2020,"""2020-12-31""","""2021-02-24""","""0000034088_10-K_2020""","""0000034088_10-K_2020_section_1_17"""


In [None]:
print("\n" + "="*70)

# Incremental SDK - Sample rows with reportDate
print("\nINCREMENTAL (SDK) DATA:")
print("-"*70)
incr_sample = incr_full.select([
    'cik',
    'name',
    'report_year', 
    'reportDate',
    'filingDate',
    'docID',
    'sentenceID'
]).head(10)

print(f"Sample of {len(incr_sample)} rows:\n")
incr_sample



INCREMENTAL (SDK) DATA:
----------------------------------------------------------------------
Sample of 10 rows:



cik,name,report_year,reportDate,filingDate,docID,sentenceID
str,str,i64,null,str,str,str
"""320193""","""Apple Inc.""",2024,,"""2024-11-01""","""320193_10-K_2024""","""320193_10-K_2024_section_0_0"""
"""320193""","""Apple Inc.""",2024,,"""2024-11-01""","""320193_10-K_2024""","""320193_10-K_2024_section_0_1"""
"""320193""","""Apple Inc.""",2024,,"""2024-11-01""","""320193_10-K_2024""","""320193_10-K_2024_section_0_2"""
"""320193""","""Apple Inc.""",2024,,"""2024-11-01""","""320193_10-K_2024""","""320193_10-K_2024_section_0_3"""
"""320193""","""Apple Inc.""",2024,,"""2024-11-01""","""320193_10-K_2024""","""320193_10-K_2024_section_0_4"""
"""320193""","""Apple Inc.""",2024,,"""2024-11-01""","""320193_10-K_2024""","""320193_10-K_2024_section_0_5"""
"""320193""","""Apple Inc.""",2024,,"""2024-11-01""","""320193_10-K_2024""","""320193_10-K_2024_section_0_6"""
"""320193""","""Apple Inc.""",2024,,"""2024-11-01""","""320193_10-K_2024""","""320193_10-K_2024_section_0_7"""
"""320193""","""Apple Inc.""",2024,,"""2024-11-01""","""320193_10-K_2024""","""320193_10-K_2024_section_0_8"""
"""320193""","""Apple Inc.""",2024,,"""2024-11-01""","""320193_10-K_2024""","""320193_10-K_2024_section_0_9"""
