In [74]:
import json
import os
import re
from pprint import pprint
from collections import Counter

In [91]:
# _quality_state
file_path_data_quality_check = os.path.expanduser('~/data/_quality_state/raw.binance-usdt-futures.BookDepth/ENAUSDT/state.json')

with open(file_path_data_quality_check, 'r') as f:
    content = f.read()  # read the whole file as a string

record = json.loads(content)  # now parse the full JSON object
print(json.dumps(record, indent=2))  # pretty print
# Zvezda: I don't quite understand the max_intra_batch_gap_ms

{
  "last_processed_date": "2025-07-16",
  "last_updated_utc": "2025-07-17T08:49:50.441662+00:00",
  "partitions_checked_in_last_run": [
    "2024-04-02",
    "2024-04-03",
    "2024-04-04",
    "2024-04-05",
    "2024-04-06",
    "2024-04-07",
    "2024-04-08",
    "2024-04-09",
    "2024-04-10",
    "2024-04-11",
    "2024-04-12",
    "2024-04-13",
    "2024-04-14",
    "2024-04-15",
    "2024-04-16",
    "2024-04-17",
    "2024-04-18",
    "2024-04-19",
    "2024-04-20",
    "2024-04-21",
    "2024-04-22",
    "2024-04-23",
    "2024-04-24",
    "2024-04-25",
    "2024-04-26",
    "2024-04-27",
    "2024-04-28",
    "2024-04-29",
    "2024-04-30",
    "2024-05-01",
    "2024-05-02",
    "2024-05-03",
    "2024-05-04",
    "2024-05-05",
    "2024-05-06",
    "2024-05-07",
    "2024-05-08",
    "2024-05-09",
    "2024-05-10",
    "2024-05-11",
    "2024-05-12",
    "2024-05-13",
    "2024-05-14",
    "2024-05-15",
    "2024-05-16",
    "2024-05-17",
    "2024-05-18",
    "2024-05-19",

In [122]:
def normalize_error_msg(msg: str) -> str:
    if not isinstance(msg, str):
        return str(msg)

    msg = msg.strip()

    # Step 1: Remove outer quotes ONLY if they wrap the whole string
    if (msg.startswith("'") and msg.endswith("'")) or (msg.startswith('"') and msg.endswith('"')):
        msg = msg[1:-1]

    # Step 2: Flatten line breaks
    msg = msg.replace('\n', ' ')
    msg = re.sub(r'\s+', ' ', msg)

    # Step 3: Replace date strings
    msg = re.sub(r'\["\d{4}-\d{2}-\d{2}"\]', '["<DATE>"]', msg)
    msg = re.sub(r'\b\d{4}-\d{2}-\d{2}\b', '<DATE>', msg)

    return msg.strip()

def summarize_nested_errors_grouped(data):
    reasons = []
    error_msgs = []
    retry_counts = []
    permanent_flags = []
    total_records = 0

    for symbol, date_errors in data.items():
        for date, report in date_errors.items():
            total_records += 1

            reason = report.get('reason')
            if reason:
                reasons.append(reason)

            raw_msg = report.get('error_msg')
            if raw_msg:
                normalized = normalize_error_msg(raw_msg)
                error_msgs.append(normalized)

            rc = report.get('retry_count')
            if rc is not None:
                retry_counts.append(rc)

            pf = report.get('permanent')
            if pf is not None:
                permanent_flags.append(pf)

    print(f"📁 Total error records: {total_records}")

    print("\n📌 Unique Reasons:")
    for reason, count in Counter(reasons).most_common():
        print(f"  {reason}: {count} occurrences")

    
    print("\n📌 Grouped Error Messages:")
    for msg, count in Counter(error_msgs).most_common():
        print(f"  '{msg}': {count} occurrences")

    if retry_counts:
        print("\n📌 Retry Count Stats:")
        print(f"  Min: {min(retry_counts)}, Max: {max(retry_counts)}, Avg: {sum(retry_counts)/len(retry_counts):.2f}")
    else:
        print("\n📌 Retry Count Stats: No data")

    if permanent_flags:
        permanent_count = sum(permanent_flags)
        print("\n📌 Permanent Failures:")
        print(f"  {permanent_count} out of {len(permanent_flags)} were marked permanent")
    else:
        print("\n📌 Permanent Failures: No data")

In [123]:
# tardis_failed_downloads.json
tardis_failed_download = os.path.expanduser('~/data/tardis_failed_downloads.json')

with open(tardis_failed_download, 'r') as f:
    content = f.read()  # read the whole file as a string

record = json.loads(content)  # now parse the full JSON object
pprint(record, depth=2)
for key in list(record['tardis'].keys()):  # each key
    print(f"\n=== {key} ===")
    summarize_nested_errors(record['tardis'][key]) # print out error statistics
    print(json.dumps(record['tardis'][key], indent=2)[:500])  # first 1000 characters

# I don't understand what does it mean downloaded data is too small, and the meaning of permanent, 
#    and what does it mean by derivative ticker trades, and book snapshots.

{'tardis': {'book_snapshot_5': {...},
            'derivative_ticker': {...},
            'trades': {...}}}

=== trades ===
📁 Total error records: 1859

📌 Unique Reasons:
  empty_data: 1857 occurrences
  processing_error: 2 occurrences

📌 Unique Error Messages:
  'Downloaded data is too small': 1857 occurrences

📌 Retry Count Stats:
  Min: 1, Max: 1, Avg: 1.00

📌 Permanent Failures:
  1857 out of 1859 were marked permanent
{
  "EOSUSDT": {
    "2025-05-22": {
      "reason": "empty_data",
      "timestamp": "2025-07-16T03:14:06.298060+00:00",
      "retry_count": 1,
      "permanent": true,
      "error_msg": "Downloaded data is too small"
    }
  },
  "OMGUSDT": {
    "2025-02-01": {
      "reason": "empty_data",
      "timestamp": "2025-07-16T03:14:50.550708+00:00",
      "retry_count": 1,
      "permanent": true,
      "error_msg": "Downloaded data is too small"
    }
  },
  "MATICUSDT": {
    "2024-09-05": {
 

=== book_snapshot_5 ===
📁 Total error records: 1859

📌 Unique Reasons:


In [126]:
# binance_failed_downloads.json
binance_failed_download = os.path.expanduser('~/data/binance_failed_downloads.json')

with open(binance_failed_download, 'r') as f:
    content = f.read()  # read the whole file as a string

record = json.loads(content)  # now parse the full JSON object
pprint(record, depth=2)
for key in list(record['binance'].keys()):  # each key
    print(f"\n=== {key} ===")
    print(json.dumps(record['binance'][key], indent=2)[:1000])  # first 1000 characters

{'binance': {'metrics': {...}}}

=== metrics ===
{
  "BTCUSDT": {
    "2025-07-15": {
      "reason": "no_data",
      "timestamp": "2025-07-16T03:21:03.447082+00:00",
      "retry_count": 1,
      "permanent": true,
      "error_msg": "No data available (404)"
    },
    "2025-07-16": {
      "reason": "no_data",
      "timestamp": "2025-07-17T02:52:47.655582+00:00",
      "retry_count": 1,
      "permanent": true,
      "error_msg": "No data available (404)"
    }
  },
  "BTCDOMUSDT": {
    "2025-07-15": {
      "reason": "no_data",
      "timestamp": "2025-07-16T03:22:08.873557+00:00",
      "retry_count": 1,
      "permanent": true,
      "error_msg": "No data available (404)"
    },
    "2025-07-16": {
      "reason": "no_data",
      "timestamp": "2025-07-17T02:52:47.436509+00:00",
      "retry_count": 1,
      "permanent": true,
      "error_msg": "No data available (404)"
    }
  },
  "ETHUSDT": {
    "2025-07-15": {
      "reason": "no_data",
      "timestamp": "2025-07-16T03: