In [85]:
import pandas as pd
import json
import sys
from pathlib import Path

In [86]:
BASE_DIR = Path().resolve().parent
DATA_DIR = BASE_DIR / "data"

RAW_DIR = DATA_DIR / "raw_data"
INGESTED_DIR = DATA_DIR / "ingested_data" / 'books data.csv'
CLEAN_CSV = DATA_DIR / "clean_data" / "clean_books.csv"
ENRICHED_JSON = DATA_DIR / "enriched_data" / "enriched_books.json"



ðŸ“¥ 1. RAW DATA STATISTICS (Before Pipeline)

In [87]:
raw_files = list(RAW_DIR.glob("*.csv"))
raw_dfs = [pd.read_csv(f, encoding="latin1", low_memory=False) for f in raw_files]

raw_df = pd.concat(raw_dfs, ignore_index=True)


In [88]:
raw_stats = {
    "total_raw_rows": len(raw_df),
    "unique_titles": raw_df["Title"].nunique(dropna=True),
    "missing_titles": raw_df["Title"].isna().sum(),
    "missing_isbn": raw_df["ISBN"].isna().sum() if "ISBN" in raw_df else None,
}
raw_stats


{'total_raw_rows': 36364,
 'unique_titles': 30906,
 'missing_titles': np.int64(0),
 'missing_isbn': np.int64(412)}

ðŸ“¥ 2. INGESTED DATA STATISTICS

In [89]:
ingested_df = pd.read_csv(INGESTED_DIR, encoding="latin1", low_memory=False)

In [90]:
ingested_stats = {
    "total_ingested_rows": len(ingested_df),
    "unique_titles": ingested_df["title"].nunique(),
    "unique_isbn": ingested_df["isbn"].nunique(dropna=True),
    "missing_isbn": ingested_df["isbn"].isna().sum(),
    "missing_year": ingested_df["year"].isna().sum(),
}
ingested_stats


{'total_ingested_rows': 36364,
 'unique_titles': 30906,
 'unique_isbn': 31546,
 'missing_isbn': np.int64(412),
 'missing_year': np.int64(170)}

ðŸ§¹ 3. CLEANED DATA STATISTICS

In [91]:
clean_df = pd.read_csv(CLEAN_CSV)

In [92]:
clean_stats = {
    "total_clean_rows": len(clean_df),
    "unique_record_id": clean_df["record_id"].nunique(),
    "unique_isbn": clean_df["isbn"].nunique(dropna=True),
    "missing_isbn": clean_df["isbn"].isna().sum(),
    "duplicate_removed": len(ingested_df) - len(clean_df),
}
clean_stats


{'total_clean_rows': 31946,
 'unique_record_id': 31946,
 'unique_isbn': 26871,
 'missing_isbn': np.int64(5075),
 'duplicate_removed': 4418}

In [93]:
{
    "isbn_based_books": clean_df["isbn"].notna().sum(),
    "non_isbn_books": clean_df["isbn"].isna().sum(),
}


{'isbn_based_books': np.int64(26871), 'non_isbn_books': np.int64(5075)}

In [94]:
with open(ENRICHED_JSON, "r", encoding="utf-8") as f:
    enriched_data = json.load(f)

enriched_df = pd.DataFrame(enriched_data)

In [95]:
enrichment_stats = {
    "total_processed": len(enriched_df),
    "found_books": (enriched_df["status"] == "FOUND").sum(),
    "missing_books": (enriched_df["status"] == "MISSING").sum(),
    "success_rate_%": round(
        100 * (enriched_df["status"] == "FOUND").mean(), 2
    ),
}
enrichment_stats

{'total_processed': 31946,
 'found_books': np.int64(9221),
 'missing_books': np.int64(22725),
 'success_rate_%': np.float64(28.86)}

In [96]:
{
    "authors_available": enriched_df["authors"].notna().sum(),
    "subjects_available": enriched_df["subjects"].notna().sum(),
    "summary_available": enriched_df["summary"].notna().sum(),
    "publisher_available": enriched_df["publisher"].notna().sum(),
}

{'authors_available': np.int64(8348),
 'subjects_available': np.int64(8497),
 'summary_available': np.int64(7313),
 'publisher_available': np.int64(6708)}

In [97]:
final_stats = {
    "final_books_count": enriched_df["book_key"].nunique(),
    "unique_titles": enriched_df["title"].nunique(),
    "unique_isbn": enriched_df["isbn"].nunique(dropna=True),
}
final_stats

{'final_books_count': 31895, 'unique_titles': 30246, 'unique_isbn': 26026}

In [98]:
summary = pd.DataFrame([
    {"stage": "Raw", "rows": len(raw_df)},
    {"stage": "Ingested", "rows": len(ingested_df)},
    {"stage": "Cleaned", "rows": len(clean_df)},
    {"stage": "Enriched", "rows": len(enriched_df)},
])

summary


Unnamed: 0,stage,rows
0,Raw,36364
1,Ingested,36364
2,Cleaned,31946
3,Enriched,31946
