In [1]:
import pandas as pd
import json
import sys
from pathlib import Path

In [2]:
BASE_DIR = Path().resolve().parent
DATA_DIR = BASE_DIR / "data"

RAW_DIR = DATA_DIR / "raw_data"
INGESTED_DIR = DATA_DIR / "ingested_data" / 'books data.csv'
CLEAN_CSV = DATA_DIR / "clean_data" / "clean_books.csv"
ENRICHED_JSON = DATA_DIR / "enriched_data" / "enriched_books.json"



ðŸ“¥ 1. RAW DATA STATISTICS (Before Pipeline)

In [3]:
raw_files = list(RAW_DIR.glob("*.csv"))
raw_dfs = [pd.read_csv(f, encoding="latin1", low_memory=False) for f in raw_files]

raw_df = pd.concat(raw_dfs, ignore_index=True)


In [4]:
raw_stats = {
    "total_raw_rows": len(raw_df),
    "unique_titles": raw_df["Title"].nunique(dropna=True),
    "missing_titles": raw_df["Title"].isna().sum(),
    "missing_isbn": raw_df["ISBN"].isna().sum() if "ISBN" in raw_df else None,
}
raw_stats


{'total_raw_rows': 36364,
 'unique_titles': 30906,
 'missing_titles': np.int64(0),
 'missing_isbn': np.int64(412)}

ðŸ“¥ 2. INGESTED DATA STATISTICS

In [5]:
ingested_df = pd.read_csv(INGESTED_DIR, encoding="latin1", low_memory=False)

In [6]:
ingested_stats = {
    "total_ingested_rows": len(ingested_df),
    "unique_titles": ingested_df["title"].nunique(),
    "unique_isbn": ingested_df["isbn"].nunique(dropna=True),
    "missing_isbn": ingested_df["isbn"].isna().sum(),
    "missing_year": ingested_df["year"].isna().sum(),
}
ingested_stats


{'total_ingested_rows': 36364,
 'unique_titles': 30906,
 'unique_isbn': 31546,
 'missing_isbn': np.int64(412),
 'missing_year': np.int64(170)}

ðŸ§¹ 3. CLEANED DATA STATISTICS

In [7]:
clean_df = pd.read_csv(CLEAN_CSV)

In [8]:
clean_stats = {
    "total_clean_rows": len(clean_df),
    "unique_record_id": clean_df["record_id"].nunique(),
    "unique_isbn": clean_df["isbn"].nunique(dropna=True),
    "missing_isbn": clean_df["isbn"].isna().sum(),
    "duplicate_removed": len(ingested_df) - len(clean_df),
}
clean_stats


{'total_clean_rows': 31999,
 'unique_record_id': 31999,
 'unique_isbn': 26871,
 'missing_isbn': np.int64(5075),
 'duplicate_removed': 4365}

In [9]:
clean_df[clean_df["title"].duplicated()]

Unnamed: 0,date,accession_no,title,author_editor,edition_volume,place_publisher,isbn,year,pages,source,class_no_book_no,record_id
70,09-08-2001,121,computer graphics,"harrington, steven",2nd ed.,"new york: mcgraw-hill,",9780071004725,1987.0,466 p.;,,006.6 har,fdc6e18dedb7dba6242c9fe8b7fa7fad
96,15-09-2001,157,high-performance communication networks,"walrand, jean",,"singapore: harcourt,",1558605746,2000.0,693 p.;,,621.382 wal,6412802223dad6a2d714ead829e4eba6
139,15-09-2001,285,microprocessors,"rafiquzzaman, mohamed",revised ed.,"new delhi: prentice hall of india,",8120308484,2000.0,468 p.;,,004.16 raf,a91946d1920476262ea4b91ac9903668
170,22-09-2001,369,programming languages,"sethi, ravi",2nd ed.,"delhi: addison-wesley,",8178081040,2000.0,640 p.;,,005.1 set,22958e628b75fdf8ce4282f1fdc6c61c
205,29-09-2001,462,"science, hegemony and violence : a requiem for...","nandy, ashis",,"new delhi: oxford university press,",9780195625807,1999.0,301 p.;,,809.4 nan,c9c845775fb6e576dfbe54c841fe0cf9
...,...,...,...,...,...,...,...,...,...,...,...,...
31817,22-12-2010,25398,bhagavad gita,"easwaran, eknath",,"canada: nilgiri press,",,2007.0,294 p.;,,294.592404521 eas,861542c8d7e66493c194b2e67a7ff82f
31831,01-02-2011,25500,discrete mathematics for computing,"grossman, peter",3rd ed.,"new york: palgrave macmillan,",,2009.0,"xii, 316 p.;",,004.0151 gro,b577b70b85595f9d13e36dd5b98cda79
31874,16-06-2011,26039,inside servlets : server-side programming for ...,"callaway, dustin r.",2nd ed.,"boston: addison-wesley,",,2001.0,"xxv, 881 p.;",,005.2762 cal,b756a96c9ab56d66ca3aee0f304b6053
31875,16-06-2011,26040,java 2 network security,"pistoia, marco",2nd ed.,"upper saddle river: prentice hall,",,1999.0,"xxi, 713 p.;",,005.8 pis,784e65991b0233fbd279e59e5e3c34ec


In [10]:
{
    "isbn_based_books": clean_df["isbn"].notna().sum(),
    "non_isbn_books": clean_df["isbn"].isna().sum(),
}


{'isbn_based_books': np.int64(26924), 'non_isbn_books': np.int64(5075)}

In [11]:
with open(ENRICHED_JSON, "r", encoding="utf-8") as f:
    enriched_data = json.load(f)

enriched_df = pd.DataFrame(enriched_data)

In [12]:
enrichment_stats = {
    "total_processed": len(enriched_df),
    "found_books": (enriched_df["status"] == "FOUND").sum(),
    "missing_books": (enriched_df["status"] == "MISSING").sum(),
    "success_rate_%": round(
        100 * (enriched_df["status"] == "FOUND").mean(), 2
    ),
}
enrichment_stats

{'total_processed': 31999,
 'found_books': np.int64(5146),
 'missing_books': np.int64(26853),
 'success_rate_%': np.float64(16.08)}

In [13]:
{
    "authors_available": enriched_df["authors"].notna().sum(),
    "subjects_available": enriched_df["subjects"].notna().sum(),
    "summary_available": enriched_df["summary"].notna().sum(),
    "publisher_available": enriched_df["publisher"].notna().sum(),
}

{'authors_available': np.int64(5020),
 'subjects_available': np.int64(4656),
 'summary_available': np.int64(4301),
 'publisher_available': np.int64(3902)}

In [14]:
final_stats = {
    "final_books_count": enriched_df["book_key"].nunique(),
    "unique_titles": enriched_df["title"].nunique(),
    "unique_isbn": enriched_df["isbn"].nunique(dropna=True),
}
final_stats

{'final_books_count': 31939, 'unique_titles': 30895, 'unique_isbn': 26251}

In [15]:
summary = pd.DataFrame([
    {"stage": "Raw", "rows": len(raw_df)},
    {"stage": "Ingested", "rows": len(ingested_df)},
    {"stage": "Cleaned", "rows": len(clean_df)},
    {"stage": "Enriched", "rows": len(enriched_df)},
])

summary


Unnamed: 0,stage,rows
0,Raw,36364
1,Ingested,36364
2,Cleaned,31999
3,Enriched,31999
