<a href="https://colab.research.google.com/github/BakhturinaPolina/goodreads-romance-research/blob/main/Goodreads_Datasets_Romance_Data_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Cell 1: Imports and Mount Drive

In [1]:
# Colab-ready script: copy into cells in order.
# Cell 1: imports and mount drive
import json, gzip, os, re, random, math
from collections import Counter, defaultdict
from datetime import datetime
from typing import List, Dict, Any, Iterable, Optional
import pandas as pd
from tqdm import tqdm
from IPython.display import display

# Mount Google Drive (run this cell in Colab)
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

# Paths (adjust if necessary)
goodreads_book_romance = "/content/drive/MyDrive/Goodreads_Metadata_Reviews_2017/goodreads_books_romance.json.gz"
goodreads_interactions_romance = "/content/drive/MyDrive/Goodreads_Metadata_Reviews_2017/goodreads_interactions_romance.json.gz"
goodreads_reviews_romance = "/content/drive/MyDrive/Goodreads_Metadata_Reviews_2017/goodreads_reviews_romance.json.gz"

# Output CSV path
OUTPUT_DIR = "/content/drive/MyDrive/Goodreads_Metadata_Reviews_2017/outputs"
os.makedirs(OUTPUT_DIR, exist_ok=True)

Mounted at /content/drive


# Cell 2: Utility readers for json / jsonlines gz

In [2]:
# Cell 2: Utility readers for json / jsonlines gz (defensive)
def iter_json_gz(path: str, max_items: Optional[int]=None):
    """
    Yields JSON objects from a gzip file which may be either:
      - newline-delimited JSON (jsonlines), OR
      - a single JSON array (loads whole file once; only for small files)
    This handles both forms defensively.
    """
    if path.endswith(".gz"):
        with gzip.open(path, "rt", encoding="utf-8") as f:
            # Peek first non-whitespace char
            pos = f.tell()
            first = f.read(1)
            while first and first.isspace():
                first = f.read(1)
            f.seek(pos)
            if first == '[':
                # single JSON array
                raw = f.read()
                arr = json.loads(raw)
                for i, obj in enumerate(arr):
                    yield obj
                    if max_items and i+1 >= max_items: break
            else:
                # assume one JSON object per line
                for i, line in enumerate(f):
                    line = line.strip()
                    if not line:
                        continue
                    try:
                        obj = json.loads(line)
                    except json.JSONDecodeError:
                        # attempt to fix trailing commas or other problems (best-effort)
                        try:
                            # sometimes files contain Python-style reprs or similar — skip
                            continue
                        except:
                            continue
                    yield obj
                    if max_items and i+1 >= max_items:
                        break
    else:
        # non-gz fallback
        with open(path, "r", encoding="utf-8") as f:
            raw = f.read()
            if raw.strip().startswith('['):
                arr = json.loads(raw)
                for obj in arr:
                    yield obj
            else:
                for line in f:
                    if not line.strip(): continue
                    yield json.loads(line)

# Cell 3: Quick file inspection (first N examples + aggregated key stats)

In [3]:
import gzip
import json
import pandas as pd
from collections import Counter
from pprint import pprint

# === CONFIG ===
JSON_PATH = "/content/drive/MyDrive/Goodreads_Metadata_Reviews_2017/goodreads_books_romance.json.gz"
SAMPLE_SIZE = 1000  # number of objects to sample for initial inspection
SAMPLE_PRINT_COUNT = 4  # number of full records to print

print(f"[INFO] Inspecting file: {JSON_PATH}")

# === LOAD SAMPLE ===
sample_objects = []
key_counter = Counter()

with gzip.open(JSON_PATH, 'rt', encoding='utf-8') as f:
    for idx, line in enumerate(f):
        try:
            obj = json.loads(line)
            sample_objects.append(obj)
            key_counter.update(obj.keys())
        except json.JSONDecodeError as e:
            print(f"[ERROR] JSON parsing error at line {idx}: {e}")
        if len(sample_objects) >= SAMPLE_SIZE:
            break

print(f"[INFO] Number of sample objects collected: {len(sample_objects)}")

# === LOG: Top keys by frequency ===
print(f"\n[INFO] Top keys (by frequency in first {SAMPLE_SIZE} objects):")
for key, count in key_counter.most_common():
    print(f"  {key:25} : {count}")

# === LOG: Missing value percentage per column ===
df_sample = pd.DataFrame(sample_objects)
missing_pct = df_sample.isna().mean().sort_values(ascending=False) * 100
print("\n[INFO] Missing value percentage (sample):")
print(missing_pct.to_string())

# === LOG: Columns to drop (KEEPING book_id and work_id for joins) ===
columns_to_drop = [
    "isbn", "isbn13", "asin", "kindle_asin",
    "url", "link", "image_url",
    "edition_information", "country_code", "publisher"
]
print("\n[INFO] Candidate columns for drop (book_id/work_id KEPT):")
print(columns_to_drop)

# === LOG: Metadata fields that may hint at subgenres ===
subgenre_hint_fields = ["popular_shelves", "series", "description", "publisher", "format", "language_code"]
print("\n[INFO] Metadata fields with potential subgenre clues:")
print(subgenre_hint_fields)

# === LOG: Pretty print sample records ===
print(f"\n[DEBUG] Pretty print of first {SAMPLE_PRINT_COUNT} sample objects:")
for i in range(min(SAMPLE_PRINT_COUNT, len(sample_objects))):
    print(f"\n--- SAMPLE BOOK {i+1} ---")
    pprint(sample_objects[i], indent=2)

[INFO] Inspecting file: /content/drive/MyDrive/Goodreads_Metadata_Reviews_2017/goodreads_books_romance.json.gz
[INFO] Number of sample objects collected: 1000

[INFO] Top keys (by frequency in first 1000 objects):
  isbn                      : 1000
  text_reviews_count        : 1000
  series                    : 1000
  country_code              : 1000
  language_code             : 1000
  popular_shelves           : 1000
  asin                      : 1000
  is_ebook                  : 1000
  average_rating            : 1000
  kindle_asin               : 1000
  similar_books             : 1000
  description               : 1000
  format                    : 1000
  link                      : 1000
  authors                   : 1000
  publisher                 : 1000
  num_pages                 : 1000
  publication_day           : 1000
  isbn13                    : 1000
  publication_month         : 1000
  edition_information       : 1000
  publication_year          : 1000
  url           

# Cell 4: investigate edition vs work counts & cross-check reviews/interactions

In [4]:
# === Cell: investigate edition vs work counts & cross-check reviews/interactions ===
import gzip, json, os, csv
from collections import defaultdict, Counter
from pprint import pprint
from tqdm import tqdm
import pandas as pd

# Paths (adjust if needed)
BOOKS_PATH = "/content/drive/MyDrive/Goodreads_Metadata_Reviews_2017/goodreads_books_romance.json.gz"
REVIEWS_PATH = "/content/drive/MyDrive/Goodreads_Metadata_Reviews_2017/goodreads_reviews_romance.json.gz"  # may be big
INTERACTIONS_PATH = "/content/drive/MyDrive/Goodreads_Metadata_Reviews_2017/goodreads_interactions_romance.json.gz"  # optional

# Config: book to inspect (defaults from your sample)
TARGET_BOOK_ID = "3209316"  # string type to match dump format
TARGET_TITLE = "Emma"
SAMPLE_EDITIONS_TO_PRINT = 25  # how many editions sharing the same work_id to print

# Helpers
def to_int_safe(x):
    if x is None or x == "":
        return None
    try:
        return int(str(x).replace(",", ""))
    except:
        try:
            return int(float(x))
        except:
            return None

def pretty_print_book(obj):
    keys_of_interest = ["book_id", "title", "title_without_series", "work_id", "authors",
                        "publication_year", "ratings_count", "text_reviews_count", "num_pages",
                        "format", "is_ebook", "language_code", "series", "popular_shelves"]
    for k in keys_of_interest:
        print(f"{k:20}: {obj.get(k, '')}")
    print("-"*80)

# Step 1: find target edition record and collect work_id
print("[INFO] Scanning books file for target book and related editions (this reads the file once).")
target_record = None
work_id_of_target = None
editions_for_work = []  # list of raw objects for same work
all_books_index = {}  # book_id -> selected fields for quick lookup

with gzip.open(BOOKS_PATH, "rt", encoding="utf-8") as f:
    for line in tqdm(f, desc="scanning books"):
        try:
            obj = json.loads(line)
        except json.JSONDecodeError:
            continue
        bid = str(obj.get("book_id", ""))
        wid = str(obj.get("work_id", "") if obj.get("work_id", "") is not None else "")
        # index minimal info
        all_books_index[bid] = {
            "book_id": bid,
            "title": obj.get("title"),
            "work_id": wid,
            "ratings_count": to_int_safe(obj.get("ratings_count")),
            "text_reviews_count": to_int_safe(obj.get("text_reviews_count")),
            "publication_year": obj.get("publication_year"),
            "authors": obj.get("authors"),
            "popular_shelves": obj.get("popular_shelves")
        }
        if bid == str(TARGET_BOOK_ID):
            target_record = obj
            work_id_of_target = wid

# Logging results
if target_record is None:
    print(f"[WARN] Target book_id {TARGET_BOOK_ID} not found in books dump.")
else:
    print(f"[INFO] Found target book_id {TARGET_BOOK_ID}. Title: {target_record.get('title')!r}, work_id: {work_id_of_target}")
    print("\n[INFO] Target edition fields (concise):")
    pretty_print_book(target_record)

# Step 2: collect all other editions in dump sharing the same work_id
if work_id_of_target:
    print(f"\n[INFO] Collecting all editions that share work_id = {work_id_of_target}")
    editions_for_work = [v for k, v in all_books_index.items() if v.get("work_id") == work_id_of_target]
    print(f"[INFO] Number of editions in this dump that share the work_id: {len(editions_for_work)}")
    # sort by ratings_count descending
    editions_for_work_sorted = sorted(editions_for_work, key=lambda x: x.get("ratings_count") or 0, reverse=True)
    print(f"\n[INFO] Top {min(SAMPLE_EDITIONS_TO_PRINT, len(editions_for_work_sorted))} editions for this work (dump values):")
    for e in editions_for_work_sorted[:SAMPLE_EDITIONS_TO_PRINT]:
        print(f"  book_id={e['book_id']}, title={str(e.get('title'))[:70]:70} | ratings={e.get('ratings_count')} | text_reviews={e.get('text_reviews_count')}")

    # aggregate sums across editions found in dump
    total_ratings_in_dump = sum((e.get("ratings_count") or 0) for e in editions_for_work_sorted)
    total_text_reviews_in_dump = sum((e.get("text_reviews_count") or 0) for e in editions_for_work_sorted)
    print(f"\n[INFO] Sum across editions present in dump -> ratings_count: {total_ratings_in_dump}, text_reviews_count: {total_text_reviews_in_dump}")

# Step 3: Cross-check detailed reviews file for counts (streams file; can be slow)
def stream_count_reviews(reviews_path, book_ids_set=None, work_id=None, max_lines=None):
    """
    Count detailed review records that reference book_id or work_id in the reviews dump.
    Returns (per_book_counter, total_counter_matched_lines).
    """
    per_book = Counter()
    matched_lines = 0
    if not os.path.exists(reviews_path):
        print(f"[WARN] Reviews path {reviews_path} not found on disk.")
        return per_book, 0
    with gzip.open(reviews_path, "rt", encoding="utf-8") as f:
        for i, line in enumerate(tqdm(f, desc="streaming reviews")):
            if max_lines and i >= max_lines:
                break
            try:
                r = json.loads(line)
            except json.JSONDecodeError:
                continue
            # defensive: reviews may use 'book_id' or 'bookId'
            r_book_id = str(r.get("book_id") or r.get("bookId") or "")
            r_work_id = str(r.get("work_id") or r.get("workId") or "")
            if book_ids_set and r_book_id in book_ids_set:
                per_book[r_book_id] += 1
                matched_lines += 1
            elif work_id and r_work_id == work_id:
                # if review references work_id, increment an aggregated key
                per_book["_work_level_"] += 1
                matched_lines += 1
    return per_book, matched_lines

# Prepare book id set to check: target edition and all known editions for same work
book_ids_to_check = set()
if target_record:
    book_ids_to_check.add(str(target_record.get("book_id")))
if editions_for_work:
    for e in editions_for_work:
        book_ids_to_check.add(e["book_id"])

print("\n[INFO] Streaming reviews file to count detailed reviews for these book_ids/work_id (this can take a few minutes).")
per_book_review_counts, matched = stream_count_reviews(REVIEWS_PATH, book_ids_set=book_ids_to_check, work_id=work_id_of_target, max_lines=None)
print(f"[INFO] Matched {matched} review records in dump referencing our target book_ids/work_id.")
print("[INFO] Review counts found in the reviews dump (per book_id):")
pprint(per_book_review_counts.most_common(30))

# Step 4: (Optional) Stream interactions file similarly if you want to compare user-shelf interactions
def stream_count_interactions(inter_path, book_ids_set=None, max_lines=None):
    per_book = Counter()
    if not os.path.exists(inter_path):
        print(f"[WARN] Interactions path {inter_path} not found on disk.")
        return per_book, 0
    with gzip.open(inter_path, "rt", encoding="utf-8") as f:
        for i, line in enumerate(tqdm(f, desc="streaming interactions")):
            if max_lines and i >= max_lines:
                break
            try:
                r = json.loads(line)
            except json.JSONDecodeError:
                continue
            r_book_id = str(r.get("book_id") or r.get("bookId") or "")
            if book_ids_set and r_book_id in book_ids_set:
                per_book[r_book_id] += 1
    return per_book, sum(per_book.values())

if os.path.exists(INTERACTIONS_PATH):
    print("\n[INFO] Streaming interactions file for the same book_ids (this can also take several minutes).")
    per_book_inter_counts, total_inter_matches = stream_count_interactions(INTERACTIONS_PATH, book_ids_set=book_ids_to_check, max_lines=None)
    print(f"[INFO] Found {total_inter_matches} interaction records referencing these book_ids in the interactions dump.")
    pprint(per_book_inter_counts.most_common(30))
else:
    print("\n[INFO] Interactions file not found at INTERACTIONS_PATH; skipping interactions streaming.")

# Step 5: Save a small report to CSV for traceability
report_rows = []
if target_record:
    # add target edition row
    report_rows.append({
        "query_book_id": TARGET_BOOK_ID,
        "found_book_id": target_record.get("book_id"),
        "title": target_record.get("title"),
        "work_id": work_id_of_target,
        "edition_ratings_count": to_int_safe(target_record.get("ratings_count")),
        "edition_text_reviews_count": to_int_safe(target_record.get("text_reviews_count")),
        "sum_ratings_across_editions": total_ratings_in_dump if work_id_of_target else None,
        "sum_text_reviews_across_editions": total_text_reviews_in_dump if work_id_of_target else None,
        "reviews_in_reviews_dump_for_edition": per_book_review_counts.get(str(target_record.get("book_id")), 0),
        "reviews_in_reviews_dump_for_other_editions_total": sum(per_book_review_counts.get(bid, 0) for bid in book_ids_to_check if bid != str(target_record.get("book_id"))),
        "reviews_in_reviews_dump_for_worklevel": per_book_review_counts.get("_work_level_", 0)
    })
# Save
OUT_DIR = "/content/drive/MyDrive/Goodreads_Metadata_Reviews_2017/inspection_reports"
os.makedirs(OUT_DIR, exist_ok=True)
out_csv = os.path.join(OUT_DIR, f"book_inspection_{TARGET_BOOK_ID}.csv")
with open(out_csv, "w", newline='', encoding='utf-8') as cf:
    if report_rows:
        writer = csv.DictWriter(cf, fieldnames=list(report_rows[0].keys()))
        writer.writeheader()
        for r in report_rows:
            writer.writerow(r)
print(f"\n[INFO] Saved inspection report to: {out_csv}")

# Final log summary & suggestions
print("\n[SUMMARY]")
print(" - If dump's edition-level counts are much smaller than the live Goodreads page numbers, likely causes:")
print("    * The dump stores edition (book_id) counts while the live page shows work-level aggregates (across editions).")
print("    * The dataset is a 2017 snapshot; live site counts change over time.")
print("    * The dump's reviews/interactions are a subset (not all Goodreads data), so counts in the dump can be much smaller.")

[INFO] Scanning books file for target book and related editions (this reads the file once).


scanning books: 335449it [00:28, 11938.13it/s]


[INFO] Found target book_id 3209316. Title: 'Emma', work_id: 3360164

[INFO] Target edition fields (concise):
book_id             : 3209316
title               : Emma
title_without_series: Emma
work_id             : 3360164
authors             : [{'author_id': '1265', 'role': ''}]
publication_year    : 2005
ratings_count       : 42
text_reviews_count  : 8
num_pages           : 544
format              : Audio CD
is_ebook            : false
language_code       : eng
series              : []
popular_shelves     : [{'count': '16215', 'name': 'classics'}, {'count': '7070', 'name': 'to-read'}, {'count': '4564', 'name': 'fiction'}, {'count': '4279', 'name': 'favorites'}, {'count': '3125', 'name': 'romance'}, {'count': '2750', 'name': 'classic'}, {'count': '1942', 'name': 'books-i-own'}, {'count': '1503', 'name': 'owned'}, {'count': '1277', 'name': 'jane-austen'}, {'count': '996', 'name': 'historical-fiction'}, {'count': '993', 'name': 'clàssics'}, {'count': '904', 'name': 'literature'}, {'cou

streaming reviews: 3565378it [00:44, 79767.94it/s]


[INFO] Matched 2508 review records in dump referencing our target book_ids/work_id.
[INFO] Review counts found in the reviews dump (per book_id):
[('6969', 1658),
 ('76691', 51),
 ('7938542', 36),
 ('18626814', 27),
 ('18300260', 25),
 ('7181805', 25),
 ('157421', 24),
 ('1063179', 23),
 ('10931947', 19),
 ('437131', 18),
 ('14926', 17),
 ('15777407', 17),
 ('6282632', 16),
 ('2726761', 15),
 ('563200', 13),
 ('894017', 12),
 ('111025', 12),
 ('15875763', 12),
 ('10249902', 11),
 ('894011', 11),
 ('274404', 11),
 ('586497', 11),
 ('7576798', 10),
 ('2003182', 10),
 ('894048', 9),
 ('468001', 9),
 ('2381674', 9),
 ('22738927', 9),
 ('8618588', 8),
 ('6550323', 8)]

[INFO] Streaming interactions file for the same book_ids (this can also take several minutes).


streaming interactions: 42792856it [03:13, 220752.94it/s]


[INFO] Found 83630 interaction records referencing these book_ids in the interactions dump.
[('6969', 72315),
 ('76691', 1438),
 ('7938542', 825),
 ('18626814', 749),
 ('7181805', 531),
 ('18300260', 448),
 ('157421', 336),
 ('1063179', 294),
 ('24611702', 287),
 ('15777407', 278),
 ('10931947', 257),
 ('586497', 208),
 ('6282632', 196),
 ('437131', 178),
 ('894017', 176),
 ('22676096', 174),
 ('894048', 165),
 ('14926', 158),
 ('8618588', 151),
 ('13584589', 136),
 ('2003182', 134),
 ('563200', 130),
 ('274404', 127),
 ('1575363', 113),
 ('9666483', 109),
 ('111025', 108),
 ('2726761', 105),
 ('643084', 102),
 ('13506315', 97),
 ('468001', 96)]

[INFO] Saved inspection report to: /content/drive/MyDrive/Goodreads_Metadata_Reviews_2017/inspection_reports/book_inspection_3209316.csv

[SUMMARY]
 - If dump's edition-level counts are much smaller than the live Goodreads page numbers, likely causes:
    * The dump stores edition (book_id) counts while the live page shows work-level aggregate

In [5]:
# === Cell: build work->editions mapping, infer original publication year, deduplicate by work_id, flag same-title diff-authors ===
# Paste into a single Colab cell and run.
import gzip
import json
import os
import logging
from collections import defaultdict, Counter
from statistics import mean, median
from typing import Dict, Any, List, Optional, Tuple
from tqdm import tqdm
import pandas as pd
import math
import datetime

# ---------- CONFIG ----------
BOOKS_PATH = "/content/drive/MyDrive/Goodreads_Metadata_Reviews_2017/goodreads_books_romance.json.gz"
OUTPUT_DIR = "/content/drive/MyDrive/Goodreads_Metadata_Reviews_2017/processing_outputs"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Toggle for quick subcorpus test: uncomment the two lines below to run only on first 10 distinct work_ids.
# (Leave commented to run on whole file.)
QUICK_TEST_ONLY = True
QUICK_TEST_WORK_COUNT = 10

LOGFILE = os.path.join(OUTPUT_DIR, f"dedup_workid_run_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}.log")

# ---------- SETUP LOGGING ----------
logger = logging.getLogger("goodreads_dedup")
logger.setLevel(logging.DEBUG)
# console handler
ch = logging.StreamHandler()
ch.setLevel(logging.INFO)
ch.setFormatter(logging.Formatter('[%(levelname)s] %(message)s'))
logger.handlers = [ch]
# file handler
fh = logging.FileHandler(LOGFILE, encoding='utf-8')
fh.setLevel(logging.DEBUG)
fh.setFormatter(logging.Formatter('%(asctime)s [%(levelname)s] %(message)s'))
logger.addHandler(fh)

logger.info("Starting work_id deduplication & edition analysis.")
logger.info(f"Books path: {BOOKS_PATH}")
logger.info(f"Output dir: {OUTPUT_DIR}")
logger.info(f"Log file: {LOGFILE}")

# ---------- HELPERS ----------
def to_int_safe(x) -> Optional[int]:
    if x is None or x == "":
        return None
    try:
        return int(str(x).replace(",", ""))
    except:
        try:
            return int(float(x))
        except:
            return None

def normalize_title(t: Optional[str]) -> str:
    if not t:
        return ""
    s = str(t).strip().lower()
    # basic normalization: remove punctuation-like chars and extra whitespace
    import re
    s = re.sub(r"[^\w\s]", "", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s

def normalize_author_list(authors_field) -> List[str]:
    """
    authors_field is often a list of dicts like [{'author_id': '1265', 'role': ''}]
    We will extract author_id if present, else try to extract name.
    """
    out = []
    if not authors_field:
        return out
    if isinstance(authors_field, list):
        for a in authors_field:
            if isinstance(a, dict):
                aid = a.get("author_id") or a.get("id")
                if aid:
                    out.append(str(aid).strip())
                else:
                    # fallback to name if present
                    name = a.get("name") or a.get("author_name")
                    if name:
                        out.append(str(name).strip().lower())
            else:
                out.append(str(a).strip().lower())
    elif isinstance(authors_field, dict):
        aid = authors_field.get("author_id") or authors_field.get("id")
        if aid:
            out.append(str(aid).strip())
        else:
            name = authors_field.get("name") or authors_field.get("author_name")
            if name:
                out.append(str(name).strip().lower())
    else:
        out.append(str(authors_field).strip().lower())
    return out

# ---------- PHASE 1: scan file once and build work_id -> list(editions) mapping ----------
logger.info("Phase 1: scanning books file and building mapping work_id -> editions (single pass).")

work_to_editions: Dict[str, List[Dict[str, Any]]] = defaultdict(list)
seen_work_ids = []
seen_book_ids = set()
num_lines = 0

with gzip.open(BOOKS_PATH, "rt", encoding="utf-8") as f:
    for line in tqdm(f, desc="reading books file"):
        num_lines += 1
        try:
            obj = json.loads(line)
        except json.JSONDecodeError:
            logger.debug(f"JSON decode error at line {num_lines}, skipping.")
            continue
        book_id = str(obj.get("book_id", "")).strip()
        work_id = str(obj.get("work_id", "")).strip()
        if not work_id:
            # place into special bucket for missing work_id (shouldn't happen per your statements)
            work_id = "__NO_WORK_ID__"
        # store selected fields to keep memory moderate but preserve raw for later retrieval
        edition_info = {
            "book_id": book_id,
            "work_id": work_id,
            "title": obj.get("title"),
            "title_without_series": obj.get("title_without_series"),
            "authors_raw": obj.get("authors"),
            "authors_ids_or_names": normalize_author_list(obj.get("authors")),
            "publication_year": to_int_safe(obj.get("publication_year")),
            "ratings_count": to_int_safe(obj.get("ratings_count")),
            "text_reviews_count": to_int_safe(obj.get("text_reviews_count")),
            "num_pages": to_int_safe(obj.get("num_pages")),
            "language_code": obj.get("language_code"),
            "format": obj.get("format"),
            "is_ebook": obj.get("is_ebook"),
            "popular_shelves": obj.get("popular_shelves"),
            "_raw": obj
        }
        work_to_editions[work_id].append(edition_info)
        seen_work_ids.append(work_id)
        seen_book_ids.add(book_id)

logger.info(f"Completed scan. Lines read: {num_lines}. Distinct works found (in mapping): {len(work_to_editions)}. Distinct book_ids seen: {len(seen_book_ids)}")

# ---------- QUICK-TEST: optionally shrink to first N works (for debugging) ----------
# Uncomment next two lines for fast testing on a small subcorpus (10 works).
if 'QUICK_TEST_ONLY' in globals() and QUICK_TEST_ONLY:
     selected_work_ids = list(work_to_editions.keys())[:QUICK_TEST_WORK_COUNT]
else:
  selected_work_ids = list(work_to_editions.keys())

logger.info(f"Number of work_ids to process in Phase 2: {len(selected_work_ids)} (use QUICK_TEST_* to reduce)")

# ---------- PHASE 2: compute edition-count statistics and infer original publication year ----------
logger.info("Phase 2: computing edition-count stats & inferring earliest publication year per work.")

edition_counts = []
work_inferred_year = {}  # work_id -> inferred_first_pub_year (earliest among editions)
work_selected_canonical_edition = {}  # work_id -> canonical edition (dict)

for wid in tqdm(selected_work_ids, desc="processing works"):
    editions = work_to_editions[wid]
    n_editions = len(editions)
    edition_counts.append(n_editions)

    # collect publication_years that are valid ints
    pub_years = [e["publication_year"] for e in editions if e.get("publication_year") is not None]
    inferred_year = None
    if pub_years:
        # take earliest year as inferred original publication year
        inferred_year = min(pub_years)
    work_inferred_year[wid] = inferred_year

    # choose canonical edition for dedup: prefer earliest publication_year, tie-break by ratings_count (desc)
    # if no publication_year available, choose edition with highest ratings_count
    # if ratings_count missing, fallback to first edition
    # Note: canonical edition stored as shallow copy (without raw sometimes to reduce memory)
    chosen = None
    # filter editions with non-null year
    candidate_with_year = [e for e in editions if e.get("publication_year") is not None]
    if candidate_with_year:
        # pick earliest year
        earliest_year = min(e["publication_year"] for e in candidate_with_year)
        candidates = [e for e in candidate_with_year if e["publication_year"] == earliest_year]
        # tie-break by highest ratings_count
        candidates_sorted = sorted(candidates, key=lambda r: (r.get("ratings_count") or 0), reverse=True)
        chosen = candidates_sorted[0]
    else:
        # fallback to highest ratings_count among all editions
        editions_sorted = sorted(editions, key=lambda r: (r.get("ratings_count") or 0), reverse=True)
        if editions_sorted:
            chosen = editions_sorted[0]
    # Save chosen canonical edition (shallow)
    if chosen:
        work_selected_canonical_edition[wid] = {
            "book_id": chosen["book_id"],
            "title": chosen["title"],
            "publication_year_chosen": chosen.get("publication_year"),
            "ratings_count_chosen": chosen.get("ratings_count"),
            "text_reviews_count_chosen": chosen.get("text_reviews_count"),
            "authors_ids_or_names": chosen.get("authors_ids_or_names"),
            "num_editions_for_work": n_editions
        }
    else:
        work_selected_canonical_edition[wid] = {
            "book_id": None,
            "title": None,
            "publication_year_chosen": None,
            "ratings_count_chosen": None,
            "text_reviews_count_chosen": None,
            "authors_ids_or_names": [],
            "num_editions_for_work": n_editions
        }

# compute edition-count distribution statistics
if edition_counts:
    stats = {
        "n_works_processed": len(edition_counts),
        "min_editions_per_work": min(edition_counts),
        "max_editions_per_work": max(edition_counts),
        "mean_editions_per_work": mean(edition_counts),
        "median_editions_per_work": median(edition_counts),
        "pct_25": sorted(edition_counts)[max(0, math.floor(0.25*len(edition_counts))-1)],
        "pct_75": sorted(edition_counts)[max(0, math.floor(0.75*len(edition_counts))-1)]
    }
else:
    stats = {}

logger.info("Edition-count distribution stats computed:")
for k,v in stats.items():
    logger.info(f"  {k}: {v}")

# Save edition stats and inferred years
pd.DataFrame({
    "work_id": list(work_inferred_year.keys()),
    "inferred_first_publication_year": list(work_inferred_year.values()),
    "canonical_book_id": [work_selected_canonical_edition[w]["book_id"] for w in work_inferred_year.keys()],
    "canonical_title": [work_selected_canonical_edition[w]["title"] for w in work_inferred_year.keys()],
    "canonical_pub_year": [work_selected_canonical_edition[w]["publication_year_chosen"] for w in work_inferred_year.keys()],
    "canonical_ratings_count": [work_selected_canonical_edition[w]["ratings_count_chosen"] for w in work_inferred_year.keys()],
    "num_editions_for_work": [work_selected_canonical_edition[w]["num_editions_for_work"] for w in work_inferred_year.keys()]
}).to_csv(os.path.join(OUTPUT_DIR, "work_level_inferred_years_and_canonical.csv"), index=False, encoding="utf-8")
logger.info("Saved work-level inferred years and canonical edition summary CSV.")

# ---------- PHASE 3: remove duplicates by work_id (i.e., build deduplicated list) ----------
logger.info("Phase 3: building deduplicated dataset (one canonical edition per work_id).")

dedup_rows = []
for wid, can in work_selected_canonical_edition.items():
    dedup_rows.append({
        "work_id": wid,
        "canonical_book_id": can["book_id"],
        "title": can["title"],
        "publication_year_inferred": work_inferred_year.get(wid),
        "publication_year_chosen": can.get("publication_year_chosen"),
        "ratings_count_chosen": can.get("ratings_count_chosen"),
        "text_reviews_count_chosen": can.get("text_reviews_count_chosen"),
        "num_editions_for_work": can.get("num_editions_for_work"),
        "authors_ids_or_names": "|".join(can.get("authors_ids_or_names") or [])
    })
dedup_df = pd.DataFrame(dedup_rows)
dedup_out_path = os.path.join(OUTPUT_DIR, "deduplicated_by_workid.csv")
dedup_df.to_csv(dedup_out_path, index=False, encoding="utf-8")
logger.info(f"Saved deduplicated-by-work CSV: {dedup_out_path} (rows = {len(dedup_df)})")

# ---------- PHASE 4: flag same title with different authors (potential ambiguous titles) ----------
logger.info("Phase 4: flagging same title but different authors across deduplicated works.")

# build normalized title -> set of author identifiers mapping
title_to_authors = defaultdict(set)
title_to_workids = defaultdict(list)
for _, r in dedup_df.iterrows():
    norm_title = normalize_title(r["title"])
    authors_field = r["authors_ids_or_names"] or ""
    auths = [a for a in authors_field.split("|") if a]
    if not auths:
        auths = ["UNKNOWN_AUTHOR"]
    for a in auths:
        title_to_authors[norm_title].add(a)
    title_to_workids[norm_title].append((r["work_id"], r["canonical_book_id"], auths))

# find titles with >1 distinct author id/name
ambiguous_titles = {t: title_to_workids[t] for t,a in title_to_authors.items() if len(a) > 1}
logger.info(f"Found {len(ambiguous_titles)} normalized titles that map to multiple distinct authors (flagged for manual review).")

# Save ambiguous titles and examples
ambig_rows = []
for t, entries in ambiguous_titles.items():
    ambig_rows.append({
        "normalized_title": t,
        "n_distinct_authors": len(set([a for sub in entries for a in sub[2]])),
        "work_examples": ";".join([f"{wid}:{bid}" for wid,bid,_ in entries]),
        "authors_examples": ";".join([",".join(sub[2]) for sub in entries])
    })
pd.DataFrame(ambig_rows).to_csv(os.path.join(OUTPUT_DIR, "ambiguous_same_title_diff_authors.csv"), index=False, encoding="utf-8")
logger.info("Saved ambiguous title report: ambiguous_same_title_diff_authors.csv")

# ---------- PHASE 5: summary & sample prints ----------
logger.info("Phase 5: printing short sample summaries for manual inspection (4 works).")

# pick some sample work_ids (first few)
sample_wids = selected_work_ids[:4]
sample_report = []
for wid in sample_wids:
    can = work_selected_canonical_edition.get(wid, {})
    editions = work_to_editions[wid]
    sample_report.append({
        "work_id": wid,
        "canonical_book_id": can.get("book_id"),
        "canonical_title": can.get("title"),
        "inferred_first_pub_year": work_inferred_year.get(wid),
        "num_editions": len(editions),
        "top_editions (book_id | year | ratings_count)": "; ".join([f"{e['book_id']}|{e.get('publication_year')}|{e.get('ratings_count')}" for e in sorted(editions, key=lambda x: (x.get('publication_year') if x.get('publication_year') is not None else 9999, -(x.get('ratings_count') or 0)))[:6]])
    })
logger.info("Sample deduplication report (first 4 works):")
for r in sample_report:
    logger.info(json.dumps(r, ensure_ascii=False))

pd.DataFrame(sample_report).to_csv(os.path.join(OUTPUT_DIR, "sample_dedup_report_4works.csv"), index=False, encoding="utf-8")
logger.info("Saved sample dedup report CSV.")

# ---------- PHASE 6: final small summary JSON ----------
summary = {
    "timestamp": datetime.datetime.now().isoformat(),
    "books_file": BOOKS_PATH,
    "n_lines_read": num_lines,
    "n_works_in_mapping": len(work_to_editions),
    "n_works_processed": len(selected_work_ids),
    "edition_count_stats": stats,
    "n_ambiguous_titles_multi_author": len(ambiguous_titles),
    "deduplicated_rows_saved": len(dedup_df),
    "output_dir": OUTPUT_DIR
}
import json as _json
with open(os.path.join(OUTPUT_DIR, "dedup_summary.json"), "w", encoding="utf-8") as fh:
    _json.dump(summary, fh, indent=2)
logger.info("Saved summary JSON to output directory.")

logger.info("Done. Files produced:")
for fn in os.listdir(OUTPUT_DIR):
    logger.info(f"  - {fn}")

logger.info("NOTE: If you uncomment QUICK_TEST_* variables at the top, you'll run on a small subset for testing. For full runs, leave them commented.")

# End of cell

[INFO] Starting work_id deduplication & edition analysis.
INFO:goodreads_dedup:Starting work_id deduplication & edition analysis.
[INFO] Books path: /content/drive/MyDrive/Goodreads_Metadata_Reviews_2017/goodreads_books_romance.json.gz
INFO:goodreads_dedup:Books path: /content/drive/MyDrive/Goodreads_Metadata_Reviews_2017/goodreads_books_romance.json.gz
[INFO] Output dir: /content/drive/MyDrive/Goodreads_Metadata_Reviews_2017/processing_outputs
INFO:goodreads_dedup:Output dir: /content/drive/MyDrive/Goodreads_Metadata_Reviews_2017/processing_outputs
[INFO] Log file: /content/drive/MyDrive/Goodreads_Metadata_Reviews_2017/processing_outputs/dedup_workid_run_20250808_232403.log
INFO:goodreads_dedup:Log file: /content/drive/MyDrive/Goodreads_Metadata_Reviews_2017/processing_outputs/dedup_workid_run_20250808_232403.log
[INFO] Phase 1: scanning books file and building mapping work_id -> editions (single pass).
INFO:goodreads_dedup:Phase 1: scanning books file and building mapping work_id -> 