# Goodreads Shelf Normalization and Clustering

This notebook focuses on cleaning and normalizing reader-created shelf tags from Goodreads data.
We'll group similar/synonymous shelf names and assign umbrella terms for each cluster.

## Objectives
1. Load and explore shelf data structure
2. Analyze shelf frequency and patterns
3. Implement clustering/normalization logic
4. Validate and export normalized shelf mappings


In [2]:
import pandas as pd
import numpy as np
import logging
import os

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

### üîÑ Dataset Loading and Column Management

# Load dataset - using absolute path from project root
project_root = os.path.abspath("../../../")  # Go up 3 levels from src/data_audit/notebooks/
main_final_path = os.path.join(project_root, "data", "processed", "romance_books_main_final.csv")

# Verify file exists before loading
if not os.path.exists(main_final_path):
    raise FileNotFoundError(f"Dataset not found at: {main_final_path}")

main_final = pd.read_csv(main_final_path)
logger.info(f"Loaded main final dataset: {len(main_final)} books")

# Drop specified columns
columns_to_drop = ['series_works_count', 'popular_shelves', 'genres', 'decade', 
                   'book_length_category', 'rating_category', 'popularity_category', 
                   'has_collection_indicators']
main_final = main_final.drop(columns=columns_to_drop, errors='ignore')
logger.info(f"Dropped columns: {[col for col in columns_to_drop if col in main_final.columns]}")

# Clean series_works_count_numeric: replace NaN with 'stand_alone'
main_final['series_works_count_numeric'] = main_final['series_works_count_numeric'].fillna('stand_alone')
logger.info(f"Replaced NaN values in series_works_count_numeric with 'stand_alone'")

### üìã Basic Dataset Information

# Display basic info
print(f"Dataset shape after dropping columns: {main_final.shape}")
print(f"\nRemaining column names:")
print(main_final.columns.tolist())
print(f"\nData types:")
print(main_final.dtypes)

### üîç Detailed Column Investigation

# Define ID columns to exclude from numerical analysis
id_columns = ['work_id', 'book_id_list_en', 'author_id', 'series_id']

for col in main_final.columns:
    print(f"\n{'='*60}")
    print(f"COLUMN: {col}")
    print(f"{'='*60}")
    
    # Basic info
    print(f"Data type: {main_final[col].dtype}")
    print(f"Non-null count: {main_final[col].count()} / {len(main_final)} ({main_final[col].count()/len(main_final)*100:.1f}%)")
    print(f"Null count: {main_final[col].isnull().sum()} ({main_final[col].isnull().sum()/len(main_final)*100:.1f}%)")
    
    # Mark ID columns
    if col in id_columns:
        print("üîë ID COLUMN - Excluded from numerical analysis")
    
    # Type-specific analysis
    if main_final[col].dtype in ['object']:
        print(f"Unique values: {main_final[col].nunique()}")
        print(f"Sample values:")
        sample_values = main_final[col].dropna().head(10).tolist()
        for i, val in enumerate(sample_values):
            val_str = str(val)
            if len(val_str) > 100:
                val_str = val_str[:100] + "..."
            print(f"  [{i+1}] {val_str}")
        
        # Check for list-like strings
        if any(main_final[col].dropna().astype(str).str.startswith('[').head(100)):
            print("  ‚ö†Ô∏è  Contains list-like strings - may need parsing")
        
        # Value length distribution for string columns
        lengths = main_final[col].dropna().astype(str).str.len()
        print(f"String length stats: min={lengths.min()}, max={lengths.max()}, mean={lengths.mean():.1f}")
        
    elif main_final[col].dtype in ['int64', 'float64'] and col not in id_columns:
        print(f"üìä NUMERICAL COLUMN - Valid for analysis")
        print(f"Basic stats:")
        stats = main_final[col].describe()
        for stat_name, stat_val in stats.items():
            print(f"  {stat_name}: {stat_val}")
        
        # Check for potential categorical numeric columns
        unique_count = main_final[col].nunique()
        if unique_count <= 20:
            print(f"Value counts (low cardinality - {unique_count} unique values):")
            vc = main_final[col].value_counts().head(10)
            for val, count in vc.items():
                print(f"  {val}: {count} ({count/len(main_final)*100:.1f}%)")
    
    elif main_final[col].dtype in ['int64', 'float64'] and col in id_columns:
        print(f"üîë ID COLUMN - Basic stats skipped")
        unique_count = main_final[col].nunique()
        print(f"Unique values: {unique_count}")
        
    elif main_final[col].dtype in ['bool']:
        print(f"Boolean distribution:")
        vc = main_final[col].value_counts()
        for val, count in vc.items():
            print(f"  {val}: {count} ({count/len(main_final)*100:.1f}%)")

### üìä Sample Data Preview

main_final.head()

INFO:__main__:Loaded main final dataset: 53349 books
INFO:__main__:Dropped columns: []
INFO:__main__:Replaced NaN values in series_works_count_numeric with 'stand_alone'


Dataset shape after dropping columns: (53349, 19)

Remaining column names:
['work_id', 'book_id_list_en', 'title', 'publication_year', 'num_pages_median', 'description', 'language_codes_en', 'author_id', 'author_name', 'author_average_rating', 'author_ratings_count', 'series_id', 'series_title', 'ratings_count_sum', 'text_reviews_count_sum', 'average_rating_weighted_mean', 'genres_str', 'shelves_str', 'series_works_count_numeric']

Data types:
work_id                           int64
book_id_list_en                  object
title                            object
publication_year                  int64
num_pages_median                float64
description                      object
language_codes_en                object
author_id                         int64
author_name                      object
author_average_rating           float64
author_ratings_count              int64
series_id                        object
series_title                     object
ratings_count_sum               

Unnamed: 0,work_id,book_id_list_en,title,publication_year,num_pages_median,description,language_codes_en,author_id,author_name,author_average_rating,author_ratings_count,series_id,series_title,ratings_count_sum,text_reviews_count_sum,average_rating_weighted_mean,genres_str,shelves_str,series_works_count_numeric
0,3237433,"['9416', '227650', '9423', '6088685', '1982627...",Confessions of a Shopaholic,2000,320.0,Unabridged audible download; approximately 11 ...,eng,6160,Sophie Kinsella,3.74,2169284,165735.0,Shopaholic,555675,10488,3.62,"fiction,romance,young adult","3-stars,5-stars,abandoned,adult-fiction,audio,...",12.0
1,1268663,"['3462', '6338758', '289110', '6386960', '1778...",The Rescue,2000,372.0,When confronted by raging fires or deadly acci...,eng,2345,Nicholas Sparks,4.06,4600277,stand_alone,stand_alone,148062,3150,4.1,"fiction,mystery,romance,young adult","2000,2001,2012-reads,adult,adult-fiction,alrea...",stand_alone
2,846763,"['110391', '6077588', '25322247', '1859059', '...",The Duke and I,2000,371.0,Can there be any greater challenge to London's...,eng,63898,Julia Quinn,3.98,567004,153045.0,Bridgertons,61848,2444,4.11,"biography,fiction,historical fiction,history,r...","19th-century,1st-in-series,2012-reads,2016-rea...",19.0
3,3363,"['861326', '6077587', '25322244', '353066', '9...",The Viscount Who Loved Me,2000,381.0,Alternate cover for ISBN: 0380815575/978038081...,eng,63898,Julia Quinn,3.98,567004,144491.0,Bridgertons,38086,1404,4.19,"biography,fiction,historical fiction,history,r...","1,19th-century,2016-reads,3-stars,4-stars,5-st...",19.0
4,2363,"['22649', '22655', '31107', '6560878', '257668...",Bookends,2000,368.0,On the heels of her national bestsellers Jemim...,eng,12915,Jane Green,3.58,502125,stand_alone,stand_alone,34139,842,3.7,"fiction,romance","2002,2003,2004,2005,2006,5-stars,abandoned,adu...",stand_alone


In [None]:
! pip install rapidfuzz
! pip install Unidecode

In [None]:
# =============================================================================
# CELL: UNIVERSAL STRING CANONICALIZATION (v1.1)
# =============================================================================
import re, time, json, unicodedata
from collections import defaultdict
from pathlib import Path
import pandas as pd
from rapidfuzz import fuzz  # pip install rapidfuzz
from unidecode import unidecode  # pip install Unidecode

print(f"\n[{time.strftime('%H:%M:%S')}] üîß CELL 2: UNIVERSAL STRING CANONICALIZATION (v1.1)")
print("=" * 70)

CANONICAL_CONFIG = {
    'normalize_case': True,
    'remove_extra_whitespace': True,
    'standardize_separators': True,
    'fold_accents': True,
    'map_variants': True,        # unify common aliases
    'min_token_length': 1,
    'max_token_length': 100
}
print("üìã CANONICALIZATION CONFIG:", CANONICAL_CONFIG)

# House-style replacements (extend as needed)
ALIASES = {
    "&": " and ",
    "ya": "young adult",
    "sci fi": "scifi",
    "sci-fi": "scifi",
    "favourite": "favorite",
    "rom com": "romcom",
}
RATING_RE = re.compile(r"\b([1-5])\s*[- ]?\s*star(s)?\b")
YEARLIST_RE = re.compile(r"\b(19|20)\d{2}(-reads)?\b")
STATUS_SET = {"to read","to-read","currently reading","currently-reading","did not finish","dnf","abandoned"}
FORMAT_SET = {"audiobook","audio book","audio","audible","ebook","e-book","kindle","paperback","hardcover"}

def _normalize(s: str) -> str:
    if not isinstance(s, str): return ""
    x = s.strip()
    if CANONICAL_CONFIG['fold_accents']:
        x = unidecode(x)  # ASCII fold (caf√© -> cafe)
    if CANONICAL_CONFIG['normalize_case']:
        x = x.lower()
    if CANONICAL_CONFIG['standardize_separators']:
        x = re.sub(r"[-_/]+", " ", x)
        x = re.sub(r"\s+", " ", x)
    if CANONICAL_CONFIG['map_variants']:
        # light-weight replacements
        for k, v in ALIASES.items():
            x = x.replace(k, v)
        x = re.sub(RATING_RE, r"rating-\1", x)  # 5 stars -> rating-5
    return x.strip()

def canonicalize_tag(tag: str) -> str:
    x = _normalize(tag)
    if not x: return ""
    # collapse obvious plurals (books -> book) while leaving proper nouns mostly intact
    x = re.sub(r"\b(romances)\b", "romance", x)
    x = re.sub(r"\b(historical romances)\b", "historical romance", x)
    # keep within length constraints
    if not (CANONICAL_CONFIG['min_token_length'] <= len(x) <= CANONICAL_CONFIG['max_token_length']):
        return ""
    return x

# --- Vectorized extraction of shelves/genres ---
def _split_comma_series(series: pd.Series) -> pd.Series:
    return series.fillna("").astype(str).str.split(",").explode().str.strip().replace("", pd.NA).dropna()

genres_series = _split_comma_series(main_final['genres_str'])
shelves_series = _split_comma_series(main_final['shelves_str'])

unique_genres = pd.Index(genres_series.unique())
unique_shelves = pd.Index(shelves_series.unique())

canon_genres = pd.Series({g: canonicalize_tag(g) for g in unique_genres})
canon_shelves = pd.Series({s: canonicalize_tag(s) for s in unique_shelves})

# Drop empties and compute compression
canon_genres = canon_genres[canon_genres != ""]
canon_shelves = canon_shelves[canon_shelves != ""]
unique_canonical_genres = pd.Index(canon_genres.unique())
unique_canonical_shelves = pd.Index(canon_shelves.unique())

genre_compression_ratio = len(unique_canonical_genres) / len(unique_genres) if len(unique_genres) else 1.0
shelf_compression_ratio = len(unique_canonical_shelves) / len(unique_shelves) if len(unique_shelves) else 1.0

print("\nüìä CANONICALIZATION RESULTS")
print("-"*40)
print(f"Genres:   original={len(unique_genres):,}  canonical={len(unique_canonical_genres):,}  compression={genre_compression_ratio:.3f}")
print(f"Shelves:  original={len(unique_shelves):,} canonical={len(unique_canonical_shelves):,} compression={shelf_compression_ratio:.3f}")

# Persist mappings
outputs_dir = Path("romance-novel-nlp-research/src/eda_analysis/outputs")
outputs_dir.mkdir(parents=True, exist_ok=True)

pd.DataFrame({'original': canon_genres.index, 'canonical': canon_genres.values}).to_csv(outputs_dir/"genre_canonical_mappings.csv", index=False)
pd.DataFrame({'original': canon_shelves.index, 'canonical': canon_shelves.values}).to_csv(outputs_dir/"shelf_canonical_mappings.csv", index=False)

meta = {
  'timestamp': time.strftime('%Y-%m-%d %H:%M:%S'),
  'config': CANONICAL_CONFIG,
  'stats': {
      'genres': {
          'original_count': int(len(unique_genres)),
          'canonical_count': int(len(unique_canonical_genres)),
          'compression_ratio': float(genre_compression_ratio),
          'duplicates_eliminated': int(len(unique_genres)-len(unique_canonical_genres)),
      },
      'shelves': {
          'original_count': int(len(unique_shelves)),
          'canonical_count': int(len(unique_canonical_shelves)),
          'compression_ratio': float(shelf_compression_ratio),
          'duplicates_eliminated': int(len(unique_shelves)-len(unique_canonical_shelves)),
      }
  }
}
with open(outputs_dir/"canonicalization_metadata.json","w",encoding="utf-8") as f:
    json.dump(meta, f, indent=2, ensure_ascii=False)

print(f"‚úÖ Saved mappings + metadata in {outputs_dir}")