## üìä Data Loading and Initial Setup

This section handles the loading of the main dataset and performs initial data exploration to understand the structure and content of our romance books data.

### What this section does:
- Loads the main final dataset from CSV file
- Drops unnecessary columns to focus on core variables
- Performs detailed column-by-column analysis
- Identifies data types, missing values, and unique value counts
- Provides sample data for initial inspection

---

In [6]:
import pandas as pd
import numpy as np
import logging

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

### üîÑ Dataset Loading and Column Management

# Load dataset
main_final_path = "../../data/processed/romance_books_main_final.csv"
main_final = pd.read_csv(main_final_path)
logger.info(f"Loaded main final dataset: {len(main_final)} books")

# Drop specified columns
columns_to_drop = ['series_works_count', 'popular_shelves', 'genres', 'decade', 
                   'book_length_category', 'rating_category', 'popularity_category', 
                   'has_collection_indicators']
main_final = main_final.drop(columns=columns_to_drop, errors='ignore')
logger.info(f"Dropped columns: {[col for col in columns_to_drop if col in main_final.columns]}")

# Clean series_works_count_numeric: replace NaN with 'stand_alone'
main_final['series_works_count_numeric'] = main_final['series_works_count_numeric'].fillna('stand_alone')
logger.info(f"Replaced NaN values in series_works_count_numeric with 'stand_alone'")

### üìã Basic Dataset Information

# Display basic info
print(f"Dataset shape after dropping columns: {main_final.shape}")
print(f"\nRemaining column names:")
print(main_final.columns.tolist())
print(f"\nData types:")
print(main_final.dtypes)

### üîç Detailed Column Investigation

# Define ID columns to exclude from numerical analysis
id_columns = ['work_id', 'book_id_list_en', 'author_id', 'series_id']

for col in main_final.columns:
    print(f"\n{'='*60}")
    print(f"COLUMN: {col}")
    print(f"{'='*60}")
    
    # Basic info
    print(f"Data type: {main_final[col].dtype}")
    print(f"Non-null count: {main_final[col].count()} / {len(main_final)} ({main_final[col].count()/len(main_final)*100:.1f}%)")
    print(f"Null count: {main_final[col].isnull().sum()} ({main_final[col].isnull().sum()/len(main_final)*100:.1f}%)")
    
    # Mark ID columns
    if col in id_columns:
        print("üîë ID COLUMN - Excluded from numerical analysis")
    
    # Type-specific analysis
    if main_final[col].dtype in ['object']:
        print(f"Unique values: {main_final[col].nunique()}")
        print(f"Sample values:")
        sample_values = main_final[col].dropna().head(10).tolist()
        for i, val in enumerate(sample_values):
            val_str = str(val)
            if len(val_str) > 100:
                val_str = val_str[:100] + "..."
            print(f"  [{i+1}] {val_str}")
        
        # Check for list-like strings
        if any(main_final[col].dropna().astype(str).str.startswith('[').head(100)):
            print("  ‚ö†Ô∏è  Contains list-like strings - may need parsing")
        
        # Value length distribution for string columns
        lengths = main_final[col].dropna().astype(str).str.len()
        print(f"String length stats: min={lengths.min()}, max={lengths.max()}, mean={lengths.mean():.1f}")
        
    elif main_final[col].dtype in ['int64', 'float64'] and col not in id_columns:
        print(f"üìä NUMERICAL COLUMN - Valid for analysis")
        print(f"Basic stats:")
        stats = main_final[col].describe()
        for stat_name, stat_val in stats.items():
            print(f"  {stat_name}: {stat_val}")
        
        # Check for potential categorical numeric columns
        unique_count = main_final[col].nunique()
        if unique_count <= 20:
            print(f"Value counts (low cardinality - {unique_count} unique values):")
            vc = main_final[col].value_counts().head(10)
            for val, count in vc.items():
                print(f"  {val}: {count} ({count/len(main_final)*100:.1f}%)")
    
    elif main_final[col].dtype in ['int64', 'float64'] and col in id_columns:
        print(f"üîë ID COLUMN - Basic stats skipped")
        unique_count = main_final[col].nunique()
        print(f"Unique values: {unique_count}")
        
    elif main_final[col].dtype in ['bool']:
        print(f"Boolean distribution:")
        vc = main_final[col].value_counts()
        for val, count in vc.items():
            print(f"  {val}: {count} ({count/len(main_final)*100:.1f}%)")

### üìä Sample Data Preview

main_final.head()


INFO:__main__:Loaded main final dataset: 53349 books
INFO:__main__:Dropped columns: []
INFO:__main__:Replaced NaN values in series_works_count_numeric with 'stand_alone'


Dataset shape after dropping columns: (53349, 19)

Remaining column names:
['work_id', 'book_id_list_en', 'title', 'publication_year', 'num_pages_median', 'description', 'language_codes_en', 'author_id', 'author_name', 'author_average_rating', 'author_ratings_count', 'series_id', 'series_title', 'ratings_count_sum', 'text_reviews_count_sum', 'average_rating_weighted_mean', 'genres_str', 'shelves_str', 'series_works_count_numeric']

Data types:
work_id                           int64
book_id_list_en                  object
title                            object
publication_year                  int64
num_pages_median                float64
description                      object
language_codes_en                object
author_id                         int64
author_name                      object
author_average_rating           float64
author_ratings_count              int64
series_id                        object
series_title                     object
ratings_count_sum               

Unnamed: 0,work_id,book_id_list_en,title,publication_year,num_pages_median,description,language_codes_en,author_id,author_name,author_average_rating,author_ratings_count,series_id,series_title,ratings_count_sum,text_reviews_count_sum,average_rating_weighted_mean,genres_str,shelves_str,series_works_count_numeric
0,3237433,"['9416', '227650', '9423', '6088685', '1982627...",Confessions of a Shopaholic,2000,320.0,Unabridged audible download; approximately 11 ...,eng,6160,Sophie Kinsella,3.74,2169284,165735.0,Shopaholic,555675,10488,3.62,"fiction,romance,young adult","3-stars,5-stars,abandoned,adult-fiction,audio,...",12.0
1,1268663,"['3462', '6338758', '289110', '6386960', '1778...",The Rescue,2000,372.0,When confronted by raging fires or deadly acci...,eng,2345,Nicholas Sparks,4.06,4600277,stand_alone,stand_alone,148062,3150,4.1,"fiction,mystery,romance,young adult","2000,2001,2012-reads,adult,adult-fiction,alrea...",stand_alone
2,846763,"['110391', '6077588', '25322247', '1859059', '...",The Duke and I,2000,371.0,Can there be any greater challenge to London's...,eng,63898,Julia Quinn,3.98,567004,153045.0,Bridgertons,61848,2444,4.11,"biography,fiction,historical fiction,history,r...","19th-century,1st-in-series,2012-reads,2016-rea...",19.0
3,3363,"['861326', '6077587', '25322244', '353066', '9...",The Viscount Who Loved Me,2000,381.0,Alternate cover for ISBN: 0380815575/978038081...,eng,63898,Julia Quinn,3.98,567004,144491.0,Bridgertons,38086,1404,4.19,"biography,fiction,historical fiction,history,r...","1,19th-century,2016-reads,3-stars,4-stars,5-st...",19.0
4,2363,"['22649', '22655', '31107', '6560878', '257668...",Bookends,2000,368.0,On the heels of her national bestsellers Jemim...,eng,12915,Jane Green,3.58,502125,stand_alone,stand_alone,34139,842,3.7,"fiction,romance","2002,2003,2004,2005,2006,5-stars,abandoned,adu...",stand_alone


## üîß Universal String Canonicalization

This section performs comprehensive canonicalization of genre and shelf strings to create standardized, normalized versions for consistent analysis and comparison.

### What this section does:
- Applies consistent normalization rules to all genre and shelf strings
- Creates canonical mappings between original and normalized forms
- Handles case normalization, whitespace cleaning, and separator standardization
- Generates comprehensive statistics on transformation patterns
- Prepares clean, standardized data for downstream similarity analysis

In [7]:
# =============================================================================
# CELL 2: UNIVERSAL STRING CANONICALIZATION (v0)
# =============================================================================

import re
import time
from collections import defaultdict, Counter
import json
import os
from pathlib import Path

print(f"\n[{time.strftime('%H:%M:%S')}] üîß CELL 2: UNIVERSAL STRING CANONICALIZATION (v0)")
print("=" * 70)

### ‚öôÔ∏è Configuration Setup

# Configuration for canonicalization
CANONICAL_CONFIG = {
    'normalize_case': True,
    'remove_extra_whitespace': True,
    'remove_special_chars': False,  # Keep for genre/shelf analysis
    'standardize_separators': True,
    'min_token_length': 1,
    'max_token_length': 100
}

print(f"üìã CANONICALIZATION CONFIG:")
for key, value in CANONICAL_CONFIG.items():
    print(f"  {key}: {value}")

### üìö Genre Canonicalization

print(f"\nüìö GENRE CANONICALIZATION")
print("-" * 40)

def canonicalize_genre(genre_str):
    """
    Canonicalize a single genre string.
    
    Args:
        genre_str (str): Raw genre string
        
    Returns:
        str: Canonicalized genre string
    """
    if not isinstance(genre_str, str) or not genre_str.strip():
        return ""
    
    # Normalize case
    canonical = genre_str.lower() if CANONICAL_CONFIG['normalize_case'] else genre_str
    
    # Remove extra whitespace
    if CANONICAL_CONFIG['remove_extra_whitespace']:
        canonical = ' '.join(canonical.split())
    
    # Standardize separators (hyphens to spaces for consistency)
    if CANONICAL_CONFIG['standardize_separators']:
        canonical = re.sub(r'[-_]+', ' ', canonical)
        canonical = ' '.join(canonical.split())  # Clean up multiple spaces
    
    # Length validation
    if len(canonical) < CANONICAL_CONFIG['min_token_length'] or len(canonical) > CANONICAL_CONFIG['max_token_length']:
        return ""
    
    return canonical.strip()

# Apply canonicalization to unique genres
print(f"\nüîß PROCESSING GENRES:")
print(f"Canonicalizing genres from main_final dataset...")

# Extract unique genres from the dataset
unique_genres = set()
for idx, row in main_final.iterrows():
    if pd.notna(row.get('genres_str')) and row['genres_str'].strip():
        genres_list = [g.strip() for g in row['genres_str'].split(',') if g.strip()]
        unique_genres.update(genres_list)

print(f"Found {len(unique_genres):,} unique genres")

canonical_genres = {}
genre_mapping_stats = defaultdict(list)

for original_genre in unique_genres:
    canonical = canonicalize_genre(original_genre)
    canonical_genres[original_genre] = canonical
    
    # Track mapping for analysis
    if canonical != original_genre.lower():
        genre_mapping_stats['changed'].append((original_genre, canonical))
    else:
        genre_mapping_stats['unchanged'].append(original_genre)

print(f"  ‚úÖ Processed {len(canonical_genres):,} genres")

### üìö Shelf Canonicalization

print(f"\nüìö SHELF CANONICALIZATION")
print("-" * 40)

def canonicalize_shelf(shelf_str):
    """
    Canonicalize a single shelf string.
    
    Args:
        shelf_str (str): Raw shelf string
        
    Returns:
        str: Canonicalized shelf string
    """
    if not isinstance(shelf_str, str) or not shelf_str.strip():
        return ""
    
    # Normalize case
    canonical = shelf_str.lower() if CANONICAL_CONFIG['normalize_case'] else shelf_str
    
    # Remove extra whitespace
    if CANONICAL_CONFIG['remove_extra_whitespace']:
        canonical = ' '.join(canonical.split())
    
    # Standardize separators (hyphens to spaces for consistency)
    if CANONICAL_CONFIG['standardize_separators']:
        canonical = re.sub(r'[-_]+', ' ', canonical)
        canonical = ' '.join(canonical.split())  # Clean up multiple spaces
    
    # Length validation
    if len(canonical) < CANONICAL_CONFIG['min_token_length'] or len(canonical) > CANONICAL_CONFIG['max_token_length']:
        return ""
    
    return canonical.strip()

# Apply canonicalization to unique shelves
print(f"\nüîß PROCESSING SHELVES:")
print(f"Canonicalizing shelves from main_final dataset...")

# Extract unique shelves from the dataset
unique_shelves = set()
for idx, row in main_final.iterrows():
    if pd.notna(row.get('shelves_str')) and row['shelves_str'].strip():
        shelves_list = [s.strip() for s in row['shelves_str'].split(',') if s.strip()]
        unique_shelves.update(shelves_list)

print(f"Found {len(unique_shelves):,} unique shelves")

canonical_shelves = {}
shelf_mapping_stats = defaultdict(list)

for original_shelf in unique_shelves:
    canonical = canonicalize_shelf(original_shelf)
    canonical_shelves[original_shelf] = canonical
    
    # Track mapping for analysis
    if canonical != original_shelf.lower():
        shelf_mapping_stats['changed'].append((original_shelf, canonical))
    else:
        shelf_mapping_stats['unchanged'].append(original_shelf)

print(f"  ‚úÖ Processed {len(canonical_shelves):,} shelves")

### üìä Canonicalization Results

print(f"\nüìä CANONICALIZATION RESULTS:")
print("-" * 40)

# Get unique canonical values
unique_canonical_genres = set(canonical_genres.values())
unique_canonical_shelves = set(canonical_shelves.values())

# Calculate compression ratios
genre_compression_ratio = len(unique_canonical_genres) / len(unique_genres) if len(unique_genres) > 0 else 0
shelf_compression_ratio = len(unique_canonical_shelves) / len(unique_shelves) if len(unique_shelves) > 0 else 0

print(f"üìö GENRES:")
print(f"  Original count: {len(unique_genres):,}")
print(f"  Canonical count: {len(unique_canonical_genres):,}")
print(f"  Compression ratio: {genre_compression_ratio:.3f}")
print(f"  Changes made: {len(genre_mapping_stats['changed']):,}")
print(f"  Unchanged: {len(genre_mapping_stats['unchanged']):,}")

print(f"\nüìö SHELVES:")
print(f"  Original count: {len(unique_shelves):,}")
print(f"  Canonical count: {len(unique_canonical_shelves):,}")
print(f"  Compression ratio: {shelf_compression_ratio:.3f}")
print(f"  Changes made: {len(shelf_mapping_stats['changed']):,}")
print(f"  Unchanged: {len(shelf_mapping_stats['unchanged']):,}")

### üíæ Save Canonical Mappings

print(f"\nüíæ SAVING CANONICAL MAPPINGS:")
print("-" * 40)

# Create outputs directory
outputs_dir = Path("romance-novel-nlp-research/src/eda_analysis/outputs")
outputs_dir.mkdir(parents=True, exist_ok=True)

# Save genre mappings
genre_mappings_df = pd.DataFrame([
    {'original': orig, 'canonical': canon} 
    for orig, canon in canonical_genres.items()
])
genre_mappings_path = outputs_dir / "genre_canonical_mappings.csv"
genre_mappings_df.to_csv(genre_mappings_path, index=False)
print(f"  ‚úÖ Saved genre mappings to: {genre_mappings_path}")

# Save shelf mappings
shelf_mappings_df = pd.DataFrame([
    {'original': orig, 'canonical': canon} 
    for orig, canon in canonical_shelves.items()
])
shelf_mappings_path = outputs_dir / "shelf_canonical_mappings.csv"
shelf_mappings_df.to_csv(shelf_mappings_path, index=False)
print(f"  ‚úÖ Saved shelf mappings to: {shelf_mappings_path}")

# Save canonicalization metadata
canonical_meta = {
    'timestamp': time.strftime('%Y-%m-%d %H:%M:%S'),
    'config': CANONICAL_CONFIG,
    'stats': {
        'genres': {
            'original_count': len(unique_genres),
            'canonical_count': len(unique_canonical_genres),
            'compression_ratio': genre_compression_ratio,
            'changes_count': len(genre_mapping_stats['changed']),
            'duplicates_eliminated': len(unique_genres) - len(unique_canonical_genres)
        },
        'shelves': {
            'original_count': len(unique_shelves),
            'canonical_count': len(unique_canonical_shelves),
            'compression_ratio': shelf_compression_ratio,
            'changes_count': len(shelf_mapping_stats['changed']),
            'duplicates_eliminated': len(unique_shelves) - len(unique_canonical_shelves)
        }
    }
}

metadata_path = outputs_dir / "canonicalization_metadata.json"
with open(metadata_path, 'w', encoding='utf-8') as f:
    json.dump(canonical_meta, f, indent=2, ensure_ascii=False)
print(f"  ‚úÖ Saved metadata to: {metadata_path}")

print(f"\n[{time.strftime('%H:%M:%S')}] ‚úÖ Cell 2: Universal String Canonicalization completed successfully!")



[21:02:34] üîß CELL 2: UNIVERSAL STRING CANONICALIZATION (v0)
üìã CANONICALIZATION CONFIG:
  normalize_case: True
  remove_extra_whitespace: True
  remove_special_chars: False
  standardize_separators: True
  min_token_length: 1
  max_token_length: 100

üìö GENRE CANONICALIZATION
----------------------------------------

üîß PROCESSING GENRES:
Canonicalizing genres from main_final dataset...
Found 13 unique genres
  ‚úÖ Processed 13 genres

üìö SHELF CANONICALIZATION
----------------------------------------

üîß PROCESSING SHELVES:
Canonicalizing shelves from main_final dataset...
Found 255,664 unique shelves
  ‚úÖ Processed 255,664 shelves

üìä CANONICALIZATION RESULTS:
----------------------------------------
üìö GENRES:
  Original count: 13
  Canonical count: 13
  Compression ratio: 1.000
  Changes made: 0
  Unchanged: 13

üìö SHELVES:
  Original count: 255,664
  Canonical count: 254,778
  Compression ratio: 0.997
  Changes made: 230,934
  Unchanged: 24,730

üíæ SAVING CA

## üîç Character Similarity Index & Neighbor Retrieval

This section builds a comprehensive character-based similarity index using TF-IDF vectorization and approximate nearest neighbor (ANN) search to identify potential duplicate shelf names.

### What this section does:
- Creates TF-IDF vectors from canonical shelf tokens using character n-grams
- Builds an approximate nearest neighbor index for efficient similarity search
- Retrieves candidate similar pairs based on cosine similarity thresholds
- Generates comprehensive statistics on similarity patterns and coverage
- Exports sample data for manual validation and quality assessment

In [None]:
# =============================================================================
# CELL 3: CHARACTER SIMILARITY INDEX & NEIGHBOR RETRIEVAL
# =============================================================================

import pandas as pd
import numpy as np
import json
import time
import os
import pickle
import hashlib
from pathlib import Path
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors
import logging

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

print(f"\n[{time.strftime('%H:%M:%S')}] üîç Starting Cell 3: Character Similarity Index & Neighbor Retrieval")
print("=" * 80)

### ‚öôÔ∏è Configuration Setup

# TF-IDF Configuration
NGRAM_RANGE = (2, 4)  # Character n-grams from 2 to 4 characters
MIN_DF = 2  # Minimum document frequency
MAX_FEATURES = 10000  # Maximum number of features
SIMILARITY_THRESHOLD = 0.3  # Minimum cosine similarity for candidate pairs
TOP_K_NEIGHBORS = 50  # Number of nearest neighbors to retrieve

print(f"üìã TF-IDF CONFIGURATION:")
print(f"  N-gram range: {NGRAM_RANGE}")
print(f"  Min document frequency: {MIN_DF}")
print(f"  Max features: {MAX_FEATURES}")
print(f"  Similarity threshold: {SIMILARITY_THRESHOLD}")
print(f"  Top-K neighbors: {TOP_K_NEIGHBORS}")

### üî§ Token Vectorization

print(f"\nüî§ TOKEN VECTORIZATION:")
print("-" * 30)

# Get canonical tokens from previous cell
canonical_tokens = list(unique_canonical_shelves)
print(f"  üìä Total canonical tokens: {len(canonical_tokens):,}")

# Initialize TF-IDF vectorizer
vectorizer = TfidfVectorizer(
    analyzer='char',
    ngram_range=NGRAM_RANGE,
    min_df=MIN_DF,
    max_features=MAX_FEATURES,
    lowercase=False,  # Already canonicalized
    token_pattern=None  # Use character n-grams
)

# Fit and transform
vectorize_start = time.time()
tfidf_matrix = vectorizer.fit_transform(canonical_tokens)
vectorize_time = time.time() - vectorize_start

print(f"  ‚úÖ TF-IDF fitted: {tfidf_matrix.shape[0]:,} tokens √ó {tfidf_matrix.shape[1]:,} features")
print(f"  ‚è±Ô∏è  Vectorization time: {vectorize_time:.2f} seconds")

### üîç Build ANN Index & Retrieve Neighbors

print(f"\nüîç BUILDING ANN INDEX & RETRIEVING NEIGHBORS:")
print("-" * 55)

ann_start = time.time()

# Build approximate nearest neighbor index
nbrs = NearestNeighbors(
    n_neighbors=min(TOP_K_NEIGHBORS + 1, len(canonical_tokens)),  # +1 to exclude self
    algorithm='auto',
    metric='cosine'
)
nbrs.fit(tfidf_matrix)

# Retrieve neighbors for all tokens
distances, indices = nbrs.kneighbors(tfidf_matrix)

ann_time = time.time() - ann_start
print(f"  ‚úÖ ANN index built and neighbors retrieved")
print(f"  ‚è±Ô∏è  ANN time: {ann_time:.2f} seconds")

### üìä Generate Candidate Pairs

print(f"\nüìä GENERATING CANDIDATE PAIRS:")
print("-" * 40)

candidate_pairs = []
total_pairs_checked = 0

for i, token in enumerate(canonical_tokens):
    # Get neighbors (excluding self)
    neighbor_distances = distances[i][1:]  # Skip first (self)
    neighbor_indices = indices[i][1:]  # Skip first (self)
    
    for j, (dist, neighbor_idx) in enumerate(zip(neighbor_distances, neighbor_indices)):
        total_pairs_checked += 1
        
        # Convert distance to similarity (cosine distance = 1 - cosine similarity)
        similarity = 1 - dist
        
        if similarity >= SIMILARITY_THRESHOLD:
            neighbor_token = canonical_tokens[neighbor_idx]
            candidate_pairs.append({
                'token_a': token,
                'token_b': neighbor_token,
                'cosine_sim': similarity,
                'rank': j + 1
            })

print(f"  üìä Total pairs checked: {total_pairs_checked:,}")
print(f"  üìä Candidate pairs found: {len(candidate_pairs):,}")
print(f"  üìä Hit rate: {len(candidate_pairs)/total_pairs_checked*100:.2f}%")

### üìà Calculate Additional Metrics

print(f"\nüìà CALCULATING ADDITIONAL METRICS:")
print("-" * 40)

# Calculate edit distances and other metrics
from difflib import SequenceMatcher

for pair in candidate_pairs:
    # Edit distance (Levenshtein distance approximation)
    edit_distance = len(pair['token_a']) + len(pair['token_b']) - 2 * SequenceMatcher(None, pair['token_a'], pair['token_b']).get_matching_blocks()[0].size
    pair['edit_distance'] = edit_distance
    
    # Length difference
    pair['len_a'] = len(pair['token_a'])
    pair['len_b'] = len(pair['token_b'])
    pair['len_diff'] = abs(pair['len_a'] - pair['len_b'])
    
    # Document frequency ratio (if available)
    # This would require the original document frequencies from the canonicalization step
    pair['df_ratio'] = 1.0  # Placeholder

print(f"  ‚úÖ Additional metrics calculated for {len(candidate_pairs):,} pairs")

### üíæ Save Results

print(f"\nüíæ SAVING RESULTS:")
print("-" * 30)

# Create outputs directory
outputs_dir = Path("romance-novel-nlp-research/src/eda_analysis/outputs")
outputs_dir.mkdir(parents=True, exist_ok=True)

# Save candidate pairs
candidate_pairs_df = pd.DataFrame(candidate_pairs)
candidate_pairs_path = outputs_dir / "candidate_similarity_pairs.csv"
candidate_pairs_df.to_csv(candidate_pairs_path, index=False)
print(f"  ‚úÖ Saved {len(candidate_pairs):,} candidate pairs to: {candidate_pairs_path}")

# Save a sample for inspection
sample_size = min(5000, len(candidate_pairs))
sample_pairs = candidate_pairs_df.sample(n=sample_size, random_state=42)
sample_path = outputs_dir / "similarity_sample_inspection.csv"
sample_pairs.to_csv(sample_path, index=False)
print(f"  ‚úÖ Saved {sample_size:,} sample pairs to: {sample_path}")

# Save vectorizer
vectorizer_path = outputs_dir / "tfidf_vectorizer.pkl"
with open(vectorizer_path, 'wb') as f:
    pickle.dump(vectorizer, f)
print(f"  ‚úÖ Saved vectorizer to: {vectorizer_path}")

# Save metadata
cell3_meta = {
    'timestamp': time.strftime('%Y-%m-%d %H:%M:%S'),
    'config': {
        'ngram_range': NGRAM_RANGE,
        'min_df': MIN_DF,
        'max_features': MAX_FEATURES,
        'similarity_threshold': SIMILARITY_THRESHOLD,
        'top_k_neighbors': TOP_K_NEIGHBORS
    },
    'stats': {
        'total_tokens': len(canonical_tokens),
        'total_pairs_checked': total_pairs_checked,
        'candidate_pairs_found': len(candidate_pairs),
        'hit_rate': len(candidate_pairs)/total_pairs_checked*100,
        'vectorization_time': vectorize_time,
        'ann_time': ann_time
    },
    'outputs': {
        'candidate_pairs_file': str(candidate_pairs_path),
        'sample_file': str(sample_path),
        'vectorizer_file': str(vectorizer_path)
    }
}

metadata_path = outputs_dir / "cell3_similarity_metadata.json"
with open(metadata_path, 'w', encoding='utf-8') as f:
    json.dump(cell3_meta, f, indent=2, ensure_ascii=False)
print(f"  ‚úÖ Saved metadata to: {metadata_path}")

### üìä Summary Statistics

print(f"\nüìä SUMMARY STATISTICS:")
print("-" * 30)

if len(candidate_pairs) > 0:
    similarities = [pair['cosine_sim'] for pair in candidate_pairs]
    edit_distances = [pair['edit_distance'] for pair in candidate_pairs]
    
    print(f"  Cosine similarity range: {min(similarities):.3f} - {max(similarities):.3f}")
    print(f"  Cosine similarity mean: {np.mean(similarities):.3f}")
    print(f"  Edit distance range: {min(edit_distances)} - {max(edit_distances)}")
    print(f"  Edit distance mean: {np.mean(edit_distances):.1f}")
    
    # Coverage analysis
    unique_tokens_in_pairs = set()
    for pair in candidate_pairs:
        unique_tokens_in_pairs.add(pair['token_a'])
        unique_tokens_in_pairs.add(pair['token_b'])
    
    coverage = len(unique_tokens_in_pairs) / len(canonical_tokens) * 100
    print(f"  Token coverage: {len(unique_tokens_in_pairs):,} / {len(canonical_tokens):,} ({coverage:.1f}%)")

print(f"\n[{time.strftime('%H:%M:%S')}] ‚úÖ Cell 3: Character Similarity Index & Neighbor Retrieval completed successfully!")



[21:03:38] üîç Starting Cell 3: Character Similarity Index & Neighbor Retrieval
üìã TF-IDF CONFIGURATION:
  N-gram range: (2, 4)
  Min document frequency: 2
  Max features: 10000
  Similarity threshold: 0.3
  Top-K neighbors: 50

üî§ TOKEN VECTORIZATION:
------------------------------
  üìä Total canonical tokens: 254,778
  ‚úÖ TF-IDF fitted: 254,778 tokens √ó 10,000 features
  ‚è±Ô∏è  Vectorization time: 24.93 seconds

üîç BUILDING ANN INDEX & RETRIEVING NEIGHBORS:
-------------------------------------------------------


## üîç Cell 4: Neighbor Similarity Sample Inspection

This section loads and thoroughly inspects the neighbor similarity sample CSV file to validate the quality of our character similarity index and identify potential false positives in the candidate edges.

### What this section does:
- Loads the 5,000-row sample CSV file from Cell 3
- Validates the schema and data structure
- Performs comprehensive quality checks on similarity pairs
- Identifies edge cases and potential false positives
- Provides detailed logging for manual verification

In [None]:
# =============================================================================
# CELL 4: NEIGHBOR SIMILARITY SAMPLE INSPECTION
# =============================================================================

import pandas as pd
import numpy as np
import json
import time
from pathlib import Path
import logging

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

print(f"\n[{time.strftime('%H:%M:%S')}] üîç Starting Cell 4: Neighbor Similarity Sample Inspection")
print("=" * 80)

### üìÇ Load Sample Data

print(f"\nüìÇ LOADING SAMPLE DATA:")
print("-" * 30)

# Load the sample file created in Cell 3
outputs_dir = Path("romance-novel-nlp-research/src/eda_analysis/outputs")
sample_path = outputs_dir / "similarity_sample_inspection.csv"

if not sample_path.exists():
    print(f"  ‚ùå Sample file not found: {sample_path}")
    print(f"  üìù Please run Cell 3 first to generate the sample data")
else:
    sample_df = pd.read_csv(sample_path)
    print(f"  ‚úÖ Loaded sample data: {len(sample_df):,} rows")
    print(f"  üìä Columns: {list(sample_df.columns)}")

### üîç Data Quality Inspection

print(f"\nüîç DATA QUALITY INSPECTION:")
print("-" * 40)

if sample_path.exists():
    # Basic data validation
    print(f"üìä BASIC VALIDATION:")
    print(f"  Total rows: {len(sample_df):,}")
    print(f"  Null values per column:")
    for col in sample_df.columns:
        null_count = sample_df[col].isnull().sum()
        print(f"    {col}: {null_count} ({null_count/len(sample_df)*100:.1f}%)")
    
    # Data type validation
    print(f"\nüìä DATA TYPES:")
    print(sample_df.dtypes)
    
    # Sample data preview
    print(f"\nüìä SAMPLE DATA PREVIEW:")
    print(sample_df.head(10))

### üìà Quality Assessment

print(f"\nüìà QUALITY ASSESSMENT:")
print("-" * 30)

if sample_path.exists():
    # Analyze similarity distributions
    print(f"üìä SIMILARITY DISTRIBUTION:")
    if 'cosine_sim' in sample_df.columns:
        similarities = sample_df['cosine_sim']
        print(f"  Cosine similarity range: {similarities.min():.3f} - {similarities.max():.3f}")
        print(f"  Cosine similarity mean: {similarities.mean():.3f}")
        print(f"  Cosine similarity std: {similarities.std():.3f}")
        
        # Similarity ranges
        high_sim = (similarities >= 0.8).sum()
        med_sim = ((similarities >= 0.5) & (similarities < 0.8)).sum()
        low_sim = (similarities < 0.5).sum()
        
        print(f"  High similarity (‚â•0.8): {high_sim:,} ({high_sim/len(sample_df)*100:.1f}%)")
        print(f"  Medium similarity (0.5-0.8): {med_sim:,} ({med_sim/len(sample_df)*100:.1f}%)")
        print(f"  Low similarity (<0.5): {low_sim:,} ({low_sim/len(sample_df)*100:.1f}%)")
    
    # Analyze edit distances
    print(f"\nüìä EDIT DISTANCE DISTRIBUTION:")
    if 'edit_distance' in sample_df.columns:
        edit_distances = sample_df['edit_distance']
        print(f"  Edit distance range: {edit_distances.min()} - {edit_distances.max()}")
        print(f"  Edit distance mean: {edit_distances.mean():.1f}")
        print(f"  Edit distance std: {edit_distances.std():.1f}")
        
        # Edit distance ranges
        low_edit = (edit_distances <= 2).sum()
        med_edit = ((edit_distances > 2) & (edit_distances <= 5)).sum()
        high_edit = (edit_distances > 5).sum()
        
        print(f"  Low edit distance (‚â§2): {low_edit:,} ({low_edit/len(sample_df)*100:.1f}%)")
        print(f"  Medium edit distance (3-5): {med_edit:,} ({med_edit/len(sample_df)*100:.1f}%)")
        print(f"  High edit distance (>5): {high_edit:,} ({high_edit/len(sample_df)*100:.1f}%)")

### üîç Manual Quality Review

print(f"\nüîç MANUAL QUALITY REVIEW:")
print("-" * 35)

if sample_path.exists():
    # Show sample pairs for manual review
    print(f"üìã SAMPLE PAIRS FOR MANUAL REVIEW:")
    print(f"Showing first 10 pairs for quality assessment:")
    
    for i, (idx, row) in enumerate(sample_df.head(10).iterrows()):
        print(f"\n[{i+1}] Row {idx}:")
        print(f"  Token A: '{row['token_a']}' (len: {row['len_a']})")
        print(f"  Token B: '{row['token_b']}' (len: {row['len_b']})")
        print(f"  Cosine similarity: {row['cosine_sim']:.4f}")
        print(f"  Edit distance: {row['edit_distance']}")
        
        # Quality assessment
        if row['edit_distance'] <= 3 and row['cosine_sim'] > 0.5:
            print(f"  ‚úÖ Looks like valid shelf variants")
        elif row['edit_distance'] > 5 and row['cosine_sim'] < 0.3:
            print(f"  ‚ö†Ô∏è  Potential false positive - low similarity")
        else:
            print(f"  üîç Manual review needed")

### ‚ö†Ô∏è Edge Case Analysis

print(f"\n‚ö†Ô∏è  EDGE CASE ANALYSIS:")
print("-" * 30)

if sample_path.exists():
    # Edge case 1: Low cosine similarity
    print(f"\nüîç EDGE CASE 1: Low Cosine Similarity (< 0.2)")
    print("-" * 50)
    low_cosine = sample_df[sample_df['cosine_sim'] < 0.2]
    print(f"üìä Found {len(low_cosine)} rows with cosine_sim < 0.2")
    
    if len(low_cosine) > 0:
        print(f"üîç Sample of low cosine similarity pairs:")
        for i, (idx, row) in enumerate(low_cosine.head(5).iterrows()):
            print(f"  [{i+1}] '{row['token_a']}' ‚Üî '{row['token_b']}' (cosine: {row['cosine_sim']:.4f}, edit: {row['edit_distance']})")
    
    # Edge case 2: Low edit distance but low cosine similarity
    print(f"\nüîç EDGE CASE 2: Low Edit Distance but Low Cosine Similarity")
    print("-" * 60)
    low_edit_low_cosine = sample_df[(sample_df['edit_distance'] < 2) & (sample_df['cosine_sim'] < 0.3)]
    print(f"üìä Found {len(low_edit_low_cosine)} rows with edit_distance < 2 AND cosine_sim < 0.3")
    
    if len(low_edit_low_cosine) > 0:
        print(f"üîç Sample of low edit distance but low cosine similarity pairs:")
        for i, (idx, row) in enumerate(low_edit_low_cosine.head(5).iterrows()):
            print(f"  [{i+1}] '{row['token_a']}' ‚Üî '{row['token_b']}' (cosine: {row['cosine_sim']:.4f}, edit: {row['edit_distance']})")

### üìä Quality Metrics Summary

print(f"\nüìä QUALITY METRICS SUMMARY:")
print("-" * 35)

if sample_path.exists():
    total_rows = len(sample_df)
    
    # Calculate quality metrics
    high_quality_pairs = len(sample_df[
        (sample_df['cosine_sim'] >= 0.5) & 
        (sample_df['edit_distance'] <= 5)
    ])
    
    potential_false_positives = len(sample_df[
        (sample_df['cosine_sim'] < 0.3) | 
        (sample_df['edit_distance'] > 8)
    ])
    
    print(f"üìã QUALITY ASSESSMENT:")
    print(f"  Total pairs analyzed: {total_rows:,}")
    print(f"  High quality pairs: {high_quality_pairs:,} ({high_quality_pairs/total_rows*100:.1f}%)")
    print(f"  Potential false positives: {potential_false_positives:,} ({potential_false_positives/total_rows*100:.1f}%)")
    
    # Distribution analysis
    print(f"\nüìä DISTRIBUTION ANALYSIS:")
    
    # Cosine similarity distribution
    cosine_ranges = [
        (0.0, 0.2, "Very Low"),
        (0.2, 0.4, "Low"),
        (0.4, 0.6, "Medium"),
        (0.6, 0.8, "High"),
        (0.8, 1.0, "Very High")
    ]
    
    print(f"  Cosine Similarity Distribution:")
    for min_val, max_val, label in cosine_ranges:
        count = len(sample_df[(sample_df['cosine_sim'] >= min_val) & (sample_df['cosine_sim'] < max_val)])
        print(f"    {label} ({min_val}-{max_val}): {count:,} ({count/total_rows*100:.1f}%)")
    
    # Edit distance distribution
    edit_ranges = [
        (0, 1, "Very Close"),
        (2, 3, "Close"),
        (4, 5, "Medium"),
        (6, 10, "Far"),
        (11, 100, "Very Far")
    ]
    
    print(f"  Edit Distance Distribution:")
    for min_val, max_val, label in edit_ranges:
        count = len(sample_df[(sample_df['edit_distance'] >= min_val) & (sample_df['edit_distance'] <= max_val)])
        print(f"    {label} ({min_val}-{max_val}): {count:,} ({count/total_rows*100:.1f}%)")

### üìã Recommendations

print(f"\nüìã RECOMMENDATIONS BASED ON ANALYSIS:")
print("-" * 45)

if sample_path.exists():
    if potential_false_positives / total_rows > 0.1:
        print(f"  ‚ö†Ô∏è  High false positive rate detected - consider adjusting similarity thresholds")
    else:
        print(f"  ‚úÖ False positive rate appears acceptable")
    
    if high_quality_pairs / total_rows > 0.7:
        print(f"  ‚úÖ High proportion of quality pairs - similarity index working well")
    else:
        print(f"  ‚ö†Ô∏è  Consider tuning TF-IDF parameters or similarity thresholds")
    
    # Additional recommendations
    print(f"\nüìã ADDITIONAL RECOMMENDATIONS:")
    print(f"  üîß Consider implementing length-based filtering")
    print(f"  üîß Add domain-specific stopword filtering")
    print(f"  üîß Implement confidence scoring based on multiple metrics")
    print(f"  üîß Consider clustering similar pairs for batch processing")

### üíæ Save Inspection Report

print(f"\nüíæ SAVING INSPECTION REPORT:")
print("-" * 35)

if sample_path.exists():
    # Create inspection report
    inspection_report = {
        'timestamp': time.strftime('%Y-%m-%d %H:%M:%S'),
        'sample_file': str(sample_path),
        'total_pairs_analyzed': len(sample_df),
        'quality_metrics': {
            'high_quality_pairs': int(high_quality_pairs),
            'high_quality_percentage': float(high_quality_pairs/total_rows*100),
            'potential_false_positives': int(potential_false_positives),
            'false_positive_percentage': float(potential_false_positives/total_rows*100)
        },
        'distributions': {
            'cosine_similarity': {
                'min': float(sample_df['cosine_sim'].min()),
                'max': float(sample_df['cosine_sim'].max()),
                'mean': float(sample_df['cosine_sim'].mean()),
                'std': float(sample_df['cosine_sim'].std())
            },
            'edit_distance': {
                'min': int(sample_df['edit_distance'].min()),
                'max': int(sample_df['edit_distance'].max()),
                'mean': float(sample_df['edit_distance'].mean()),
                'std': float(sample_df['edit_distance'].std())
            }
        },
        'recommendations': [
            "Consider implementing length-based filtering",
            "Add domain-specific stopword filtering", 
            "Implement confidence scoring based on multiple metrics",
            "Consider clustering similar pairs for batch processing"
        ]
    }
    
    report_path = outputs_dir / "similarity_inspection_report.json"
    with open(report_path, 'w', encoding='utf-8') as f:
        json.dump(inspection_report, f, indent=2, ensure_ascii=False)
    print(f"  ‚úÖ Saved inspection report to: {report_path}")

print(f"\n[{time.strftime('%H:%M:%S')}] ‚úÖ Cell 4: Neighbor Similarity Sample Inspection completed successfully!")
