In [None]:
import os

def filter_books(input_file, output_file):
    """Filter dataset to keep only books and book-to-book edges."""
    
    # Read file
    with open(input_file, 'r', encoding='utf-8', errors='ignore') as f:
        content = f.read()
    
    # Split into products (each starts with "Id:")
    products = content.split('\nId:')
    header = products[0].split('Id:')[0] if 'Id:' in products[0] else ''
    
    # First pass: collect book ASINs
    book_asins = set()
    for product in products:
        if not product.strip():
            continue
        if '\n  group: Book' in product or product.startswith('  group: Book'):
            # Extract ASIN
            for line in product.split('\n'):
                if line.startswith('ASIN:') or line.startswith('  ASIN:'):
                    asin = line.split('ASIN:')[1].strip()
                    book_asins.add(asin)
                    break
    
    # Second pass: filter books and their edges
    filtered = []
    for product in products:
        if not product.strip():
            continue
        
        # Check if it's a book
        if '\n  group: Book' not in product and not product.startswith('  group: Book'):
            continue
        
        # Filter similar products
        lines = product.split('\n')
        result_lines = []
        
        for line in lines:
            if line.strip().startswith('similar:'):
                parts = line.split()
                idx = next(i for i, p in enumerate(parts) if p == 'similar:')
                asins = [a for a in parts[idx+2:] if a in book_asins]  # Skip count
                result_lines.append(f"  similar: {len(asins)}  {'  '.join(asins)}")
            else:
                result_lines.append(line)
        
        filtered.append('\n'.join(result_lines))
    
    # Write output
    with open(output_file, 'w', encoding='utf-8') as f:
        if header:
            f.write(header)
        for i, product in enumerate(filtered):
            if i == 0 and not header:
                f.write('Id:' + product)
            else:
                f.write('\nId:' + product)
    
    # Print stats
    total_edges = sum(line.count('similar:') for p in filtered for line in p.split('\n'))
    print(f"Filtered {len(filtered)} books with {total_edges} book-to-book edges")
    print(f"Output: {output_file}")


if __name__ == "__main__":    
    input_file = r"../data/amazon-meta.txt"
    output_file = r"../data/amazon-books.txt"
    
    output_dir = os.path.dirname(output_file)
    if output_dir:
        os.makedirs(output_dir, exist_ok=True)
    
    filter_books(input_file, output_file)

Filtered 393560 books with 393560 book-to-book edges
Output: ../data/raw/amazon-books.txt
