In [2]:
import json

file_path = 'COMBINED-FINAL-DEDUPED.json'
output_file = 'COMBINED-FINAL-DEDUPED-CLEAN.json'

try:
    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    print(f"Successfully loaded dataset with {len(data)} entries")
    print(f"Sample of the first entry: {data[0]['name']}")
    
    unique_products = {}
    duplicate_count = 0
    
    for product in data:
        prod_link = product.get('prodLink')
        
        if prod_link not in unique_products:
            unique_products[prod_link] = product
        else:
            duplicate_count += 1
    
    unique_data = list(unique_products.values())
    
    print(f"Original dataset: {len(data)} entries")
    print(f"After removing duplicates: {len(unique_data)} entries")
    print(f"Removed {duplicate_count} duplicate entries")
    
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(unique_data, f, indent=4, ensure_ascii=False)
        
    print(f"Deduplicated data written to {output_file}")
    
except FileNotFoundError:
    print(f"Error: The file {file_path} was not found.")
except json.JSONDecodeError:
    print(f"Error: The file {file_path} is not valid JSON.")
except Exception as e:
    print(f"An error occurred: {str(e)}")

Successfully loaded dataset with 1333 entries
Sample of the first entry: y2k western american floral cowbow 6597
Original dataset: 1333 entries
After removing duplicates: 1249 entries
Removed 84 duplicate entries
Deduplicated data written to COMBINED-FINAL-DEDUPED-CLEAN.json
