In [None]:
import json
import re
import os

# Input and output file paths
input_file = r"/content/poems.json"  # Path to input JSON
output_file = "cleansed.txt"  # Cleansed data will be saved here

# Counter to track duplicates
num_duplicates = 0

if not os.path.isfile(input_file):
    print(f"Error: The file '{input_file}' does not exist. Please check the path.")
else:
    try:
        # Load the JSON file
        with open(input_file, "r", encoding="utf-8") as f:
            data = json.load(f)

        # Ensure `data` is a list
        if not isinstance(data, list):
            print("Error: The JSON file does not contain a list. Please check the file structure.")
            exit()

        # Set to store unique cleansed entries
        unique_entries = set()
        cleansed_data = []

        # Process each dictionary in the list
        for entry in data:
            if not isinstance(entry, dict):
                print(f"Skipping malformed entry (not a dictionary): {entry}")
                continue

            # Extract title and body fields with defaults
            title = entry.get("title", "Untitled").strip()
            body = entry.get("body", "").strip()

            # Clean the body text to remove excessive blank lines
            cleaned_body = re.sub(r'\n\s*\n+', '\n', body)  # Collapse multiple newlines into one
            cleaned_body = cleaned_body.strip()  # Remove leading and trailing spaces

            # Combine title and body for uniqueness check
            combined_entry = f"Title: {title}\nBody:\n{cleaned_body}"

            # Add to cleansed data if not a duplicate
            if combined_entry not in unique_entries:
                cleansed_data.append(combined_entry)
                unique_entries.add(combined_entry)
            else:
                num_duplicates += 1  # Increment duplicate counter

        # Save cleansed data to the output file
        with open(output_file, "w", encoding="utf-8") as f:
            for item in cleansed_data:
                f.write(item + "\n\n")  # Adds spacing between poems only

        # Print cleansing summary
        print(f"Original entries: {len(data)}")
        print(f"Cleaned entries: {len(cleansed_data)}")
        print(f"Number of duplicates skipped: {num_duplicates}")
        print(f"Cleansed data saved to: {output_file}")

    except json.JSONDecodeError:
        print("Error: Failed to decode JSON. Check the file contents.")

Original entries: 15680
Cleaned entries: 15078
Number of duplicates skipped: 602
Cleansed data saved to: cleansed.txt
