In [8]:
import json
import pandas as pd
from collections import Counter
from typing import Dict, List

In [9]:
# Load the JSON file
file_path = 'data/gov_pages_with_synthetic_content.json'

with open(file_path, 'r', encoding='utf-8') as f:
    data = json.load(f)

print(f"Total number of documents: {len(data)}")

Total number of documents: 157


In [10]:
# Analyze document types
doc_types = Counter(page['document_type'] for page in data)
print("Document types distribution:")
for doc_type, count in doc_types.most_common():
    print(f"- {doc_type}: {count}")

Document types distribution:
- guide: 76
- answer: 45
- detailed_guide: 21
- step_by_step_nav: 8
- guidance: 6
- statutory_guidance: 1


In [11]:
# Analyze synthetic data coverage
synthetic_count = sum(1 for page in data if page.get('synthetic_data'))
print(f"\nSynthetic data coverage:")
print(f"Documents with synthetic data: {synthetic_count}")
print(f"Coverage percentage: {(synthetic_count/len(data))*100:.1f}%")


Synthetic data coverage:
Documents with synthetic data: 157
Coverage percentage: 100.0%


In [12]:
# Analyze snippets
total_snippets = 0
snippets_per_doc = []

for page in data:
    if page.get('synthetic_data') and page['synthetic_data'].get('article_snippets'):
        num_snippets = len(page['synthetic_data']['article_snippets']['snippets'])
        total_snippets += num_snippets
        snippets_per_doc.append(num_snippets)

print(f"\nSnippet statistics:")
print(f"Total snippets across all documents: {total_snippets}")
print(f"Average snippets per document: {sum(snippets_per_doc)/len(snippets_per_doc):.1f}")
print(f"Min snippets in a document: {min(snippets_per_doc)}")
print(f"Max snippets in a document: {max(snippets_per_doc)}")


Snippet statistics:
Total snippets across all documents: 1293
Average snippets per document: 8.2
Min snippets in a document: 1
Max snippets in a document: 24


In [14]:
def display_random_synthetic_example():
    import random
    
    # Get a random page with synthetic data
    pages_with_synthetic = [p for p in data if p.get('synthetic_data')]
    page_index = random.randint(0, len(pages_with_synthetic) - 1)
    sample_page = pages_with_synthetic[page_index]
    
    print(f"Document Index: {page_index} (save this number to revisit this example)")
    print(f"Title: {sample_page['title']}")
    print(f"Document Type: {sample_page['document_type']}")
    print("-" * 80)
    
    # Original content preview (first 200 chars)
    print("\nORIGINAL CONTENT PREVIEW:")
    print(sample_page['details']['body'][:200], "...\n")
    print("-" * 80)
    
    # Poorly written version preview
    print("\nPOORLY WRITTEN VERSION PREVIEW:")
    print(sample_page['synthetic_data']['poorly_written_article'][:200], "...\n")
    print("-" * 80)
    
    # Show all snippet pairs
    snippets = sample_page['synthetic_data']['article_snippets']['snippets']
    print(f"\nNumber of snippets: {len(snippets)}")
    
    for i, snippet in enumerate(snippets, 1):
        print(f"\nSNIPPET PAIR {i}:")
        print("\nWell-written:")
        print(snippet['well_written_snippet'])
        print("\nBadly-written:")
        print(snippet['badly_written_snippet'])
        print("-" * 80)

# To see a specific example, you can modify this to:
# display_specific_example(saved_index)
display_random_synthetic_example()


Document Index: 72 (save this number to revisit this example)
Title: The National Minimum Wage and Living Wage
Document Type: guide
--------------------------------------------------------------------------------

ORIGINAL CONTENT PREVIEW:
## Overview 

The [minimum wage a worker should get](/national-minimum-wage-rates) depends on their age and if they’re an apprentice.

The National Minimum Wage is the minimum pay per hour almost all  ...

--------------------------------------------------------------------------------

POORLY WRITTEN VERSION PREVIEW:
Okay, so let’s talk about minimum wage stuff really quick. So like, minimum wage kinda depends on your age and if you’re an apprentice, right? The National Minimum Wage is basically the minimum pay yo ...

--------------------------------------------------------------------------------

Number of snippets: 12

SNIPPET PAIR 1:

Well-written:
The minimum wage a worker should receive depends on their age and whether they are an apprentice.

In [17]:

def display_specific_example(index: int):
    pages_with_synthetic = [p for p in data if p.get('synthetic_data')]
    if 0 <= index < len(pages_with_synthetic):
        sample_page = pages_with_synthetic[index]
        print(f"Document Index: {index}")
        print(f"Title: {sample_page['title']}")
        print(f"Document Type: {sample_page['document_type']}")
        print("-" * 80)
        
        # Original content preview (first 200 chars)
        print("\nORIGINAL CONTENT PREVIEW:")
        print(sample_page['details']['body'][:200], "...\n")
        print("-" * 80)
        
        # Poorly written version preview
        print("\nPOORLY WRITTEN VERSION PREVIEW:")
        print(sample_page['synthetic_data']['poorly_written_article'][:200], "...\n")
        print("-" * 80)
        
        # Show all snippet pairs
        snippets = sample_page['synthetic_data']['article_snippets']['snippets']
        print(f"\nNumber of snippets: {len(snippets)}")
        
        for i, snippet in enumerate(snippets, 1):
            print(f"\nSNIPPET PAIR {i}:")
            print("\nWell-written:")
            print(snippet['well_written_snippet'])
            print("\nBadly-written:")
            print(snippet['badly_written_snippet'])
            print("-" * 80)
    else:
        print(f"Index {index} is out of range. Please use an index between 0 and {len(pages_with_synthetic)-1}")
        
# Example usage:
display_specific_example(72)  # To revisit example #42


Document Index: 72
Title: The National Minimum Wage and Living Wage
Document Type: guide
--------------------------------------------------------------------------------

ORIGINAL CONTENT PREVIEW:
## Overview 

The [minimum wage a worker should get](/national-minimum-wage-rates) depends on their age and if they’re an apprentice.

The National Minimum Wage is the minimum pay per hour almost all  ...

--------------------------------------------------------------------------------

POORLY WRITTEN VERSION PREVIEW:
Okay, so let’s talk about minimum wage stuff really quick. So like, minimum wage kinda depends on your age and if you’re an apprentice, right? The National Minimum Wage is basically the minimum pay yo ...

--------------------------------------------------------------------------------

Number of snippets: 12

SNIPPET PAIR 1:

Well-written:
The minimum wage a worker should receive depends on their age and whether they are an apprentice. The National Minimum Wage is the lowest pa