In [457]:
# Install required packages
%pip install reportlab

# Import required libraries
import json
from datetime import datetime
import os
import re
from reportlab.lib import colors
from reportlab.lib.pagesizes import letter, A4
from reportlab.platypus import SimpleDocTemplate, Table, TableStyle, Paragraph, Spacer, PageBreak
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
from reportlab.lib.units import inch
from reportlab.lib.enums import TA_JUSTIFY, TA_LEFT, TA_CENTER
from reportlab.pdfbase import pdfmetrics
from reportlab.pdfbase.ttfonts import TTFont


Defaulting to user installation because normal site-packages is not writeable
You should consider upgrading via the '/Applications/Xcode.app/Contents/Developer/usr/bin/python3 -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


In [458]:
def parse_json_data(json_data):
    """Parse the JSON data and convert to a list of nodes"""
    
    # If it's already a set or dict-like object containing JSON strings
    if isinstance(json_data, (set, dict)):
        json_data = list(json_data)  # Convert to list
        json_data = [json.loads(item) for item in json_data]  # Parse each JSON string
    
    # If it's a string representation of a set
    elif isinstance(json_data, str):
        if json_data.startswith("{") and json_data.endswith("}"):
            # It's a string representation of a set containing JSON strings
            json_data = eval(json_data)  # Convert string representation to actual set
            json_data = list(json_data)  # Convert set to list
            json_data = [json.loads(item) for item in json_data]  # Parse each JSON string
    
    # If it's already a list, check if items need parsing
    elif isinstance(json_data, list):
        # Check if first item is a string that needs parsing
        if json_data and isinstance(json_data[0], str):
            json_data = [json.loads(item) for item in json_data]
    
    return json_data

def create_custom_styles():
    """Create custom paragraph styles for the PDF"""
    styles = getSampleStyleSheet()
    
    # Custom title style - compact
    styles.add(ParagraphStyle(
        name='CustomTitle',
        parent=styles['Heading1'],
        fontSize=20,
        textColor=colors.HexColor('#000000'),
        spaceAfter=10,
        spaceBefore=0,
        alignment=TA_LEFT,
        leading=24,
        fontName='Helvetica-Bold'
    ))
    
    # Custom heading styles - tighter spacing
    styles.add(ParagraphStyle(
        name='CustomHeading2',
        parent=styles['Heading2'],
        fontSize=16,
        textColor=colors.HexColor('#1a1a1a'),
        spaceBefore=12,
        spaceAfter=6,
        alignment=TA_LEFT,
        fontName='Helvetica-Bold'
    ))
    
    styles.add(ParagraphStyle(
        name='CustomHeading3',
        parent=styles['Heading3'],
        fontSize=13,
        textColor=colors.HexColor('#2c3e50'),
        spaceBefore=10,
        spaceAfter=4,
        alignment=TA_LEFT,
        fontName='Helvetica-Bold'
    ))
    
    # Custom body text - minimal spacing
    styles.add(ParagraphStyle(
        name='CustomBodyText',
        parent=styles['BodyText'],
        fontSize=10,
        alignment=TA_LEFT,
        spaceBefore=2,
        spaceAfter=6,
        leading=14,
        textColor=colors.HexColor('#333333')
    ))
    
    # List item style - compact
    styles.add(ParagraphStyle(
        name='ListItem',
        parent=styles['BodyText'],
        fontSize=10,
        leftIndent=20,
        spaceBefore=1,
        spaceAfter=1,
        alignment=TA_LEFT,
        leading=13,
        textColor=colors.HexColor('#333333')
    ))
    
    return styles


In [459]:
def process_content(content):
    """Process content array and return text"""
    if not content:
        return ""
    
    text_parts = []
    for item in content:
        if item.get("type") == "text":
            text_parts.append(item.get("text", ""))
    return "".join(text_parts)

def generate_pdf_with_reportlab(json_data, output_filename=None, organization_name=None, custom_date=None, policy_name=None, policy_description=None):
    """
    Generate PDF from JSON data using ReportLab
    
    Args:
        json_data: The JSON data (as string or parsed object)
        output_filename: Name of the output PDF file (optional - will use title from JSON if not provided)
        organization_name: Custom organization name (optional)
        custom_date: Custom date (optional)
        policy_name: Name of the policy to display at the top (optional)
        policy_description: Description of the policy to display (optional)
    """
    # Parse JSON data
    nodes = parse_json_data(json_data)
    
    # Extract title from first heading if no filename provided
    auto_generated_filename = False
    if output_filename is None:
        # Find the first heading in the document
        title = None
        for node in nodes:
            if node.get("type") == "heading":
                content = node.get("content", [])
                title = process_content(content)
                break
        
        if title:
            # Convert title to filename-friendly format
            # Remove special characters and replace spaces with underscores
            filename_base = re.sub(r'[^\w\s-]', '', title.lower())
            filename_base = re.sub(r'[-\s]+', '_', filename_base)
            output_filename = f"{filename_base}.pdf"
            auto_generated_filename = True
        else:
            output_filename = "output.pdf"
    
    # Set up organization and date
    org_name = organization_name or "Your Organization"
    date_str = custom_date or datetime.now().strftime("%Y-%m-%d")
    
    # Create pdfs directory if it doesn't exist
    pdf_dir = "pdfs"
    if not os.path.exists(pdf_dir):
        os.makedirs(pdf_dir)
        print(f"📁 Created directory: {pdf_dir}/")
    
    # Prepend the pdfs directory to the filename
    full_output_path = os.path.join(pdf_dir, output_filename)
    
    # Create the PDF document
    doc = SimpleDocTemplate(full_output_path, pagesize=letter,
                            rightMargin=72, leftMargin=72,
                            topMargin=72, bottomMargin=72)
    
    # Get custom styles
    styles = create_custom_styles()
    
    # Container for the 'Flowable' objects
    elements = []
    
    # Add policy name and description at the top if provided
    if policy_name:
        # Add policy name as a main title
        elements.append(Paragraph(policy_name, styles['CustomTitle']))
        elements.append(Spacer(1, 0.1*inch))
    
    if policy_description:
        # Add policy description
        desc_style = ParagraphStyle(
            name='PolicyDescription',
            parent=styles['BodyText'],
            fontSize=11,
            textColor=colors.HexColor('#555555'),
            alignment=TA_LEFT,
            spaceAfter=10,
            leading=14
        )
        elements.append(Paragraph(policy_description, desc_style))
        elements.append(Spacer(1, 0.15*inch))
    
    # Add a horizontal line divider
    from reportlab.platypus import HRFlowable
    elements.append(HRFlowable(width="100%", thickness=1, color=colors.HexColor('#cccccc')))
    elements.append(Spacer(1, 0.15*inch))
    
    # Process each node
    for node in nodes:
        node_type = node.get("type", "")
        attrs = node.get("attrs", {})
        content = node.get("content", [])
        
        if node_type == "heading":
            level = attrs.get("level", 1)
            text = process_content(content)
            # Replace template variables
            text = text.replace("{{organization}}", org_name)
            text = text.replace("{{date}}", date_str)
            
            if level == 1:
                elements.append(Paragraph(text, styles['CustomTitle']))
                elements.append(Spacer(1, 0.05*inch))
            elif level == 2:
                elements.append(Paragraph(text, styles['CustomHeading2']))
            elif level == 3:
                elements.append(Paragraph(text, styles['CustomHeading3']))
            else:
                elements.append(Paragraph(text, styles['Heading4']))
        
        elif node_type == "paragraph":
            text = process_content(content)
            # Replace template variables
            text = text.replace("{{organization}}", org_name)
            text = text.replace("{{date}}", date_str)
            
            elements.append(Paragraph(text, styles['CustomBodyText']))
        
        elif node_type == "orderedList":
            list_count = 1
            for item in content:
                if item.get("type") == "listItem":
                    item_content = item.get("content", [])
                    for sub_item in item_content:
                        if sub_item.get("type") == "paragraph":
                            text = process_content(sub_item.get("content", []))
                            # Replace template variables
                            text = text.replace("{{organization}}", org_name)
                            text = text.replace("{{date}}", date_str)
                            
                            # Check if text already starts with numbering (e.g., "1. ", "2. ", etc.)
                            import re
                            if re.match(r'^\d+\.\s', text):
                                # Text already has numbering, use as-is
                                elements.append(Paragraph(text, styles['ListItem']))
                            else:
                                # Add numbering
                                numbered_text = f"{list_count}. {text}"
                                elements.append(Paragraph(numbered_text, styles['ListItem']))
                                list_count += 1
            elements.append(Spacer(1, 0.05*inch))
        
        elif node_type == "bulletList":
            for item in content:
                if item.get("type") == "listItem":
                    item_content = item.get("content", [])
                    for sub_item in item_content:
                        if sub_item.get("type") == "paragraph":
                            text = process_content(sub_item.get("content", []))
                            # Replace template variables
                            text = text.replace("{{organization}}", org_name)
                            text = text.replace("{{date}}", date_str)
                            
                            # Check if text already starts with a bullet or numbering
                            import re
                            if re.match(r'^[\d•\-\*]\.\s', text) or text.startswith('• '):
                                # Text already has bullet/numbering, use as-is
                                elements.append(Paragraph(text, styles['ListItem']))
                            else:
                                # Add bullet
                                bullet_text = f"• {text}"
                                elements.append(Paragraph(bullet_text, styles['ListItem']))
            elements.append(Spacer(1, 0.05*inch))
        
        elif node_type == "table":
            table_data = []
            for row in content:
                if row.get("type") == "tableRow":
                    row_data = []
                    for cell in row.get("content", []):
                        if cell.get("type") == "tableCell":
                            text = process_content(cell.get("content", []))
                            # Replace template variables
                            text = text.replace("{{organization}}", org_name)
                            text = text.replace("{{date}}", date_str)
                            row_data.append(text)
                    table_data.append(row_data)
            
            if table_data:
                # Create the table
                t = Table(table_data, colWidths=None)
                
                # Add style to table
                table_style = TableStyle([
                    # Header row style
                    ('BACKGROUND', (0, 0), (-1, 0), colors.HexColor('#3498db')),
                    ('TEXTCOLOR', (0, 0), (-1, 0), colors.whitesmoke),
                    ('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
                    ('FONTSIZE', (0, 0), (-1, 0), 12),
                    ('BOTTOMPADDING', (0, 0), (-1, 0), 12),
                    
                    # Data rows
                    ('BACKGROUND', (0, 1), (-1, -1), colors.beige),
                    ('TEXTCOLOR', (0, 1), (-1, -1), colors.black),
                    ('FONTNAME', (0, 1), (-1, -1), 'Helvetica'),
                    ('FONTSIZE', (0, 1), (-1, -1), 10),
                    ('GRID', (0, 0), (-1, -1), 1, colors.grey),
                    ('ALIGN', (0, 0), (-1, -1), 'LEFT'),
                    ('VALIGN', (0, 0), (-1, -1), 'MIDDLE'),
                    ('TOPPADDING', (0, 1), (-1, -1), 6),
                    ('BOTTOMPADDING', (0, 1), (-1, -1), 6),
                ])
                
                # Alternate row colors
                for i in range(2, len(table_data), 2):
                    table_style.add('BACKGROUND', (0, i), (-1, i), colors.HexColor('#f9f9f9'))
                
                t.setStyle(table_style)
                elements.append(t)
                elements.append(Spacer(1, 0.1*inch))
    
    # Build PDF
    if auto_generated_filename:
        print(f"📄 Auto-generated filename from document title: {output_filename}")
    print(f"Generating PDF: {full_output_path}")
    doc.build(elements)
    print(f"✅ PDF generated successfully: {full_output_path}")

# Create an alias for backward compatibility
generate_pdf = generate_pdf_with_reportlab


In [460]:
# Debug: Let's see what the raw JSON strings look like
if policies_data and len(policies_data) > 0:
    print("DEBUG: First policy's content (first 3 items):")
    for i, item in enumerate(policies_data[0]['content'][:3]):
        print(f"\nItem {i}:")
        print(f"Length: {len(item)}")
        print(f"First 100 chars: {item[:100]}")
        print(f"Raw: {repr(item[:100])}")


DEBUG: First policy's content (first 3 items):

Item 0:
Length: 79
First 100 chars: {"type": "heading", "content": [{"text": "Table of Contents", "type": "text"}]}
Raw: '{"type": "heading", "content": [{"text": "Table of Contents", "type": "text"}]}'

Item 1:
Length: 898
First 100 chars: {"type": "orderedList", "content": [{"type": "listItem", "content": [{"type": "paragraph", "content"
Raw: '{"type": "orderedList", "content": [{"type": "listItem", "content": [{"type": "paragraph", "content"'

Item 2:
Length: 79
First 100 chars: {"type": "heading", "content": [{"text": "Executive Summary", "type": "text"}]}
Raw: '{"type": "heading", "content": [{"text": "Executive Summary", "type": "text"}]}'


In [461]:
# Load data from source_data.txt file
import csv
import ast

def read_source_data(filepath='source_data.txt'):
    """Read and parse the source data file"""
    policies = []
    
    try:
        with open(filepath, 'r', encoding='utf-8') as f:
            # Read as CSV with comma delimiter
            reader = csv.reader(f, delimiter=',')
            
            # Skip header if present
            header = next(reader, None)
            
            for row in reader:
                if len(row) > 7:  # Ensure we have enough columns
                    policy_id = row[0]
                    policy_name = row[1]
                    policy_desc = row[2]
                    
                    # The content appears to be in a set format starting from column 7
                    # Parse the set containing JSON strings
                    content_set_str = row[7]
                    
                    if content_set_str and content_set_str.strip():
                        try:
                            # Parse the set string
                            # Remove the outer quotes if present
                            content_set_str = content_set_str.strip('"')
                            
                            # Convert the set string to a list
                            # The content is a set of JSON strings
                            content_items = []
                            
                            # The content is formatted as a set with escaped JSON strings
                            # Remove the curly braces and split by ","
                            content_str = content_set_str.strip('{}')
                            
                            # Split carefully to handle JSON objects that contain commas
                            import re
                            # Use regex to split by commas that are between JSON objects
                            json_pattern = r'"\s*,\s*"'
                            json_strings = re.split(json_pattern, content_str)
                            
                            for i, json_str in enumerate(json_strings):
                                # Clean up the JSON string
                                json_str = json_str.strip()
                                
                                # Remove leading/trailing quotes if present
                                if json_str.startswith('"'):
                                    json_str = json_str[1:]
                                if json_str.endswith('"'):
                                    json_str = json_str[:-1]
                                
                                # Replace escaped quotes with regular quotes
                                json_str = json_str.replace('\\""', '"')
                                json_str = json_str.replace('\\"', '"')
                                json_str = json_str.replace('""', '"')
                                
                                if json_str:
                                    content_items.append(json_str)
                            
                            if content_items:
                                policies.append({
                                    'id': policy_id,
                                    'name': policy_name,
                                    'description': policy_desc,
                                    'content': content_items  # List of JSON strings
                                })
                        except Exception as e:
                            print(f"Error parsing policy {policy_name}: {e}")
                            continue
    
    except Exception as e:
        print(f"Error reading source file: {e}")
        return []
    
    return policies

# Load all policies from the source file
print("Loading policies from source_data.txt...")
policies_data = read_source_data()

if policies_data:
    print(f"\n✅ Loaded {len(policies_data)} policies successfully!\n")
    print("Available policies:")
    print("-" * 60)
    for i, policy in enumerate(policies_data):
        print(f"{i+1:2d}. {policy['name']}")
    print("-" * 60)
    
    # Select a policy to convert (change this index to select a different policy)
    policy_index = 0  # First policy by default
    selected_policy = policies_data[policy_index]
    
    print(f"\n📋 Selected policy: {selected_policy['name']}")
    print(f"   Description: {selected_policy['description'][:100]}...")
    
    # Set the JSON input to the selected policy's content
    json_input = selected_policy['content']
    
else:
    print("❌ No policies found in source_data.txt")
    json_input = []


Loading policies from source_data.txt...

✅ Loaded 19 policies successfully!

Available policies:
------------------------------------------------------------
 1. Business Continuity Policy
 2. Information Security Program
 3. Capacity & Performance Management
 4. Change Management Policy 
 5. Encryption & Cryptographic Control Policy
 6. Logging Policy
 7. Physical Security Policy
 8. Vulnerability Management Policy
 9. Acceptable Use Policy
10. Access Control Policy
11. Asset Management Policy
12. Endpoint Protection Policy
13. Incident Response Policy
14. Information Protection Policy
15. Privacy Policy
16. Risk Management Policy 
17. Secure Development Policy
18. Security Awareness & Training Policy
19. Third-Party Risk Management Policy
------------------------------------------------------------

📋 Selected policy: Business Continuity Policy
   Description: This policy ensures the organization can quickly restore critical operations after a disruption by m...


In [462]:
# PASTE YOUR JSON DATA HERE
# IMPORTANT: Use square brackets [] for a list to preserve order, NOT curly braces {} for a set!
# Example with the provided data:
json_input = ["{\"type\": \"heading\", \"content\": [{\"text\": \"Executive Summary\", \"type\": \"text\"}]}","{\"type\": \"paragraph\", \"content\": [{\"text\": \"This Endpoint Protection Policy defines mandatory controls for Casper Studios’ cloud and home-office endpoints to meet SOC 2 requirements. It assigns roles, specifies measurable review cycles, and leverages AWS us-east-1 and Google Workspace integrations to protect employee data. All unresolved configuration details are marked for review.\", \"type\": \"text\"}]}","{\"type\": \"heading\", \"content\": [{\"text\": \"Table of Contents\", \"type\": \"text\"}]}","{\"type\": \"bulletList\", \"content\": [{\"type\": \"listItem\", \"content\": [{\"type\": \"paragraph\", \"content\": [{\"text\": \"1. Document Content Page\", \"type\": \"text\"}]}]}, {\"type\": \"listItem\", \"content\": [{\"type\": \"paragraph\", \"content\": [{\"text\": \"2. Applicability and Scope\", \"type\": \"text\"}]}]}, {\"type\": \"listItem\", \"content\": [{\"type\": \"paragraph\", \"content\": [{\"text\": \"3. Controls\", \"type\": \"text\"}]}]}, {\"type\": \"listItem\", \"content\": [{\"type\": \"paragraph\", \"content\": [{\"text\": \"4. Exceptions Process\", \"type\": \"text\"}]}]}, {\"type\": \"listItem\", \"content\": [{\"type\": \"paragraph\", \"content\": [{\"text\": \"5. Violations and Disciplinary Action\", \"type\": \"text\"}]}]}, {\"type\": \"listItem\", \"content\": [{\"type\": \"paragraph\", \"content\": [{\"text\": \"6. Auditor Evidence Artefacts\", \"type\": \"text\"}]}]}]}","{\"type\": \"heading\", \"content\": [{\"text\": \"2. Applicability and Scope\", \"type\": \"text\"}]}","{\"type\": \"paragraph\", \"content\": [{\"text\": \"This policy applies to all Casper Studios employees, contractors, and third parties configuring, using, or managing company-provided laptops, home-office desktops, and AWS EC2 instances in us-east-1 that access, store, or process employee data.\", \"type\": \"text\"}]}","{\"type\": \"heading\", \"content\": [{\"text\": \"3. Controls\", \"type\": \"text\"}]}","{\"type\": \"heading\", \"content\": [{\"text\": \"3.1 Malware Protection\", \"type\": \"text\"}]}","{\"type\": \"orderedList\", \"content\": [{\"type\": \"listItem\", \"content\": [{\"type\": \"paragraph\", \"content\": [{\"text\": \"All endpoints accessing AWS-hosted applications or storing employee data must have approved anti-malware software (e.g., Sophos Endpoint Protection agent version) installed, configured, and updated daily; the Security Administrator shall verify signature updates quarterly.\", \"type\": \"text\"}]}]}]}","{\"type\": \"heading\", \"content\": [{\"text\": \"3.2 Inventory and Encryption\", \"type\": \"text\"}]}","{\"type\": \"orderedList\", \"content\": [{\"type\": \"listItem\", \"content\": [{\"type\": \"paragraph\", \"content\": [{\"text\": \"The IT Manager must maintain an automated endpoint inventory via AWS Config and a CMDB, reviewing asset records quarterly for accuracy and completeness.\", \"type\": \"text\"}]}]}, {\"type\": \"listItem\", \"content\": [{\"type\": \"paragraph\", \"content\": [{\"text\": \"All company-provided laptops and EBS volumes shall be encrypted with AES-256 using AWS KMS keys managed in us-east-1; compliance scans shall run monthly.\", \"type\": \"text\"}]}]}]}","{\"type\": \"heading\", \"content\": [{\"text\": \"3.3 Endpoint Security Administration\", \"type\": \"text\"}]}","{\"type\": \"orderedList\", \"content\": [{\"type\": \"listItem\", \"content\": [{\"type\": \"paragraph\", \"content\": [{\"text\": \"The Security Administrator shall document and maintain endpoint configuration procedures in the internal Security Wiki, reviewing them annually.\", \"type\": \"text\"}]}]}, {\"type\": \"listItem\", \"content\": [{\"type\": \"paragraph\", \"content\": [{\"text\": \"Encryption keys and anti-malware policies must be centrally managed via AWS IAM and Google Workspace APIs, with access reviewed quarterly by Executive Management.\", \"type\": \"text\"}]}]}]}","{\"type\": \"heading\", \"content\": [{\"text\": \"4. Exceptions Process\", \"type\": \"text\"}]}","{\"type\": \"paragraph\", \"content\": [{\"text\": \"Employees must request endpoint exceptions through Linear tickets, including business justification, compensating controls, and duration capped at 30 days. The Information Security Officer and IT Manager shall jointly approve, document, and time-limit each exception; all exceptions shall be re-evaluated at expiration.\", \"type\": \"text\"}]}","{\"type\": \"heading\", \"content\": [{\"text\": \"5. Violations and Disciplinary Action\", \"type\": \"text\"}]}","{\"type\": \"paragraph\", \"content\": [{\"text\": \"Automated monitoring via AWS CloudWatch and intrusion detection systems shall detect non-compliance. Suspected violations must be reported to the Information Security Officer and HR within 24 hours. Confirmed violations will trigger HR disciplinary tiers (verbal warning through termination) and may include immediate access revocation or device quarantine.\", \"type\": \"text\"}]}","{\"type\": \"heading\", \"content\": [{\"text\": \"6. Auditor Evidence Artefacts\", \"type\": \"text\"}]}","{\"type\": \"bulletList\", \"content\": [{\"type\": \"listItem\", \"content\": [{\"type\": \"paragraph\", \"content\": [{\"text\": \"AWS Config export reports and KMS key configuration screenshots\", \"type\": \"text\"}]}]}, {\"type\": \"listItem\", \"content\": [{\"type\": \"paragraph\", \"content\": [{\"text\": \"Daily anti-malware update logs and quarterly verification records\", \"type\": \"text\"}]}]}, {\"type\": \"listItem\", \"content\": [{\"type\": \"paragraph\", \"content\": [{\"text\": \"CMDB asset inventory exports with quarterly review annotations\", \"type\": \"text\"}]}]}, {\"type\": \"listItem\", \"content\": [{\"type\": \"paragraph\", \"content\": [{\"text\": \"Linear exception request and approval tickets\", \"type\": \"text\"}]}]}, {\"type\": \"listItem\", \"content\": [{\"type\": \"paragraph\", \"content\": [{\"text\": \"Incident reports and HR disciplinary action logs\", \"type\": \"text\"}]}]}, {\"type\": \"listItem\", \"content\": [{\"type\": \"paragraph\", \"content\": [{\"text\": \"Minutes from quarterly security review meetings\", \"type\": \"text\"}]}]}]}"]




In [463]:
# Configuration options
output_filename = None  # Set to None to auto-generate from document title, or specify a custom filename
organization_name = "Casper Studios"  # Change this to your organization name
custom_date = None  # Set to a specific date string or leave as None for today's date

# Generate the PDF
# If output_filename is None, it will use the first heading as the filename
# If you selected a policy from the source file, it will include name and description
if 'selected_policy' in globals() and selected_policy:
    generate_pdf(
        json_data=json_input,
        output_filename=output_filename,
        organization_name=organization_name,
        custom_date=custom_date,
        policy_name=selected_policy.get('name'),
        policy_description=selected_policy.get('description')
    )
else:
    generate_pdf(
        json_data=json_input,
        output_filename=output_filename,
        organization_name=organization_name,
        custom_date=custom_date
    )


UnboundLocalError: local variable 're' referenced before assignment

In [446]:
# Alternative method: paste your entire JSON string here
json_string = '''
# PASTE YOUR JSON STRING HERE
# Just replace this comment with your JSON data
'''

# Uncomment the lines below to use this method
# generate_pdf(
#     json_data=json_string,
#     output_filename="output.pdf",
#     organization_name="Your Company",
#     custom_date=None
# )


In [447]:
# Generate PDFs for all policies in the source file
def generate_all_pdfs(organization_name="Casper Studios", custom_date=None):
    """Generate PDFs for all policies loaded from source_data.txt"""
    import re  # For filename sanitization
    
    if not policies_data:
        print("❌ No policies loaded. Please run the data loading cell first.")
        return
    
    print(f"🚀 Starting batch PDF generation for {len(policies_data)} policies...\n")
    
    successful = 0
    failed = 0
    
    for i, policy in enumerate(policies_data):
        try:
            print(f"Processing {i+1}/{len(policies_data)}: {policy['name']}")
            
            # Generate filename from policy name instead of document content
            policy_filename = re.sub(r'[^\w\s-]', '', policy['name'].lower())
            policy_filename = re.sub(r'[-\s]+', '_', policy_filename)
            policy_filename = f"{policy_filename}.pdf"
            
            # Generate PDF with policy name as filename
            generate_pdf(
                json_data=policy['content'],
                output_filename=policy_filename,
                organization_name=organization_name,
                custom_date=custom_date,
                policy_name=policy['name'],
                policy_description=policy['description']
            )
            
            successful += 1
            print("")  # Add blank line between policies
            
        except Exception as e:
            print(f"❌ Failed to generate PDF for {policy['name']}: {str(e)}\n")
            failed += 1
    
    print(f"\n{'='*60}")
    print(f"✅ Batch processing complete!")
    print(f"   Successful: {successful}")
    print(f"   Failed: {failed}")
    print(f"   Total: {len(policies_data)}")
    print(f"{'='*60}")

# ⚡ GENERATE ALL PDFs AT ONCE ⚡
# Uncomment and run ONE of the options below to generate PDFs for ALL policies:

# Option 1: Use default settings
# generate_all_pdfs()

# Option 2: Custom organization name
# generate_all_pdfs(organization_name="Your Company Name")

# Option 3: Custom organization and date
# generate_all_pdfs(organization_name="Casper Studios", custom_date="2024-01-01")


In [448]:
# Run this cell to generate ALL PDFs at once!
generate_all_pdfs()

🚀 Starting batch PDF generation for 19 policies...

Processing 1/19: Business Continuity Policy
Generating PDF: pdfs/business_continuity_policy.pdf
✅ PDF generated successfully: pdfs/business_continuity_policy.pdf

Processing 2/19: Information Security Program
Generating PDF: pdfs/information_security_program.pdf
✅ PDF generated successfully: pdfs/information_security_program.pdf

Processing 3/19: Capacity & Performance Management
Generating PDF: pdfs/capacity_performance_management.pdf
✅ PDF generated successfully: pdfs/capacity_performance_management.pdf

Processing 4/19: Change Management Policy 
Generating PDF: pdfs/change_management_policy_.pdf
✅ PDF generated successfully: pdfs/change_management_policy_.pdf

Processing 5/19: Encryption & Cryptographic Control Policy
Generating PDF: pdfs/encryption_cryptographic_control_policy.pdf
✅ PDF generated successfully: pdfs/encryption_cryptographic_control_policy.pdf

Processing 6/19: Logging Policy
Generating PDF: pdfs/logging_policy.pdf


In [449]:
# Let's debug the list parsing issue
# Check what the actual content looks like for lists

# Select Information Protection Policy (index 13)
policy_index = 13
selected_policy = policies_data[policy_index]
print(f"Analyzing: {selected_policy['name']}\n")

# Parse the JSON and look for lists
import json
for i, json_str in enumerate(selected_policy['content'][:10]):  # First 10 items
    try:
        node = json.loads(json_str)
        if node.get('type') in ['orderedList', 'bulletList']:
            print(f"\nFound {node['type']} at index {i}:")
            for j, item in enumerate(node.get('content', [])):
                if item.get('type') == 'listItem':
                    for para in item.get('content', []):
                        if para.get('type') == 'paragraph':
                            text = ''.join([t.get('text', '') for t in para.get('content', [])])
                            print(f"  Item {j}: {text[:80]}...")
    except:
        pass


Analyzing: Information Protection Policy


Found orderedList at index 1:
  Item 0: 1. Document Content Page...
  Item 1: 2. Executive Summary...
  Item 2: 3. Applicability and Scope...
  Item 3: 4. Controls...
  Item 4: 5. Exceptions Process...
  Item 5: 6. Violations and Disciplinary Action...
  Item 6: 7. Auditor Evidence Artefacts...

Found orderedList at index 9:
  Item 0: Retention periods must be defined per data classification (e.g., employee record...
  Item 1: Media decommissioning (e.g., AWS EBS volumes, local disk images) must employ cry...


In [450]:
# Let's look at a specific bulletList example
# Find the Table of Contents which seems to be the problematic list
import json

# Look for the Table of Contents in Information Protection Policy
policy_index = 13
selected_policy = policies_data[policy_index]

for i, json_str in enumerate(selected_policy['content'][:5]):
    try:
        node = json.loads(json_str)
        print(f"\nNode {i}: {node.get('type')}")
        if node.get('type') == 'heading':
            text = ''.join([t.get('text', '') for t in node.get('content', [])[0].get('content', [])])
            print(f"  Heading: {text}")
        elif node.get('type') == 'bulletList':
            print("  BulletList items:")
            for item in node.get('content', []):
                if item.get('type') == 'listItem':
                    for para in item.get('content', []):
                        if para.get('type') == 'paragraph':
                            text = ''.join([t.get('text', '') for t in para.get('content', [])])
                            print(f"    - '{text}'")
    except Exception as e:
        print(f"  Error: {e}")



Node 0: heading
  Heading: 

Node 1: orderedList

Node 2: heading
  Heading: 

Node 3: paragraph

Node 4: heading
  Heading: 


In [None]:
# Test the fixed list parsing with Information Protection Policy
policy_index = 13  # Information Protection Policy
selected_policy = policies_data[policy_index]

print(f"Regenerating PDF for: {selected_policy['name']}")

# Generate with the fixed code
generate_pdf(
    json_data=selected_policy['content'],
    output_filename="information_protection_policy_fixed.pdf",
    organization_name="Casper Studios",
    custom_date=None,
    policy_name=selected_policy['name'],
    policy_description=selected_policy['description']
)


In [None]:
# Regenerate all PDFs with the fixed list parsing
print("🔧 Regenerating all PDFs with fixed list parsing and improved spacing...\n")
generate_all_pdfs()


In [451]:
# 🔄 REGENERATE ALL PDFs with enhanced layout (includes policy name & description)
# Uncomment and run to regenerate all PDFs with the new format:
# generate_all_pdfs()


In [452]:
# List all generated PDFs
import os
pdf_dir = "pdfs"
if os.path.exists(pdf_dir):
    pdf_files = [f for f in os.listdir(pdf_dir) if f.endswith('.pdf')]
    print(f"📚 Generated PDFs in {pdf_dir}/ directory:")
    print("-" * 60)
    for pdf in sorted(pdf_files):
        print(f"  ✓ {pdf}")
    print("-" * 60)
    print(f"Total: {len(pdf_files)} PDFs")
else:
    print(f"❌ No {pdf_dir}/ directory found. Run generate_all_pdfs() first!")


📚 Generated PDFs in pdfs/ directory:
------------------------------------------------------------
  ✓ acceptable_use_policy.pdf
  ✓ access_control_policy.pdf
  ✓ asset_management_policy.pdf
  ✓ business_continuity_policy.pdf
  ✓ capacity_performance_management.pdf
  ✓ change_management_policy_.pdf
  ✓ encryption_cryptographic_control_policy.pdf
  ✓ endpoint_protection_policy.pdf
  ✓ executive_summary.pdf
  ✓ incident_response_policy.pdf
  ✓ information_protection_policy.pdf
  ✓ information_security_program.pdf
  ✓ logging_policy.pdf
  ✓ physical_security_policy.pdf
  ✓ privacy_policy.pdf
  ✓ risk_management_policy_.pdf
  ✓ secure_development_policy.pdf
  ✓ security_awareness_training_policy.pdf
  ✓ third_party_risk_management_policy.pdf
  ✓ vulnerability_management_policy.pdf
------------------------------------------------------------
Total: 20 PDFs


In [453]:
# Run this cell to generate ALL PDFs at once!
generate_all_pdfs()

🚀 Starting batch PDF generation for 19 policies...

Processing 1/19: Business Continuity Policy
Generating PDF: pdfs/business_continuity_policy.pdf
✅ PDF generated successfully: pdfs/business_continuity_policy.pdf

Processing 2/19: Information Security Program
Generating PDF: pdfs/information_security_program.pdf


✅ PDF generated successfully: pdfs/information_security_program.pdf

Processing 3/19: Capacity & Performance Management
Generating PDF: pdfs/capacity_performance_management.pdf
✅ PDF generated successfully: pdfs/capacity_performance_management.pdf

Processing 4/19: Change Management Policy 
Generating PDF: pdfs/change_management_policy_.pdf
✅ PDF generated successfully: pdfs/change_management_policy_.pdf

Processing 5/19: Encryption & Cryptographic Control Policy
Generating PDF: pdfs/encryption_cryptographic_control_policy.pdf
✅ PDF generated successfully: pdfs/encryption_cryptographic_control_policy.pdf

Processing 6/19: Logging Policy
Generating PDF: pdfs/logging_policy.pdf
✅ PDF generated successfully: pdfs/logging_policy.pdf

Processing 7/19: Physical Security Policy
Generating PDF: pdfs/physical_security_policy.pdf
✅ PDF generated successfully: pdfs/physical_security_policy.pdf

Processing 8/19: Vulnerability Management Policy
Generating PDF: pdfs/vulnerability_management_policy.p

In [454]:
# Run this cell to generate ALL PDFs at once!
generate_all_pdfs()

🚀 Starting batch PDF generation for 19 policies...

Processing 1/19: Business Continuity Policy
Generating PDF: pdfs/business_continuity_policy.pdf
✅ PDF generated successfully: pdfs/business_continuity_policy.pdf

Processing 2/19: Information Security Program
Generating PDF: pdfs/information_security_program.pdf
✅ PDF generated successfully: pdfs/information_security_program.pdf

Processing 3/19: Capacity & Performance Management
Generating PDF: pdfs/capacity_performance_management.pdf
✅ PDF generated successfully: pdfs/capacity_performance_management.pdf

Processing 4/19: Change Management Policy 
Generating PDF: pdfs/change_management_policy_.pdf
✅ PDF generated successfully: pdfs/change_management_policy_.pdf

Processing 5/19: Encryption & Cryptographic Control Policy
Generating PDF: pdfs/encryption_cryptographic_control_policy.pdf
✅ PDF generated successfully: pdfs/encryption_cryptographic_control_policy.pdf

Processing 6/19: Logging Policy
Generating PDF: pdfs/logging_policy.pdf


In [455]:
# Run this cell to generate ALL PDFs at once!
generate_all_pdfs()

🚀 Starting batch PDF generation for 19 policies...

Processing 1/19: Business Continuity Policy
Generating PDF: pdfs/business_continuity_policy.pdf
✅ PDF generated successfully: pdfs/business_continuity_policy.pdf

Processing 2/19: Information Security Program
Generating PDF: pdfs/information_security_program.pdf
✅ PDF generated successfully: pdfs/information_security_program.pdf

Processing 3/19: Capacity & Performance Management
Generating PDF: pdfs/capacity_performance_management.pdf
✅ PDF generated successfully: pdfs/capacity_performance_management.pdf

Processing 4/19: Change Management Policy 
Generating PDF: pdfs/change_management_policy_.pdf
✅ PDF generated successfully: pdfs/change_management_policy_.pdf

Processing 5/19: Encryption & Cryptographic Control Policy
Generating PDF: pdfs/encryption_cryptographic_control_policy.pdf
✅ PDF generated successfully: pdfs/encryption_cryptographic_control_policy.pdf

Processing 6/19: Logging Policy
Generating PDF: pdfs/logging_policy.pdf


In [None]:
generate_all_pdfs()

🚀 Starting batch PDF generation for 19 policies...

Processing 1/19: Business Continuity Policy
Generating PDF: pdfs/business_continuity_policy.pdf
✅ PDF generated successfully: pdfs/business_continuity_policy.pdf

Processing 2/19: Information Security Program
Generating PDF: pdfs/information_security_program.pdf
✅ PDF generated successfully: pdfs/information_security_program.pdf

Processing 3/19: Capacity & Performance Management
Generating PDF: pdfs/capacity_performance_management.pdf
✅ PDF generated successfully: pdfs/capacity_performance_management.pdf

Processing 4/19: Change Management Policy 
Generating PDF: pdfs/change_management_policy_.pdf
✅ PDF generated successfully: pdfs/change_management_policy_.pdf

Processing 5/19: Encryption & Cryptographic Control Policy
Generating PDF: pdfs/encryption_cryptographic_control_policy.pdf
✅ PDF generated successfully: pdfs/encryption_cryptographic_control_policy.pdf

Processing 6/19: Logging Policy
Generating PDF: pdfs/logging_policy.pdf
