In [1]:
import boto3
import hashlib
import string
import os
import json
import re
root_dir = '../Themes'
audio_dir = '../audio'

In [None]:
polly = boto3.client('polly')

os.makedirs(audio_dir, exist_ok=True)

processed_lines = set()
# Include existing mp3 files in processed_lines
for filename in os.listdir(audio_dir):
    if filename.endswith('.mp3'):
        file_root = os.path.splitext(filename)[0]  # Remove file extension
        processed_lines.add(file_root)

for subdir, _, files in os.walk(root_dir):
    for file in files:
        if file.endswith('.json'):
            new_entries = 0
            file_path = os.path.join(subdir, file)
            theme = os.path.splitext(file)[0]
            try:
                with open(file_path, 'r', encoding='utf-8') as f:
                    data = json.load(f)
                    for entry in data:
                        line = entry.get('line', '')
                        # Normalize the line to create a unique key
                        line_key = line.translate(str.maketrans('', '', string.punctuation)).replace(' ', '').lower()
                        line_hash = hashlib.sha256(line_key.encode('utf-8')).hexdigest()
                        if line_hash in processed_lines:
                            continue
                        response = polly.synthesize_speech(
                            Text=line,
                            OutputFormat='mp3',
                            VoiceId='Salli'  # Standard voice, not neural
                        )
                        new_entries += 1
                        audio_file_path = os.path.join(audio_dir, f"{line_hash}.mp3")
                        with open(audio_file_path, 'wb') as audio_file:
                            audio_file.write(response['AudioStream'].read())
                        processed_lines.add(line_hash)
                if new_entries:
                    print(f"Processed: {theme} with {new_entries} new entries")
            except Exception as e:
                print(f"Error processing file {file}: {e}")

In [None]:
orphaned_lines = []

for subdir, _, files in os.walk(root_dir):
    for file in files:
        if file.endswith('.json'):
            file_path = os.path.join(subdir, file)
            try:
                with open(file_path, 'r', encoding='utf-8') as f:
                    data = json.load(f)
                    for entry in data:
                        line = entry.get('line', '')
                        if not line:
                            orphaned_lines.append({'file': file, 'entry': entry, 'reason': 'Missing line'})
                            continue
                        # Generate line hash as before
                        line_key = line.translate(str.maketrans('', '', string.punctuation)).replace(' ', '').lower()
                        line_hash = hashlib.sha256(line_key.encode('utf-8')).hexdigest()
                        if line_hash not in processed_lines:
                            orphaned_lines.append({'file': file, 'line': line, 'reason': 'No audio associated'})
            except Exception as e:
                print(f"Error processing file {file}: {e}")

if orphaned_lines:
    print("The following entries are orphaned:")
    for orphan in orphaned_lines:
        print(f"File: {orphan['file']}, Line: {orphan.get('line', '')}, Reason: {orphan['reason']}")
else:
    print("No orphaned or abandoned lines found.")