In [6]:
import boto3
import hashlib
import string
import os
import json
import re
root_dir = '../Themes'
audio_dir = '../audio'

In [None]:
# convert old type text files into json
print(os.listdir(root_dir))

for subdir, _, files in os.walk(root_dir):
    for file in files:
        if file.endswith('.txt'):
            output = []
            unique_no_sub = set()
            unique_sub = set()
            theme = os.path.splitext(file)[0]
            file_path = os.path.join(subdir, file)
            try:
                #print(f"Processing {file_path}...")
                with open(file_path, 'r', encoding='utf-8') as f:
                    for line_number, line in enumerate(f, 1):
                        line = line.strip()
                        if not line or line.startswith('Description:'):
                            continue
                        parts = line.split('\t')
                        if len(parts) != 4:
                            print(f"Warning: Line {line_number} in {file} does not have 4 columns.")
                            continue
                        difficulty, line_no_sub, line_with_sub, dom_flag = parts
                        dominant = "Master" if dom_flag.lower() == "dom" else None
                        # Entry without submissive pet name
                        if line_no_sub in unique_no_sub:
                            print(f"Warning: Duplicate entry found in {file}: {line_no_sub}")
                        else:
                            entry_no_sub = {
                                "type": "audio",
                                "line": line_no_sub,
                                "theme": theme,
                                "dominant": dominant,
                                "subject": None,
                                "difficulty": difficulty
                            }
                            output.append(entry_no_sub)
                            unique_no_sub.add(line_no_sub)
                        if line_with_sub in unique_sub:
                            print(f"Warning: Duplicate entry found in {file}: {line_with_sub}")
                        else:
                            # Entry with submissive pet name
                            entry_with_sub = {
                                "type": "audio",
                                "line": line_with_sub,
                                "theme": theme,
                                "dominant": dominant,
                                "subject": "Bambi",
                                "difficulty": difficulty
                            }
                            output.append(entry_with_sub)
                            unique_sub.add(line_with_sub)
            except Exception as e:
                print(f"Error processing file {file}: {e}")
            if output:
                with open(os.path.join(subdir,f'{theme}.json'), 'w', encoding='utf-8') as outfile:
                    json.dump(output, outfile, ensure_ascii=False, indent=4)
            else:
                print(f"No valid entries found in {file}")

In [19]:
# convert any lines including "Master" to "Mistress" and append to original json
for subdir, _, files in os.walk(root_dir):
    for file in files:
        if file.endswith('.json'):
            file_path = os.path.join(subdir, file)
            try:
                with open(file_path, 'r', encoding='utf-8') as f:
                    data = json.load(f)
                    for entry in data:
                        line = entry.get('line', '')
                        if entry['dominant']=="Master":
                            new_entry = entry.copy() #note, this would probably break if the subject has gendered pronouns. probably better to stay neutral they/them for consistency
                            new_entry['line'] = re.sub(r'\bmaster\b', 'Mistress', line, flags=re.IGNORECASE)
                            new_entry['line'] = re.sub(r'\bhis\b', 'her', new_entry['line'], flags=re.IGNORECASE)
                            new_entry['line'] = re.sub(r'\bhe\b', 'she', new_entry['line'], flags=re.IGNORECASE)
                            new_entry['line'] = re.sub(r'\bhim\b', 'her', new_entry['line'], flags=re.IGNORECASE)
                            new_entry['dominant']="Mistress"
                            data.append(new_entry)
                    #print([i for i in data if i['dominant']=='Mistress'])
                    # for i in data:
                    #     if i['dominant']=="Mistress":
                    #         print(i['line'])
                #do a simple deduplication of data by making sure data[n]['line'] is unique
                print(f"Data is {len(data)} lines")
                seen = set()
                deduplicated_data = []
                for item in data:
                    if item['line'] not in seen:
                        deduplicated_data.append(item)
                        seen.add(item['line'])
                    else:
                        print("duplicate line: " + item['line'])
                print(f"Deduplicate is {len(deduplicated_data)}")
                with open(file_path, 'w', encoding='utf-8') as f:
                    json.dump(data, f, ensure_ascii=False, indent=4)
            except Exception as e:
                print(f"Error processing file {file}: {e}")

Data is 974 lines
Deduplicate is 974
Data is 814 lines
Deduplicate is 814
Data is 820 lines
Deduplicate is 820
Data is 886 lines
Deduplicate is 886
Data is 900 lines
Deduplicate is 900
Data is 876 lines
Deduplicate is 876
Data is 898 lines
Deduplicate is 898
Data is 856 lines
Deduplicate is 856
Data is 1026 lines
Deduplicate is 1026
Data is 900 lines
Deduplicate is 900
Data is 748 lines
Deduplicate is 748
Data is 862 lines
duplicate line: Bambi's approval is her strongest motivation.
Deduplicate is 861
Data is 862 lines
Deduplicate is 862


In [None]:
polly = boto3.client('polly')

os.makedirs(audio_dir, exist_ok=True)

processed_lines = set()
# Include existing mp3 files in processed_lines
for filename in os.listdir(audio_dir):
    if filename.endswith('.mp3'):
        file_root = os.path.splitext(filename)[0]  # Remove file extension
        processed_lines.add(file_root)

for subdir, _, files in os.walk(root_dir):
    for file in files:
        if file.endswith('.json'):
            new_entries = 0
            file_path = os.path.join(subdir, file)
            theme = os.path.splitext(file)[0]
            try:
                with open(file_path, 'r', encoding='utf-8') as f:
                    data = json.load(f)
                    for entry in data:
                        line = entry.get('line', '')
                        # Normalize the line to create a unique key
                        line_key = line.translate(str.maketrans('', '', string.punctuation)).replace(' ', '').lower()
                        line_hash = hashlib.sha256(line_key.encode('utf-8')).hexdigest()
                        if line_hash in processed_lines:
                            continue
                        response = polly.synthesize_speech(
                            Text=line,
                            OutputFormat='mp3',
                            VoiceId='Salli'  # Standard voice, not neural
                        )
                        new_entries += 1
                        audio_file_path = os.path.join(audio_dir, f"{line_hash}.mp3")
                        with open(audio_file_path, 'wb') as audio_file:
                            audio_file.write(response['AudioStream'].read())
                        processed_lines.add(line_hash)
                if new_entries:
                    print(f"Processed: {theme} with {new_entries} new entries")
            except Exception as e:
                print(f"Error processing file {file}: {e}")

In [None]:
orphaned_lines = []

for subdir, _, files in os.walk(root_dir):
    for file in files:
        if file.endswith('.json'):
            file_path = os.path.join(subdir, file)
            try:
                with open(file_path, 'r', encoding='utf-8') as f:
                    data = json.load(f)
                    for entry in data:
                        line = entry.get('line', '')
                        if not line:
                            orphaned_lines.append({'file': file, 'entry': entry, 'reason': 'Missing line'})
                            continue
                        # Generate line hash as before
                        line_key = line.translate(str.maketrans('', '', string.punctuation)).replace(' ', '').lower()
                        line_hash = hashlib.sha256(line_key.encode('utf-8')).hexdigest()
                        if line_hash not in processed_lines:
                            orphaned_lines.append({'file': file, 'line': line, 'reason': 'No audio associated'})
            except Exception as e:
                print(f"Error processing file {file}: {e}")

if orphaned_lines:
    print("The following entries are orphaned:")
    for orphan in orphaned_lines:
        print(f"File: {orphan['file']}, Line: {orphan.get('line', '')}, Reason: {orphan['reason']}")
else:
    print("No orphaned or abandoned lines found.")