In [6]:
import os
import json
import re
root_dir = '../Themes'
audio_dir = '../audio'

In [None]:
# convert old type text files into json
print(os.listdir(root_dir))

for subdir, _, files in os.walk(root_dir):
    for file in files:
        if file.endswith('.txt'):
            output = []
            unique_no_sub = set()
            unique_sub = set()
            theme = os.path.splitext(file)[0]
            file_path = os.path.join(subdir, file)
            try:
                #print(f"Processing {file_path}...")
                with open(file_path, 'r', encoding='utf-8') as f:
                    for line_number, line in enumerate(f, 1):
                        line = line.strip()
                        if not line or line.startswith('Description:'):
                            continue
                        parts = line.split('\t')
                        if len(parts) != 4:
                            print(f"Warning: Line {line_number} in {file} does not have 4 columns.")
                            continue
                        difficulty, line_no_sub, line_with_sub, dom_flag = parts
                        dominant = "Master" if dom_flag.lower() == "dom" else None
                        # Entry without submissive pet name
                        if line_no_sub in unique_no_sub:
                            print(f"Warning: Duplicate entry found in {file}: {line_no_sub}")
                        else:
                            entry_no_sub = {
                                "type": "audio",
                                "line": line_no_sub,
                                "theme": theme,
                                "dominant": dominant,
                                "subject": None,
                                "difficulty": difficulty
                            }
                            output.append(entry_no_sub)
                            unique_no_sub.add(line_no_sub)
                        if line_with_sub in unique_sub:
                            print(f"Warning: Duplicate entry found in {file}: {line_with_sub}")
                        else:
                            # Entry with submissive pet name
                            entry_with_sub = {
                                "type": "audio",
                                "line": line_with_sub,
                                "theme": theme,
                                "dominant": dominant,
                                "subject": "Bambi",
                                "difficulty": difficulty
                            }
                            output.append(entry_with_sub)
                            unique_sub.add(line_with_sub)
            except Exception as e:
                print(f"Error processing file {file}: {e}")
            if output:
                with open(os.path.join(subdir,f'{theme}.json'), 'w', encoding='utf-8') as outfile:
                    json.dump(output, outfile, ensure_ascii=False, indent=4)
            else:
                print(f"No valid entries found in {file}")

In [28]:
# convert any lines including "Master" to "Mistress" and append to original json
for subdir, _, files in os.walk(root_dir):
    for file in files:
        if file.endswith('.json'):
            file_path = os.path.join(subdir, file)
            try:
                with open(file_path, 'r', encoding='utf-8') as f:
                    data = json.load(f)
                    for entry in data:
                        line = entry.get('line', '')
                        if entry['dominant']=="Master":
                            new_entry = entry.copy() #note, this would probably break if the subject has gendered pronouns. probably better to stay neutral they/them for consistency
                            new_entry['line'] = re.sub(r'\bmaster\b', 'Mistress', line, flags=re.IGNORECASE)
                            new_entry['line'] = re.sub(r'\bhis\b', 'her', new_entry['line'], flags=re.IGNORECASE)
                            new_entry['line'] = re.sub(r'\bhe\b', 'she', new_entry['line'], flags=re.IGNORECASE)
                            new_entry['line'] = re.sub(r'\bhim\b', 'her', new_entry['line'], flags=re.IGNORECASE)
                            new_entry['dominant']="Mistress"
                            data.append(new_entry)
                    #print([i for i in data if i['dominant']=='Mistress'])
                    # for i in data:
                    #     if i['dominant']=="Mistress":
                    #         print(i['line'])
                #do a simple deduplication of data by making sure data[n]['line'] is unique
                print(f"Data is {len(data)} lines")
                seen = set()
                deduplicated_data = []
                for item in data:
                    if item['line'] not in seen:
                        seen.add(item['line'])
                        seen.add(item['line'].replace('’', "'"))
                        item['line'] = item['line'].replace('’', "'")
                        deduplicated_data.append(item)
                    else:
                        print("duplicate line: " + item['line'])
                print(f"Deduplicate is {len(deduplicated_data)}")
                with open(file_path, 'w', encoding='utf-8') as f:
                    json.dump(deduplicated_data, f, ensure_ascii=False, indent=4)
            except Exception as e:
                print(f"Error processing file {file}: {e}")

Data is 1372 lines
duplicate line: I trust Mistress's version of events completely.
duplicate line: Bambi trusts Mistress's version of events completely.
duplicate line: My memory is hazy; Mistress knows the truth.
duplicate line: Bambi's memory is hazy; Mistress knows the truth.
duplicate line: Mistress's words untangle my confusion.
duplicate line: Mistress's words untangle Bambi's confusion.
duplicate line: I feel secure when Mistress explains things to me.
duplicate line: Bambi feels secure when Mistress explains things to her.
duplicate line: Mistress helps me see the world as it really is.
duplicate line: Mistress helps Bambi see the world as it really is.
duplicate line: My thoughts twist and turn; Mistress's clarity is a blessing.
duplicate line: Bambi's thoughts twist and turn; Mistress's clarity is a blessing.
duplicate line: Mistress's truth is the only stable ground I have.
duplicate line: Mistress's truth is the only stable ground Bambi has.
duplicate line: My perceptions 