In [283]:
import os
import sys
from IPython import get_ipython
from pathlib import Path
import json
import pandas as pd
import re
from concurrent.futures import ThreadPoolExecutor, as_completed

In [284]:
def get_script_directory():
    """
    Returns the ACTUAL directory containing the notebook/script.
    Works in:
    - VS Code Jupyter notebooks
    - Regular Jupyter Notebook/Lab
    - Standalone Python scripts
    """
    # If running in Jupyter
    if 'ipykernel' in sys.modules:
        try:
            # 1. First try VS Code's special attribute
            shell = get_ipython()
            if hasattr(shell, '__vsc_ipynb_file__'):
                return str(Path(shell.__vsc_ipynb_file__).parent)
            
            # 2. Try Jupyter notebook path (modern Jupyter)
            from notebook.notebookapp import list_running_servers
            servers = list_running_servers()
            if servers:
                import requests
                from urllib.parse import urljoin
                kernel_id = Path(get_ipython().config['IPKernelApp']['connection_file']).stem.replace('kernel-', '')
                for server in servers:
                    sessions = requests.get(urljoin(server['url'], 'api/sessions'), params={'token': server.get('token', '')}).json()
                    for session in sessions:
                        if session['kernel']['id'] == kernel_id:
                            return str(Path(server['notebook_dir']) / Path(session['notebook']['path']).parent)
            
            # 3. Fallback to current working directory
            return str(Path.cwd())
        except:
            return str(Path.cwd())
    
    # If running as a Python script
    return str(Path(__file__).parent.resolve())

In [285]:
# Function to delete all files and folders in path except .obsidian
def delete_all_except_obsidian(path):
    """
    Deletes all files and folders in the given path except for the .obsidian folder.
    """
    for item in os.listdir(path):
        item_path = os.path.join(path, item)
        if os.path.isdir(item_path) and item != '.obsidian':
            # Recursively delete the contents of the directory
            for root, dirs, files in os.walk(item_path, topdown=False):
                for file in files:
                    os.remove(os.path.join(root, file))
                for dir in dirs:
                    os.rmdir(os.path.join(root, dir))
            os.rmdir(item_path)  # Remove the now-empty directory
        elif os.path.isfile(item_path):
            os.remove(item_path)

In [286]:
def load_json(json_path):
    with open(json_path, 'r') as file:
        json_contents = json.load(file)
    return json_contents

In [287]:
def construct_url_dict(gid_dict, url_template):
    url_dict = {}
    for sheet, gid in gid_dict.items():
        full_url = url_template.replace("edit?gid=gid_value#gid=gid_value", f"export?format=csv&gid={gid}")
        url_dict[sheet] = full_url
    return url_dict

In [288]:
def construct_master_url_dict(sheets_dict):
    master_url_dict={}
    for spreadsheet, spreadsheet_dict in sheets_dict.items():
        spreadsheet_gid_dict=spreadsheet_dict['sheets']
        spreadsheet_url_template=spreadsheet_dict['link_template']
        spreadsheet_url_dict=construct_url_dict(spreadsheet_gid_dict, spreadsheet_url_template)
        master_url_dict[spreadsheet]=spreadsheet_url_dict
    return master_url_dict

In [289]:
def row_first_value(row):
    first_value = str(row.iloc[0]).strip() if pd.notna(row.iloc[0]) else None
    return first_value

def first_column_indecies(df):
    indecies_list = df.iloc[1:, 0]
    return indecies_list

In [290]:
def initial_content_dict_from_url(url):
    df = pd.read_csv(url)
    
    # Convert float columns to Int64 where appropriate
    for col in df.select_dtypes(include=['float64']):
        if (df[col].dropna().apply(float.is_integer).all()):
            df[col] = df[col].astype('Int64')
    
    content_dict = {}
    
    for _, row in df.iterrows():
        if pd.notna(row.iloc[0]):  # First column as key
            index_value = row.iloc[0]
            
            if index_value not in content_dict:
                content_dict[index_value] = {}
            
            # Process each column
            for i, val in enumerate(row.iloc[1:]):
                if pd.notna(val):
                    original_col = df.columns[i+1]
                    base_col = original_col.split('.')[0]  # Remove pandas suffixes
                    
                    # Find next available column name
                    col_name = base_col
                    suffix = 1
                    while col_name in content_dict[index_value]:
                        suffix += 1
                        col_name = f"{base_col}_{suffix}"
                    
                    content_dict[index_value][col_name] = val
    
    return content_dict

In [291]:
def extract_filename(path: str) -> str:
    # Split by forward slash and filter out empty strings (this handles multiple slashes)
    parts = [part for part in path.split('/') if part]
    # Return the last element, if available; otherwise return an empty string.
    return parts[-1] if parts else ''

In [292]:
def extract_directory(path: str) -> str:
    # Remove any trailing slashes so that they don't interfere with finding the last meaningful slash.
    stripped_path = path.rstrip('/')
    # Find the index of the last slash in the stripped path.
    last_slash_index = stripped_path.rfind('/')
    # If a slash exists, return everything before it. If not, return an empty string.
    return stripped_path[:last_slash_index] if last_slash_index != -1 else ''


In [293]:
def remove_empty_lines(s: str) -> str:
    """
    Remove all literal '/n' substrings and leading spaces from each line,
    and remove any trailing empty (or whitespace-only) lines from the input string.
    
    Args:
        s (str): The input string.
    
    Returns:
        str: The cleaned string.
    """
    # Remove all literal occurrences of "/n"
    s = s.replace("/n", "")
    
    # Split the string into lines.
    lines = s.splitlines()
    
    # Remove trailing empty or whitespace-only lines.
    while lines and not lines[-1].strip():
        lines.pop()
    
    # Remove leading spaces from each line.
    lines = [line.lstrip() for line in lines]
    
    # Reassemble the string with newline characters.
    return "\n".join(lines)

In [294]:
def clean_square_brackets(text):
    """
    Escape single square brackets in the text to prevent them from being treated as links in Markdown.

    Args:
        text (str): The input text containing square brackets.

    Returns:
        str: The text with escaped square brackets.
    """
    # Escape single square brackets
    return re.sub(r'(?<!\\)\[([^\]]+)\]', r'\\[\1\\]', text)


In [295]:
def clean_sprite_tags_and_brackets(text):
    """
    Remove <sprite name=...> tags and escape square brackets in the text.

    Args:
        text (str): The input text.

    Returns:
        str: The cleaned text.
    """
    # Remove <sprite name=...> tags
    text = re.sub(r'<sprite name=[^>]+>', '', text)
    # Escape square brackets
    return clean_square_brackets(text)

In [296]:
def clean_filename(filename: str) -> str:
    """
    Cleans a filename by:
      - Removing disallowed Windows characters: < > : " / \\ | ? *
      - Removing any spaces immediately preceding or following those characters.
      - Replacing interior occurrences of a disallowed character (with its adjacent spaces) with an underscore.
      - Removing disallowed characters at the beginning or end entirely.
    
    Args:
        filename (str): The original filename string.
    
    Returns:
        str: The cleaned filename.
    """
    # Define the set of disallowed characters.
    invalid = r'[<>:"/\\|?*]'
    
    # 1. Remove invalid characters (with any adjacent spaces) from the beginning.
    filename = re.sub(r'^\s*(?:' + invalid + r'\s*)+', '', filename)
    
    # 2. Remove invalid characters (with any adjacent spaces) from the end.
    filename = re.sub(r'(?:\s*' + invalid + r')+\s*$', '', filename)
    
    # 3. In the interior, replace any invalid character (with optional surrounding spaces)
    #    with a single underscore.
    filename = re.sub(r'\s*(' + invalid + r')\s*', '_', filename)
    
    return remove_empty_lines(filename)

In [297]:
def check_first_row_and_column_duplicates(df, detailed=False):
    """
    Enhanced duplicate checker for first column values and all headers.
    
    Args:
        df (pd.DataFrame): Input DataFrame
        detailed (bool): If True, returns duplicate counts
        
    Returns:
        dict: {
            'column_duplicates': bool/Series,  # First column values
            'header_duplicates': bool/Series,  # All column headers
            'exact_header_duplicates': list   # List of duplicate header names
        }
    """
    results = {
        'column_duplicates': False,
        'header_duplicates': False,
        'exact_header_duplicates': []
    }
    
    # Check first column (values)
    first_col = df.iloc[:, 0]
    col_duplicates = first_col[first_col.duplicated(keep=False)]
    if detailed:
        results['column_duplicates'] = col_duplicates.value_counts().sort_values(ascending=False)
    else:
        results['column_duplicates'] = not col_duplicates.empty
    
    # Enhanced header check
    header_counts = pd.Series(df.columns).value_counts()
    dup_headers = header_counts[header_counts > 1]
    
    if not dup_headers.empty:
        results['header_duplicates'] = True
        results['exact_header_duplicates'] = dup_headers.index.tolist()
        if detailed:
            results['header_duplicates'] = dup_headers.sort_values(ascending=False)
    
    return results

In [298]:
def enhance_search_keys(search_keys):
    """
    Enhance search keys by:
    - Removing leading 'the' or 'a' and adding individual words for keys separated by underscores or colons.
    - Adding variations of words ending with 's' by removing the 's'.

    Args:
        search_keys (list): List of original search keys.

    Returns:
        list: Enhanced list of search keys.
    """
    enhanced_keys = set()

    for key in search_keys:
        # Remove leading 'the' or 'a' (case-insensitive) and the separator after it
        key = re.sub(r'^(the|a)[_:]+', '', key, flags=re.IGNORECASE).strip()

        # Add the cleaned key to the enhanced keys
        enhanced_keys.add(key)

        # Split the key by both underscores and colons
        parts = re.split(r'[_:]', key)
        for part in parts:
            if part:  # Ensure the part is not empty
                enhanced_keys.add(part)

                # If the part ends with 's, add the version without 's
                if part.endswith("'s"):
                    enhanced_keys.add(part[:-2])
                # If the part ends with 's' (without the apostrophe), add the version without 's'
                elif part.endswith("s"):
                    enhanced_keys.add(part[:-1])

    return list(enhanced_keys)

In [299]:
def construct_game_content_dict(config_dict, master_url_dict):
    """
    Constructs the game_content_dict using multithreading for faster processing.

    Args:
        config_dict (dict): Configuration dictionary containing game information.
        master_url_dict (dict): Dictionary containing URLs for each sheet.

    Returns:
        dict: The constructed game_content_dict.
    """
    game_content_dict = {}

    def process_sheet(game, sheet, url):
        """
        Processes a single sheet and returns its content.

        Args:
            game (str): The game name.
            sheet (str): The sheet name.
            url (str): The URL of the sheet.

        Returns:
            tuple: (game, sheet, sheet_content_dict)
        """
        try:
            entry_dict = initial_content_dict_from_url(url)
            sheet_content_dict = {}

            for entry, content in entry_dict.items():
                # Clean the entry name for use as a key
                clean_entry = clean_filename(entry)
                empty_lines_removed_entry = remove_empty_lines(entry)

                # Step 1: Clean the content values
                cleaned_content = {
                    key: clean_sprite_tags_and_brackets(str(value)) if isinstance(value, str) else value
                    for key, value in content.items()
                }

                # Step 2: Capture 'AKA', 'aliases', or 'alias' values
                aliases = []
                for key in ['AKA', 'aliases', 'alias', 'search keys']:
                    if key in content:
                        # Split values by commas and strip whitespace
                        aliases += [val.strip() for val in str(content[key]).split(',') if val.strip()]

                # Step 3: Add 'search_keys' key
                search_keys = [empty_lines_removed_entry] + aliases
                enhanced_keys = enhance_search_keys(search_keys)

                # Step 4: Populate sheet_content_dict with the cleaned content
                sheet_content_dict[empty_lines_removed_entry] = {
                    'title': empty_lines_removed_entry.strip(),  # Strip title
                    'link': f'{game}/{sheet}/{clean_entry}'.strip(),  # Strip link
                    'content': cleaned_content,
                    'search_keys': enhanced_keys,
                    'references': [],
                }

            return game, sheet, sheet_content_dict
        except Exception as e:
            print(f"Error processing sheet '{sheet}' for game '{game}': {e}")
            return game, sheet, {}

    # Use ThreadPoolExecutor to process sheets concurrently
    with ThreadPoolExecutor() as executor:
        futures = []
        for game in config_dict['games']:
            if game in master_url_dict:
                for sheet, url in master_url_dict[game].items():
                    futures.append(executor.submit(process_sheet, game, sheet, url))

        # Collect results as they complete
        for future in as_completed(futures):
            game, sheet, sheet_content_dict = future.result()
            if game not in game_content_dict:
                game_content_dict[game] = {}
            game_content_dict[game][sheet] = sheet_content_dict

    return game_content_dict

In [300]:
def construct_initial_meta_dict(master_url_dict):
    """
    Constructs the meta_lore_dict using multithreading for faster processing.

    Args:
        master_url_dict (dict): Dictionary containing URLs for each sheet in the 'Meta Lore' category.

    Returns:
        dict: The constructed meta_lore_dict.
    """
    meta_lore_dict = {}

    def process_meta_sheet(sheet, url):
        """
        Processes a single meta sheet and returns its content.

        Args:
            sheet (str): The sheet name.
            url (str): The URL of the sheet.

        Returns:
            tuple: (sheet, sheet_meta_dict)
        """
        try:
            # Read CSV with no header
            df = pd.read_csv(url, header=None)

            # Check if DataFrame has at least two rows (header + at least one data row)
            if len(df) > 1:
                first_row = df.iloc[1]  # Get the first row after the header

                # Extract aliases: non-empty values from cells starting with 'AKA' (case-insensitive)
                if str(first_row[0]).strip().upper() == 'AKA':
                    aliases = [
                        str(x).strip() for x in first_row[1:] if pd.notna(x) and str(x).strip()
                    ]
                    # Exclude the 'AKA' row from the entry list
                    entry_list = df.iloc[2:, 0].tolist()  # Start from the second row after the header
                else:
                    aliases = []
                    entry_list = df.iloc[1:, 0].tolist()  # Include all rows after the header
            else:
                aliases = []
                entry_list = []

            # Construct the sheet meta dictionary
            sheet_meta_dict = {
                'title': sheet,
                'search_keys': [sheet] + aliases,
                'link': f'Meta Lore/{sheet}',
                'references': [],
            }

            for entry in entry_list:
                sheet_meta_dict[entry] = {
                    'title': entry,
                    'link': f'Meta Lore/{sheet}/{entry}',
                    'references': [],
                }

            return sheet, sheet_meta_dict
        except Exception as e:
            print(f"Error processing meta sheet '{sheet}': {e}")
            return sheet, {}

    # Use ThreadPoolExecutor to process sheets concurrently
    with ThreadPoolExecutor() as executor:
        futures = []
        for sheet, url in master_url_dict['Meta Lore'].items():
            futures.append(executor.submit(process_meta_sheet, sheet, url))

        # Collect results as they complete
        for future in as_completed(futures):
            sheet, sheet_meta_dict = future.result()
            meta_lore_dict[sheet] = sheet_meta_dict

    return meta_lore_dict

In [301]:
def search_keys_in_strings(search_keys, input_strings):
    """
    Checks if any of the search_keys exist as a string or substring in any of the input strings.

    Args:
        search_keys (list): List of search keys to check.
        input_strings (list): List of strings to search within.

    Returns:
        bool: True if any search key exists as a string or substring in any of the input strings, False otherwise.
    """
    for input_string in input_strings:
        for key in search_keys:
            if key in input_string:
                return True
    return False

In [302]:
def compare_and_update_references(dict_1, dict_2):
    """
    Compares search keys and content between two dictionaries and updates their references.

    Args:
        dict_1 (dict): The first dictionary with 'search_keys', 'content', and 'references'.
        dict_2 (dict): The second dictionary with 'search_keys', 'content', and 'references'.

    Returns:
        None: Updates the 'references' key in both dictionaries in place.
    """
    # Step 1: Extract search keys and convert to strings
    search_keys_1 = set(str(key).strip() for key in dict_1.get('search_keys', []))
    search_keys_2 = set(str(key).strip() for key in dict_2.get('search_keys', []))

    # Step 2: Check for overlap in search keys
    search_key_overlap = not search_keys_1.isdisjoint(search_keys_2)

    # Step 3: Extract strings to search in and convert to strings
    strings_to_search_in_1 = [str(value) for value in dict_1.get('content', {}).values()]
    strings_to_search_in_2 = [str(value) for value in dict_2.get('content', {}).values()]

    # Step 4: Use search_keys_in_strings to compare content
    result_1 = search_keys_in_strings(search_keys_1, strings_to_search_in_2)
    result_2 = search_keys_in_strings(search_keys_2, strings_to_search_in_1)

    # Step 5: Combine conditions
    if search_key_overlap or result_1 or result_2:
        # Update references in dict_1
        references_1 = set(dict_1.get('references', []))
        references_1.add(dict_2.get('link', '').strip())
        dict_1['references'] = list(references_1)

        # Update references in dict_2
        references_2 = set(dict_2.get('references', []))
        references_2.add(dict_1.get('link', '').strip())
        dict_2['references'] = list(references_2)

In [303]:
from itertools import combinations
from concurrent.futures import ThreadPoolExecutor

def process_all_game_sheets(game_content_dict):
    """
    Cross-reference all entry_dicts across all games and sheets in game_content_dict.
    """
    # Collect all entry_dicts from all games and sheets
    all_entry_dicts = []
    for game, sheets in game_content_dict.items():
        for sheet, entries in sheets.items():
            all_entry_dicts.extend(entries.values())

    print(f"Total entries to compare: {len(all_entry_dicts)}")  # Debugging log

    # Generate all unique pairs of entry_dicts
    entry_pairs = list(combinations(all_entry_dicts, 2))
    print(f"Total pairs to compare: {len(entry_pairs)}")  # Debugging log

    # Process each pair
    for pair in entry_pairs:
        dict_1, dict_2 = pair

        # Debugging: Log the links of the entries being compared
        # print(f"Comparing '{dict_1['link']}' with '{dict_2['link']}'")

        # Call compare_and_update_references
        try:
            compare_and_update_references(dict_1, dict_2)
        except Exception as e:
            print(f"Error comparing '{dict_1['link']}' and '{dict_2['link']}': {e}")

In [304]:
def dict_to_markdown(data, vault_path):
    """
    Convert nested dictionary structure to Markdown files.

    Args:
        data (dict): Nested dictionary in the format:
            {game1: {sheet1: {entry1: {'title': title, 'link': link, 
                    'content': {key1:value1, key2:value2,...}, 'search_keys': [...], 'references': [...]}}}, ...}
        vault_path (str): Root directory where files should be saved
    """
    for game_data in data.values():
        for sheet_data in game_data.values():
            for entry_data in sheet_data.values():
                # Get entry details
                title = entry_data.get('title', 'Untitled')
                link = entry_data.get('link', '')
                content = entry_data.get('content', {})
                search_keys = entry_data.get('search_keys', [])
                references = entry_data.get('references', [])

                # Create full file path
                file_path = Path(vault_path) / f"{link}.md"

                # Create parent directories if they don't exist
                file_path.parent.mkdir(parents=True, exist_ok=True)

                # Write markdown content
                with open(file_path, 'w', encoding='utf-8') as f:
                    # Write each content key-value pair
                    for key, value in content.items():
                        # Handle multiline values by indenting subsequent lines
                        if isinstance(value, str) and '\n' in value:
                            value = value.replace('\n', '\n  ')
                        f.write(f"**{key}**: {value}\n\n")

                    # Write search keys
                    if search_keys:
                        f.write(f"**search keys**: {', '.join(search_keys)}\n\n")

                    # Write references at the end
                    if references:
                        f.write("\n## References\n")
                        for ref in references:
                            f.write(f"- [[{ref}]]\n")

In [305]:
# from concurrent.futures import ThreadPoolExecutor
# from threading import Lock

# def dict_to_markdown(data, vault_path, batch_size=100):
#     """
#     Convert nested dictionary structure to Markdown files using multithreading with batch processing.

#     Args:
#         data (dict): Nested dictionary in the format:
#             {game1: {sheet1: {entry1: {'title': title, 'link': link, 
#                     'content': {key1:value1, key2:value2,...}, 'search_keys': [...], 'references': [...]}}}, ...}
#         vault_path (str): Root directory where files should be saved.
#         batch_size (int): Number of entries to process in each batch.
#     """
#     dir_lock = Lock()  # Lock for synchronizing directory creation

#     def process_batch(batch, vault_path):
#         """
#         Process a batch of entries and write them to Markdown files.

#         Args:
#             batch (list): List of entry dictionaries.
#             vault_path (str): Root directory where files should be saved.
#         """
#         for entry_data in batch:
#             # Get entry details
#             title = entry_data.get('title', 'Untitled')
#             link = entry_data.get('link', '')
#             content = entry_data.get('content', {})
#             search_keys = entry_data.get('search_keys', [])
#             references = entry_data.get('references', [])

#             # Create full file path
#             file_path = Path(vault_path) / f"{link}.md"

#             # Ensure parent directories exist
#             parent_dir = file_path.parent
#             with dir_lock:  # Synchronize directory creation
#                 if not parent_dir.exists():
#                     parent_dir.mkdir(parents=True, exist_ok=True)

#             # Write markdown content
#             with open(file_path, 'w', encoding='utf-8') as f:
#                 # Write each content key-value pair
#                 for key, value in content.items():
#                     # Handle multiline values by indenting subsequent lines
#                     if isinstance(value, str) and '\n' in value:
#                         value = value.replace('\n', '\n  ')
#                     f.write(f"**{key}**: {value}\n\n")

#                 # Write search keys
#                 if search_keys:
#                     f.write(f"**search keys**: {', '.join(search_keys)}\n\n")

#                 # Write references at the end
#                 if references:
#                     f.write("\n## References\n")
#                     for ref in references:
#                         f.write(f"- [[{ref}]]\n")

#     # Flatten the nested dictionary into a list of entries
#     entries = [
#         entry_data
#         for game_data in data.values()
#         for sheet_data in game_data.values()
#         for entry_data in sheet_data.values()
#     ]

#     # Process entries in batches using ThreadPoolExecutor
#     with ThreadPoolExecutor() as executor:
#         futures = []
#         for i in range(0, len(entries), batch_size):
#             batch = entries[i:i + batch_size]
#             futures.append(executor.submit(process_batch, batch, vault_path))

#         # Wait for all threads to complete
#         for future in futures:
#             future.result()

In [306]:
# def main():
script_path = Path(get_script_directory())
master_path = script_path.parent
vault_path = master_path / 'Obsidian Vault'

delete_all_except_obsidian(vault_path)

config_path=script_path / 'config.json'
sheets_path=script_path / 'sheets.json'

# config_dict=load_json(config_path)
# sheets_dict=load_json(sheets_path)

# master_url_dict=construct_master_url_dict(sheets_dict)

# game_content_dict=construct_game_content_dict(config_dict, master_url_dict)
# meta_lore_dict=construct_initial_meta_dict(master_url_dict)

# dict_to_markdown(game_content_dict, vault_path)

In [307]:
config_dict=load_json(config_path)
sheets_dict=load_json(sheets_path)

In [308]:
master_url_dict=construct_master_url_dict(sheets_dict)

In [309]:
game_content_dict=construct_game_content_dict(config_dict, master_url_dict)

In [310]:
meta_lore_dict=construct_initial_meta_dict(master_url_dict)

In [311]:
process_all_game_sheets(game_content_dict)

Total entries to compare: 1280
Total pairs to compare: 818560


In [312]:
dict_to_markdown(game_content_dict, vault_path)

In [313]:
game_content_dict["Book of Hours"]["Visitors"].keys()

dict_keys(['Mr Peter Agdistis', 'Dr Ibn Al-Adim', 'LT Arthur Moore', 'Ms Azita Bukhara', 'Lalla Chaima', 'MCO Constance Lee', 'Princess  Coquille Amirejibi', 'Sr. Corso Reverte', 'Dagmar von Nagelsburg', 'DI Douglas Moore', 'Mr Ehsan Fekri', 'Father Stanislav Schaller', 'Mr Fraser Strathcoyne', 'Magister Hokobald ', 'Mlle Margot Mtutine', 'Mme Olympe Bechet', 'Dr Arun Peel', 'Dr Serena Blackwood', 'Dr Yvette Southey', 'Mr Zachary Wakefield'])

In [314]:
search_keys = ["Forge of Days"]
enhanced_keys = enhance_search_keys(search_keys)
print("Enhanced Search Keys:", enhanced_keys)

Enhanced Search Keys: ['Forge of Days', 'Forge of Day']


In [None]:
# def create_smaller_game_content_dict(original_dict, max_entries=1000):
#     """
#     Create a smaller version of the game_content_dict for testing.
    
#     Args:
#         original_dict (dict): The original game_content_dict.
#         max_entries (int): The maximum number of entries to include in the smaller dictionary.
    
#     Returns:
#         dict: A smaller version of the game_content_dict.
#     """
#     smaller_dict = {}
#     included_entries = 0

#     # Ensure the required entries are included
#     required_entries = [
#         ("Cultist Simulator", "Expeditions", "Cater & Hero Limited"),
#         ("Cultist Simulator", "Ingredients", "Vital Pigment")
#     ]

#     # Add required entries first
#     for game, sheet, entry in required_entries:
#         if game in original_dict and sheet in original_dict[game] and entry in original_dict[game][sheet]:
#             if game not in smaller_dict:
#                 smaller_dict[game] = {}
#             if sheet not in smaller_dict[game]:
#                 smaller_dict[game][sheet] = {}
#             smaller_dict[game][sheet][entry] = original_dict[game][sheet][entry]
#             included_entries += 1

#     # Iterate over the original dictionary to add more entries
#     for game, sheets in original_dict.items():
#         if game not in smaller_dict:
#             smaller_dict[game] = {}
#         for sheet, entries in sheets.items():
#             if sheet not in smaller_dict[game]:
#                 smaller_dict[game][sheet] = {}
#             for entry, data in entries.items():
#                 # Skip if we've already added the required entries or reached the max limit
#                 if included_entries >= max_entries:
#                     break
#                 if entry not in smaller_dict[game][sheet]:
#                     smaller_dict[game][sheet][entry] = data
#                     included_entries += 1
#             if included_entries >= max_entries:
#                 break
#         if included_entries >= max_entries:
#             break

#     return smaller_dict

In [None]:
# # Example usage
# smaller_game_content_dict = create_smaller_game_content_dict(game_content_dict, max_entries=10000)
# print(json.dumps(smaller_game_content_dict, indent=4))

In [None]:
game_content_dict['Book of Hours']['Assistance']['Barber']

In [None]:
game_content_dict.keys()

In [None]:
# dict_1 = game_content_dict['Cultist Simulator']['Expeditions']['Cater & Hero Limited']
# dict_2 = game_content_dict['Cultist Simulator']['Ingredients']['Vital Pigment']
# compare_and_update_references(dict_1, dict_2)

In [None]:
meta_lore_dict['Principles']

In [None]:
os.path.dirname(script_path)

In [None]:
print(game_content_dict)

In [None]:
print(meta_lore_dict)

In [None]:
game_content_dict['Book of Hours']['Books'].keys()

In [None]:
import requests
import pandas as pd

def check_index_duplicates(df, detailed=False):
    """
    Check only for duplicates in the first column (indices)
    
    Args:
        df: pandas DataFrame
        detailed: If True, returns value counts
        
    Returns:
        Series if detailed=True, else bool
    """
    first_col = df.iloc[:, 0]
    duplicates = first_col[first_col.duplicated(keep=False)]
    
    if detailed:
        return duplicates.value_counts().sort_values(ascending=False)
    return not duplicates.empty

# Main processing loop
for game in config_dict['games']:
    for sheet, url in master_url_dict[game].items():
        try:
            # Load the DataFrame directly (no need for header check)
            df = pd.read_csv(url)
            
            # Check only for index duplicates
            duplicates = check_index_duplicates(df, detailed=True)
            
            # Print results if duplicates found
            if not duplicates.empty:
                print(f'\n[INDICES] Duplicates found in {game} - {sheet}:')
                print(duplicates.to_string())
                
                # Optional: Show the headers for reference
                print("\nCurrent headers:", df.columns.tolist())
                
        except Exception as e:
            print(f"\nError processing {game} - {sheet}: {str(e)}")

In [None]:
df = pd.read_csv(master_url_dict['Book of Hours']['Crafting Recipe'])
print("Actual headers:", df.columns.tolist())

In [None]:
game_content_dict
    # print(game)
    # print(master_url_dict[game])

In [None]:
boh_memory_url=master_url_dict['Book of Hours']['Memories']
initial_content_dict_from_url(boh_memory_url)

In [None]:
# # Create output directory relative to script location
# script_dir = Path(get_script_directory())
# output_dir = script_dir.parent / "Obsidian/markdown_files"
# output_dir.mkdir(exist_ok=True)

# def row_value_pairs(row):
#     """Convert a pandas row to Obsidian-friendly markdown format"""
#     content = []
    
#     # Get the first column's value (regardless of other columns)
#     first_col_name = row.index[0]  # Name of the first column
    
#     # Add other columns as key-value pairs (skip the first column)
#     # content.append(f"- **Type**: {type}")
#     for col, val in row.items():
#         if col == first_col_name:  # Skip the first column (already used as heading)
#             content.append(f"- **Type**: {col}")
#         if pd.notna(val):
#             clean_val = str(val).strip().replace('\r\n', '\n').replace('\n', '<br>')
#             content.append(f"- **{col}**: {clean_val}")
    
#     return "\n".join(content)

# def md_files_from_df(df):
#     # Write each row to a markdown file
#     for index, row in df.iterrows():
#         # Create safe filename (remove special chars)
#         safe_filename = row_first_value(row).replace(':', ' -') + '.md'
#         filepath = output_dir / safe_filename
#         # print(f'filepath: {filepath}')
        
#         try:
#             with open(filepath, 'w', encoding='utf-8') as f:
#                 f.write(row_value_pairs(row))
#             print(f"✓ Created: {filepath.relative_to(output_dir)}")
#         except Exception as e:
#             print(f"✗ Error writing {filepath.name}: {str(e)}")