In [None]:
# imports 

import re 
import os

In [None]:
OUTPUT_PATH = '../documents/clean'

In [None]:
# remove header and footer

def remove_headers(folder_path):
    """
    Processes all .md files in a folder to keep only the text
    between the first and second '* * *' separators.
    """
    # Check if the provided path is a valid directory
    if not os.path.isdir(folder_path):
        print(f"Error: Folder '{folder_path}' not found.")
        return

    print(f"Scanning folder: {folder_path}\n")

    # Loop through every file in the specified folder
    for filename in os.listdir(folder_path):
        # Process only files that end with .md
        if filename.endswith(".md"):
            file_path = os.path.join(folder_path, filename)
            
            try:
                # Read the original content of the file
                with open(file_path, 'r', encoding='utf-8') as file:
                    original_content = file.read()

                # Split the content by the '***' separator
                parts = original_content.split('* * *')
                

                # Check if the file has at least two separators
                if len(parts) >= 3:
                    # The content we want is the second element (index 1)
                    cleaned_content = parts[1].strip() # .strip() removes whitespace

                    output_path = f"{OUTPUT_PATH}/{filename}"
                    # Write the cleaned content back to the file, overwriting it
                    with open(output_path, 'w', encoding='utf-8') as file:
                        file.write(cleaned_content)
                    
                    print(f"Processed: {filename}")
                else:
                    print(f"Skipped: {filename} (does not contain two '* * *' separators)")

            except Exception as e:
                print(f"Error processing {filename}: {e}")

    print("\nProcessing complete.")

In [None]:
remove_headers('../documents/raw')

In [None]:
item_count = len(os.listdir('../documents/raw'))
item_count

In [None]:
def remove_bot_nav(folder_path):

    # Loop through every file in the specified folder
    for filename in os.listdir(folder_path):
        # Process only files that end with .md
        if filename.endswith(".md"):
            file_path = os.path.join(folder_path, filename)
            
            with open(file_path, 'r', encoding='utf-8') as file:
                original_content = file.read()

            nav_marker = '[ Previous]'

            footer_index = original_content.find(nav_marker)

            if footer_index != -1:
                # If found, slice the string to keep everything BEFORE the marker
                cleaned_content = original_content[:footer_index]
    
                
                with open(file_path, 'w', encoding='utf-8') as file:
                    # Use .rstrip() to remove any trailing blank lines or spaces
                    file.write(cleaned_content.rstrip())
                
                print(f"  -> Footer removed and file saved.")

In [None]:
remove_bot_nav('../documents/clean')

In [None]:
def remove_special_chars(folder_path):
    # Loop through every file in the specified folder
    for filename in os.listdir(folder_path):
        # Process only files that end with .md
        if filename.endswith(".md"):
            file_path = os.path.join(folder_path, filename)
            
            with open(file_path, 'r', encoding='utf-8') as file:
                original_content = file.read()

In [None]:
def remove_image_links(folder_path):
    """
    Removes all Markdown image links starting with '![_images/'
    from all .md files in a given folder.
    """
    # Check if the folder path is valid
    if not os.path.isdir(folder_path):
        print(f"Error: Folder '{folder_path}' not found.")
        return

    # Loop through every file in the specified folder
    for filename in os.listdir(folder_path):
        if filename.endswith(".md"):
            file_path = os.path.join(folder_path, filename)
            
            try:
                with open(file_path, 'r', encoding='utf-8') as file:
                    content = file.read()
                
                # We need a copy to check if changes were made
                original_content = content
                
                # Define the starting marker for the image link
                image_start_marker = '![_images/'

                # Loop as long as we keep finding the marker
                while image_start_marker in content:
                    # Find the start of the image link
                    start_index = content.find(image_start_marker)
                    
                    # Find the end of the link, which is the closing parenthesis ')'
                    end_index = content.find(')', start_index)
                    
                    # If we can't find a closing parenthesis, break to avoid an infinite loop
                    if end_index == -1:
                        break
                    
                    # Rebuild the content string, excluding the image link
                    content = content[:start_index] + content[end_index+1:]

                # Only write back to the file if content has actually changed
                if content != original_content:
                    print(f"Processing and saving: {filename}")
                    with open(file_path, 'w', encoding='utf-8') as file:
                        file.write(content)
                else:
                    print(f"No changes needed for: {filename}")

            except Exception as e:
                print(f"Error processing {filename}: {e}")

In [None]:
remove_image_links('../documents/clean')

In [None]:
def remove_char_from_all_files(folder_path, char_to_remove=''):
    """
    Loops through all files in a folder and removes every instance
    of a specified character.
    """
    # First, validate that the folder path exists
    if not os.path.isdir(folder_path):
        print(f"Error: Folder '{folder_path}' not found.")
        return

    print(f"Scanning folder: {folder_path}\n")

    # Loop through all items in the directory
    for filename in os.listdir(folder_path):
        file_path = os.path.join(folder_path, filename)

        # Make sure we are only processing files, not subdirectories
        if os.path.isfile(file_path):
            try:
                # Read the original content
                with open(file_path, 'r', encoding='utf-8') as file:
                    original_content = file.read()

                # Use .replace() to remove all occurrences of the character
                cleaned_content = original_content.replace(char_to_remove, '')

                # Only write back to the file if a change was actually made
                if cleaned_content != original_content:
                    with open(file_path, 'w', encoding='utf-8') as file:
                        file.write(cleaned_content)
                    print(f"Processed: {filename}")

            except Exception as e:
                # Catch potential errors like permission issues or encoding errors
                print(f"Error processing {filename}: {e}")

In [None]:
remove_char_from_all_files('../documents/clean')

In [None]:
def remove_source_links(folder_path):
    """
    Removes all markdown links like '[[source]](...)' from all files
    in a given folder.
    """
    # Validate that the folder path exists
    if not os.path.isdir(folder_path):
        print(f"Error: Folder '{folder_path}' not found.")
        return

    print(f"Scanning folder: {folder_path}\n")

    # Define the regular expression pattern to find
    # This looks for '[[source]]' followed by anything inside parentheses
    # Broken down: \[\[source\]\]\(.*\)
    #  - \[\[source\]\] matches the literal text "[[source]]"
    #  - \( matches the literal opening parenthesis
    #  - .* matches any character, any number of times
    #  - \) matches the literal closing parenthesis
    pattern = re.compile(r'\[\[source\]\]\(.*\)')

    # Loop through every item in the folder
    for filename in os.listdir(folder_path):
        file_path = os.path.join(folder_path, filename)

        # Process only files, not subdirectories
        if os.path.isfile(file_path):
            try:
                with open(file_path, 'r', encoding='utf-8') as file:
                    original_content = file.read()
                
                # Use re.sub() to find all matches and replace them with nothing
                cleaned_content = re.sub(pattern, '', original_content)

                # Only write back to the file if a change was actually made
                if cleaned_content != original_content:
                    with open(file_path, 'w', encoding='utf-8') as file:
                        file.write(cleaned_content)
                    print(f"Processed: {filename}")

            except Exception as e:
                print(f"Error processing {filename}: {e}")

In [None]:
remove_source_links('../knowledge/clean')

In [None]:
def clean_markdown_syntax(file_path):
    """
    Removes various markdown syntax elements from a text file.

    This function reads a file and uses a regular expression to find and
    remove the following markdown patterns:
    1. Table syntax (e.g., '|' and '---|---')
    2. Links (e.g., '[text](url "title")')
    3. Standalone arrows (→)

    It then overwrites the original file with the cleaned content.

    Args:
        file_path (str): The full path to the file to be cleaned.

    Returns:
        bool: True if the file was cleaned successfully, False otherwise.
    """
    # The regex pattern is composed of four parts, separated by '|' (OR):
    # 1. \s*---\s*\|  -- matches '---|---' with optional spaces.
    # 2. \|            -- matches the literal pipe character '|'.
    # 3. →?\s*\[[^\]]+\]\([^)]+\) -- matches markdown links like '[text](url)'.
    # 4. →            -- matches the literal arrow character '→'.
    pattern = r'\s*---\s*\||\||→?\s*\[[^\]]+\]\([^)]+\)|→'

    try:
        # First, read the entire content of the file.
        # Using a 'with' statement ensures the file is automatically closed.
        with open(file_path, 'r', encoding='utf-8') as file:
            content = file.read()

        # Use re.sub() to find all occurrences of the pattern and replace them
        # with an empty string.
        cleaned_content = re.sub(pattern, '', content)

        # Write the cleaned content back to the same file, overwriting it.
        with open(file_path, 'w', encoding='utf-8') as file:
            file.write(cleaned_content)

        print(f"Successfully cleaned the file: {file_path}")
        return True

    except FileNotFoundError:
        print(f"Error: The file at {file_path} was not found.")
        return False
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        return False

In [None]:
clean_markdown_syntax('../knowledge/test/nuke_splinewarp_CTransform.md')

In [None]:
# Loop through every item in the folder
folder_path = '../knowledge/clean'
for filename in os.listdir(folder_path):
    file_path = os.path.join(folder_path, filename)
    clean_markdown_syntax(file_path)

In [None]:
def clean_extra_lines(file_path):
    """
    Reads a file and removes excessive blank lines, ensuring a maximum of one
    empty line between content blocks. The original file is overwritten.

    Args:
        file_path (str): The full path to the text file.
    """
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            content = f.read()

        # Replace two or more newlines with exactly one newline.
        # This regex looks for \n{2,} which means "2 or more newline characters"
        # and replaces them with \n.
        cleaned_content = re.sub(r'\n{2,}', '\n', content)

        # Also, clean up any trailing whitespace on lines
        cleaned_content = '\n'.join(line.rstrip() for line in cleaned_content.split('\n'))

        with open(file_path, 'w', encoding='utf-8') as f:
            f.write(cleaned_content)

        print(f"Cleaned: {file_path}")

    except Exception as e:
        print(f"Could not process file {file_path}: {e}")

In [None]:
clean_extra_lines('../knowledge/clean/3D.md')

In [None]:
# Loop through every item in the folder
folder_path = '../knowledge/clean'
for filename in os.listdir(folder_path):
    file_path = os.path.join(folder_path, filename)
    clean_extra_lines(file_path)