In [124]:
import re
    
def clean_markdown(text):
    text = re.sub(r'```markdown+', '', text)
    
    # Remove Markdown backticks
    text = re.sub(r'```+', '', text)

    # Remove inline code backticks (`text`)
    text = re.sub(r'`+', '', text)

    text = re.sub(r'\[Print\]\(javascript:window\.print\(\)\)', '', text)
    
    # Remove list of links with same anchors
    text = re.sub(r'(?:(https?:\/\/[^\s]+)\s+){2,}', '', text)  # Remove repeated links

    # Replace [link](#) and [link](url) with link text only
    text = re.sub(r'\[([^\]]+)\]\(([^)]+)\)', r'\1', text)

    # Remove lists of links to the same page (e.g., [All](#) [Web Pages](#))
    text = re.sub(r'(\[([^\]]+)\]\(#\))+(?:\s|,)*', '', text)
    
    # Regular expression to remove unnecessary text from 
    # knowledge base articles
    # Remove specific table headers
    text = re.sub(r'\| \*\*Bot Information\*\* \|\n\| --- \|', '', text)
    text = re.sub(r'\| \*\*Information\*\* \|\n\| --- \|', '', text)
    text = re.sub(r'Views:\n\n\|\s*Article Overview\s*\|\s*\n\|\s*---\s*\|\s*\n\|.*?\|','',text,flags=re.DOTALL)
    text = re.sub(r'\|\s*Information\s*\|\s*\n\|\s*---\s*\|\s*\n\|.*?\|', '', text, flags=re.DOTALL)
    text = re.sub(r'\|\s*Bot Information\s*\|\s*\n\|\s*---\s*\|\s*\n\|.*?\|', '', text, flags=re.DOTALL)
    text = re.sub(r'\n\s*\*\*Information\*\*\s*\n', '\n', text)
    text = re.sub(r'##? Views:\n\n\| \*\*Article Overview\*\* \|\n\| --- \|\n\|.*?\|', '', text, flags=re.DOTALL)
    text = re.sub(r'Views:\n\n\| \*\*Article Overview\*\* \|\n\| --- \|\n\|.*?\|', '', text, flags=re.DOTALL)
    text = re.sub(r'^\| Information \|\n', '', text, flags=re.MULTILINE)
    text = re.sub(r'\*\s*(Home|Knowledge Base - Home|KA-\d+)\s*\n', '', text)
    text = re.sub(r"(You’re offline.*?Knowledge Articles|Contoso, Ltd\.|BYU-Pathway Worldwide|Toggle navigation[.\w\s\*\+\-\:]+|Search Filter|Search\n|Knowledge Article Key:)", '', text)
    text = re.sub(r"You’re offline\. This is a read only version of the page\.", '', text)
    
    # Others regular expressions to remove unnecessary text
    # Remove empty headers
    text = re.sub(r'^#+\s*$', '', text, flags=re.MULTILINE)
    
    # Remove text from WhatsApp navigation
    text = re.sub(r"Copy link\S*", 'Copy link', text)
    
    # Remove text from the hall foundation menu
    text = re.sub(r"(\*|\+|\-)\s+[A-Za-z\s]+", '', text, flags=re.MULTILINE)

    # Remove broken links
    text = re.sub(r'\[([^\]]+)\]\.\n\n\((http[^\)]+)\) \(([^)]+)\)\.', r'\1 (\3).',  text)
    
    # Remove consecutive blank lines
    text = re.sub(r'\n\s*\n\s*\n', '\n\n', text)

    return text

In [125]:
import os

def clean_md(input_path, output_path):
    # Create the new directory if it doesn't exist
    if not os.path.exists(output_path):
        os.makedirs(output_path)
    
    for file in os.listdir(input_path):
        if file.endswith('.md'):
            print(f'Cleaning file: {file}')
            input_file_path = os.path.join(input_path, file)
            
            # Read the original markdown file
            with open(input_file_path, 'r', encoding='utf-8') as f:
                content = f.read()
                
            # Clean the markdown content
            cleaned_content = clean_markdown(content)
            
            # Define the new file path in the output folder
            output_file_path = os.path.join(output_path, file)
            
            # Write the cleaned content to the new file
            with open(output_file_path, 'w', encoding='utf-8') as f:
                f.write(cleaned_content)
            print(f'Cleaned file saved as: {output_file_path}')

In [126]:
input_directory_path = '../data/data_16_09_24/out/from_html/'
output_directory_path = '../data/data_16_09_24/clean_md/out/' 
clean_md(input_directory_path, output_directory_path)

Cleaning file: 3dfa6235.md
Cleaned file saved as: ../data/data_16_09_24/clean_md/out/3dfa6235.md
Cleaning file: e0b5c0d2.md
Cleaned file saved as: ../data/data_16_09_24/clean_md/out/e0b5c0d2.md
Cleaning file: 68ae39cc.md
Cleaned file saved as: ../data/data_16_09_24/clean_md/out/68ae39cc.md
Cleaning file: 7e576bfb.md
Cleaned file saved as: ../data/data_16_09_24/clean_md/out/7e576bfb.md
Cleaning file: b494d37a.md
Cleaned file saved as: ../data/data_16_09_24/clean_md/out/b494d37a.md
Cleaning file: dac4d8ce.md
Cleaned file saved as: ../data/data_16_09_24/clean_md/out/dac4d8ce.md
Cleaning file: 91d1baa2.md
Cleaned file saved as: ../data/data_16_09_24/clean_md/out/91d1baa2.md
Cleaning file: 9957e5da.md
Cleaned file saved as: ../data/data_16_09_24/clean_md/out/9957e5da.md
Cleaning file: 511f8bf1.md
Cleaned file saved as: ../data/data_16_09_24/clean_md/out/511f8bf1.md
Cleaning file: 699f08af.md
Cleaned file saved as: ../data/data_16_09_24/clean_md/out/699f08af.md
Cleaning file: 49b390b3.md
Cle