In [24]:
import os
import csv

def associate_markdown_with_urls(markdown_dir, csv_file):
    # Read the CSV file and store the file paths and URLs in a dictionary
    file_url_mapping = {}
    with open(csv_file, mode='r', newline='', encoding='utf-8') as file:
        reader = csv.DictReader(file)
        for row in reader:
            # Extract the filename without the extension and use it as the key
            filepath = os.path.basename(row['Filepath'])  # Get the file name from the path
            filename_without_ext = os.path.splitext(filepath)[0]
            file_url_mapping[filename_without_ext] = row['URL']
            

    # Now go through the markdown files and associate them with the URLs
    markdown_url_mapping = {}
    for markdown_filename in os.listdir(markdown_dir):
        # Get the markdown filename without the extension
        filename_without_ext = os.path.splitext(markdown_filename)[0]
        
        # Check if the filename matches any entry in the CSV dictionary
        if filename_without_ext in file_url_mapping:
            markdown_url_mapping[markdown_filename] = {
            "heading" : row['Heading'],
            "subheading" : row['Subheading'],
            "title" : row['Title'],
            "url" : file_url_mapping[filename_without_ext]
            }
            

    return markdown_url_mapping




In [25]:

markdown_dir = 'data/testdata'
csv_file = 'output_data.csv'
markdown_url_mapping = associate_markdown_with_urls(markdown_dir, csv_file)

# Display the result
for markdown_file, url in markdown_url_mapping.items():
    # print()
    print(f"Markdown File: {markdown_file}, URL: {url}")
   

Markdown File: ACM-Vetting-Recommendation-Instructions.md, URL: {'heading': 'Software Systems', 'subheading': 'Technical Support', 'title': 'Account Access Error Support Knowledge Article', 'url': 'https://missionaries.prod.byu-pathway.psdops.com/ACM-Vetting-Recommendation-Instructions'}
Markdown File: BYU-Idaho-Academic-Calendars.md, URL: {'heading': 'Software Systems', 'subheading': 'Technical Support', 'title': 'Account Access Error Support Knowledge Article', 'url': 'https://www.byui.edu/academic-calendar/'}
Markdown File: Printable-Interest-List.md, URL: {'heading': 'Software Systems', 'subheading': 'Technical Support', 'title': 'Account Access Error Support Knowledge Article', 'url': 'https://missionaries.prod.byu-pathway.psdops.com/BYUPW-Interest-List'}
Markdown File: Financial-Holds-BYU-Idaho-Support-Knowledge-Article.md, URL: {'heading': 'Software Systems', 'subheading': 'Technical Support', 'title': 'Account Access Error Support Knowledge Article', 'url': 'https://pathway-mis

In [26]:
import os

def count_files_in_directory(directory):
    # List all the entries in the directory
    entries = os.listdir(directory)
    
    # Count only the files
    file_count = sum(1 for entry in entries if os.path.isfile(os.path.join(directory, entry)))
    
    return file_count

In [27]:
print(count_files_in_directory(markdown_dir))

484


In [28]:
import yaml

def attach_metadata_to_markdown_directory(directory_path, metadata_dict):
    # Loop through each file in the directory
    for filename in os.listdir(directory_path):
        if filename.endswith('.md'):
            file_path = os.path.join(directory_path, filename)
            if filename in metadata_dict:
                metadata = {
                    "heading" : metadata_dict[filename]["heading"],
                    "subheading" : metadata_dict[filename]["subheading"],
                    "title": filename,
                    "url": metadata_dict[filename]["url"]
                }
                
                # Open the markdown file and prepend the metadata
                with open(file_path, 'r+') as file:
                    content = file.read()
                    file.seek(0, 0)
                    yaml_metadata = yaml.dump(metadata, default_flow_style=False)
                    front_matter = f"---\n{yaml_metadata}---\n"
                    file.write(front_matter + content)

In [29]:
directory_path = 'data/testdata'

# Attach metadata to each markdown file in the directory
attach_metadata_to_markdown_directory(directory_path, markdown_url_mapping)

In [30]:
print(markdown_url_mapping)

{'ACM-Vetting-Recommendation-Instructions.md': {'heading': 'Software Systems', 'subheading': 'Technical Support', 'title': 'Account Access Error Support Knowledge Article', 'url': 'https://missionaries.prod.byu-pathway.psdops.com/ACM-Vetting-Recommendation-Instructions'}, 'BYU-Idaho-Academic-Calendars.md': {'heading': 'Software Systems', 'subheading': 'Technical Support', 'title': 'Account Access Error Support Knowledge Article', 'url': 'https://www.byui.edu/academic-calendar/'}, 'Printable-Interest-List.md': {'heading': 'Software Systems', 'subheading': 'Technical Support', 'title': 'Account Access Error Support Knowledge Article', 'url': 'https://missionaries.prod.byu-pathway.psdops.com/BYUPW-Interest-List'}, 'Financial-Holds-BYU-Idaho-Support-Knowledge-Article.md': {'heading': 'Software Systems', 'subheading': 'Technical Support', 'title': 'Account Access Error Support Knowledge Article', 'url': 'https://pathway-missionary.powerappsportals.com/knowledgebase/article/KA-01472/en-us'},

In [31]:
import yaml

def extract_url_from_markdown(file_path):
    with open(file_path, 'r') as file:
        lines = file.readlines()
        
        # Check if the file is not empty and starts with YAML front matter
        if lines and lines[0].strip() == '---':
            # Find where the front matter ends
            end_index = None
            for i in range(1, len(lines)):
                if lines[i].strip() == '---':
                    end_index = i
                    break
            
            if end_index:
                # Extract the YAML front matter
                yaml_content = ''.join(lines[1:end_index])
                metadata = yaml.safe_load(yaml_content)
                
                # Extract the URL from the metadata if it exists
                if 'url' in metadata:
                    return metadata['url']
    
    return None  # Return None if no URL is found or no valid YAML front matter is present




In [32]:
for filename in os.listdir(directory_path):
        if filename.endswith('.md'):
            file_path = os.path.join(directory_path, filename)
            url = extract_url_from_markdown(file_path)
            if url:
                print(url)
            else:
                print("No URL found")


https://missionaries.prod.byu-pathway.psdops.com/ACM-Vetting-Recommendation-Instructions
https://www.byui.edu/academic-calendar/
https://missionaries.prod.byu-pathway.psdops.com/BYUPW-Interest-List
https://pathway-missionary.powerappsportals.com/knowledgebase/article/KA-01472/en-us
https://missionaries.prod.byu-pathway.psdops.com/Troubleshoot-Platform-Issues
https://missionaries.prod.byu-pathway.psdops.com/Learn-the-Basic-Features-of-Zoom
https://missionaries.prod.byu-pathway.psdops.com/BYUPW-Interest-List
https://missionaries.prod.byu-pathway.psdops.com/Share-Multiple-Screens
https://www.byupathway.edu/policies/handbook/11-4-communication-resources-r
https://missionaries.prod.byu-pathway.psdops.com/Student-Transfer-FAQ
https://missionaries.prod.byu-pathway.psdops.com/Best-Practices-for-an-In-Person-Gathering
https://missionaries.prod.byu-pathway.psdops.com/Mac-Zoom-Installation
https://missionaries.prod.byu-pathway.psdops.com/BYU-Idaho-Tutoring-Services
https://www.byupathway.edu/degr

In [40]:
def extract_metadata_from_markdown(file_path):
    with open(file_path, 'r') as file:
        lines = file.readlines()
        
        # Check if the file is not empty and starts with YAML front matter
        if lines and lines[0].strip() == '---':
            # Find where the front matter ends
            end_index = None
            for i in range(1, len(lines)):
                if lines[i].strip() == '---':
                    end_index = i
                    break
            
            if end_index:
                # Extract the YAML front matter
                yaml_content = ''.join(lines[1:end_index])
                metadata = yaml.safe_load(yaml_content)
                
                # Extract the URL from the metadata if it exists
                # if 'url' in metadata:
                #     url = metadata['url']
                # if 'heading' in metadata:
                #     heading = metadata['heading']
                # if 'subheading' in metadata:
                #     subheading = metadata['subheading']
                # if 'title' in metadata:
                #     title = metadata['title']
                # # print(f"Heading: {heading}, Subheading: {subheading}, Title: {title}, URL: {url}")
    
    return metadata

In [41]:
for filename in os.listdir(directory_path):
        if filename.endswith('.md'):
            file_path = os.path.join(directory_path, filename)
            print(extract_metadata_from_markdown(file_path))

{'heading': 'Software Systems', 'subheading': 'Technical Support', 'title': 'ACM-Vetting-Recommendation-Instructions.md', 'url': 'https://missionaries.prod.byu-pathway.psdops.com/ACM-Vetting-Recommendation-Instructions'}
{'heading': 'Software Systems', 'subheading': 'Technical Support', 'title': 'BYU-Idaho-Academic-Calendars.md', 'url': 'https://www.byui.edu/academic-calendar/'}
{'heading': 'Software Systems', 'subheading': 'Technical Support', 'title': 'Printable-Interest-List.md', 'url': 'https://missionaries.prod.byu-pathway.psdops.com/BYUPW-Interest-List'}
{'heading': 'Software Systems', 'subheading': 'Technical Support', 'title': 'Financial-Holds-BYU-Idaho-Support-Knowledge-Article.md', 'url': 'https://pathway-missionary.powerappsportals.com/knowledgebase/article/KA-01472/en-us'}
{'heading': 'Software Systems', 'subheading': 'Technical Support', 'title': 'Troubleshoot-platform-issues.md', 'url': 'https://missionaries.prod.byu-pathway.psdops.com/Troubleshoot-Platform-Issues'}
{'hea