In [2]:
# let's chcek the parsing library 
import frontmatter

with open('example.md', 'r', encoding='utf-8') as f:
    post = frontmatter.load(f)

#Access metadata

print(post.metadata['title'])
print(post.metadata['tags'])

#Access content 

print(post.content) #the markdown content without frontmatter

Building AI Agents
['ai', 'machine-learning', 'Ai-agents']
# Getting Started with AI-Agents

This is the main content of the document written in **Markdown**.

You can include code blocks, links, and other formatting here.


In [3]:
#to get the all the metadata nad content at the same time

post.to_dict()

{'title': 'Building AI Agents',
 'author': 'Filmon H. Assefa',
 'date': '2025-11-03',
 'tags': ['ai', 'machine-learning', 'Ai-agents'],
 'difficulty': 'beginner',
 'content': '# Getting Started with AI-Agents\n\nThis is the main content of the document written in **Markdown**.\n\nYou can include code blocks, links, and other formatting here.'}

In [4]:
import io
import zipfile
import requests
import frontmatter


#Next, we download the repository as a zip file. GitHub provides a convenient URL format for this:


url = 'https://codeload.github.com/DataTalksClub/faq/zip/refs/heads/main'
resp = requests.get(url)


### Now process the zip file in memory without saving it to disk:

In [6]:
repository_data = []

# Create a ZipFile object from the downloaded content
zf = zipfile.ZipFile(io.BytesIO(resp.content))

for file_info in zf.infolist():
    filename = file_info.filename.lower()

    # Only process markdown files
    if not filename.endswith('.md'):
        continue

    # Read and parse each file
    with zf.open(file_info) as f_in:
        content = f_in.read()
        post = frontmatter.loads(content)
        data = post.to_dict()
        data['filename'] = filename
        repository_data.append(data)

zf.close()


### Let's look at what we got:

In [8]:
print(repository_data[1])

{'content': '# DataTalks.Club FAQ\n\nA static site generator for DataTalks.Club course FAQs with automated AI-powered FAQ maintenance.\n\n## Features\n\n- **Static Site Generation**: Converts markdown FAQs to a beautiful, searchable HTML site\n- **Automated FAQ Management**: AI-powered bot that processes new FAQ proposals\n- **Intelligent Triage**: Automatically determines if proposals should create new entries, update existing ones, or are duplicates\n- **GitHub Integration**: Seamless workflow via GitHub Issues and Pull Requests\n\n## Project Structure\n\n```\nfaq/\n├── _questions/              # FAQ content organized by course\n│   ├── machine-learning-zoomcamp/\n│   │   ├── _metadata.yaml   # Course configuration\n│   │   ├── general/         # General course questions\n│   │   ├── module-1/        # Module-specific questions\n│   │   └── ...\n│   ├── data-engineering-zoomcamp/\n│   └── ...\n├── _layouts/                # Jinja2 HTML templates\n│   ├── base.html\n│   ├── course.htm

#### For processing Evidently docs we also need .mdx files (React markdown), so we can modify the code like this:


`for file_info in zf.infolist():`
   ` filename = file_info.filename.lower()`

   ` if not (filename.endswith('.md') or filename.endswith('.mdx')):
        continue`

   ` # rest remains the same...`
`

In [9]:
### let's put every thing together

import io
import zipfile
import requests
import frontmatter

def read_repo_data(repo_owner, repo_name):
    """
    Download and parse all markdown files from a GitHub repository.
    
    Args:
        repo_owner: GitHub username or organization
        repo_name: Repository name
    
    Returns:
        List of dictionaries containing file content and metadata
    """
    prefix = 'https://codeload.github.com' 
    url = f'{prefix}/{repo_owner}/{repo_name}/zip/refs/heads/main'
    resp = requests.get(url)
    
    if resp.status_code != 200:
        raise Exception(f"Failed to download repository: {resp.status_code}")

    repository_data = []
    zf = zipfile.ZipFile(io.BytesIO(resp.content))
    
    for file_info in zf.infolist():
        filename = file_info.filename
        filename_lower = filename.lower()

        if not (filename_lower.endswith('.md') 
            or filename_lower.endswith('.mdx')):
            continue
    
        try:
            with zf.open(file_info) as f_in:
                content = f_in.read().decode('utf-8', errors='ignore')
                post = frontmatter.loads(content)
                data = post.to_dict()
                data['filename'] = filename
                repository_data.append(data)
        except Exception as e:
            print(f"Error processing {filename}: {e}")
            continue
    
    zf.close()
    return repository_data


In [10]:
dtc_faq = read_repo_data('DataTalksClub', 'faq')
evidently_docs = read_repo_data('evidentlyai', 'docs')

print(f"FAQ documents: {len(dtc_faq)}")
print(f"Evidently documents: {len(evidently_docs)}")


FAQ documents: 1222
Evidently documents: 95
