In [1]:
import io
import zipfile
import requests
import frontmatter

from IPython.display import display, Markdown

In [2]:
with open('./example.md', 'r', encoding='utf-8') as f:
    post = frontmatter.load(f)

In [3]:
post.metadata

{'title': 'Getting Started with AI',
 'author': 'John Doe',
 'date': '2024-01-15',
 'tags': ['ai', 'machine-learning', 'tutorial'],
 'difficulty': 'beginner'}

In [4]:
display(Markdown(post.content))

# Getting Started with AI

This is the main content of the document written in **Markdown**.

You can include code blocks, links, and other formatting here.

In [5]:
url = 'https://codeload.github.com/DataTalksClub/faq/zip/refs/heads/main'

In [6]:
resp = requests.get(url)

In [7]:
repository_data = []

with zipfile.ZipFile(io.BytesIO(resp.content)) as zf:
    for file_info in zf.infolist():
        filename = file_info.filename.lower()
    
        # Only process markdown files
        if not (filename.endswith('.md') or filename.endswith('.mdx')):
            continue
    
        # Read and parse each file
        with zf.open(file_info) as f_in:
            content = f_in.read()
            post = frontmatter.loads(content)
            data = post.to_dict()
            data['filename'] = filename
            repository_data.append(data)

In [8]:
len(repository_data)

1240

In [9]:
def read_repo_data(repo_owner, repo_name):
    """
    Download and parse all markdown files from a GitHub repository.
    
    Args:
        repo_owner: GitHub username or organization
        repo_name: Repository name
    
    Returns:
        List of dictionaries containing file content and metadata
    """
    prefix = 'https://codeload.github.com' 
    url = f'{prefix}/{repo_owner}/{repo_name}/zip/refs/heads/main'
    resp = requests.get(url)

    if resp.status_code != 200:
        raise Exception(f"Failed to download repository: {resp.status_code}")

    repository_data = []
    with zipfile.ZipFile(io.BytesIO(resp.content)) as zf:
        for file_info in zf.infolist():
            filename = file_info.filename
            filename_lower = filename.lower()

            if not (filename_lower.endswith('.md') 
                or filename_lower.endswith('.mdx')):
                continue

            try:
                with zf.open(file_info) as f_in:
                    content = f_in.read().decode('utf-8', errors='ignore')
                    post = frontmatter.loads(content)
                    data = post.to_dict()
                    data['filename'] = filename
                    repository_data.append(data)
            except Exception as e:
                print(f"Error processing {filename}: {e}")
                continue

    return repository_data

In [10]:
dtc_faq = read_repo_data('DataTalksClub', 'faq')
evidently_docs = read_repo_data('evidentlyai', 'docs')

print(f"FAQ documents: {len(dtc_faq)}")
print(f"Evidently documents: {len(evidently_docs)}")

FAQ documents: 1240
Evidently documents: 95
