**Recursively counting words in markdown within a folder**

In [1]:
import os
import re
import pathlib
from typing import List
import nbformat
import io
from nbformat import current

In [2]:
# Source: https://github.com/gandreadis/markdown-word-count
def count_words_in_markdown(filePath: str):
    with open(filePath, 'r', encoding='utf8') as f:
        text = f.read()

    # Comments
    text = re.sub(r'<!--(.*?)-->', '', text, flags=re.MULTILINE)
    # Tabs to spaces
    text = text.replace('\t', '    ')
    # More than 1 space to 4 spaces
    text = re.sub(r'[ ]{2,}', '    ', text)
    # Footnotes
    text = re.sub(r'^\[[^]]*\][^(].*', '', text, flags=re.MULTILINE)
    # Indented blocks of code
    text = re.sub(r'^( {4,}[^-*]).*', '', text, flags=re.MULTILINE)
    # Replace newlines with spaces for uniform handling
    text = text.replace('\n', ' ')
    # Custom header IDs
    text = re.sub(r'{#.*}', '', text)
    # Remove images
    text = re.sub(r'!\[[^\]]*\]\([^)]*\)', '', text)
    # Remove HTML tags
    text = re.sub(r'</?[^>]*>', '', text)
    # Remove special characters
    text = re.sub(r'[#*`~\-–^=<>+|/:]', '', text)
    # Remove footnote references
    text = re.sub(r'\[[0-9]*\]', '', text)
    # Remove enumerations
    text = re.sub(r'[0-9#]*\.', '', text)

    return len(text.split())

In [3]:
# Top directory to search through
topFolder: pathlib.Path = pathlib.Path().cwd().parent.parent.parent.parent.joinpath('content')

allMarkdown: List  = []

# Iterate through all files using pathlib
for singleFile in topFolder.glob('**/*'):
    if singleFile.suffix == '.md':
        allMarkdown.append(singleFile)

print(len(allMarkdown))

57


In [4]:
totalWordCount: int = 0
for singleFile in allMarkdown:
    totalWordCount += count_words_in_markdown(singleFile)

print(totalWordCount)

31623
