In [41]:
from google.cloud import storage
from firebase_admin import firestore, initialize_app

# Establish a connection to the Google Cloud Storage and Firestore
storage_client = storage.Client()
bucket = storage_client.bucket('website-content12345')
initialize_app()
db = firestore.client()

In [2]:
from datetime import datetime
# Okay let's insert some data into Firestore
doc_ref = db.collection('feed').document('content-log')

timestamp = datetime.now()
timestamp = timestamp.strftime("%Y-%m-%d %H:%M:%S")

data = {
    timestamp: {
        'title': 'Test Blog',
        'location': 'blogs/test-blog.md',
    }
}

# Upload it
doc_ref.set(data)

update_time {
  seconds: 1720282478
  nanos: 194105000
}

In [4]:
feed = db.collection('feed').document('content-log')
data = feed.get().to_dict()

# Sort by key (timestamp) desc
data = dict(sorted(data.items(), key=lambda item: item[0], reverse=True))
data

{'2024-07-06 17:14:38': {'location': 'blogs/test-blog.md'}}

In [9]:
for key, value in data.items():
    blob = bucket.blob(value['location'])
    md = blob.download_as_string().decode('utf-8')

# We can capture the section between --- and --- and use it as metadata
metadata = md.split('---')[1]
description = {}
for line in metadata.split('\n'):
    if line:
        split_line = line.split(':')
        key = split_line[0].strip()
        # Remove the quotes
        value = ":".join(split_line[1:]).strip()[1:-1]
        description[key.strip()] = value

description

{'title': 'Test Post',
 'author': 'Ed',
 'date': '2024-07-06',
 'tags': '"coding", "python"',
 'type': 'blog',
 'description': 'This is a blog post about coding in Python.',
 'thumbnail': 'images/awesome-blog-post-thumbnail.jpg',
 'og_title': 'Awesome Blog Post',
 'og_description': 'An amazing blog post about coding in Python.',
 'og_image': 'images/awesome-blog-post-og.jpg'}

In [2]:
# Find md files in test-dir
import os
md_files = []
images = []
for root, dirs, files in os.walk('test-dir'):
    for file in files:
        if file.endswith('.md'):
            md_files.append(os.path.join(root, file))
        elif file.endswith('.png') :
            images.append(os.path.join(root, file))

print(md_files)
print("-"*10)
print(images)
print("-"*10)
print(len(md_files))
print("-"*10)
print(len(images))

['test-dir/2024/January/books.md', 'test-dir/2024/February/hip.md', 'test-dir/2024/May/discord.md', 'test-dir/2024/April/internet.md', 'test-dir/2024/March/horses.md', 'test-dir/2023/December/depressed.md', 'test-dir/2023/December/charlmes.md', 'test-dir/2023/July/burnout.md', 'test-dir/2023/July/onrss.md', 'test-dir/2023/July/dustydrawers.md', 'test-dir/2023/August/stress.md', 'test-dir/2023/August/prank.md', 'test-dir/2023/February/stumbleupon.md', 'test-dir/2023/June/alcoholism.md', 'test-dir/2023/June/oldwebsites.md', 'test-dir/2023/April/bananas.md']
----------
['test-dir/blog/2024/phone.png', 'test-dir/blog/2024/horse.png', 'test-dir/blog/2024/skateboard.png', 'test-dir/blog/2024/discord.png', 'test-dir/blog/2023/rss.png', 'test-dir/blog/2023/stumbleUpon.png', 'test-dir/blog/2023/depression.png', 'test-dir/blog/2023/duck.png', 'test-dir/blog/2023/banana.png', 'test-dir/blog/2023/outside.png', 'test-dir/blog/2023/computer.png', 'test-dir/blog/2023/stress.png', 'test-dir/blog/2023/

In [4]:
# Now we need to add metadata to the md files and upload them to the bucket
# The thumbnails can be compressed jpgs of the images
# Oh we also need to link the images in the md files to the image files
related_images = {}
# Go through our markdown files - md file is the key and any images in the doc are the values
for md_file in md_files:
    with open(md_file, 'r') as f:
        md = f.read()
    images = []
    for line in md.split('\n'):
        if '![' in line:
            images.append(line.split('(')[1].split(')')[0])
    related_images[md_file] = {
        'images': images
    }

print(related_images)

{'test-dir/2024/January/books.md': {'images': []}, 'test-dir/2024/February/hip.md': {'images': ['/images/blog/2024/skateboard.png']}, 'test-dir/2024/May/discord.md': {'images': ['/images/blog/2024/discord.png']}, 'test-dir/2024/April/internet.md': {'images': ['/images/blog/2024/phone.png']}, 'test-dir/2024/March/horses.md': {'images': ['/images/blog/2024/horse.png']}, 'test-dir/2023/December/depressed.md': {'images': ['/images/blog/2023/depression.png']}, 'test-dir/2023/December/charlmes.md': {'images': ['/images/blog/2023/duck.png']}, 'test-dir/2023/July/burnout.md': {'images': ['/images/blog/2023/burnout.png']}, 'test-dir/2023/July/onrss.md': {'images': ['/images/blog/2023/rss.png']}, 'test-dir/2023/July/dustydrawers.md': {'images': ['/images/blog/2023/hobbies.png', '/images/blog/2023/outside.png']}, 'test-dir/2023/August/stress.md': {'images': ['/images/blog/2023/stress.png']}, 'test-dir/2023/August/prank.md': {'images': ['/images/blog/2023/computer.png']}, 'test-dir/2023/February/s

In [7]:
from bs4 import BeautifulSoup
metadata = {}
# Cool now we can get the date these were published and the titles
for md_file in md_files:
    with open(md_file, 'r') as f:
        md = f.read()
    # First line is date, rm the # and strip
    date = md.split('\n')[0].strip('#').strip().replace("/", "-")
    # Second line is title, rm the # and strip
    title = md.split('\n')[1].strip('#').strip()

    # We can open the corresponding html file to get the og tags
    html_file = md_file.replace('.md', '.html')
    with open(html_file, 'r') as f:
        html = f.read()

    # Use bs
    soup = BeautifulSoup(html, 'html.parser')
    # Get og tags
    og_tags = {}
    for tag in soup.find_all('meta'):
        if tag.get('property') and tag.get('content'):
            og_tags[tag.get('property').replace(":", "_")] = tag.get('content')

    # Author is always Ed
    author = 'Ed'

    # We need some tags
    # We can set some default tags since this is my peronsal blog about silly things
    tags = ['Silly', 'Personal', 'Lifestyle']

    # Type is blog
    _type = 'blog'

    # Now we just need a thumbnail which we'll copy the first image name, add _thumbnail, compress to 64x64 and make a .jpg
    try:
        thumbnail = related_images[md_file]['images'][0].replace('.png', '_thumbnail.jpg')
    except:
        thumbnail = None

    # Now set all this data
    data = {
        'date': date,
        'title': title,
        'author': author,
        'tags': tags,
        'type': _type,
        'thumbnail': thumbnail,
        'og_tags': og_tags
    }

    metadata[md_file] = data
    metadata[md_file]['images'] = related_images[md_file]['images']

metadata

{'test-dir/2024/January/books.md': {'date': '04-01-2024',
  'title': 'Best Books of 2023',
  'author': 'Ed',
  'tags': ['Silly', 'Personal', 'Lifestyle'],
  'type': 'blog',
  'thumbnail': None,
  'og_tags': {'og_title': 'Best Books of 2023',
   'og_description': 'I started 2023 off strong with my reading, absolutely ploughing through books at a rate of one every other day, but this significantly slowed down in September ...',
   'og_type': 'article'},
  'images': []},
 'test-dir/2024/February/hip.md': {'date': '01-02-2024',
  'title': 'Why Do I Keep Injuring My Left Hip Specifically?',
  'author': 'Ed',
  'tags': ['Silly', 'Personal', 'Lifestyle'],
  'type': 'blog',
  'thumbnail': '/images/blog/2024/skateboard_thumbnail.jpg',
  'og_tags': {'og_title': 'Why Do I Keep Injuring My Left Hip Specifically?',
   'og_image': '/images/blog/2024/skateboard.png',
   'og_description': "When I was 30 years old I had a great idea. In spite of having never rollerskated in my life (apart from apparent

In [14]:
# Now we can insert this as metadata into our blog markdown and move them up to the root directory and also strip all the path from the image paths except filename
# Also images doesn't need to be included in the metadata it's just so we can track what goes in the bucket
# All our md files need the image paths setting to /assets/images/imagename.png
# With that in mind let's go ahead and do it
for md_file in md_files:
    with open(md_file, 'r') as f:
        md = f.read()

    # Remove the first two lines as these are title/date which we'll render from the metadata
    md = "\n".join(md.split('\n')[2:])

    # Now strip any excess lines
    md = md.strip()

    # Replace the image paths
    for image in metadata[md_file]['images']:
        md = md.replace(image, f"/assets/images/{image.split('/')[-1]}")

    # Get the metadata
    data = metadata[md_file]

    # Now we need to update the md file with the metadata
    new_md = f"""---
date: {data['date']}
title: {data['title']}
author: {data['author']}
tags: {data['tags']}
type: {data['type']}
thumbnail: /assets/images/{data['thumbnail'].split('/')[-1] if data['thumbnail'] else ''}
og_title: {data['og_tags'].get('og_title', '')}
og_description: {data['og_tags'].get('og_description', '')}
og_image: {data['og_tags'].get('og_image', '')}
og_type: {data['og_tags'].get('og_type', '')}
---
{md}
"""

    if not os.path.exists('test-dir/blogs'):
        os.makedirs('test-dir/blogs')

    # Write this new md to test-dir/blogs
    with open(f"test-dir/blogs/{md_file.split('/')[-1]}", 'w') as f:
        f.write(new_md)

# Now let's copy all our images to test-dir/images
import shutil
if not os.path.exists('test-dir/images'):
    os.makedirs('test-dir/images')

for image in images:
    shutil.copy(os.path.join('test-dir', *image.split('/')[2:]), f"test-dir/images/{image.split('/')[-1]}")

In [16]:
images = []
for root, dirs, files in os.walk('test-dir'):
    for file in files:
        if file.endswith('.md'):
            md_files.append(os.path.join(root, file))
        elif file.endswith('.png') :
            images.append(os.path.join(root, file))

In [19]:

for image in images:
    try:
        shutil.copy(os.path.join('test-dir', *image.split('/')[1:]), f"test-dir/images/{image.split('/')[-1]}")
    except Exception as e:
        print(e)


'test-dir/images/banana.png' and 'test-dir/images/banana.png' are the same file


In [20]:
# Date format is incorrect it goes dd-mm-yyyy instead of yyyy-mm-dd
# Let's fix that
for root, dirs, files in os.walk('test-dir/blogs'):
    for file in files:
        with open(os.path.join(root, file), 'r') as f:
            md = f.read()
        date = md.split('date: ')[1].split('\n')[0]
        date = date.split('-')
        date = f"{date[2]}-{date[1]}-{date[0]}"
        md = md.replace(md.split('date: ')[1].split('\n')[0], date)
        with open(os.path.join(root, file), 'w') as f:
            f.write(md)


In [21]:
# Oh let's use PIL to make our thumbnail images
images = os.listdir('test-dir/images')
images

['phone.png',
 'horse.png',
 'rss.png',
 'stumbleUpon.png',
 'depression.png',
 'duck.png',
 'banana.png',
 'outside.png',
 'computer.png',
 'stress.png',
 'burnout.png',
 'hu.png',
 'skateboard.png',
 'stairs.png',
 'hobbies.png',
 'discord.png']

In [27]:
from PIL import Image
for image in images:
    img = Image.open(f"test-dir/images/{image}")
    img.thumbnail((256, 256))
    img = img.convert("RGB")
    img.save(f"test-dir/images/{image.replace('.png', '_thumbnail.jpg')}")

In [39]:
import re

pattern = r'(og_image: )(/images/blog/\d{4}/)'

# Our md files have incorrect image paths for og_image
for md_file in os.listdir(os.path.join('test-dir', 'blogs')):
    with open(os.path.join('test-dir', 'blogs', md_file), 'r') as f:
        md = f.read()

    # Regex replace the capture group with /assets/images/
    md = re.sub(pattern, r'\1/assets/images/', md)

    # Write it back
    with open(os.path.join('test-dir', 'blogs', md_file), 'w') as f:
        f.write(md)


In [42]:
# Cool now we need to write our firestore log
feed = db.collection('feed').document('content-log')
data = feed.get().to_dict()
data

{'2024-07-06 17:14:38': {'location': 'blogs/test-blog.md'}}

In [44]:
from datetime import datetime as dt
from datetime import timedelta

data = {}
# So we will overwrite this with all our blogs - we do actually have a last edit date in the original file metadata we can use
# But if it is prior to the actual date we will use the actual date at midnight
# So let's go ahead and do that
# First go through and get datetime objects for all the blogs
for year in ['2023', '2024']:
    for root, dirs, files in os.walk(os.path.join('test-dir', year)):
        for file in files:
            if file.endswith('.md'):
                with open(os.path.join(root, file), 'r') as f:
                    md = f.read()
                actual_date = md.split('\n')[0].strip('#').strip().replace("/", "-")
                # Parse the actual date
                actual_date = dt.strptime(actual_date, "%d-%m-%Y")
                # Set time to midnight
                actual_date = actual_date.replace(hour=0, minute=0, second=0, microsecond=0)

                # Now get the file edit datetime
                edited_time = dt.fromtimestamp(os.path.getmtime(os.path.join(root, file)))

                # Make sure that edited_time isn't more than 1 day ahead of actual_date
                if edited_time <= actual_date + timedelta(days=1):
                    actual_date = edited_time

                data[actual_date.strftime("%Y-%m-%d %H:%M:%S")] = {
                    'location': os.path.join('blogs', file),
                }

print(data)

{'2023-12-17 16:08:05': {'location': 'blogs/depressed.md'}, '2023-12-23 11:03:45': {'location': 'blogs/charlmes.md'}, '2023-07-23 10:22:07': {'location': 'blogs/burnout.md'}, '2023-07-18 08:52:17': {'location': 'blogs/onrss.md'}, '2023-07-29 09:05:47': {'location': 'blogs/dustydrawers.md'}, '2023-08-08 12:02:16': {'location': 'blogs/stress.md'}, '2023-08-22 15:15:32': {'location': 'blogs/prank.md'}, '2023-02-22 00:00:00': {'location': 'blogs/stumbleupon.md'}, '2023-06-05 00:00:00': {'location': 'blogs/alcoholism.md'}, '2023-06-20 14:28:27': {'location': 'blogs/oldwebsites.md'}, '2023-04-17 00:00:00': {'location': 'blogs/bananas.md'}, '2024-01-04 11:35:56': {'location': 'blogs/books.md'}, '2024-02-01 17:08:24': {'location': 'blogs/hip.md'}, '2024-05-06 09:38:07': {'location': 'blogs/discord.md'}, '2024-04-23 08:17:04': {'location': 'blogs/internet.md'}, '2024-03-14 00:00:00': {'location': 'blogs/horses.md'}}


In [45]:
# Banging job now let's just order it such that the most recent is first
data = dict(sorted(data.items(), key=lambda item: item[0], reverse=True))
data

{'2024-05-06 09:38:07': {'location': 'blogs/discord.md'},
 '2024-04-23 08:17:04': {'location': 'blogs/internet.md'},
 '2024-03-14 00:00:00': {'location': 'blogs/horses.md'},
 '2024-02-01 17:08:24': {'location': 'blogs/hip.md'},
 '2024-01-04 11:35:56': {'location': 'blogs/books.md'},
 '2023-12-23 11:03:45': {'location': 'blogs/charlmes.md'},
 '2023-12-17 16:08:05': {'location': 'blogs/depressed.md'},
 '2023-08-22 15:15:32': {'location': 'blogs/prank.md'},
 '2023-08-08 12:02:16': {'location': 'blogs/stress.md'},
 '2023-07-29 09:05:47': {'location': 'blogs/dustydrawers.md'},
 '2023-07-23 10:22:07': {'location': 'blogs/burnout.md'},
 '2023-07-18 08:52:17': {'location': 'blogs/onrss.md'},
 '2023-06-20 14:28:27': {'location': 'blogs/oldwebsites.md'},
 '2023-06-05 00:00:00': {'location': 'blogs/alcoholism.md'},
 '2023-04-17 00:00:00': {'location': 'blogs/bananas.md'},
 '2023-02-22 00:00:00': {'location': 'blogs/stumbleupon.md'}}

In [46]:
# Slam it into firestore
feed.set(data)

update_time {
  seconds: 1720885488
  nanos: 98901000
}

In [51]:
# Go through and copy the og_description to the description field
for md_file in os.listdir(os.path.join('test-dir', 'blogs')):
    with open(os.path.join('test-dir', 'blogs', md_file), 'r') as f:
        md = f.read()

    # Get the og_description
    og_description = md.split('og_description: ')[1].split('\n')[0]

    # Add a description field under title
    title = md.split('title: ')[1].split('\n')[0]
    title_with_description = f"{title}\ndescription: {og_description}"
    md = md.replace(title, title_with_description)

    # Write it back
    with open(os.path.join('test-dir', 'blogs', md_file), 'w') as f:
        f.write(md)

In [53]:
# Rename our music files to add underscores in place of spaces
for root, dirs, files in os.walk('test-dir/music'):
    for file in files:
        if ' ' in file:
            os.rename(os.path.join(root, file), os.path.join(root, file.replace(' ', '_')))
        if '[' in file or ']' in file:
            os.rename(os.path.join(root, file), os.path.join(root, file.replace('[', '').replace(']', '')))

In [54]:
# Let's write a markdown file for our Planet Ed Album
metadata = {
    'date': '2005-01-01',
    'title': 'Planet Ed',
    'description': 'My first album, Planet Ed, was released in 2005. Nobody really cared. I was 12 going on 13.',
    'author': 'Ed',
    'tags': ['Music', 'Planet Ed'],
    'type': 'music',
    'thumbnail': '/assets/images/planet_ed.jpg',
    'og_title': 'Planet Ed',
    'og_description': 'My first album, Planet Ed, was released in 2005. Nobody really cared. I was 12 going on 13.',
    'og_image': '/assets/images/planet_ed.jpg',
    'og_type': 'music'
}

metadata_md = f"""---
date: {metadata['date']}
title: {metadata['title']}
description: {metadata['description']}
author: {metadata['author']}
tags: {metadata['tags']}
type: {metadata['type']}
thumbnail: {metadata['thumbnail']}
og_title: {metadata['og_title']}
og_description: {metadata['og_description']}
og_image: {metadata['og_image']}
og_type: {metadata['og_type']}
---
"""

# Now the actual content
content = """
---
When I was a young 12 year old, I was playing around with Gamemaker.

I took copyright law very seriously, and decided I must make my own music for my games otherwise I would be a criminal.

Thus started my music career, under the moniker Planet Ed (at some point stylised instead as Planet 'ed, a contraction of Planet Head). I didn't know anything about music theory. I had a light background in jazz piano, but I generally couldn't be bothered to learn anything properly.

Destroyed World was the first song I made, I was playing Jak and Daxter at the time so I was inspired by the music in that game. It is unexpectedly a very good piece of music.

The rest of the music may seem weird and disjointed, but I was 12 and to reiterate, I didn't know anything about music theory.

Armed with a copy of Sibelius 3 and an EMU Proteus 2000, I set out to make my first album. I was very proud of it at the time. Now I just look back at it with nostalgia.

I hope you enjoy it.

PS. The recording is terrible because I didn't know how to record properly. I didn't know what a DAW was.
---
title: Blips
file: /assets/music/01_Blips.mp3
title: Heaven
file: /assets/music/02_Heaven.mp3
title: Drum Pie and Peas
file: /assets/music/03_Drum_Pie_and_Peas.mp3
title: Folk Thing
file: /assets/music/04_Folk_Thing.mp3
title: Power Plant [Power Failure Remix]
file: /assets/music/05_Power_Plant_Power_Failure_Remix.mp3
title: Heavy Machinery
file: /assets/music/06_Heavy_Machinery.mp3
title: Aliens
file: /assets/music/07_Aliens.mp3
title: Epic Song
file: /assets/music/08_Epic_Song.mp3
title: Destroyed World
file: /assets/music/09_Destroyed_World.mp3
title: The Banjo Experience
file: /assets/music/10_The_Banjo_Experience.mp3
title: Taking a Ride
file: /assets/music/11_Taking_a_Ride.mp3
title: Power Plant
file: /assets/music/12_Power_Plant.mp3
"""

with open('test-dir/planet_ed.md', 'w') as f:
    f.write(metadata_md + content)

In [58]:
# Alright we gonna add collection metadata to our markdown files
for f in os.listdir(os.path.join('test-dir', 'blogs')):
    with open(os.path.join('test-dir', 'blogs', f), 'r') as file:
        md = file.read()

    # Add collection metadata
    collection_metadata = "collection: Ed's Blog"
    metadata = md.split('---')[1]
    md = md.replace(metadata, f"{metadata}{collection_metadata}\n")
    # Write it back
    with open(os.path.join('test-dir', 'blogs', f), 'w') as file:
        file.write(md)

# Now same in the music directory
with open('test-dir/planet_ed.md', 'r') as file:
    md = file.read()
metadata = md.split('---')[1]
collection_metadata = "collection: Ed's Music"
# Add collection metadata
md = md.replace(metadata, f"{metadata}{collection_metadata}\n")
# Write it back
with open('test-dir/planet_ed.md', 'w') as file:
    file.write(md)

In [60]:
# Now let's get our feed data and create a list of all our blog posts in reverse chronological order
# We can use this to write to our new firestore collection
feed = db.collection('feed').document('content-log').get().to_dict()
feed

{'2023-02-22 00:00:00': {'location': 'blogs/stumbleupon.md'},
 '2024-03-14 00:00:00': {'location': 'blogs/horses.md'},
 '2023-12-23 11:03:45': {'location': 'blogs/charlmes.md'},
 '2024-04-23 08:17:04': {'location': 'blogs/internet.md'},
 '2023-06-20 14:28:27': {'location': 'blogs/oldwebsites.md'},
 '2024-02-01 17:08:24': {'location': 'blogs/hip.md'},
 '2023-12-17 16:08:05': {'location': 'blogs/depressed.md'},
 '2023-04-17 00:00:00': {'location': 'blogs/bananas.md'},
 '2024-01-04 11:35:56': {'location': 'blogs/books.md'},
 '2023-07-29 09:05:47': {'location': 'blogs/dustydrawers.md'},
 '2023-07-18 08:52:17': {'location': 'blogs/onrss.md'},
 '2023-08-22 15:15:32': {'location': 'blogs/prank.md'},
 '2023-08-08 12:02:16': {'location': 'blogs/stress.md'},
 '2023-06-05 00:00:00': {'location': 'blogs/alcoholism.md'},
 '2023-07-23 10:22:07': {'location': 'blogs/burnout.md'},
 '2024-05-06 09:38:07': {'location': 'blogs/discord.md'}}

In [61]:
new_doc = db.collection('collections').document('eds_blog')
# Let's write a list from our feed data
data = [
    v['location'] for v in feed.values()
]
data

['blogs/stumbleupon.md',
 'blogs/horses.md',
 'blogs/charlmes.md',
 'blogs/internet.md',
 'blogs/oldwebsites.md',
 'blogs/hip.md',
 'blogs/depressed.md',
 'blogs/bananas.md',
 'blogs/books.md',
 'blogs/dustydrawers.md',
 'blogs/onrss.md',
 'blogs/prank.md',
 'blogs/stress.md',
 'blogs/alcoholism.md',
 'blogs/burnout.md',
 'blogs/discord.md']

In [62]:
# Cool now we can write this to our new collection
new_doc.set({'content': data})

update_time {
  seconds: 1720953389
  nanos: 391888000
}