In [7]:
from google.cloud import storage
from firebase_admin import firestore, initialize_app

# Establish a connection to the Google Cloud Storage and Firestore
storage_client = storage.Client()
bucket = storage_client.bucket('website-content12345')
initialize_app()
db = firestore.client()

In [2]:
feed = db.collection('feed').document('content-log')
data = feed.get().to_dict()

# Sort by key (timestamp) desc
data = dict(sorted(data.items(), key=lambda item: item[0], reverse=True))
data

{'2024-07-18 17:14:21': {'location': 'blogs/keyboard_cat.md'},
 '2024-06-29 00:00:00': {'location': 'blogs/sql_recursive.md'},
 '2024-06-07 12:23:34': {'location': 'comics/pp_comic17.md'},
 '2024-05-14 09:28:11': {'location': 'videos/Ted_Crusty_a_lost_legend_of_Youtube.md'},
 '2024-05-06 09:38:07': {'location': 'blogs/discord.md'},
 '2024-04-23 08:17:04': {'location': 'blogs/internet.md'},
 '2024-04-15 06:35:07': {'location': 'videos/I_waited_nearly_30_years_to_beat_this_game.md'},
 '2024-03-14 00:00:00': {'location': 'blogs/horses.md'},
 '2024-03-13 00:00:26': {'location': 'videos/A_Game_Too_Obscure_to_Beat.md'},
 '2024-02-12 00:00:01': {'location': 'videos/The_People_Who_Make_Bad_Games.md'},
 '2024-02-01 17:08:24': {'location': 'blogs/hip.md'},
 '2024-01-13 13:33:15': {'location': 'videos/What_Happened_to_That39s_My_Sonic.md'},
 '2024-01-04 16:58:00': {'location': 'blogs/Soviet_Burgers_Big_Red_Adventure.md'},
 '2024-01-04 11:35:56': {'location': 'blogs/books.md'},
 '2023-12-28 15:56:

In [9]:
for key, value in data.items():
    blob = bucket.blob(value['location'])
    md = blob.download_as_string().decode('utf-8')

# We can capture the section between --- and --- and use it as metadata
metadata = md.split('---')[1]
description = {}
for line in metadata.split('\n'):
    if line:
        split_line = line.split(':')
        key = split_line[0].strip()
        # Remove the quotes
        value = ":".join(split_line[1:]).strip()[1:-1]
        description[key.strip()] = value

description

{'title': 'Test Post',
 'author': 'Ed',
 'date': '2024-07-06',
 'tags': '"coding", "python"',
 'type': 'blog',
 'description': 'This is a blog post about coding in Python.',
 'thumbnail': 'images/awesome-blog-post-thumbnail.jpg',
 'og_title': 'Awesome Blog Post',
 'og_description': 'An amazing blog post about coding in Python.',
 'og_image': 'images/awesome-blog-post-og.jpg'}

In [2]:
# Find md files in test-dir
import os
md_files = []
images = []
for root, dirs, files in os.walk('test-dir'):
    for file in files:
        if file.endswith('.md'):
            md_files.append(os.path.join(root, file))
        elif file.endswith('.png') :
            images.append(os.path.join(root, file))

print(md_files)
print("-"*10)
print(images)
print("-"*10)
print(len(md_files))
print("-"*10)
print(len(images))

['test-dir/2024/January/books.md', 'test-dir/2024/February/hip.md', 'test-dir/2024/May/discord.md', 'test-dir/2024/April/internet.md', 'test-dir/2024/March/horses.md', 'test-dir/2023/December/depressed.md', 'test-dir/2023/December/charlmes.md', 'test-dir/2023/July/burnout.md', 'test-dir/2023/July/onrss.md', 'test-dir/2023/July/dustydrawers.md', 'test-dir/2023/August/stress.md', 'test-dir/2023/August/prank.md', 'test-dir/2023/February/stumbleupon.md', 'test-dir/2023/June/alcoholism.md', 'test-dir/2023/June/oldwebsites.md', 'test-dir/2023/April/bananas.md']
----------
['test-dir/blog/2024/phone.png', 'test-dir/blog/2024/horse.png', 'test-dir/blog/2024/skateboard.png', 'test-dir/blog/2024/discord.png', 'test-dir/blog/2023/rss.png', 'test-dir/blog/2023/stumbleUpon.png', 'test-dir/blog/2023/depression.png', 'test-dir/blog/2023/duck.png', 'test-dir/blog/2023/banana.png', 'test-dir/blog/2023/outside.png', 'test-dir/blog/2023/computer.png', 'test-dir/blog/2023/stress.png', 'test-dir/blog/2023/

In [4]:
# Now we need to add metadata to the md files and upload them to the bucket
# The thumbnails can be compressed jpgs of the images
# Oh we also need to link the images in the md files to the image files
related_images = {}
# Go through our markdown files - md file is the key and any images in the doc are the values
for md_file in md_files:
    with open(md_file, 'r') as f:
        md = f.read()
    images = []
    for line in md.split('\n'):
        if '![' in line:
            images.append(line.split('(')[1].split(')')[0])
    related_images[md_file] = {
        'images': images
    }

print(related_images)

{'test-dir/2024/January/books.md': {'images': []}, 'test-dir/2024/February/hip.md': {'images': ['/images/blog/2024/skateboard.png']}, 'test-dir/2024/May/discord.md': {'images': ['/images/blog/2024/discord.png']}, 'test-dir/2024/April/internet.md': {'images': ['/images/blog/2024/phone.png']}, 'test-dir/2024/March/horses.md': {'images': ['/images/blog/2024/horse.png']}, 'test-dir/2023/December/depressed.md': {'images': ['/images/blog/2023/depression.png']}, 'test-dir/2023/December/charlmes.md': {'images': ['/images/blog/2023/duck.png']}, 'test-dir/2023/July/burnout.md': {'images': ['/images/blog/2023/burnout.png']}, 'test-dir/2023/July/onrss.md': {'images': ['/images/blog/2023/rss.png']}, 'test-dir/2023/July/dustydrawers.md': {'images': ['/images/blog/2023/hobbies.png', '/images/blog/2023/outside.png']}, 'test-dir/2023/August/stress.md': {'images': ['/images/blog/2023/stress.png']}, 'test-dir/2023/August/prank.md': {'images': ['/images/blog/2023/computer.png']}, 'test-dir/2023/February/s

In [7]:
from bs4 import BeautifulSoup
metadata = {}
# Cool now we can get the date these were published and the titles
for md_file in md_files:
    with open(md_file, 'r') as f:
        md = f.read()
    # First line is date, rm the # and strip
    date = md.split('\n')[0].strip('#').strip().replace("/", "-")
    # Second line is title, rm the # and strip
    title = md.split('\n')[1].strip('#').strip()

    # We can open the corresponding html file to get the og tags
    html_file = md_file.replace('.md', '.html')
    with open(html_file, 'r') as f:
        html = f.read()

    # Use bs
    soup = BeautifulSoup(html, 'html.parser')
    # Get og tags
    og_tags = {}
    for tag in soup.find_all('meta'):
        if tag.get('property') and tag.get('content'):
            og_tags[tag.get('property').replace(":", "_")] = tag.get('content')

    # Author is always Ed
    author = 'Ed'

    # We need some tags
    # We can set some default tags since this is my peronsal blog about silly things
    tags = ['Silly', 'Personal', 'Lifestyle']

    # Type is blog
    _type = 'blog'

    # Now we just need a thumbnail which we'll copy the first image name, add _thumbnail, compress to 64x64 and make a .jpg
    try:
        thumbnail = related_images[md_file]['images'][0].replace('.png', '_thumbnail.jpg')
    except:
        thumbnail = None

    # Now set all this data
    data = {
        'date': date,
        'title': title,
        'author': author,
        'tags': tags,
        'type': _type,
        'thumbnail': thumbnail,
        'og_tags': og_tags
    }

    metadata[md_file] = data
    metadata[md_file]['images'] = related_images[md_file]['images']

metadata

{'test-dir/2024/January/books.md': {'date': '04-01-2024',
  'title': 'Best Books of 2023',
  'author': 'Ed',
  'tags': ['Silly', 'Personal', 'Lifestyle'],
  'type': 'blog',
  'thumbnail': None,
  'og_tags': {'og_title': 'Best Books of 2023',
   'og_description': 'I started 2023 off strong with my reading, absolutely ploughing through books at a rate of one every other day, but this significantly slowed down in September ...',
   'og_type': 'article'},
  'images': []},
 'test-dir/2024/February/hip.md': {'date': '01-02-2024',
  'title': 'Why Do I Keep Injuring My Left Hip Specifically?',
  'author': 'Ed',
  'tags': ['Silly', 'Personal', 'Lifestyle'],
  'type': 'blog',
  'thumbnail': '/images/blog/2024/skateboard_thumbnail.jpg',
  'og_tags': {'og_title': 'Why Do I Keep Injuring My Left Hip Specifically?',
   'og_image': '/images/blog/2024/skateboard.png',
   'og_description': "When I was 30 years old I had a great idea. In spite of having never rollerskated in my life (apart from apparent

In [14]:
# Now we can insert this as metadata into our blog markdown and move them up to the root directory and also strip all the path from the image paths except filename
# Also images doesn't need to be included in the metadata it's just so we can track what goes in the bucket
# All our md files need the image paths setting to /assets/images/imagename.png
# With that in mind let's go ahead and do it
for md_file in md_files:
    with open(md_file, 'r') as f:
        md = f.read()

    # Remove the first two lines as these are title/date which we'll render from the metadata
    md = "\n".join(md.split('\n')[2:])

    # Now strip any excess lines
    md = md.strip()

    # Replace the image paths
    for image in metadata[md_file]['images']:
        md = md.replace(image, f"/assets/images/{image.split('/')[-1]}")

    # Get the metadata
    data = metadata[md_file]

    # Now we need to update the md file with the metadata
    new_md = f"""---
date: {data['date']}
title: {data['title']}
author: {data['author']}
tags: {data['tags']}
type: {data['type']}
thumbnail: /assets/images/{data['thumbnail'].split('/')[-1] if data['thumbnail'] else ''}
og_title: {data['og_tags'].get('og_title', '')}
og_description: {data['og_tags'].get('og_description', '')}
og_image: {data['og_tags'].get('og_image', '')}
og_type: {data['og_tags'].get('og_type', '')}
---
{md}
"""

    if not os.path.exists('test-dir/blogs'):
        os.makedirs('test-dir/blogs')

    # Write this new md to test-dir/blogs
    with open(f"test-dir/blogs/{md_file.split('/')[-1]}", 'w') as f:
        f.write(new_md)

# Now let's copy all our images to test-dir/images
import shutil
if not os.path.exists('test-dir/images'):
    os.makedirs('test-dir/images')

for image in images:
    shutil.copy(os.path.join('test-dir', *image.split('/')[2:]), f"test-dir/images/{image.split('/')[-1]}")

In [16]:
images = []
for root, dirs, files in os.walk('test-dir'):
    for file in files:
        if file.endswith('.md'):
            md_files.append(os.path.join(root, file))
        elif file.endswith('.png') :
            images.append(os.path.join(root, file))

In [19]:

for image in images:
    try:
        shutil.copy(os.path.join('test-dir', *image.split('/')[1:]), f"test-dir/images/{image.split('/')[-1]}")
    except Exception as e:
        print(e)


'test-dir/images/banana.png' and 'test-dir/images/banana.png' are the same file


In [20]:
# Date format is incorrect it goes dd-mm-yyyy instead of yyyy-mm-dd
# Let's fix that
for root, dirs, files in os.walk('test-dir/blogs'):
    for file in files:
        with open(os.path.join(root, file), 'r') as f:
            md = f.read()
        date = md.split('date: ')[1].split('\n')[0]
        date = date.split('-')
        date = f"{date[2]}-{date[1]}-{date[0]}"
        md = md.replace(md.split('date: ')[1].split('\n')[0], date)
        with open(os.path.join(root, file), 'w') as f:
            f.write(md)


In [21]:
# Oh let's use PIL to make our thumbnail images
images = os.listdir('test-dir/images')
images

['phone.png',
 'horse.png',
 'rss.png',
 'stumbleUpon.png',
 'depression.png',
 'duck.png',
 'banana.png',
 'outside.png',
 'computer.png',
 'stress.png',
 'burnout.png',
 'hu.png',
 'skateboard.png',
 'stairs.png',
 'hobbies.png',
 'discord.png']

In [27]:
from PIL import Image
for image in images:
    img = Image.open(f"test-dir/images/{image}")
    img.thumbnail((256, 256))
    img = img.convert("RGB")
    img.save(f"test-dir/images/{image.replace('.png', '_thumbnail.jpg')}")

In [39]:
import re

pattern = r'(og_image: )(/images/blog/\d{4}/)'

# Our md files have incorrect image paths for og_image
for md_file in os.listdir(os.path.join('test-dir', 'blogs')):
    with open(os.path.join('test-dir', 'blogs', md_file), 'r') as f:
        md = f.read()

    # Regex replace the capture group with /assets/images/
    md = re.sub(pattern, r'\1/assets/images/', md)

    # Write it back
    with open(os.path.join('test-dir', 'blogs', md_file), 'w') as f:
        f.write(md)


In [42]:
# Cool now we need to write our firestore log
feed = db.collection('feed').document('content-log')
data = feed.get().to_dict()
data

{'2024-07-06 17:14:38': {'location': 'blogs/test-blog.md'}}

In [44]:
from datetime import datetime as dt
from datetime import timedelta

data = {}
# So we will overwrite this with all our blogs - we do actually have a last edit date in the original file metadata we can use
# But if it is prior to the actual date we will use the actual date at midnight
# So let's go ahead and do that
# First go through and get datetime objects for all the blogs
for year in ['2023', '2024']:
    for root, dirs, files in os.walk(os.path.join('test-dir', year)):
        for file in files:
            if file.endswith('.md'):
                with open(os.path.join(root, file), 'r') as f:
                    md = f.read()
                actual_date = md.split('\n')[0].strip('#').strip().replace("/", "-")
                # Parse the actual date
                actual_date = dt.strptime(actual_date, "%d-%m-%Y")
                # Set time to midnight
                actual_date = actual_date.replace(hour=0, minute=0, second=0, microsecond=0)

                # Now get the file edit datetime
                edited_time = dt.fromtimestamp(os.path.getmtime(os.path.join(root, file)))

                # Make sure that edited_time isn't more than 1 day ahead of actual_date
                if edited_time <= actual_date + timedelta(days=1):
                    actual_date = edited_time

                data[actual_date.strftime("%Y-%m-%d %H:%M:%S")] = {
                    'location': os.path.join('blogs', file),
                }

print(data)

{'2023-12-17 16:08:05': {'location': 'blogs/depressed.md'}, '2023-12-23 11:03:45': {'location': 'blogs/charlmes.md'}, '2023-07-23 10:22:07': {'location': 'blogs/burnout.md'}, '2023-07-18 08:52:17': {'location': 'blogs/onrss.md'}, '2023-07-29 09:05:47': {'location': 'blogs/dustydrawers.md'}, '2023-08-08 12:02:16': {'location': 'blogs/stress.md'}, '2023-08-22 15:15:32': {'location': 'blogs/prank.md'}, '2023-02-22 00:00:00': {'location': 'blogs/stumbleupon.md'}, '2023-06-05 00:00:00': {'location': 'blogs/alcoholism.md'}, '2023-06-20 14:28:27': {'location': 'blogs/oldwebsites.md'}, '2023-04-17 00:00:00': {'location': 'blogs/bananas.md'}, '2024-01-04 11:35:56': {'location': 'blogs/books.md'}, '2024-02-01 17:08:24': {'location': 'blogs/hip.md'}, '2024-05-06 09:38:07': {'location': 'blogs/discord.md'}, '2024-04-23 08:17:04': {'location': 'blogs/internet.md'}, '2024-03-14 00:00:00': {'location': 'blogs/horses.md'}}


In [45]:
# Banging job now let's just order it such that the most recent is first
data = dict(sorted(data.items(), key=lambda item: item[0], reverse=True))
data

{'2024-05-06 09:38:07': {'location': 'blogs/discord.md'},
 '2024-04-23 08:17:04': {'location': 'blogs/internet.md'},
 '2024-03-14 00:00:00': {'location': 'blogs/horses.md'},
 '2024-02-01 17:08:24': {'location': 'blogs/hip.md'},
 '2024-01-04 11:35:56': {'location': 'blogs/books.md'},
 '2023-12-23 11:03:45': {'location': 'blogs/charlmes.md'},
 '2023-12-17 16:08:05': {'location': 'blogs/depressed.md'},
 '2023-08-22 15:15:32': {'location': 'blogs/prank.md'},
 '2023-08-08 12:02:16': {'location': 'blogs/stress.md'},
 '2023-07-29 09:05:47': {'location': 'blogs/dustydrawers.md'},
 '2023-07-23 10:22:07': {'location': 'blogs/burnout.md'},
 '2023-07-18 08:52:17': {'location': 'blogs/onrss.md'},
 '2023-06-20 14:28:27': {'location': 'blogs/oldwebsites.md'},
 '2023-06-05 00:00:00': {'location': 'blogs/alcoholism.md'},
 '2023-04-17 00:00:00': {'location': 'blogs/bananas.md'},
 '2023-02-22 00:00:00': {'location': 'blogs/stumbleupon.md'}}

In [46]:
# Slam it into firestore
feed.set(data)

update_time {
  seconds: 1720885488
  nanos: 98901000
}

In [51]:
# Go through and copy the og_description to the description field
for md_file in os.listdir(os.path.join('test-dir', 'blogs')):
    with open(os.path.join('test-dir', 'blogs', md_file), 'r') as f:
        md = f.read()

    # Get the og_description
    og_description = md.split('og_description: ')[1].split('\n')[0]

    # Add a description field under title
    title = md.split('title: ')[1].split('\n')[0]
    title_with_description = f"{title}\ndescription: {og_description}"
    md = md.replace(title, title_with_description)

    # Write it back
    with open(os.path.join('test-dir', 'blogs', md_file), 'w') as f:
        f.write(md)

In [53]:
# Rename our music files to add underscores in place of spaces
for root, dirs, files in os.walk('test-dir/music'):
    for file in files:
        if ' ' in file:
            os.rename(os.path.join(root, file), os.path.join(root, file.replace(' ', '_')))
        if '[' in file or ']' in file:
            os.rename(os.path.join(root, file), os.path.join(root, file.replace('[', '').replace(']', '')))

In [54]:
# Let's write a markdown file for our Planet Ed Album
metadata = {
    'date': '2005-01-01',
    'title': 'Planet Ed',
    'description': 'My first album, Planet Ed, was released in 2005. Nobody really cared. I was 12 going on 13.',
    'author': 'Ed',
    'tags': ['Music', 'Planet Ed'],
    'type': 'music',
    'thumbnail': '/assets/images/planet_ed.jpg',
    'og_title': 'Planet Ed',
    'og_description': 'My first album, Planet Ed, was released in 2005. Nobody really cared. I was 12 going on 13.',
    'og_image': '/assets/images/planet_ed.jpg',
    'og_type': 'music'
}

metadata_md = f"""---
date: {metadata['date']}
title: {metadata['title']}
description: {metadata['description']}
author: {metadata['author']}
tags: {metadata['tags']}
type: {metadata['type']}
thumbnail: {metadata['thumbnail']}
og_title: {metadata['og_title']}
og_description: {metadata['og_description']}
og_image: {metadata['og_image']}
og_type: {metadata['og_type']}
---
"""

# Now the actual content
content = """
---
When I was a young 12 year old, I was playing around with Gamemaker.

I took copyright law very seriously, and decided I must make my own music for my games otherwise I would be a criminal.

Thus started my music career, under the moniker Planet Ed (at some point stylised instead as Planet 'ed, a contraction of Planet Head). I didn't know anything about music theory. I had a light background in jazz piano, but I generally couldn't be bothered to learn anything properly.

Destroyed World was the first song I made, I was playing Jak and Daxter at the time so I was inspired by the music in that game. It is unexpectedly a very good piece of music.

The rest of the music may seem weird and disjointed, but I was 12 and to reiterate, I didn't know anything about music theory.

Armed with a copy of Sibelius 3 and an EMU Proteus 2000, I set out to make my first album. I was very proud of it at the time. Now I just look back at it with nostalgia.

I hope you enjoy it.

PS. The recording is terrible because I didn't know how to record properly. I didn't know what a DAW was.
---
title: Blips
file: /assets/music/01_Blips.mp3
title: Heaven
file: /assets/music/02_Heaven.mp3
title: Drum Pie and Peas
file: /assets/music/03_Drum_Pie_and_Peas.mp3
title: Folk Thing
file: /assets/music/04_Folk_Thing.mp3
title: Power Plant [Power Failure Remix]
file: /assets/music/05_Power_Plant_Power_Failure_Remix.mp3
title: Heavy Machinery
file: /assets/music/06_Heavy_Machinery.mp3
title: Aliens
file: /assets/music/07_Aliens.mp3
title: Epic Song
file: /assets/music/08_Epic_Song.mp3
title: Destroyed World
file: /assets/music/09_Destroyed_World.mp3
title: The Banjo Experience
file: /assets/music/10_The_Banjo_Experience.mp3
title: Taking a Ride
file: /assets/music/11_Taking_a_Ride.mp3
title: Power Plant
file: /assets/music/12_Power_Plant.mp3
"""

with open('test-dir/planet_ed.md', 'w') as f:
    f.write(metadata_md + content)

In [58]:
# Alright we gonna add collection metadata to our markdown files
for f in os.listdir(os.path.join('test-dir', 'blogs')):
    with open(os.path.join('test-dir', 'blogs', f), 'r') as file:
        md = file.read()

    # Add collection metadata
    collection_metadata = "collection: Ed's Blog"
    metadata = md.split('---')[1]
    md = md.replace(metadata, f"{metadata}{collection_metadata}\n")
    # Write it back
    with open(os.path.join('test-dir', 'blogs', f), 'w') as file:
        file.write(md)

# Now same in the music directory
with open('test-dir/planet_ed.md', 'r') as file:
    md = file.read()
metadata = md.split('---')[1]
collection_metadata = "collection: Ed's Music"
# Add collection metadata
md = md.replace(metadata, f"{metadata}{collection_metadata}\n")
# Write it back
with open('test-dir/planet_ed.md', 'w') as file:
    file.write(md)

In [60]:
# Now let's get our feed data and create a list of all our blog posts in reverse chronological order
# We can use this to write to our new firestore collection
feed = db.collection('feed').document('content-log').get().to_dict()
feed

{'2023-02-22 00:00:00': {'location': 'blogs/stumbleupon.md'},
 '2024-03-14 00:00:00': {'location': 'blogs/horses.md'},
 '2023-12-23 11:03:45': {'location': 'blogs/charlmes.md'},
 '2024-04-23 08:17:04': {'location': 'blogs/internet.md'},
 '2023-06-20 14:28:27': {'location': 'blogs/oldwebsites.md'},
 '2024-02-01 17:08:24': {'location': 'blogs/hip.md'},
 '2023-12-17 16:08:05': {'location': 'blogs/depressed.md'},
 '2023-04-17 00:00:00': {'location': 'blogs/bananas.md'},
 '2024-01-04 11:35:56': {'location': 'blogs/books.md'},
 '2023-07-29 09:05:47': {'location': 'blogs/dustydrawers.md'},
 '2023-07-18 08:52:17': {'location': 'blogs/onrss.md'},
 '2023-08-22 15:15:32': {'location': 'blogs/prank.md'},
 '2023-08-08 12:02:16': {'location': 'blogs/stress.md'},
 '2023-06-05 00:00:00': {'location': 'blogs/alcoholism.md'},
 '2023-07-23 10:22:07': {'location': 'blogs/burnout.md'},
 '2024-05-06 09:38:07': {'location': 'blogs/discord.md'}}

In [63]:
new_doc = db.collection('collections').document('eds_blog')
# Let's write a list from our feed data
data = [
    v['location'].split('/')[-1] for v in feed.values()
]
data

['stumbleupon.md',
 'horses.md',
 'charlmes.md',
 'internet.md',
 'oldwebsites.md',
 'hip.md',
 'depressed.md',
 'bananas.md',
 'books.md',
 'dustydrawers.md',
 'onrss.md',
 'prank.md',
 'stress.md',
 'alcoholism.md',
 'burnout.md',
 'discord.md']

In [64]:
# Cool now we can write this to our new collection
new_doc.set({'content': data})

update_time {
  seconds: 1720953587
  nanos: 833201000
}

In [69]:
# We should also add the music album to the music collection
new_doc = db.collection('collections').document('eds_music')
new_doc.set({'content': ['planet_ed.md']})


update_time {
  seconds: 1720957229
  nanos: 499697000
}

In [73]:
# Let's add our music to the feed
feed = db.collection('feed').document('content-log')
data = feed.get().to_dict()
data['2005-01-01 00:00:00'] = {
    'location': 'music/planet_ed.md'
}

data

{'2023-02-22 00:00:00': {'location': 'blogs/stumbleupon.md'},
 '2024-03-14 00:00:00': {'location': 'blogs/horses.md'},
 '2023-12-23 11:03:45': {'location': 'blogs/charlmes.md'},
 '2024-04-23 08:17:04': {'location': 'blogs/internet.md'},
 '2023-06-20 14:28:27': {'location': 'blogs/oldwebsites.md'},
 '2024-02-01 17:08:24': {'location': 'blogs/hip.md'},
 '2023-12-17 16:08:05': {'location': 'blogs/depressed.md'},
 '2023-04-17 00:00:00': {'location': 'blogs/bananas.md'},
 '2024-01-04 11:35:56': {'location': 'blogs/books.md'},
 '2023-07-29 09:05:47': {'location': 'blogs/dustydrawers.md'},
 '2023-07-18 08:52:17': {'location': 'blogs/onrss.md'},
 '2023-08-22 15:15:32': {'location': 'blogs/prank.md'},
 '2023-08-08 12:02:16': {'location': 'blogs/stress.md'},
 '2023-06-05 00:00:00': {'location': 'blogs/alcoholism.md'},
 '2023-07-23 10:22:07': {'location': 'blogs/burnout.md'},
 '2024-05-06 09:38:07': {'location': 'blogs/discord.md'},
 '2005-01-01 00:00:00': {'location': 'music/planet_ed.md'}}

In [74]:
# Put it back
feed.set(data)

update_time {
  seconds: 1720957492
  nanos: 514354000
}

In [82]:
from random import randint
# Let's try add some comics (Hewligg Urubokkle the old shit comics I made when I was 12)
# Alright we need to go through images and create markdown documents for every page, they don't have names so just call them 'Hewligg Urobokkle 1' etc
for im in os.listdir(os.path.join('STAGING', 'images')):
    filename = im.split(".")[0]

    new_filename = 'hewligg_urobokkle_' + im
    # Rename the image using shutil
    shutil.move(os.path.join('STAGING','images', im), os.path.join('STAGING', 'images', 'hewligg_urobokkle_' + im))
    im = new_filename
    # Make a thumbnail by sampling some of the comic
    img = Image.open(os.path.join('STAGING', 'images', im))
    img = img.convert('RGB')
    # Subsample 256x256 of the page
    width, height = img.size
    max_width = width - 256
    max_height = height - 256
    x0 = randint(0, max_width)
    y0 = randint(0, max_height)
    x1 = x0 + 256
    y1 = y0 + 256
    bounds = (x0, y0, x1, y1)
    cropped_img = img.crop(bounds)
    cropped_img.thumbnail((256,256))

    # Save into the staging area
    cropped_img.save(os.path.join("STAGING", "images", im.split('.')[0] + '_thumbnail.jpg'))
    if '_' in filename:
        filename = " Part ".join(filename.split("_"))
    md = f"""
---
date: YYYY-MM-DD
title: Hewligg Urobokkle {filename}
description: Description
author: Ed
tags: ['Sprite Comic', 'Archive', 'Hewligg Urobokkle']
type: comic
thumbnail: /assets/images/{im.split('.')[0]}_thumbnail.jpg
og_title: Hewligg Urobokkle {filename}
og_description: Description
og_image: /assets/images/{im.split('.')[0]}_thumbnail.jpg
og_type: article
collection: Hewligg Urobokkle
---
# Hewligg Urobokkle {filename}

![Hewligg Urobokkle {filename}](/assets/images/{im})
"""

    with open(os.path.join('STAGING', 'comics', im.split('.')[0] + '.md'), 'w') as f:
        f.write(md)

In [85]:
# Okay so we can now write the metadata for the Hewligg Urobokkle collection
collection = db.collection('collections').document('hewligg_urobokkle')
data = [
    f for f in os.listdir(os.path.join('STAGING', 'comics'))
]
# Sort it based on the number in the title
data = sorted(data, key=lambda x: float('_'.join(x.split('_')[2:]).split('.')[0].replace('_','.')))
data

['hewligg_urobokkle_1.md',
 'hewligg_urobokkle_2.md',
 'hewligg_urobokkle_3.md',
 'hewligg_urobokkle_4.md',
 'hewligg_urobokkle_5.md',
 'hewligg_urobokkle_6.md',
 'hewligg_urobokkle_7.md',
 'hewligg_urobokkle_8.md',
 'hewligg_urobokkle_9.md',
 'hewligg_urobokkle_10.md',
 'hewligg_urobokkle_11.md',
 'hewligg_urobokkle_12.md',
 'hewligg_urobokkle_13.md',
 'hewligg_urobokkle_14.md',
 'hewligg_urobokkle_15.md',
 'hewligg_urobokkle_16.md',
 'hewligg_urobokkle_17.md',
 'hewligg_urobokkle_18.md',
 'hewligg_urobokkle_19.md',
 'hewligg_urobokkle_20_1.md',
 'hewligg_urobokkle_20_2.md',
 'hewligg_urobokkle_20_3.md',
 'hewligg_urobokkle_21_1.md',
 'hewligg_urobokkle_21_2.md',
 'hewligg_urobokkle_21_3.md',
 'hewligg_urobokkle_22_1.md',
 'hewligg_urobokkle_22_2.md',
 'hewligg_urobokkle_23.md',
 'hewligg_urobokkle_24.md',
 'hewligg_urobokkle_25.md',
 'hewligg_urobokkle_26.md',
 'hewligg_urobokkle_27.md',
 'hewligg_urobokkle_28.md',
 'hewligg_urobokkle_29.md',
 'hewligg_urobokkle_30.md',
 'hewligg_uro

In [86]:
# Write it back
collection.set({'content': data})

update_time {
  seconds: 1720964939
  nanos: 861734000
}

In [87]:
# Great now let's add our comments to the feed based on the date of the comic
# But this has slightly more nuance we need a datetime object for the date of the comic
# Some share a date so we need to add a second to the time
feed = db.collection('feed').document('content-log')
feed_data = feed.get().to_dict()
feed_data

{'2023-02-22 00:00:00': {'location': 'blogs/stumbleupon.md'},
 '2024-03-14 00:00:00': {'location': 'blogs/horses.md'},
 '2024-04-23 08:17:04': {'location': 'blogs/internet.md'},
 '2023-12-23 11:03:45': {'location': 'blogs/charlmes.md'},
 '2023-06-20 14:28:27': {'location': 'blogs/oldwebsites.md'},
 '2024-02-01 17:08:24': {'location': 'blogs/hip.md'},
 '2023-12-17 16:08:05': {'location': 'blogs/depressed.md'},
 '2023-04-17 00:00:00': {'location': 'blogs/bananas.md'},
 '2024-01-04 11:35:56': {'location': 'blogs/books.md'},
 '2023-07-29 09:05:47': {'location': 'blogs/dustydrawers.md'},
 '2023-07-18 08:52:17': {'location': 'blogs/onrss.md'},
 '2023-08-22 15:15:32': {'location': 'blogs/prank.md'},
 '2023-08-08 12:02:16': {'location': 'blogs/stress.md'},
 '2023-06-05 00:00:00': {'location': 'blogs/alcoholism.md'},
 '2005-01-01 00:00:00': {'location': 'music/planet_ed.md'},
 '2024-05-06 09:38:07': {'location': 'blogs/discord.md'},
 '2023-07-23 10:22:07': {'location': 'blogs/burnout.md'}}

In [91]:
# Now let's create a dict of the comic data
new_feed_data = {}
for comic in data:
    with open(os.path.join('STAGING', 'comics', comic), 'r') as f:
        md = f.read()

    # Get the date
    date = md.split('date: ')[1].split('\n')[0]
    date = dt.strptime(date, "%Y-%m-%d")
    while date.strftime("%Y-%m-%d %H:%M:%S") in new_feed_data:
        date = date + timedelta(seconds=1)

    new_feed_data[date.strftime("%Y-%m-%d %H:%M:%S")] = {
        'location': os.path.join('comics', comic)
    }

assert len(new_feed_data.keys()) == len(data)

In [93]:
# Now extend feed_data
feed_data.update(new_feed_data)
feed_data

{'2023-02-22 00:00:00': {'location': 'blogs/stumbleupon.md'},
 '2024-03-14 00:00:00': {'location': 'blogs/horses.md'},
 '2024-04-23 08:17:04': {'location': 'blogs/internet.md'},
 '2023-12-23 11:03:45': {'location': 'blogs/charlmes.md'},
 '2023-06-20 14:28:27': {'location': 'blogs/oldwebsites.md'},
 '2024-02-01 17:08:24': {'location': 'blogs/hip.md'},
 '2023-12-17 16:08:05': {'location': 'blogs/depressed.md'},
 '2023-04-17 00:00:00': {'location': 'blogs/bananas.md'},
 '2024-01-04 11:35:56': {'location': 'blogs/books.md'},
 '2023-07-29 09:05:47': {'location': 'blogs/dustydrawers.md'},
 '2023-07-18 08:52:17': {'location': 'blogs/onrss.md'},
 '2023-08-22 15:15:32': {'location': 'blogs/prank.md'},
 '2023-08-08 12:02:16': {'location': 'blogs/stress.md'},
 '2023-06-05 00:00:00': {'location': 'blogs/alcoholism.md'},
 '2005-01-01 00:00:00': {'location': 'music/planet_ed.md'},
 '2024-05-06 09:38:07': {'location': 'blogs/discord.md'},
 '2023-07-23 10:22:07': {'location': 'blogs/burnout.md'},
 '20

In [94]:
# Cool let's put it back
feed.set(feed_data)

update_time {
  seconds: 1720965142
  nanos: 440245000
}

In [97]:
# We need to update the feed again because hewligg_urobokkle_21_3.md said 2024 instead of 2004
feed = db.collection('feed').document('content-log')
feed_data = feed.get().to_dict()
feed_data['2024-08-23 00:00:00']

{'location': 'comics/hewligg_urobokkle_21_3.md'}

In [98]:
# Let's pop that key and reinsert it
old_data = feed_data.pop('2024-08-23 00:00:00')
feed_data['2004-08-23 00:00:00'] = old_data
feed_data


{'2004-08-16 00:00:07': {'location': 'comics/hewligg_urobokkle_8.md'},
 '2024-04-23 08:17:04': {'location': 'blogs/internet.md'},
 '2004-08-16 00:00:01': {'location': 'comics/hewligg_urobokkle_2.md'},
 '2024-02-01 17:08:24': {'location': 'blogs/hip.md'},
 '2004-10-15 00:00:00': {'location': 'comics/hewligg_urobokkle_38.md'},
 '2023-07-18 08:52:17': {'location': 'blogs/onrss.md'},
 '2024-01-04 11:35:56': {'location': 'blogs/books.md'},
 '2004-10-21 00:00:00': {'location': 'comics/hewligg_urobokkle_44.md'},
 '2004-08-17 00:00:03': {'location': 'comics/hewligg_urobokkle_13.md'},
 '2004-10-06 00:00:00': {'location': 'comics/hewligg_urobokkle_28.md'},
 '2004-10-27 00:00:00': {'location': 'comics/hewligg_urobokkle_50.md'},
 '2004-10-10 00:00:00': {'location': 'comics/hewligg_urobokkle_32.md'},
 '2004-08-18 00:00:01': {'location': 'comics/hewligg_urobokkle_17.md'},
 '2004-08-19 00:00:00': {'location': 'comics/hewligg_urobokkle_20_3.md'},
 '2023-02-22 00:00:00': {'location': 'blogs/stumbleupon

In [99]:
assert len(feed.get().to_dict()) == len(feed_data)

In [101]:
# Cool let's put it back
feed.set(feed_data)

update_time {
  seconds: 1720965936
  nanos: 530778000
}

In [106]:
# The list of blogs is out of order so let's fix that
blogs_list = []
# Sort feed_data by date - oldest first
feed_data = dict(sorted(feed_data.items(), key=lambda item: item[0]))
for k, v in feed_data.items():
    location = v['location']
    if 'blogs' in location:
        print(k)
        blogs_list.append(location.split('/')[-1])

blogs_list

2023-02-22 00:00:00
2023-04-17 00:00:00
2023-06-05 00:00:00
2023-06-20 14:28:27
2023-07-18 08:52:17
2023-07-23 10:22:07
2023-07-29 09:05:47
2023-08-08 12:02:16
2023-08-22 15:15:32
2023-12-17 16:08:05
2023-12-23 11:03:45
2024-01-04 11:35:56
2024-02-01 17:08:24
2024-03-14 00:00:00
2024-04-23 08:17:04
2024-05-06 09:38:07


['stumbleupon.md',
 'bananas.md',
 'alcoholism.md',
 'oldwebsites.md',
 'onrss.md',
 'burnout.md',
 'dustydrawers.md',
 'stress.md',
 'prank.md',
 'depressed.md',
 'charlmes.md',
 'books.md',
 'hip.md',
 'horses.md',
 'internet.md',
 'discord.md']

In [107]:
# Okay now blogs list looks right let's put it back
new_doc = db.collection('collections').document('eds_blog')
new_doc.set({'content': blogs_list})

update_time {
  seconds: 1720968778
  nanos: 830024000
}

In [116]:
# Okay so currently our comics have a headig but the jinja2 layout deals with the heading so let's strip that out our markdown files
for md_file in os.listdir(os.path.join('CONTENT', 'comics')):
    with open(os.path.join('CONTENT', 'comics', md_file), 'r') as f:
        md = f.read()

    # Split on ---
    nothing, metadata, content = md.split('---')
    reconstructed_content = ''
    for line in content.split('\n'):
        if not line:
            continue
        elif line.startswith('#'):
            continue
        reconstructed_content += line + '\n'

    # Strip newlines
    reconstructed_content = '\n' + reconstructed_content.strip()

    # Join it back together
    md = '---'.join([nothing, metadata, reconstructed_content])

    # Write it back
    with open(os.path.join('CONTENT', 'comics', md_file), 'w') as f:
        f.write(md)


In [117]:
# Now deal with pixelated peculirarities
import json
with open(os.path.join('STAGING', 'comic_data.json'), 'r') as f:
    comic_data = json.loads(f.read())
comic_data

[{'name': 'Sonic the Hedgehog in Chilli Dog Zone',
  'description': "Sonic the Hedgehog tells Commander Keen to have a chilli dog but finds out he doesn't know what they are, so they go to Chilli Dog Zone.",
  'date': '2023-07-03',
  'src': 'comic1.png',
  'tooltip': 'Sonk 3'},
 {'name': 'Gotta Grow Pasta',
  'description': "Knuckles the Echidna tries to punch Sonic but it doesn't work because he's too fast, so he grows pasta.",
  'date': '2023-07-04',
  'src': 'comic2.png',
  'tooltip': 'BBC pasta woman'},
 {'name': "Alex Kidd's Whacky Wheelie Adventure",
  'description': "Alex Kidd drives a car and it's whacky and wheelie.",
  'date': '2023-07-04',
  'src': 'comic3.png',
  'tooltip': 'Read Alex Kidd Buys Heroin'},
 {'name': 'Guybrush Threepwood and the Ordinary Rubber Chicken With a Pulley in the Middle of Th eK',
  'description': "Guybrush Threepwood finds a perfectly normal rurbber chicken with a pulley in the middle. OR DOES HE? (He does.) Oh, and there's a monkey. And a pirate. n

In [121]:
# Okay so we gotta loop over this, generate metadata, generate content, add hover_text, and link up the thumbnail and og tag
feed = {}
for comic in comic_data:
    metadata = {}
    metadata['title'] = comic['name']
    metadata['description'] = comic['description']
    metadata['date'] = comic['date']
    metadata['hover_text'] = comic['tooltip']
    metadata['author'] = 'Ed'
    metadata['tags'] = ['Sprite Comic', 'Irony']
    metadata['type'] = 'comic'
    metadata['og_title'] = comic['name']
    metadata['og_description'] = comic['description']
    metadata['og_type'] = 'article'
    metadata['collection'] = 'Pixelated Peculirarities'

    # Now generate the thumbnail and organise the images
    og_name = 'comic_' + comic['src'].split('comic')[1]
    og_img = os.path.join('STAGING', 'og', og_name)
    new_name = 'pp_'+og_name.split('.')[0] + '_og.jpg'
    shutil.copy(og_img, os.path.join('STAGING', 'images', new_name))
    metadata['og_image'] = f"/assets/images/{new_name}"

    # Thumbnail
    img = Image.open(og_img)
    img.thumbnail((256, 256))
    img = img.convert('RGB')

    thumbnail_name = 'pp_'+og_name.split('.')[0] + '_thumbnail.jpg'
    img.save(os.path.join('STAGING', 'images', thumbnail_name))

    metadata['thumbnail'] = f"/assets/images/{thumbnail_name}"

    # Move the src image to the images directory with a new name
    comic_filename = f'pp_{comic["src"]}'
    shutil.copy(os.path.join('STAGING', 'img', comic['src']), os.path.join('STAGING', 'images', comic_filename))

    # Now finally generate the markdown doc
    content = f"""---
date: {metadata['date']}
title: {metadata['title']}
description: {metadata['description']}
author: {metadata['author']}
tags: {metadata['tags']}
type: {metadata['type']}
thumbnail: {metadata['thumbnail']}
og_title: {metadata['og_title']}
og_description: {metadata['og_description']}
og_image: {metadata['og_image']}
og_type: {metadata['og_type']}
collection: {metadata['collection']}
hover_text: {metadata['hover_text']}
---
![{metadata['title']}](/assets/images/{comic_filename})
"""

    with open(os.path.join('STAGING', 'comics', comic_filename.split('.')[0] + '.md'), 'w') as f:
        f.write(content)

    # Now get the datetime of edit from the file src
    edited_time = dt.fromtimestamp(os.path.getmtime(os.path.join('STAGING', 'img', comic['src'])))

    str_time = edited_time.strftime("%Y-%m-%d %H:%M:%S")

    # Add to feed
    feed[str_time] = {
        'location': os.path.join('comics', comic_filename.split('.')[0] + '.md')
    }

In [122]:
# Finally we need a content feed for the Pixelated Peculirarities collection
new_doc = db.collection('collections').document('pixelated_peculirarities')
data = [
    f for f in os.listdir(os.path.join('STAGING', 'comics')) if not 'thumbnail' in f and not 'og' in f
]
data

['pp_comic12.md',
 'pp_comic17.md',
 'pp_comic8.md',
 'pp_comic11.md',
 'pp_comic5.md',
 'pp_comic4.md',
 'pp_comic10.md',
 'pp_comic6.md',
 'pp_comic9.md',
 'pp_comic7.md',
 'pp_comic3.md',
 'pp_comic16.md',
 'pp_comic2.md',
 'pp_comic15.md',
 'pp_comic13.md',
 'pp_comic1.md',
 'pp_comic14.md']

In [124]:
# This data needs to be sorted by date
data = sorted(data, key=lambda x: int(x.split('comic')[1].split('.')[0]))
data

['pp_comic1.md',
 'pp_comic2.md',
 'pp_comic3.md',
 'pp_comic4.md',
 'pp_comic5.md',
 'pp_comic6.md',
 'pp_comic7.md',
 'pp_comic8.md',
 'pp_comic9.md',
 'pp_comic10.md',
 'pp_comic11.md',
 'pp_comic12.md',
 'pp_comic13.md',
 'pp_comic14.md',
 'pp_comic15.md',
 'pp_comic16.md',
 'pp_comic17.md']

In [125]:
# Write this to the collection
new_doc.set({'content': data})

update_time {
  seconds: 1720974531
  nanos: 597079000
}

In [126]:
feed

{'2023-07-03 21:52:08': {'location': 'comics/pp_comic1.md'},
 '2023-07-04 12:53:11': {'location': 'comics/pp_comic2.md'},
 '2023-07-04 17:10:46': {'location': 'comics/pp_comic3.md'},
 '2023-07-06 07:23:12': {'location': 'comics/pp_comic4.md'},
 '2023-07-06 09:49:26': {'location': 'comics/pp_comic5.md'},
 '2023-07-06 14:10:46': {'location': 'comics/pp_comic6.md'},
 '2023-07-08 15:20:42': {'location': 'comics/pp_comic7.md'},
 '2023-07-09 14:28:02': {'location': 'comics/pp_comic8.md'},
 '2023-07-13 10:00:22': {'location': 'comics/pp_comic9.md'},
 '2023-07-18 13:16:07': {'location': 'comics/pp_comic10.md'},
 '2023-07-21 18:02:45': {'location': 'comics/pp_comic11.md'},
 '2023-07-22 16:33:22': {'location': 'comics/pp_comic12.md'},
 '2023-07-30 08:18:48': {'location': 'comics/pp_comic13.md'},
 '2023-12-11 10:10:20': {'location': 'comics/pp_comic14.md'},
 '2023-12-12 17:11:52': {'location': 'comics/pp_comic15.md'},
 '2023-12-26 14:03:03': {'location': 'comics/pp_comic16.md'},
 '2024-06-07 12:2

In [127]:
# Let's get the feed, update it with this and put it back
old_feed = db.collection('feed').document('content-log')
old_feed = old_feed.get().to_dict()
old_feed


{'2004-08-16 00:00:07': {'location': 'comics/hewligg_urobokkle_8.md'},
 '2024-04-23 08:17:04': {'location': 'blogs/internet.md'},
 '2004-08-23 00:00:00': {'location': 'comics/hewligg_urobokkle_21_3.md'},
 '2004-08-16 00:00:01': {'location': 'comics/hewligg_urobokkle_2.md'},
 '2024-02-01 17:08:24': {'location': 'blogs/hip.md'},
 '2004-10-15 00:00:00': {'location': 'comics/hewligg_urobokkle_38.md'},
 '2023-07-18 08:52:17': {'location': 'blogs/onrss.md'},
 '2024-01-04 11:35:56': {'location': 'blogs/books.md'},
 '2004-10-21 00:00:00': {'location': 'comics/hewligg_urobokkle_44.md'},
 '2004-08-17 00:00:03': {'location': 'comics/hewligg_urobokkle_13.md'},
 '2004-10-06 00:00:00': {'location': 'comics/hewligg_urobokkle_28.md'},
 '2004-10-27 00:00:00': {'location': 'comics/hewligg_urobokkle_50.md'},
 '2004-10-10 00:00:00': {'location': 'comics/hewligg_urobokkle_32.md'},
 '2004-08-18 00:00:01': {'location': 'comics/hewligg_urobokkle_17.md'},
 '2004-08-19 00:00:00': {'location': 'comics/hewligg_ur

In [128]:
# Update with the new feed
old_feed.update(feed)
old_feed

{'2004-08-16 00:00:07': {'location': 'comics/hewligg_urobokkle_8.md'},
 '2024-04-23 08:17:04': {'location': 'blogs/internet.md'},
 '2004-08-23 00:00:00': {'location': 'comics/hewligg_urobokkle_21_3.md'},
 '2004-08-16 00:00:01': {'location': 'comics/hewligg_urobokkle_2.md'},
 '2024-02-01 17:08:24': {'location': 'blogs/hip.md'},
 '2004-10-15 00:00:00': {'location': 'comics/hewligg_urobokkle_38.md'},
 '2023-07-18 08:52:17': {'location': 'blogs/onrss.md'},
 '2024-01-04 11:35:56': {'location': 'blogs/books.md'},
 '2004-10-21 00:00:00': {'location': 'comics/hewligg_urobokkle_44.md'},
 '2004-08-17 00:00:03': {'location': 'comics/hewligg_urobokkle_13.md'},
 '2004-10-06 00:00:00': {'location': 'comics/hewligg_urobokkle_28.md'},
 '2004-10-27 00:00:00': {'location': 'comics/hewligg_urobokkle_50.md'},
 '2004-10-10 00:00:00': {'location': 'comics/hewligg_urobokkle_32.md'},
 '2004-08-18 00:00:01': {'location': 'comics/hewligg_urobokkle_17.md'},
 '2004-08-19 00:00:00': {'location': 'comics/hewligg_ur

In [129]:
# Cool let's put it back
feed = db.collection('feed').document('content-log')
feed.set(old_feed)

update_time {
  seconds: 1720974593
  nanos: 988740000
}

In [130]:
# Sick now let's move staging to content
for root, dirs, files in os.walk('STAGING'):
    for file in files:
        shutil.move(os.path.join(root, file), os.path.join('CONTENT', root.split('/')[-1], file))
        print('Moved', file, 'to', os.path.join('CONTENT', root.split('/')[-1], file))

Moved pp_comic4.png to CONTENT/images/pp_comic4.png
Moved pp_comic_13_thumbnail.jpg to CONTENT/images/pp_comic_13_thumbnail.jpg
Moved pp_comic_8_thumbnail.jpg to CONTENT/images/pp_comic_8_thumbnail.jpg
Moved pp_comic_12_thumbnail.jpg to CONTENT/images/pp_comic_12_thumbnail.jpg
Moved pp_comic11.png to CONTENT/images/pp_comic11.png
Moved pp_comic_2_og.jpg to CONTENT/images/pp_comic_2_og.jpg
Moved pp_comic_5_thumbnail.jpg to CONTENT/images/pp_comic_5_thumbnail.jpg
Moved pp_comic14.png to CONTENT/images/pp_comic14.png
Moved pp_comic_13_og.jpg to CONTENT/images/pp_comic_13_og.jpg
Moved pp_comic_11_thumbnail.jpg to CONTENT/images/pp_comic_11_thumbnail.jpg
Moved pp_comic3.png to CONTENT/images/pp_comic3.png
Moved pp_comic16.png to CONTENT/images/pp_comic16.png
Moved pp_comic_1_thumbnail.jpg to CONTENT/images/pp_comic_1_thumbnail.jpg
Moved pp_comic_15_thumbnail.jpg to CONTENT/images/pp_comic_15_thumbnail.jpg
Moved pp_comic_16_thumbnail.jpg to CONTENT/images/pp_comic_16_thumbnail.jpg
Moved pp_c

In [131]:
from bs4 import BeautifulSoup
import requests
from markdownify import markdownify as md
# Time to do some scraping to get my coding heaven blog
homepage = 'https://codingheaven.btw.so/'

res = requests.get(homepage)
if res.status_code == 200:
    soup = BeautifulSoup(res.text, 'html.parser')
    # Get all the blog posts
    blog_posts = soup.find_all('a', class_='post-item')
    print('Found', len(blog_posts), 'blog posts')

Found 6 blog posts


In [139]:
from time import sleep
# Now let's request each blog post and get the content
for post in blog_posts:
    print('Requesting', post.get('href'))
    post_url = post.get('href')
    post_res = requests.get(post_url)
    if post_res.status_code == 200:
        post_soup = BeautifulSoup(post_res.text, 'html.parser')
        title = post_soup.find('h1').text
        date = post_soup.find(id='post-date-dd-mm-yyyy').text
        content = post_soup.find('article')
        img = content.find('img')
        content = md(str(content))
        # Request the image
        img_res = requests.get(img.get('src'))
        # Save to STAGING/images
        with open(f'STAGING/images/{title}.png', 'wb') as f:
            f.write(img_res.content)
        print('Got image', img.get('src'))
        # Construct markdown
        # We'll deal with thumbnails, og_image and description later
        markdown = f"""---
date: {date}
title: {title}
description: DESCRIPTION
author: Ed
tags: ['Coding Heaven', 'Blog']
type: blog
thumbnail: THUMBNAIL
og_title: {title}
og_description: DESCRIPTION
og_image: OG_IMAGE
og_type: article
collection: Coding Heaven
---
{content}
"""
        with open(f'STAGING/blogs/{title}.md', 'w') as f:
            f.write(markdown)
        print('Got content', title)
    else:
        print('Failed to request', post_url)
    print('Sleeping for 5 seconds')
    sleep(5)

Requesting https://codingheaven.btw.so/sql-queries-that-youll-never-need-but-should-try-anyway
Got image https://nyc3.digitaloceanspaces.com/btw-writer-prod/1719652767457%2FrecursiveCTE.png
Got content SQL Queries That You‚Äôll Never Need (But Should Try Anyway)
Sleeping for 5 seconds
Requesting https://codingheaven.btw.so/making-an-image-crappifier
Got image https://nyc3.digitaloceanspaces.com/btw-writer-prod/1702983788357%2Fballdude.png
Got content Making an Image Crappifier
Sleeping for 5 seconds
Requesting https://codingheaven.btw.so/the-agony-of-bash-math
Got image https://nyc3.digitaloceanspaces.com/btw-writer-prod/1697641247914%2Fbashagony.png
Got content The Agony of Bash Math
Sleeping for 5 seconds
Requesting https://codingheaven.btw.so/fizz-buzz-but-its-excessively-overengineered
Got image https://nyc3.digitaloceanspaces.com/btw-writer-prod/1692982462664%2Ffizzbuzz.png
Got content Fizz Buzz but it‚Äôs Excessively Overengineered
Sleeping for 5 seconds
Requesting https://coding

In [142]:
# Since there's only 6 blogs we can just manually add the metadata and deal with the images
# Note the images in post are still linking to btw so we need to change that
# Also no time included in the date so we'll just add 00:00:00 boooo btw booooooo

# First things first lets find the descriptions for each tool and pop em in a list

homepage = 'https://codingheaven.btw.so/'
res = requests.get(homepage)
descriptions = []
if res.status_code == 200:
    soup = BeautifulSoup(res.text, 'html.parser')
    # Get all the blog posts
    blog_posts = soup.find_all('a', class_='post-item')
    # The description is the only p tag in the post-item
    for blog in blog_posts:
        descriptions.append(blog.find('p').text)

descriptions

['Let‚Äôs head to Codewars, a user-run site for creating and solving technical problems.',
 'You may read that title and wonder, why would anyone want to make an image crappifier?',
 'Let‚Äôs take a look at a coding problem that has plagued me for months on end:',
 "Ever since FizzBuzz first emerged in the famous piece 'Why Can't Programmers.. Program?', it's become a rite of passage. It's a simple program requiring bu...",
 "Humans count from 1, and yet arrays are indexed from 0 (unless you have the misfortune of using MATLAB). There's no wonder we suffer from the infamous off-by-one errors so oft...",
 'Hello World is usually the first program written by the budding programmer.']

In [143]:
# Let's link em to the md files
descriptions = {
    'bash_math.md': descriptions[2],
    'fizz_buzz.md': descriptions[3],
    'hello_world.md': descriptions[-1],
    'image_crappifier.md': descriptions[1],
    'off_by_one.md': descriptions[-2],
    'sql_injection.md': descriptions[0]
}

In [144]:
descriptions

{'bash_math.md': 'Let‚Äôs take a look at a coding problem that has plagued me for months on end:',
 'fizz_buzz.md': "Ever since FizzBuzz first emerged in the famous piece 'Why Can't Programmers.. Program?', it's become a rite of passage. It's a simple program requiring bu...",
 'hello_world.md': 'Hello World is usually the first program written by the budding programmer.',
 'image_crappifier.md': 'You may read that title and wonder, why would anyone want to make an image crappifier?',
 'off_by_one.md': "Humans count from 1, and yet arrays are indexed from 0 (unless you have the misfortune of using MATLAB). There's no wonder we suffer from the infamous off-by-one errors so oft...",
 'sql_injection.md': 'Let‚Äôs head to Codewars, a user-run site for creating and solving technical problems.'}

In [145]:
sql_recursive = descriptions.pop('sql_injection.md')
descriptions['sql_recursive.md'] = sql_recursive

In [157]:
import re
img_re = r'\!\[\]\((.*)\)'
# Cool now let's go update our md files
for f in os.listdir(os.path.join('STAGING', 'blogs')):
    # Split
    with open(os.path.join('STAGING', 'blogs', f), 'r') as file:
        md = file.read()

    split = md.split('---')

    metadata = split[1]
    content = '---'.join(split[2:])

    # Get rid of the first 4 lines in content - that's the title and other crap
    content = '\n'.join(content.split('\n')[4:])

    # Sub the image path into the markdown
    img = re.search(img_re, content)
    content = content.replace(img.group(1), f"/assets/images/{f.split('.')[0]}.png")

    # Sub the description
    metadata = metadata.replace('DESCRIPTION', descriptions[f])

    # Sub og_image
    metadata = metadata.replace('OG_IMAGE', f"/assets/images/{f.split('.')[0]}.png")

    # Create a thumbnail and sub the thumbnail
    img = Image.open(os.path.join('STAGING', 'images', f.split('.')[0] + '.png'))
    img.thumbnail((256, 256))
    img = img.convert('RGB')

    img.save(os.path.join('STAGING', 'images', f.split('.')[0] + '_thumbnail.jpg'))

    # Sub the thumbnail
    metadata = metadata.replace('THUMBNAIL', f"/assets/images/{f.split('.')[0]}_thumbnail.jpg")

    new_md = f"""---
{metadata.strip()}
---
{content.strip()}"""

    # Write it back
    with open(os.path.join('STAGING', 'blogs', f), 'w') as file:
        file.write(new_md)


In [158]:
# Cool now we can add these blogs to our feed
feed = db.collection('feed').document('content-log')
feed = feed.get().to_dict()

In [160]:
print('Feed length', len(feed))
for md_file in os.listdir(os.path.join('STAGING', 'blogs')):
    with open(os.path.join('STAGING', 'blogs', md_file), 'r') as f:
        md = f.read()

    # Get the date
    date = md.split('date: ')[1].split('\n')[0]
    date = dt.strptime(date, "%Y-%m-%d")
    while date.strftime("%Y-%m-%d %H:%M:%S") in feed:
        date = date + timedelta(seconds=1)

    feed[date.strftime("%Y-%m-%d %H:%M:%S")] = {
        'location': os.path.join('blogs', md_file)
    }

    print('Added', md_file)
    print('Time', date.strftime("%Y-%m-%d %H:%M:%S"))
    print('Location', os.path.join('blogs', md_file))

print('Feed length', len(feed))

Feed length 88
Added sql_recursive.md
Time 2024-06-29 00:00:00
Location blogs/sql_recursive.md
Added fizz_buzz.md
Time 2023-08-25 00:00:00
Location blogs/fizz_buzz.md
Added hello_world.md
Time 2023-07-29 00:00:00
Location blogs/hello_world.md
Added off_by_one.md
Time 2023-08-05 00:00:00
Location blogs/off_by_one.md
Added image_crappifier.md
Time 2023-12-19 00:00:00
Location blogs/image_crappifier.md
Added bash_math.md
Time 2023-10-18 00:00:00
Location blogs/bash_math.md
Feed length 94


In [161]:
feed

{'2023-07-06 14:10:46': {'location': 'comics/pp_comic6.md'},
 '2004-08-16 00:00:07': {'location': 'comics/hewligg_urobokkle_8.md'},
 '2024-04-23 08:17:04': {'location': 'blogs/internet.md'},
 '2004-08-23 00:00:00': {'location': 'comics/hewligg_urobokkle_21_3.md'},
 '2004-08-16 00:00:01': {'location': 'comics/hewligg_urobokkle_2.md'},
 '2024-02-01 17:08:24': {'location': 'blogs/hip.md'},
 '2004-10-15 00:00:00': {'location': 'comics/hewligg_urobokkle_38.md'},
 '2023-07-18 08:52:17': {'location': 'blogs/onrss.md'},
 '2024-01-04 11:35:56': {'location': 'blogs/books.md'},
 '2004-10-21 00:00:00': {'location': 'comics/hewligg_urobokkle_44.md'},
 '2004-08-17 00:00:03': {'location': 'comics/hewligg_urobokkle_13.md'},
 '2023-07-21 18:02:45': {'location': 'comics/pp_comic11.md'},
 '2023-07-09 14:28:02': {'location': 'comics/pp_comic8.md'},
 '2004-10-06 00:00:00': {'location': 'comics/hewligg_urobokkle_28.md'},
 '2023-07-18 13:16:07': {'location': 'comics/pp_comic10.md'},
 '2023-07-03 21:52:08': {

In [162]:
# Write it back
feed_ref = db.collection('feed').document('content-log')
feed_ref.set(feed)

update_time {
  seconds: 1721040246
  nanos: 49500000
}

In [164]:
# Oh we also need to write the collection
new_doc = db.collection('collections').document('coding_heaven')
content = [
    'hello_world.md',
    'off_by_one.md',
    'fizz_buzz.md',
    'bash_math.md',
    'image_crappifier.md',
    'sql_recursive.md'
]
content

['hello_world.md',
 'off_by_one.md',
 'fizz_buzz.md',
 'bash_math.md',
 'image_crappifier.md',
 'sql_recursive.md']

In [165]:
# Write it back
new_doc.set({'content': content})

update_time {
  seconds: 1721040452
  nanos: 997726000
}

In [166]:
# Issues with hello_world.md, let's open and save with utf-8 encoding
with open('CONTENT/blogs/hello_world.md', 'r', encoding='utf-8') as f:
    md = f.read()

with open('CONTENT/blogs/hello_world.md', 'w', encoding='utf-8') as f:
    f.write(md)

In [5]:
# Let's scrape weird indie shit
url = "https://weirdindieshit.blogspot.com/"

res = requests.get(url)
soup = BeautifulSoup(res.content, "html.parser")

In [7]:
post_titles = soup.find_all(name="h3", class_="post-title")
post_titles

[<h3 class="post-title"><a href="https://weirdindieshit.blogspot.com/2024/01/soviet-burgers-big-red-adventure.html">Soviet Burger's Big Red Adventure</a></h3>,
 <h3 class="post-title entry-title">
 <a href="https://weirdindieshit.blogspot.com/2023/12/mr-yuck-adventuers-in-jpeg-land.html">Mr Yuck Adventuers in JPEG Land</a>
 </h3>,
 <h3 class="post-title entry-title">
 <a href="https://weirdindieshit.blogspot.com/2023/08/work-at-mcdonald-authentic-fast-food.html">Work at McDonald, the Authentic Fast Food Worker Experience</a>
 </h3>,
 <h3 class="post-title entry-title">
 <a href="https://weirdindieshit.blogspot.com/2023/08/zapman-goes-on-slide-slide-into-chaos.html">Zapman Goes On a Slide, A Slide Into Chaos</a>
 </h3>,
 <h3 class="post-title entry-title">
 <a href="https://weirdindieshit.blogspot.com/2023/08/mouses-playground-affront-to-all-mice.html">Mouse's Playground, An Affront To All Mice Everywhere</a>
 </h3>,
 <h3 class="post-title entry-title">
 <a href="https://weirdindieshit.

In [10]:
# Start a dict
blogs = {
    title.find("a").text: title.find("a")['href']
    for title in post_titles
}
blogs

{"Soviet Burger's Big Red Adventure": 'https://weirdindieshit.blogspot.com/2024/01/soviet-burgers-big-red-adventure.html',
 'Mr Yuck Adventuers in JPEG Land': 'https://weirdindieshit.blogspot.com/2023/12/mr-yuck-adventuers-in-jpeg-land.html',
 'Work at McDonald, the Authentic Fast Food Worker Experience': 'https://weirdindieshit.blogspot.com/2023/08/work-at-mcdonald-authentic-fast-food.html',
 'Zapman Goes On a Slide, A Slide Into Chaos': 'https://weirdindieshit.blogspot.com/2023/08/zapman-goes-on-slide-slide-into-chaos.html',
 "Mouse's Playground, An Affront To All Mice Everywhere": 'https://weirdindieshit.blogspot.com/2023/08/mouses-playground-affront-to-all-mice.html',
 'FUCK HOUSE, When Bitsy Goes Weird': 'https://weirdindieshit.blogspot.com/2023/08/fuck-house-when-bitsy-goes-weird.html'}

In [12]:
from utils.string_utils import strip_punctuation
blog_content = {}
# Now let's make markdown files
for title, url in blogs.items():
    res = requests.get(url)
    blog_content[title] = res.content

blog_content

 'Mr Yuck Adventuers in JPEG Land': b'<!DOCTYPE html>\n<html dir=\'ltr\' lang=\'en-GB\'>\n<head>\n<meta content=\'width=device-width, initial-scale=1\' name=\'viewport\'/>\n<title>Mr Yuck Adventuers in JPEG Land</title>\n<meta content=\'text/html; charset=UTF-8\' http-equiv=\'Content-Type\'/>\n<!-- Chrome, Firefox OS and Opera -->\n<meta content=\'#eeeeee\' name=\'theme-color\'/>\n<!-- Windows Phone -->\n<meta content=\'#eeeeee\' name=\'msapplication-navbutton-color\'/>\n<meta content=\'blogger\' name=\'generator\'/>\n<link href=\'https://weirdindieshit.blogspot.com/favicon.ico\' rel=\'icon\' type=\'image/x-icon\'/>\n<link href=\'https://weirdindieshit.blogspot.com/2023/12/mr-yuck-adventuers-in-jpeg-land.html\' rel=\'canonical\'/>\n<link rel="alternate" type="application/atom+xml" title="Weird Indie Shit - Atom" href="https://weirdindieshit.blogspot.com/feeds/posts/default" />\n<link rel="alternate" type="application/rss+xml" title="Weird Indie Shit - RSS" href="https://weirdindieshit.

In [19]:
# Well we need to download the images
for title, html in blog_content.items():
    soup = BeautifulSoup(html, 'html.parser')
    article = soup.find("article")
    for i, img in enumerate(article.find_all("img")):
        src = img['src']
        res = requests.get(src)
        image = res.content
        with open(os.path.join("STAGING", "images", strip_punctuation(title).replace(" ", "_") + str(i) + ".png"), "wb") as f:
            f.write(image)
        print("Got image", src)


Got image https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEj5uCejHypuxrQy9YbCt6bdeuoLIM-99UrUQ9FFABxKut0Nr-RQBsZcBrqwQmswQZSdfhb-wybkAD4CSZUZurZaLWrzF9rBK011upLd9jsNyZad8GhpH8KaJgWQabDchlcfgPAn-WF9nSLTf5N_fmLZqUnNHuNYs8jZeLrtv3ZOzn6RaYd0BCPmU6ipTdlA/s16000/sovietburger2.png
Got image https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEjTD_Z0cyZie48LE-gdjjZSvu5jxRrLhFkuPxvm6JPMs5Ad4k9h_x0vIINgDuwzSloMSNy5RIFH2YbvmdrgJOO_BpZK8TEge-F7hDAJYMb7db-s4BpIkVWrJYeoR7wYfB30xclx0aC7YQ92WRupl8ftaeYnoAi4qTkWKy9CB0O9z7sRP34Wv03LW4SKa5hu/s16000/sovietburger1.png
Got image https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEgEpm7VwnNS9-PJIxzS26qlhtPxQQ6kxgQCJepmTlr2wTQmUBfAumx7faYRBA2c1xMbC6DyoRQb5hi-cNkKwzyBsdhUq91dETiRIQGjHHOevJiLXCNrugy0Gxb78EShS7j1_pWKlZwvWNnGveBtg-0QDnjju34qTI7UbqHLiWNtkOZOhdiaPj-m77pZdJJF/s16000/mryuck1.png
Got image https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEhGH5amnJTU8r4FhJiosDKrgiOL0rF9bpVAjs3xLrxuIJwbYAD3tH8xoPG31DXinNbEvPzdBWFnZlcLiCvH

In [25]:
published_dts = {}
# Great let's start wrtiting our markdown docs
for title, html in blog_content.items():
    clean_title = strip_punctuation(title).replace(" ", "_")
    soup = BeautifulSoup(html, 'html.parser')
    article = soup.find("article")
    title = article.find("h3").text
    date_time = article.find('time')['datetime']
    date = date_time.split('T')[0]
    body = article.find("div", class_="post-body")
    post_content = md(str(body))
    description = body.find("p").text
    published_dts[clean_title] = date_time
    # Create a thumbnail
    img = Image.open(os.path.join("STAGING", "images", clean_title + "0.png"))
    img.thumbnail((256, 256))
    img = img.convert("RGB")
    img.save(os.path.join("STAGING", "images", clean_title + "_thumbnail.jpg"))

    with open(os.path.join("STAGING", "blogs", clean_title + ".md"), "w") as f:
        f.write(f"""---
date: {date}
title: {title}
description: {description}
author: Ed
tags: ['Video Game', 'Indie Game', 'Review']
type: blog
thumbnail: /assets/images/{clean_title}_thumbail.jpg
og_title: {title}
og_description: {description}
og_image: /asset/images/{clean_title}0.png
og_type: article
collection: Weird Indie Shit
---
{post_content}
""")
    print("Written blog", title)

Written blog 
Soviet Burger's Big Red Adventure

Written blog 
Mr Yuck Adventuers in JPEG Land

Written blog 
Work at McDonald, the Authentic Fast Food Worker Experience

Written blog 
Zapman Goes On a Slide, A Slide Into Chaos

Written blog 
Mouse's Playground, An Affront To All Mice Everywhere

Written blog 
FUCK HOUSE, When Bitsy Goes Weird



In [26]:
published_dts

{'Soviet_Burgers_Big_Red_Adventure': '2024-01-04T08:58:00-08:00',
 'Mr_Yuck_Adventuers_in_JPEG_Land': '2023-12-18T12:23:00-08:00',
 'Work_at_McDonald_the_Authentic_Fast_Food_Worker_Experience': '2023-08-22T05:47:00-07:00',
 'Zapman_Goes_On_a_Slide_A_Slide_Into_Chaos': '2023-08-14T23:49:00-07:00',
 'Mouses_Playground_An_Affront_To_All_Mice_Everywhere': '2023-08-08T01:50:00-07:00',
 'FUCK_HOUSE_When_Bitsy_Goes_Weird': '2023-08-05T03:23:00-07:00'}

In [34]:
# Now we can convert these to the correct format and put them in the feed
feed_ref = db.collection("collections").document("weird_indie_shit")
published_dts = {
    t+".md": _dt.replace("T", " ")
    for t, _dt in published_dts.items()
}
published_dts

{'Soviet_Burgers_Big_Red_Adventure.md': '2024-01-04 08:58:00-08:00',
 'Mr_Yuck_Adventuers_in_JPEG_Land.md': '2023-12-18 12:23:00-08:00',
 'Work_at_McDonald_the_Authentic_Fast_Food_Worker_Experience.md': '2023-08-22 05:47:00-07:00',
 'Zapman_Goes_On_a_Slide_A_Slide_Into_Chaos.md': '2023-08-14 23:49:00-07:00',
 'Mouses_Playground_An_Affront_To_All_Mice_Everywhere.md': '2023-08-08 01:50:00-07:00',
 'FUCK_HOUSE_When_Bitsy_Goes_Weird.md': '2023-08-05 03:23:00-07:00'}

In [43]:
# Includes timezones so let's loop over
for k, v in published_dts.items():
    hour_offset=v.split('-')[-1].split(':')[0]
    hour_offset = int(hour_offset)
    hour = v.split(' ')[1].split(':')[0]
    int_hour = int(hour)
    int_hour += hour_offset
    v = v.replace(f" {hour}", f" {str(int_hour)}")
    v = "-".join(v.split("-")[:-1])
    published_dts[k] = v

published_dts

{'Soviet_Burgers_Big_Red_Adventure.md': '2024-01-04 16:58:00',
 'Mr_Yuck_Adventuers_in_JPEG_Land.md': '2023-12-18 20:23:00',
 'Work_at_McDonald_the_Authentic_Fast_Food_Worker_Experience.md': '2023-08-22 12:47:00',
 'Zapman_Goes_On_a_Slide_A_Slide_Into_Chaos.md': '2023-08-14 30:49:00',
 'Mouses_Playground_An_Affront_To_All_Mice_Everywhere.md': '2023-08-08 8:50:00',
 'FUCK_HOUSE_When_Bitsy_Goes_Weird.md': '2023-08-05 10:23:00'}

In [31]:
# I deleted the feed so let's recover it fortunately I printed it out
feed = {'2023-07-06 14:10:46': {'location': 'comics/pp_comic6.md'},
 '2004-08-16 00:00:07': {'location': 'comics/hewligg_urobokkle_8.md'},
 '2024-04-23 08:17:04': {'location': 'blogs/internet.md'},
 '2004-08-23 00:00:00': {'location': 'comics/hewligg_urobokkle_21_3.md'},
 '2004-08-16 00:00:01': {'location': 'comics/hewligg_urobokkle_2.md'},
 '2024-02-01 17:08:24': {'location': 'blogs/hip.md'},
 '2004-10-15 00:00:00': {'location': 'comics/hewligg_urobokkle_38.md'},
 '2023-07-18 08:52:17': {'location': 'blogs/onrss.md'},
 '2024-01-04 11:35:56': {'location': 'blogs/books.md'},
 '2004-10-21 00:00:00': {'location': 'comics/hewligg_urobokkle_44.md'},
 '2004-08-17 00:00:03': {'location': 'comics/hewligg_urobokkle_13.md'},
 '2023-07-21 18:02:45': {'location': 'comics/pp_comic11.md'},
 '2023-07-09 14:28:02': {'location': 'comics/pp_comic8.md'},
 '2004-10-06 00:00:00': {'location': 'comics/hewligg_urobokkle_28.md'},
 '2023-07-18 13:16:07': {'location': 'comics/pp_comic10.md'},
 '2023-07-03 21:52:08': {'location': 'comics/pp_comic1.md'},
 '2004-10-27 00:00:00': {'location': 'comics/hewligg_urobokkle_50.md'},
 '2004-10-10 00:00:00': {'location': 'comics/hewligg_urobokkle_32.md'},
 '2004-08-18 00:00:01': {'location': 'comics/hewligg_urobokkle_17.md'},
 '2004-08-19 00:00:00': {'location': 'comics/hewligg_urobokkle_20_3.md'},
 '2023-02-22 00:00:00': {'location': 'blogs/stumbleupon.md'},
 '2004-10-04 00:00:00': {'location': 'comics/hewligg_urobokkle_26.md'},
 '2004-08-17 00:00:05': {'location': 'comics/hewligg_urobokkle_15.md'},
 '2004-08-17 00:00:04': {'location': 'comics/hewligg_urobokkle_14.md'},
 '2004-08-16 00:00:08': {'location': 'comics/hewligg_urobokkle_9.md'},
 '2004-09-30 00:00:00': {'location': 'comics/hewligg_urobokkle_22_2.md'},
 '2004-08-17 00:00:02': {'location': 'comics/hewligg_urobokkle_12.md'},
 '2004-10-14 00:00:00': {'location': 'comics/hewligg_urobokkle_37.md'},
 '2023-12-17 16:08:05': {'location': 'blogs/depressed.md'},
 '2004-10-24 00:00:00': {'location': 'comics/hewligg_urobokkle_47.md'},
 '2023-12-12 17:11:52': {'location': 'comics/pp_comic15.md'},
 '2023-07-22 16:33:22': {'location': 'comics/pp_comic12.md'},
 '2004-08-18 00:00:00': {'location': 'comics/hewligg_urobokkle_16.md'},
 '2004-08-16 00:00:00': {'location': 'comics/hewligg_urobokkle_1.md'},
 '2004-10-25 00:00:00': {'location': 'comics/hewligg_urobokkle_48.md'},
 '2004-08-16 00:00:06': {'location': 'comics/hewligg_urobokkle_7.md'},
 '2005-01-01 00:00:00': {'location': 'music/planet_ed.md'},
 '2024-05-06 09:38:07': {'location': 'blogs/discord.md'},
 '2004-10-07 00:00:00': {'location': 'comics/hewligg_urobokkle_29.md'},
 '2004-10-26 00:00:00': {'location': 'comics/hewligg_urobokkle_49.md'},
 '2004-10-09 00:00:00': {'location': 'comics/hewligg_urobokkle_31.md'},
 '2004-08-16 00:00:05': {'location': 'comics/hewligg_urobokkle_6.md'},
 '2023-07-23 10:22:07': {'location': 'blogs/burnout.md'},
 '2023-12-26 14:03:03': {'location': 'comics/pp_comic16.md'},
 '2004-10-16 00:00:00': {'location': 'comics/hewligg_urobokkle_39.md'},
 '2004-10-23 00:00:00': {'location': 'comics/hewligg_urobokkle_46.md'},
 '2024-03-14 00:00:00': {'location': 'blogs/horses.md'},
 '2004-08-17 00:00:01': {'location': 'comics/hewligg_urobokkle_11.md'},
 '2023-12-23 11:03:45': {'location': 'blogs/charlmes.md'},
 '2004-08-22 00:00:00': {'location': 'comics/hewligg_urobokkle_21_2.md'},
 '2004-10-11 00:00:00': {'location': 'comics/hewligg_urobokkle_33.md'},
 '2004-08-16 00:00:02': {'location': 'comics/hewligg_urobokkle_3.md'},
 '2004-08-16 00:00:04': {'location': 'comics/hewligg_urobokkle_5.md'},
 '2004-10-08 00:00:00': {'location': 'comics/hewligg_urobokkle_23.md'},
 '2023-07-13 10:00:22': {'location': 'comics/pp_comic9.md'},
 '2023-08-08 12:02:16': {'location': 'blogs/stress.md'},
 '2004-08-21 00:00:00': {'location': 'comics/hewligg_urobokkle_21_1.md'},
 '2004-08-18 00:00:03': {'location': 'comics/hewligg_urobokkle_19.md'},
 '2004-08-18 00:00:02': {'location': 'comics/hewligg_urobokkle_18.md'},
 '2004-08-16 00:00:03': {'location': 'comics/hewligg_urobokkle_4.md'},
 '2004-08-17 00:00:00': {'location': 'comics/hewligg_urobokkle_10.md'},
 '2004-10-19 00:00:00': {'location': 'comics/hewligg_urobokkle_42.md'},
 '2004-08-18 00:00:05': {'location': 'comics/hewligg_urobokkle_20_2.md'},
 '2004-10-13 00:00:00': {'location': 'comics/hewligg_urobokkle_36.md'},
 '2023-07-08 15:20:42': {'location': 'comics/pp_comic7.md'},
 '2004-10-20 00:00:00': {'location': 'comics/hewligg_urobokkle_43.md'},
 '2023-06-20 14:28:27': {'location': 'blogs/oldwebsites.md'},
 '2004-10-12 00:00:00': {'location': 'comics/hewligg_urobokkle_35.md'},
 '2004-10-02 00:00:00': {'location': 'comics/hewligg_urobokkle_24.md'},
 '2023-07-30 08:18:48': {'location': 'comics/pp_comic13.md'},
 '2023-04-17 00:00:00': {'location': 'blogs/bananas.md'},
 '2004-10-17 00:00:00': {'location': 'comics/hewligg_urobokkle_40.md'},
 '2023-07-29 09:05:47': {'location': 'blogs/dustydrawers.md'},
 '2023-07-06 09:49:26': {'location': 'comics/pp_comic5.md'},
 '2004-10-22 00:00:00': {'location': 'comics/hewligg_urobokkle_45.md'},
 '2023-07-06 07:23:12': {'location': 'comics/pp_comic4.md'},
 '2023-08-22 15:15:32': {'location': 'blogs/prank.md'},
 '2023-06-05 00:00:00': {'location': 'blogs/alcoholism.md'},
 '2004-10-18 00:00:00': {'location': 'comics/hewligg_urobokkle_41.md'},
 '2004-10-05 00:00:00': {'location': 'comics/hewligg_urobokkle_27.md'},
 '2023-07-04 17:10:46': {'location': 'comics/pp_comic3.md'},
 '2004-09-29 00:00:00': {'location': 'comics/hewligg_urobokkle_22_1.md'},
 '2004-10-08 00:00:01': {'location': 'comics/hewligg_urobokkle_30.md'},
 '2024-06-07 12:23:34': {'location': 'comics/pp_comic17.md'},
 '2023-07-04 12:53:11': {'location': 'comics/pp_comic2.md'},
 '2004-10-03 00:00:00': {'location': 'comics/hewligg_urobokkle_25.md'},
 '2004-08-18 00:00:04': {'location': 'comics/hewligg_urobokkle_20_1.md'},
 '2023-12-11 10:10:20': {'location': 'comics/pp_comic14.md'},
 '2024-06-29 00:00:00': {'location': 'blogs/sql_recursive.md'},
 '2023-08-25 00:00:00': {'location': 'blogs/fizz_buzz.md'},
 '2023-07-29 00:00:00': {'location': 'blogs/hello_world.md'},
 '2023-08-05 00:00:00': {'location': 'blogs/off_by_one.md'},
 '2023-12-19 00:00:00': {'location': 'blogs/image_crappifier.md'},
 '2023-10-18 00:00:00': {'location': 'blogs/bash_math.md'}}
feed

{'2023-07-06 14:10:46': {'location': 'comics/pp_comic6.md'},
 '2004-08-16 00:00:07': {'location': 'comics/hewligg_urobokkle_8.md'},
 '2024-04-23 08:17:04': {'location': 'blogs/internet.md'},
 '2004-08-23 00:00:00': {'location': 'comics/hewligg_urobokkle_21_3.md'},
 '2004-08-16 00:00:01': {'location': 'comics/hewligg_urobokkle_2.md'},
 '2024-02-01 17:08:24': {'location': 'blogs/hip.md'},
 '2004-10-15 00:00:00': {'location': 'comics/hewligg_urobokkle_38.md'},
 '2023-07-18 08:52:17': {'location': 'blogs/onrss.md'},
 '2024-01-04 11:35:56': {'location': 'blogs/books.md'},
 '2004-10-21 00:00:00': {'location': 'comics/hewligg_urobokkle_44.md'},
 '2004-08-17 00:00:03': {'location': 'comics/hewligg_urobokkle_13.md'},
 '2023-07-21 18:02:45': {'location': 'comics/pp_comic11.md'},
 '2023-07-09 14:28:02': {'location': 'comics/pp_comic8.md'},
 '2004-10-06 00:00:00': {'location': 'comics/hewligg_urobokkle_28.md'},
 '2023-07-18 13:16:07': {'location': 'comics/pp_comic10.md'},
 '2023-07-03 21:52:08': {

In [53]:
feed_ref = db.collection('feed').document('content-log')
feed = feed_ref.get().to_dict()
feed

{'2004-10-17 00:00:00': {'location': 'comics/hewligg_urobokkle_40.md'},
 '2023-07-06 14:10:46': {'location': 'comics/pp_comic6.md'},
 '2004-08-16 00:00:07': {'location': 'comics/hewligg_urobokkle_8.md'},
 '2024-04-23 08:17:04': {'location': 'blogs/internet.md'},
 '2004-08-23 00:00:00': {'location': 'comics/hewligg_urobokkle_21_3.md'},
 '2004-08-16 00:00:01': {'location': 'comics/hewligg_urobokkle_2.md'},
 '2024-02-01 17:08:24': {'location': 'blogs/hip.md'},
 '2004-10-15 00:00:00': {'location': 'comics/hewligg_urobokkle_38.md'},
 '2023-07-18 08:52:17': {'location': 'blogs/onrss.md'},
 '2024-01-04 11:35:56': {'location': 'blogs/books.md'},
 '2004-10-21 00:00:00': {'location': 'comics/hewligg_urobokkle_44.md'},
 '2004-08-17 00:00:03': {'location': 'comics/hewligg_urobokkle_13.md'},
 '2023-07-21 18:02:45': {'location': 'comics/pp_comic11.md'},
 '2023-07-09 14:28:02': {'location': 'comics/pp_comic8.md'},
 '2004-10-06 00:00:00': {'location': 'comics/hewligg_urobokkle_28.md'},
 '2023-07-18 13

In [47]:
# Now let's update the feed
new_items = {
    v: {'location': f'blogs/{k}'}
    for k, v in published_dts.items()
}
new_items

{'2024-01-04 16:58:00': {'location': 'blogs/Soviet_Burgers_Big_Red_Adventure.md'},
 '2023-12-18 20:23:00': {'location': 'blogs/Mr_Yuck_Adventuers_in_JPEG_Land.md'},
 '2023-08-22 12:47:00': {'location': 'blogs/Work_at_McDonald_the_Authentic_Fast_Food_Worker_Experience.md'},
 '2023-08-14 30:49:00': {'location': 'blogs/Zapman_Goes_On_a_Slide_A_Slide_Into_Chaos.md'},
 '2023-08-08 8:50:00': {'location': 'blogs/Mouses_Playground_An_Affront_To_All_Mice_Everywhere.md'},
 '2023-08-05 10:23:00': {'location': 'blogs/FUCK_HOUSE_When_Bitsy_Goes_Weird.md'}}

In [48]:
new_items['2023-08-08 08:50:00'] = new_items.pop('2023-08-08 8:50:00')
new_items

{'2024-01-04 16:58:00': {'location': 'blogs/Soviet_Burgers_Big_Red_Adventure.md'},
 '2023-12-18 20:23:00': {'location': 'blogs/Mr_Yuck_Adventuers_in_JPEG_Land.md'},
 '2023-08-22 12:47:00': {'location': 'blogs/Work_at_McDonald_the_Authentic_Fast_Food_Worker_Experience.md'},
 '2023-08-14 30:49:00': {'location': 'blogs/Zapman_Goes_On_a_Slide_A_Slide_Into_Chaos.md'},
 '2023-08-05 10:23:00': {'location': 'blogs/FUCK_HOUSE_When_Bitsy_Goes_Weird.md'},
 '2023-08-08 08:50:00': {'location': 'blogs/Mouses_Playground_An_Affront_To_All_Mice_Everywhere.md'}}

In [54]:
feed.update(new_items)
feed

{'2004-10-17 00:00:00': {'location': 'comics/hewligg_urobokkle_40.md'},
 '2023-07-06 14:10:46': {'location': 'comics/pp_comic6.md'},
 '2004-08-16 00:00:07': {'location': 'comics/hewligg_urobokkle_8.md'},
 '2024-04-23 08:17:04': {'location': 'blogs/internet.md'},
 '2004-08-23 00:00:00': {'location': 'comics/hewligg_urobokkle_21_3.md'},
 '2004-08-16 00:00:01': {'location': 'comics/hewligg_urobokkle_2.md'},
 '2024-02-01 17:08:24': {'location': 'blogs/hip.md'},
 '2004-10-15 00:00:00': {'location': 'comics/hewligg_urobokkle_38.md'},
 '2023-07-18 08:52:17': {'location': 'blogs/onrss.md'},
 '2024-01-04 11:35:56': {'location': 'blogs/books.md'},
 '2004-10-21 00:00:00': {'location': 'comics/hewligg_urobokkle_44.md'},
 '2004-08-17 00:00:03': {'location': 'comics/hewligg_urobokkle_13.md'},
 '2023-07-21 18:02:45': {'location': 'comics/pp_comic11.md'},
 '2023-07-09 14:28:02': {'location': 'comics/pp_comic8.md'},
 '2004-10-06 00:00:00': {'location': 'comics/hewligg_urobokkle_28.md'},
 '2023-07-18 13

In [55]:
# Write it back
feed_ref.set(feed)

update_time {
  seconds: 1721303500
  nanos: 31651000
}

In [56]:
# We need a list of conteent for the weird indie shit collection
doc_ref = db.collection('collections').document('weird_indie_shit')
content = [
    r.split('/')[-1] for r in published_dts.keys()
]
content

['Soviet_Burgers_Big_Red_Adventure.md',
 'Mr_Yuck_Adventuers_in_JPEG_Land.md',
 'Work_at_McDonald_the_Authentic_Fast_Food_Worker_Experience.md',
 'Zapman_Goes_On_a_Slide_A_Slide_Into_Chaos.md',
 'Mouses_Playground_An_Affront_To_All_Mice_Everywhere.md',
 'FUCK_HOUSE_When_Bitsy_Goes_Weird.md']

In [57]:
content.reverse()
content

['FUCK_HOUSE_When_Bitsy_Goes_Weird.md',
 'Mouses_Playground_An_Affront_To_All_Mice_Everywhere.md',
 'Zapman_Goes_On_a_Slide_A_Slide_Into_Chaos.md',
 'Work_at_McDonald_the_Authentic_Fast_Food_Worker_Experience.md',
 'Mr_Yuck_Adventuers_in_JPEG_Land.md',
 'Soviet_Burgers_Big_Red_Adventure.md']

In [58]:
# Put it up there
doc_ref.set({
    'content':
        content
})

update_time {
  seconds: 1721307072
  nanos: 580100000
}

In [101]:
vid_json = {
  "kind": "youtube#searchListResponse",
  "etag": "2xpFVYMVMlapO5mRD13Toy2HCSE",
  "regionCode": "GB",
  "pageInfo": {
    "totalResults": 15,
    "resultsPerPage": 15
  },
  "items": [
    {
      "kind": "youtube#searchResult",
      "etag": "r2UVk9RBj6agtCl9RBjhCe66Fhk",
      "id": {
        "kind": "youtube#video",
        "videoId": "LWj14ITVOrU"
      },
      "snippet": {
        "publishedAt": "2022-08-13T02:10:27Z",
        "channelId": "UClCyxLNDIuY5VIt9r0LrJow",
        "title": "Can&#39;t Cast Ep The LEE Thing",
        "description": "Subscribe Please : https://www.youtube.com/channel/UClCyxLNDIuY5VIt9r0LrJow/ Since both sharks and hippos share some ...",
        "thumbnails": {
          "default": {
            "url": "https://i.ytimg.com/vi/LWj14ITVOrU/default.jpg",
            "width": 120,
            "height": 90
          },
          "medium": {
            "url": "https://i.ytimg.com/vi/LWj14ITVOrU/mqdefault.jpg",
            "width": 320,
            "height": 180
          },
          "high": {
            "url": "https://i.ytimg.com/vi/LWj14ITVOrU/hqdefault.jpg",
            "width": 480,
            "height": 360
          }
        },
        "channelTitle": "Can't Cast",
        "liveBroadcastContent": "none",
        "publishTime": "2022-08-13T02:10:27Z"
      }
    },
    {
      "kind": "youtube#searchResult",
      "etag": "tKGq2nfupVPRo50o2zPyYKMEatw",
      "id": {
        "kind": "youtube#video",
        "videoId": "0GHObygssjM"
      },
      "snippet": {
        "publishedAt": "2022-08-06T01:53:55Z",
        "channelId": "UClCyxLNDIuY5VIt9r0LrJow",
        "title": "Can&#39;t Cast Ep Different English Aussie Flag Day",
        "description": "Subscribe Please : https://www.youtube.com/channel/UClCyxLNDIuY5VIt9r0LrJow/ Since both sharks and hippos share some ...",
        "thumbnails": {
          "default": {
            "url": "https://i.ytimg.com/vi/0GHObygssjM/default.jpg",
            "width": 120,
            "height": 90
          },
          "medium": {
            "url": "https://i.ytimg.com/vi/0GHObygssjM/mqdefault.jpg",
            "width": 320,
            "height": 180
          },
          "high": {
            "url": "https://i.ytimg.com/vi/0GHObygssjM/hqdefault.jpg",
            "width": 480,
            "height": 360
          }
        },
        "channelTitle": "Can't Cast",
        "liveBroadcastContent": "none",
        "publishTime": "2022-08-06T01:53:55Z"
      }
    },
    {
      "kind": "youtube#searchResult",
      "etag": "Xl9pbXjE2D5ZddSMx3jok2Mb6kw",
      "id": {
        "kind": "youtube#video",
        "videoId": "mHwATtVracM"
      },
      "snippet": {
        "publishedAt": "2022-07-28T22:27:37Z",
        "channelId": "UClCyxLNDIuY5VIt9r0LrJow",
        "title": "Can&#39;t Cast Ep 3 (Friday podcast ) also we don&#39;t talk about Lee",
        "description": "Subscribe Please : https://www.youtube.com/channel/UClCyxLNDIuY5VIt9r0LrJow/ It's Friday so I'm going to go get something for ...",
        "thumbnails": {
          "default": {
            "url": "https://i.ytimg.com/vi/mHwATtVracM/default.jpg",
            "width": 120,
            "height": 90
          },
          "medium": {
            "url": "https://i.ytimg.com/vi/mHwATtVracM/mqdefault.jpg",
            "width": 320,
            "height": 180
          },
          "high": {
            "url": "https://i.ytimg.com/vi/mHwATtVracM/hqdefault.jpg",
            "width": 480,
            "height": 360
          }
        },
        "channelTitle": "Can't Cast",
        "liveBroadcastContent": "none",
        "publishTime": "2022-07-28T22:27:37Z"
      }
    },
    {
      "kind": "youtube#searchResult",
      "etag": "TdMijYG-bbJQx47sgW9XbYFox2U",
      "id": {
        "kind": "youtube#video",
        "videoId": "YfZMxVcAr-k"
      },
      "snippet": {
        "publishedAt": "2022-07-25T00:40:54Z",
        "channelId": "UClCyxLNDIuY5VIt9r0LrJow",
        "title": "Street Fashions USA but we didn&#39;t want Tommy Wiseau to sue us to the fullest extent of the law",
        "description": "Subscribe Please : https://www.youtube.com/channel/UClCyxLNDIuY5VIt9r0LrJow/ We love Tommy Wiseau's original ad so we ...",
        "thumbnails": {
          "default": {
            "url": "https://i.ytimg.com/vi/YfZMxVcAr-k/default.jpg",
            "width": 120,
            "height": 90
          },
          "medium": {
            "url": "https://i.ytimg.com/vi/YfZMxVcAr-k/mqdefault.jpg",
            "width": 320,
            "height": 180
          },
          "high": {
            "url": "https://i.ytimg.com/vi/YfZMxVcAr-k/hqdefault.jpg",
            "width": 480,
            "height": 360
          }
        },
        "channelTitle": "Can't Cast",
        "liveBroadcastContent": "none",
        "publishTime": "2022-07-25T00:40:54Z"
      }
    },
    {
      "kind": "youtube#searchResult",
      "etag": "jkqg8M4bNHw0CLoLa1_4jya2xnY",
      "id": {
        "kind": "youtube#video",
        "videoId": "Hkgrb_e5XQc"
      },
      "snippet": {
        "publishedAt": "2022-07-23T02:06:48Z",
        "channelId": "UClCyxLNDIuY5VIt9r0LrJow",
        "title": "Can&#39;t Cast Ep 2 (Drive thru with Mr Beast ) also we don&#39;t talk about Lee",
        "description": "Subscribe Please : https://www.youtube.com/channel/UClCyxLNDIuY5VIt9r0LrJow/ Jeremy Clarkson drove through a McDonald's ...",
        "thumbnails": {
          "default": {
            "url": "https://i.ytimg.com/vi/Hkgrb_e5XQc/default.jpg",
            "width": 120,
            "height": 90
          },
          "medium": {
            "url": "https://i.ytimg.com/vi/Hkgrb_e5XQc/mqdefault.jpg",
            "width": 320,
            "height": 180
          },
          "high": {
            "url": "https://i.ytimg.com/vi/Hkgrb_e5XQc/hqdefault.jpg",
            "width": 480,
            "height": 360
          }
        },
        "channelTitle": "Can't Cast",
        "liveBroadcastContent": "none",
        "publishTime": "2022-07-23T02:06:48Z"
      }
    },
    {
      "kind": "youtube#searchResult",
      "etag": "ACxfWpoNnB2KjlukniAQHzrjEhI",
      "id": {
        "kind": "youtube#video",
        "videoId": "jVyje8c8EvQ"
      },
      "snippet": {
        "publishedAt": "2022-07-20T07:15:01Z",
        "channelId": "UClCyxLNDIuY5VIt9r0LrJow",
        "title": "Chat News Highlights, Fire On the bottom oval - Can&#39;t Cast",
        "description": "Subscribe Please : https://www.youtube.com/channel/UClCyxLNDIuY5VIt9r0LrJow/ Chat News Highlights is a weekly podcast that ...",
        "thumbnails": {
          "default": {
            "url": "https://i.ytimg.com/vi/jVyje8c8EvQ/default.jpg",
            "width": 120,
            "height": 90
          },
          "medium": {
            "url": "https://i.ytimg.com/vi/jVyje8c8EvQ/mqdefault.jpg",
            "width": 320,
            "height": 180
          },
          "high": {
            "url": "https://i.ytimg.com/vi/jVyje8c8EvQ/hqdefault.jpg",
            "width": 480,
            "height": 360
          }
        },
        "channelTitle": "Can't Cast",
        "liveBroadcastContent": "none",
        "publishTime": "2022-07-20T07:15:01Z"
      }
    },
    {
      "kind": "youtube#searchResult",
      "etag": "eDsDT2HvCbZrAA9W2V1Y19HpXGA",
      "id": {
        "kind": "youtube#video",
        "videoId": "InNEbdTY4Oo"
      },
      "snippet": {
        "publishedAt": "2022-07-19T07:15:00Z",
        "channelId": "UClCyxLNDIuY5VIt9r0LrJow",
        "title": "Ep: 1 auto rap off against a fire train - Can&#39;t Cast Clip",
        "description": "Subscribe Please : https://www.youtube.com/channel/UClCyxLNDIuY5VIt9r0LrJow/ There is no better way to start a day than by ...",
        "thumbnails": {
          "default": {
            "url": "https://i.ytimg.com/vi/InNEbdTY4Oo/default.jpg",
            "width": 120,
            "height": 90
          },
          "medium": {
            "url": "https://i.ytimg.com/vi/InNEbdTY4Oo/mqdefault.jpg",
            "width": 320,
            "height": 180
          },
          "high": {
            "url": "https://i.ytimg.com/vi/InNEbdTY4Oo/hqdefault.jpg",
            "width": 480,
            "height": 360
          }
        },
        "channelTitle": "Can't Cast",
        "liveBroadcastContent": "none",
        "publishTime": "2022-07-19T07:15:00Z"
      }
    },
    {
      "kind": "youtube#searchResult",
      "etag": "9xNecbLxQw3DEl01Js1k-k3d0g8",
      "id": {
        "kind": "youtube#video",
        "videoId": "qIky6lBfDcs"
      },
      "snippet": {
        "publishedAt": "2022-07-18T07:15:03Z",
        "channelId": "UClCyxLNDIuY5VIt9r0LrJow",
        "title": "What does LOL really mean?",
        "description": "Subscribe Please : https://www.youtube.com/channel/UClCyxLNDIuY5VIt9r0LrJow/ lol, or laugh out loud, is a common text term ...",
        "thumbnails": {
          "default": {
            "url": "https://i.ytimg.com/vi/qIky6lBfDcs/default.jpg",
            "width": 120,
            "height": 90
          },
          "medium": {
            "url": "https://i.ytimg.com/vi/qIky6lBfDcs/mqdefault.jpg",
            "width": 320,
            "height": 180
          },
          "high": {
            "url": "https://i.ytimg.com/vi/qIky6lBfDcs/hqdefault.jpg",
            "width": 480,
            "height": 360
          }
        },
        "channelTitle": "Can't Cast",
        "liveBroadcastContent": "none",
        "publishTime": "2022-07-18T07:15:03Z"
      }
    },
    {
      "kind": "youtube#searchResult",
      "etag": "utpdfUQeoYXAHeAaDd9NiwT0VlU",
      "id": {
        "kind": "youtube#video",
        "videoId": "AaXzO_9CU2g"
      },
      "snippet": {
        "publishedAt": "2022-07-17T07:15:00Z",
        "channelId": "UClCyxLNDIuY5VIt9r0LrJow",
        "title": "Ant&#39;s wife threatens to divorce him but it doesn&#39;t really go that well",
        "description": "Subscribe Please : https://www.youtube.com/channel/UClCyxLNDIuY5VIt9r0LrJow/ The wife had planned to make her husband ...",
        "thumbnails": {
          "default": {
            "url": "https://i.ytimg.com/vi/AaXzO_9CU2g/default.jpg",
            "width": 120,
            "height": 90
          },
          "medium": {
            "url": "https://i.ytimg.com/vi/AaXzO_9CU2g/mqdefault.jpg",
            "width": 320,
            "height": 180
          },
          "high": {
            "url": "https://i.ytimg.com/vi/AaXzO_9CU2g/hqdefault.jpg",
            "width": 480,
            "height": 360
          }
        },
        "channelTitle": "Can't Cast",
        "liveBroadcastContent": "none",
        "publishTime": "2022-07-17T07:15:00Z"
      }
    },
    {
      "kind": "youtube#searchResult",
      "etag": "AjFgY4FIImt8xhZY6CGmhSULPdA",
      "id": {
        "kind": "youtube#video",
        "videoId": "C4F7rlk5FzM"
      },
      "snippet": {
        "publishedAt": "2022-07-16T02:09:05Z",
        "channelId": "UClCyxLNDIuY5VIt9r0LrJow",
        "title": "Can&#39;t Cast Ep 1 (Beast Frogs, We Don&#39;t Talk about Lee, Mr Real)",
        "description": "Subscribe Please : https://www.youtube.com/channel/UClCyxLNDIuY5VIt9r0LrJow/ This podcast is about how frogs smoke and ...",
        "thumbnails": {
          "default": {
            "url": "https://i.ytimg.com/vi/C4F7rlk5FzM/default.jpg",
            "width": 120,
            "height": 90
          },
          "medium": {
            "url": "https://i.ytimg.com/vi/C4F7rlk5FzM/mqdefault.jpg",
            "width": 320,
            "height": 180
          },
          "high": {
            "url": "https://i.ytimg.com/vi/C4F7rlk5FzM/hqdefault.jpg",
            "width": 480,
            "height": 360
          }
        },
        "channelTitle": "Can't Cast",
        "liveBroadcastContent": "none",
        "publishTime": "2022-07-16T02:09:05Z"
      }
    },
    {
      "kind": "youtube#searchResult",
      "etag": "uPBvs97JhnLaYnbG4RqY1SP6pVM",
      "id": {
        "kind": "youtube#video",
        "videoId": "i7Yk8k6mn6c"
      },
      "snippet": {
        "publishedAt": "2022-07-13T10:15:03Z",
        "channelId": "UClCyxLNDIuY5VIt9r0LrJow",
        "title": "How To Make A Tomato Bomb. Lingerie is not required",
        "description": "Subscribe Please : https://www.youtube.com/channel/UClCyxLNDIuY5VIt9r0LrJow/ The Tomato bomb is a very simple improvised ...",
        "thumbnails": {
          "default": {
            "url": "https://i.ytimg.com/vi/i7Yk8k6mn6c/default.jpg",
            "width": 120,
            "height": 90
          },
          "medium": {
            "url": "https://i.ytimg.com/vi/i7Yk8k6mn6c/mqdefault.jpg",
            "width": 320,
            "height": 180
          },
          "high": {
            "url": "https://i.ytimg.com/vi/i7Yk8k6mn6c/hqdefault.jpg",
            "width": 480,
            "height": 360
          }
        },
        "channelTitle": "Can't Cast",
        "liveBroadcastContent": "none",
        "publishTime": "2022-07-13T10:15:03Z"
      }
    },
    {
      "kind": "youtube#searchResult",
      "etag": "18ZxhjAI5tfW3NnRpTfhahiykso",
      "id": {
        "kind": "youtube#video",
        "videoId": "wIONTxlOT20"
      },
      "snippet": {
        "publishedAt": "2022-07-12T10:00:39Z",
        "channelId": "UClCyxLNDIuY5VIt9r0LrJow",
        "title": "Tiny Snail vs Big Snails",
        "description": "Subscribe Please : https://www.youtube.com/channel/UClCyxLNDIuY5VIt9r0LrJow/ If you accidentally step on a snail while ...",
        "thumbnails": {
          "default": {
            "url": "https://i.ytimg.com/vi/wIONTxlOT20/default.jpg",
            "width": 120,
            "height": 90
          },
          "medium": {
            "url": "https://i.ytimg.com/vi/wIONTxlOT20/mqdefault.jpg",
            "width": 320,
            "height": 180
          },
          "high": {
            "url": "https://i.ytimg.com/vi/wIONTxlOT20/hqdefault.jpg",
            "width": 480,
            "height": 360
          }
        },
        "channelTitle": "Can't Cast",
        "liveBroadcastContent": "none",
        "publishTime": "2022-07-12T10:00:39Z"
      }
    },
    {
      "kind": "youtube#searchResult",
      "etag": "EzNWTO2RX2pVY5bpuHshg1zvA08",
      "id": {
        "kind": "youtube#video",
        "videoId": "zJPQPfwLi5M"
      },
      "snippet": {
        "publishedAt": "2022-07-10T11:18:28Z",
        "channelId": "UClCyxLNDIuY5VIt9r0LrJow",
        "title": "Ant can&#39;t spell this two letter word",
        "description": "Subscribe Please : https://www.youtube.com/channel/UClCyxLNDIuY5VIt9r0LrJow/ After some encouragement from a few close ...",
        "thumbnails": {
          "default": {
            "url": "https://i.ytimg.com/vi/zJPQPfwLi5M/default.jpg",
            "width": 120,
            "height": 90
          },
          "medium": {
            "url": "https://i.ytimg.com/vi/zJPQPfwLi5M/mqdefault.jpg",
            "width": 320,
            "height": 180
          },
          "high": {
            "url": "https://i.ytimg.com/vi/zJPQPfwLi5M/hqdefault.jpg",
            "width": 480,
            "height": 360
          }
        },
        "channelTitle": "Can't Cast",
        "liveBroadcastContent": "none",
        "publishTime": "2022-07-10T11:18:28Z"
      }
    },
    {
      "kind": "youtube#searchResult",
      "etag": "ts2fm0kpcU0eDLJ0ST4iqPardn0",
      "id": {
        "kind": "youtube#video",
        "videoId": "z3jJsEfFyHc"
      },
      "snippet": {
        "publishedAt": "2022-07-09T11:16:11Z",
        "channelId": "UClCyxLNDIuY5VIt9r0LrJow",
        "title": "Is Alice Cooper A Minion The Truth About Rock Icon",
        "description": "Subscribe Please : https://www.youtube.com/channel/UClCyxLNDIuY5VIt9r0LrJow/ Alice Cooper is a popular musician who has ...",
        "thumbnails": {
          "default": {
            "url": "https://i.ytimg.com/vi/z3jJsEfFyHc/default.jpg",
            "width": 120,
            "height": 90
          },
          "medium": {
            "url": "https://i.ytimg.com/vi/z3jJsEfFyHc/mqdefault.jpg",
            "width": 320,
            "height": 180
          },
          "high": {
            "url": "https://i.ytimg.com/vi/z3jJsEfFyHc/hqdefault.jpg",
            "width": 480,
            "height": 360
          }
        },
        "channelTitle": "Can't Cast",
        "liveBroadcastContent": "none",
        "publishTime": "2022-07-09T11:16:11Z"
      }
    },
    {
      "kind": "youtube#searchResult",
      "etag": "Vycmj0ieupanePAklaClhmlT03Y",
      "id": {
        "kind": "youtube#video",
        "videoId": "H-rwdnQFgLg"
      },
      "snippet": {
        "publishedAt": "2022-07-09T01:57:49Z",
        "channelId": "UClCyxLNDIuY5VIt9r0LrJow",
        "title": "Can&#39; t Cast Ep 0 (naming the podcast, and working out the shuff)",
        "description": "Subscribe Please : https://www.youtube.com/channel/UClCyxLNDIuY5VIt9r0LrJow/ Can't Cast Podcast People Talking Stuff is a ...",
        "thumbnails": {
          "default": {
            "url": "https://i.ytimg.com/vi/H-rwdnQFgLg/default.jpg",
            "width": 120,
            "height": 90
          },
          "medium": {
            "url": "https://i.ytimg.com/vi/H-rwdnQFgLg/mqdefault.jpg",
            "width": 320,
            "height": 180
          },
          "high": {
            "url": "https://i.ytimg.com/vi/H-rwdnQFgLg/hqdefault.jpg",
            "width": 480,
            "height": 360
          }
        },
        "channelTitle": "Can't Cast",
        "liveBroadcastContent": "none",
        "publishTime": "2022-07-09T01:57:49Z"
      }
    }
  ]
}

In [103]:
import requests
from io import BytesIO
# This is an unordered list of dictionaries containing video information - enough to generate markdown pages and stuff with videos embedded
# The dictionary keys are:
# ['id']['videoId']: The YouTube video
# ['snippet']['title']: The title of the video
# ['snippet']['description']: The description of the video
# ['snippet']['thumbnails']['medium']['url']: The URL of the thumbnail - can download this and use it for the thumbnail
# ['snippet']['publishTime']: The time the video was published - can extract date and also use this for the feed and stuff

content = []
feed = {
}

for video in vid_json['items']:
    title = video['snippet']['title'].replace('&#39;', "'")
    description = video['snippet']['description']
    thumbnail_url = video['snippet']['thumbnails']['medium']['url']
    fullsize_thumbnail_url = video['snippet']['thumbnails']['high']['url']
    publish_time = video['snippet']['publishTime']
    _dt = dt.strptime(publish_time, '%Y-%m-%dT%H:%M:%SZ')
    ymd = _dt.strftime('%Y-%m-%d')
    timestamp = _dt.strftime('%Y-%m-%d %H:%M:%S')
    res = requests.get(thumbnail_url)
    img = Image.open(BytesIO(res.content))
    img.thumbnail((256, 256))
    img = img.convert('RGB')
    img.save(f"STAGING/images/{strip_punctuation(title).replace(' ', '_')}_thumbnail.jpg", 'JPEG')

    print(f"Downloaded thumbnail")

    res = requests.get(fullsize_thumbnail_url)
    img = Image.open(BytesIO(res.content))
    img.save(f"STAGING/images/{strip_punctuation(title).replace(' ', '_')}.png", 'PNG')

    print(f"Downloaded fullsize thumbnail")

    md = f"""---
date: {ymd}
title: {title}
description: {description}
author: Ed
tags: ['Video', 'YouTube', 'Podcast']
type: video
thumbnail: /assets/images/{strip_punctuation(title).replace(' ', '_')}_thumbnail.jpg
og_title: {title}
og_description: {description}
og_image: /assets/images/{strip_punctuation(title).replace(' ', '_')}.png
og_type: video
collection: Can't Cast
---
{video['id']['videoId']}
"""

    with open(f"STAGING/videos/{strip_punctuation(title).replace(' ', '_')}.md", 'w') as f:
        f.write(md)

    print(f"Generated markdown for {title}")

    feed[timestamp] = {
        'location': f"videos/{strip_punctuation(title).replace(' ', '_')}.md",
    }

# Update content based on the feed


Downloaded thumbnail
Downloaded fullsize thumbnail
Generated markdown for Can't Cast Ep The LEE Thing
Downloaded thumbnail
Downloaded fullsize thumbnail
Generated markdown for Can't Cast Ep Different English Aussie Flag Day
Downloaded thumbnail
Downloaded fullsize thumbnail
Generated markdown for Can't Cast Ep 3 (Friday podcast ) also we don't talk about Lee
Downloaded thumbnail
Downloaded fullsize thumbnail
Generated markdown for Street Fashions USA but we didn't want Tommy Wiseau to sue us to the fullest extent of the law
Downloaded thumbnail
Downloaded fullsize thumbnail
Generated markdown for Can't Cast Ep 2 (Drive thru with Mr Beast ) also we don't talk about Lee
Downloaded thumbnail
Downloaded fullsize thumbnail
Generated markdown for Chat News Highlights, Fire On the bottom oval - Can't Cast
Downloaded thumbnail
Downloaded fullsize thumbnail
Generated markdown for Ep: 1 auto rap off against a fire train - Can't Cast Clip
Downloaded thumbnail
Downloaded fullsize thumbnail
Generat

In [104]:
for timestamp in sorted(feed.keys(), reverse=True):
    content.append(feed[timestamp]['location'].split("/")[-1])
print(feed)
print(content)

{'2022-08-13 02:10:27': {'location': 'videos/Cant_Cast_Ep_The_LEE_Thing.md'}, '2022-08-06 01:53:55': {'location': 'videos/Cant_Cast_Ep_Different_English_Aussie_Flag_Day.md'}, '2022-07-28 22:27:37': {'location': 'videos/Cant_Cast_Ep_3_Friday_podcast__also_we_dont_talk_about_Lee.md'}, '2022-07-25 00:40:54': {'location': 'videos/Street_Fashions_USA_but_we_didnt_want_Tommy_Wiseau_to_sue_us_to_the_fullest_extent_of_the_law.md'}, '2022-07-23 02:06:48': {'location': 'videos/Cant_Cast_Ep_2_Drive_thru_with_Mr_Beast__also_we_dont_talk_about_Lee.md'}, '2022-07-20 07:15:01': {'location': 'videos/Chat_News_Highlights_Fire_On_the_bottom_oval__Cant_Cast.md'}, '2022-07-19 07:15:00': {'location': 'videos/Ep_1_auto_rap_off_against_a_fire_train__Cant_Cast_Clip.md'}, '2022-07-18 07:15:03': {'location': 'videos/What_does_LOL_really_mean.md'}, '2022-07-17 07:15:00': {'location': 'videos/Ants_wife_threatens_to_divorce_him_but_it_doesnt_really_go_that_well.md'}, '2022-07-16 02:09:05': {'location': 'videos/Can

In [105]:
# Let's update the feed now
feed_ref = db.collection('feed').document('content-log')
old_feed = feed_ref.get().to_dict()
old_feed.update(feed)

In [106]:
old_feed

{'2004-08-16 00:00:07': {'location': 'comics/hewligg_urobokkle_8.md'},
 '2024-04-23 08:17:04': {'location': 'blogs/internet.md'},
 '2010-01-09 03:03:18': {'location': 'music/poopremixes.md'},
 '2004-08-16 00:00:01': {'location': 'comics/hewligg_urobokkle_2.md'},
 '2023-07-02 11:24:15': {'location': 'videos/Commander_Keen39s_Last_Adventure_Unveiling_the_Lipton_Tea_Connection.md'},
 '2024-02-01 17:08:24': {'location': 'blogs/hip.md'},
 '2022-06-08 20:08:33': {'location': 'videos/StumbleUpon_Internet_Discovery_Through_the_0039s.md'},
 '2022-06-29 16:07:45': {'location': 'videos/Dunnet__The_Secret_Terminal_Game.md'},
 '2023-07-18 08:52:17': {'location': 'blogs/onrss.md'},
 '2023-03-08 09:14:22': {'location': 'videos/In_the_Mind_of_Kimberly_Kubus_The_Game_Developer_Who_Saw_God.md'},
 '2024-01-04 11:35:56': {'location': 'blogs/books.md'},
 '2004-08-17 00:00:03': {'location': 'comics/hewligg_urobokkle_13.md'},
 '2023-07-21 18:02:45': {'location': 'comics/pp_comic11.md'},
 '2023-07-09 14:28:02

In [107]:
# Rewrite old_feed to the database
feed_ref.set(old_feed)

update_time {
  seconds: 1721502878
  nanos: 157739000
}

In [108]:
content = content[::-1]
content

['Can_t_Cast_Ep_0_naming_the_podcast_and_working_out_the_shuff.md',
 'Is_Alice_Cooper_A_Minion_The_Truth_About_Rock_Icon.md',
 'Ant_cant_spell_this_two_letter_word.md',
 'Tiny_Snail_vs_Big_Snails.md',
 'How_To_Make_A_Tomato_Bomb_Lingerie_is_not_required.md',
 'Cant_Cast_Ep_1_Beast_Frogs_We_Dont_Talk_about_Lee_Mr_Real.md',
 'Ants_wife_threatens_to_divorce_him_but_it_doesnt_really_go_that_well.md',
 'What_does_LOL_really_mean.md',
 'Ep_1_auto_rap_off_against_a_fire_train__Cant_Cast_Clip.md',
 'Chat_News_Highlights_Fire_On_the_bottom_oval__Cant_Cast.md',
 'Cant_Cast_Ep_2_Drive_thru_with_Mr_Beast__also_we_dont_talk_about_Lee.md',
 'Street_Fashions_USA_but_we_didnt_want_Tommy_Wiseau_to_sue_us_to_the_fullest_extent_of_the_law.md',
 'Cant_Cast_Ep_3_Friday_podcast__also_we_dont_talk_about_Lee.md',
 'Cant_Cast_Ep_Different_English_Aussie_Flag_Day.md',
 'Cant_Cast_Ep_The_LEE_Thing.md']

In [109]:
# Now let's put it in collections
ref = db.collection('collections').document('cant_cast')
dic = {
    'content': content
}

ref.set(dic)

update_time {
  seconds: 1721502887
  nanos: 330125000
}

In [74]:
img = Image.open(os.path.join("STAGING", "images", "keyboard_cat.png"))
img.thumbnail((256, 256))
img.convert('RGB').save(os.path.join("STAGING", "images", "keyboard_cat_thumbnail.jpg"), 'JPEG')

In [76]:
# Updaate the feed and eds_blog collection
feed_ref = db.collection('feed').document('content-log')

# Find the keyboard_cat.md file and get the modified date for feed
time = os.path.getmtime(os.path.join("STAGING", "blogs", "keyboard_cat.md"))
_dt = dt.strftime(dt.fromtimestamp(time), '%Y-%m-%d %H:%M:%S')

In [77]:
feed = feed_ref.get().to_dict()
feed[_dt] = {
    'location': 'blogs/keyboard_cat.md'
}

In [78]:
feed

{'2004-08-16 00:00:07': {'location': 'comics/hewligg_urobokkle_8.md'},
 '2024-04-23 08:17:04': {'location': 'blogs/internet.md'},
 '2004-08-16 00:00:01': {'location': 'comics/hewligg_urobokkle_2.md'},
 '2023-07-02 11:24:15': {'location': 'videos/Commander_Keen39s_Last_Adventure_Unveiling_the_Lipton_Tea_Connection.md'},
 '2024-02-01 17:08:24': {'location': 'blogs/hip.md'},
 '2022-06-08 20:08:33': {'location': 'videos/StumbleUpon_Internet_Discovery_Through_the_0039s.md'},
 '2022-06-29 16:07:45': {'location': 'videos/Dunnet__The_Secret_Terminal_Game.md'},
 '2023-07-18 08:52:17': {'location': 'blogs/onrss.md'},
 '2023-03-08 09:14:22': {'location': 'videos/In_the_Mind_of_Kimberly_Kubus_The_Game_Developer_Who_Saw_God.md'},
 '2024-01-04 11:35:56': {'location': 'blogs/books.md'},
 '2004-08-17 00:00:03': {'location': 'comics/hewligg_urobokkle_13.md'},
 '2023-07-21 18:02:45': {'location': 'comics/pp_comic11.md'},
 '2023-07-09 14:28:02': {'location': 'comics/pp_comic8.md'},
 '2022-02-02 15:00:07'

In [79]:
# Write it back
feed_ref.set(feed)

update_time {
  seconds: 1721319485
  nanos: 53557000
}

In [80]:
# Then add this to the content list
content_ref = db.collection('collections').document('eds_blog')
content = content_ref.get().to_dict()['content']
content.append('keyboard_cat.md')
content

['stumbleupon.md',
 'bananas.md',
 'alcoholism.md',
 'oldwebsites.md',
 'onrss.md',
 'burnout.md',
 'dustydrawers.md',
 'stress.md',
 'prank.md',
 'depressed.md',
 'charlmes.md',
 'books.md',
 'hip.md',
 'horses.md',
 'internet.md',
 'discord.md',
 'keyboard_cat.md']

In [81]:
# Update
content_ref.set({
    'content': content
})

update_time {
  seconds: 1721319514
  nanos: 825036000
}

In [78]:
from random import randint
# Create a thumbnail for the chao wars blog
filename = "leftover_remixes.jpg"
img = Image.open(os.path.join("STAGING", "images", filename))
img = img.convert('RGB')
# take a 256x256 part
# w, h = img.size
# x0, y0 = randint(0, w - 256), randint(0, h - 256)
# img = img.crop((x0, y0, x0+256, y0+256))
img.thumbnail((256, 256))
img.save(os.path.join("STAGING", "images", "leftover_remixes_thumbnail.jpg"), 'JPEG')

In [76]:
# Find the earliest date a file was editid in music folder
files = [os.path.join("STAGING", "music", f) for f in os.listdir(os.path.join("STAGING", "music"))]
earliest = min([os.path.getmtime(f) for f in files])
latest = max([os.path.getmtime(f) for f in files])
# Convert from epoch time
_dt = dt.strftime(dt.fromtimestamp(latest), '%Y-%m-%d %H:%M:%S')
_dt

'2012-12-24 21:31:24'

In [77]:
# Go through the music foolder and strip punctuation etc to rename our files
for file in files:
    no_ext = file.split(".")[0]
    new_name = strip_punctuation(no_ext.split("/")[-1].replace(" ", "_"))
    new_name = new_name + ".mp3"
    os.rename(file, os.path.join("STAGING", "music", new_name))

In [79]:
# Should be able to update feed now
feed_ref = db.collection('feed').document('content-log')
feed = feed_ref.get().to_dict()
if _dt in feed:
    raise ValueError("This date already exists in the feed")
else:
    feed[_dt] = {'location': 'music/leftover_remixes.md'}
feed

{'2004-08-16 00:00:07': {'location': 'comics/hewligg_urobokkle_8.md'},
 '2024-04-23 08:17:04': {'location': 'blogs/internet.md'},
 '2010-01-09 03:03:18': {'location': 'music/poopremixes.md'},
 '2004-08-16 00:00:01': {'location': 'comics/hewligg_urobokkle_2.md'},
 '2023-07-02 11:24:15': {'location': 'videos/Commander_Keen39s_Last_Adventure_Unveiling_the_Lipton_Tea_Connection.md'},
 '2024-02-01 17:08:24': {'location': 'blogs/hip.md'},
 '2022-06-08 20:08:33': {'location': 'videos/StumbleUpon_Internet_Discovery_Through_the_0039s.md'},
 '2022-06-29 16:07:45': {'location': 'videos/Dunnet__The_Secret_Terminal_Game.md'},
 '2023-07-18 08:52:17': {'location': 'blogs/onrss.md'},
 '2023-03-08 09:14:22': {'location': 'videos/In_the_Mind_of_Kimberly_Kubus_The_Game_Developer_Who_Saw_God.md'},
 '2024-01-04 11:35:56': {'location': 'blogs/books.md'},
 '2004-08-17 00:00:03': {'location': 'comics/hewligg_urobokkle_13.md'},
 '2023-07-21 18:02:45': {'location': 'comics/pp_comic11.md'},
 '2023-07-09 14:28:02

In [80]:
# Write it back
feed_ref.set(feed)

update_time {
  seconds: 1721469934
  nanos: 449327000
}

In [85]:
# Now append to the content feed
content_ref = db.collection('collections').document('eds_music')
content = content_ref.get().to_dict()
content

{'content': ['planet_ed.md',
  'banjo.md',
  'planet_ed_2.md',
  'background_music.md',
  'electronics.md',
  'storminacowbell.md',
  'spite_malice.md',
  'sonic.md',
  'othergameremixes.md',
  'technolulz.md',
  'guitars.md',
  'spartaremixes.md',
  'poopremixes.md',
  'freshprince.md',
  'crap.md',
  'metal.md',
  'leftovers.md',
  'poopremixes.md']}

In [87]:
# Write it back
content['content'].append('leftover_remixes.md')
content

{'content': ['planet_ed.md',
  'banjo.md',
  'planet_ed_2.md',
  'background_music.md',
  'electronics.md',
  'storminacowbell.md',
  'spite_malice.md',
  'sonic.md',
  'othergameremixes.md',
  'technolulz.md',
  'guitars.md',
  'spartaremixes.md',
  'poopremixes.md',
  'freshprince.md',
  'crap.md',
  'metal.md',
  'leftovers.md',
  'leftover_remixes.md']}

In [88]:
content_ref.set(content)

update_time {
  seconds: 1721470514
  nanos: 428484000
}

In [2]:
import os

In [6]:
# Let's start working on the website archive
blog_files = []

for root, dirs, files in os.walk(os.path.join("STAGING", "Website Archive")):
    if files:
        for file in files:
            blog_files.append(os.path.join(root, file))
blog_files

['STAGING/Website Archive/gbtk2/gbtk.txt',
 'STAGING/Website Archive/Digging Day/Digging Day.txt',
 'STAGING/Website Archive/Myth/myth.txt',
 'STAGING/Website Archive/cosmo/cosmo.txt',
 'STAGING/Website Archive/xbox/xbox.txt',
 'STAGING/Website Archive/fuchsia/fuschia.txt',
 'STAGING/Website Archive/Tombi/Tombi.txt',
 'STAGING/Website Archive/monkeyisland/monkey.txt',
 'STAGING/Website Archive/allyoucaneat/all.txt',
 'STAGING/Website Archive/crashteamracing/ctr.txt',
 'STAGING/Website Archive/keenmustdie/keen3.txt',
 'STAGING/Website Archive/themepark/theme.txt',
 'STAGING/Website Archive/kingsquest/kq.txt',
 'STAGING/Website Archive/The Keymaster/The Keymaster.txt',
 'STAGING/Website Archive/BuildingLinux/Linux.txt',
 'STAGING/Website Archive/malice/malice.txt',
 'STAGING/Website Archive/filmgrain/film.txt',
 'STAGING/Website Archive/MaroonedOnMars/keen1.txt',
 'STAGING/Website Archive/silentbill/silent.txt',
 'STAGING/Website Archive/AboutTenYears/AboutTenYears.txt',
 'STAGING/Websit

In [12]:
# These are going to be annoying cause no dates attached, these require retrieving from archive of the site (can also get images from there)
# https://web.archive.org/web/20220520141623/https://edwardatkin.co.uk/
# It's going to be very time consuming, I could probably just automate it with a script that goes through the archive and downloads the pages instead (images missing etc, probably not perfect but oh well)
url = "https://web.archive.org/web/20220520141623/https://edwardatkin.co.uk/"
page = 1
blogs = {}
while True:
    print(f"Getting page {page}")
    res = requests.get(url)
    if res.status_code != 200:
        print("Failed to get page")
        break

    soup = BeautifulSoup(res.content, 'html.parser')

    post_titles = soup.find_all('h2', class_='post-title')

    if not post_titles:
        print("No more posts")
        break

    for post_title in post_titles:
        title = post_title.get_text().strip()
        link = post_title.find('a')['href']
        blogs[title] = link

    # Check if this is the last page
    prev_page = soup.find('a', class_='post-nav-older')
    if prev_page:
        url = prev_page['href']
    else:
        break

    page +=1
    sleep(15)

blogs


Getting page 1
Getting page 2
Getting page 3
Getting page 4
Getting page 5
Failed to get page


{'Super Quick Website Update': 'https://web.archive.org/web/20220520141623/https://edwardatkin.co.uk/super-quick-website-update',
 'Lego City Undercover': 'https://web.archive.org/web/20220520141623/https://edwardatkin.co.uk/lego-city-undercover',
 'Endoparasitic, An Interesting Indie Game You Probably Missed.': 'https://web.archive.org/web/20220520141623/https://edwardatkin.co.uk/endoparasitic',
 'Lone Fungus: An Indie Metroidvania You Don‚Äôt Want to Miss!': 'https://web.archive.org/web/20220520141623/https://edwardatkin.co.uk/lone-fungus-an-indie-metroidvania-you-dont-want-to-miss',
 'Chip‚Äôs Challenge 2': 'https://web.archive.org/web/20220520141623/https://edwardatkin.co.uk/chips-challenge-2',
 'The Original Xbox is Slowly Dying Out': 'https://web.archive.org/web/20220520141623/https://edwardatkin.co.uk/the-original-xbox-is-slowly-dying-out',
 'King‚Äôs Quest I: Quest for the Crown': 'https://web.archive.org/web/20220520141623/https://edwardatkin.co.uk/kings-quest-i-quest-for-the-

In [17]:
print("Found", len(blogs), "blogs")
count = os.listdir(os.path.join("STAGING", "Website Archive"))
print("Expected", len(count), "blogs")

Found 40 blogs
Expected 57 blogs


In [24]:
blogs_completed = []

In [27]:
# I need to find what's missing at some point but for now I'll just go through and generate the markdown files
# Also need to get all the imgs from the blog frontpage and download them cause they aren't always in the blogs themselves
for k, v in blogs.items():
    if k in blogs_completed:
        print(f"Skipping {k} (already completed)")
    print(k, v)
    try:
        res = requests.get(v)
    except:
        print("Failed to get page")
        sleep(15)
        continue
    if res.status_code != 200:
        print("Failed to get page")
        sleep(15)
        continue

    soup = BeautifulSoup(res.content, 'html.parser')
    try:
        title = k
        clean_title = strip_punctuation(title).replace(" ", "_")
        date = soup.find('span', class_='post-date').text
        content = soup.find('div', class_='post-content')
        content_markdown = md(str(content))
        # Find a with rel='author'
        author = soup.find('a', rel='author').text

        # Description can be the first paragraph of the content up to 200 characters
        description = content.find('p').text
        if len(description) > 200:
            description = description[:200]
            description += '...'

        date_obj = dt.strptime(date, "%B %d, %Y")
        # Format the datetime object to the desired format
        formatted_date = date_obj.strftime("%Y-%m-%d")

        if author == 'Eatkin':
            author = 'Ed'
    except Exception as e:
        print("Content failed to parse")
        print(e)
        sleep(15)
        continue

    # We're going to have to manually edit these anyway so I'll just write them to the file
    # Also we need download any images and stuff
    imgs = content.find_all('img')
    for i, im in enumerate(imgs):
        try:
            src = im['src']
        except Exception as e:
            print("Failed to get image source")
            print(e)
            continue
        if src.startswith('http'):
            try:
                res = requests.get(src)
                img = Image.open(BytesIO(res.content))
                img.save(os.path.join("STAGING", "images", f"{clean_title}_{i}.{src.split('.')[-1]}"))
                print(f"Downloaded image {i} as {clean_title}_{i}.{src.split('.')[-1]}")
            except Exception as e:
                print("Failed to download image")
                print(e)
            sleep(15)


    markdowned_content = md(str(content))

    # Write markdown
    markdown_content = f"""---
date: {formatted_date}
title: {title}
description: {description}
author: {author}
tags: TAGS
type: blog
thumbnail: /assets/images/{clean_title}_thumbnail.jpg
og_title: {title}
og_description: {description}
og_image: /assets/images/{clean_title}.jpg
og_type: article
collection: WEBSITE ARCHIVE
---
{markdowned_content}"""

    # Write the markdown file
    with open(os.path.join("STAGING", "blogs", f"{clean_title}.md"), 'w') as f:
        f.write(markdown_content)

    # If we get this far we can add it to the list of completed blogs
    blogs_completed.append(k)
    print(f"Completed {k}")

    sleep(15)

Super Quick Website Update https://web.archive.org/web/20220520141623/https://edwardatkin.co.uk/super-quick-website-update
Completed Super Quick Website Update
Lego City Undercover https://web.archive.org/web/20220520141623/https://edwardatkin.co.uk/lego-city-undercover
Downloaded image 0 as Lego_City_Undercover_0.png
Downloaded image 1 as Lego_City_Undercover_1.png
Completed Lego City Undercover
Endoparasitic, An Interesting Indie Game You Probably Missed. https://web.archive.org/web/20220520141623/https://edwardatkin.co.uk/endoparasitic
Downloaded image 0 as Endoparasitic_An_Interesting_Indie_Game_You_Probably_Missed_0.png
Failed to download image
cannot identify image file <_io.BytesIO object at 0x7f4ad3cc2a20>
Failed to download image
cannot identify image file <_io.BytesIO object at 0x7f4ae5c00cc0>
Completed Endoparasitic, An Interesting Indie Game You Probably Missed.
Lone Fungus: An Indie Metroidvania You Don‚Äôt Want to Miss! https://web.archive.org/web/20220520141623/https://e

In [32]:
# Okay 2 didn't download so gotta find which ones, then try find which I couldn't even find. Have to do it manually.
# The actual feed page has a lot of the images so I can just download them from there for the article image + thumbnail
# Some will be missing, no matter, it's only archival stuff
# Getting the images
url = "https://web.archive.org/web/20220520141623/https://edwardatkin.co.uk/"
page = 1
blogs = {}
while True:
    print(f"Getting page {page}")
    res = requests.get(url)
    if res.status_code != 200:
        print("Failed to get page")
        break

    sleep(15)

    soup = BeautifulSoup(res.content, 'html.parser')

    articles = soup.find_all('article')

    if not articles:
        print("No more posts")
        break

    for article in articles:
        post_title = article.find('h2', class_='post-title').get_text()
        post_title = post_title.strip()
        print("Getting images for", post_title)
        clean_title = strip_punctuation(post_title).replace(' ', '_')
        figure = article.find('figure', class_='featured-media')
        if not figure:
            print("No figure for", post_title)
            continue
        src = figure.find('img')['src']
        if src.startswith('http'):
            try:
                res = requests.get(src)
                print("Img status code", res.status_code)
                img = Image.open(BytesIO(res.content))
                img.save(os.path.join("STAGING", "images", f"{clean_title}.{src.split('.')[-1]}"))
                print(f"Downloaded image as {clean_title}.{src.split('.')[-1]}")
                # Create a thumbnail
                img.thumbnail((256, 256))
                img = img.convert('RGB')
                img.save(os.path.join("STAGING", "images", f"{clean_title}.jpg"), 'JPEG')
            except Exception as e:
                print("Failed to download image")
                print(e)
            sleep(15)

    # Check if this is the last page
    prev_page = soup.find('a', class_='post-nav-older')
    if prev_page:
        url = prev_page['href']
    else:
        break

    page +=1
    sleep(15)

blogs


Getting page 1
Getting images for Super Quick Website Update
No figure for Super Quick Website Update
Getting images for Lego City Undercover
Img status code 200
Downloaded image as Lego_City_Undercover.png
Getting images for Endoparasitic, An Interesting Indie Game You Probably Missed.
No figure for Endoparasitic, An Interesting Indie Game You Probably Missed.
Getting images for Lone Fungus: An Indie Metroidvania You Don‚Äôt Want to Miss!
Img status code 404
Failed to download image
cannot identify image file <_io.BytesIO object at 0x7f4ae453fc90>
Getting images for Chip‚Äôs Challenge 2
Img status code 200
Downloaded image as Chip‚Äôs_Challenge_2.png
Getting images for The Original Xbox is Slowly Dying Out
Img status code 200
Downloaded image as The_Original_Xbox_is_Slowly_Dying_Out.jpg
Getting images for King‚Äôs Quest I: Quest for the Crown
Img status code 200
Downloaded image as King‚Äôs_Quest_I_Quest_for_the_Crown.png
Getting images for Spyro: Enter the Dragonfly
Img status code 2

{}

Whsat's missing:
* Binary grab
* CC episode 2 & 3
* Feb 2020 devlog
* Diggig Day
* Keymaster
* Lockdown Opportunities
* March 2020 devlog
* Myth Bearer
* June 2020 devlog
* Nonlinear acceleration (sine wave movement)
* Theme Park World
* There is nothing here
* Twinsanity
* Film grain
* My very good game
* Linux
* Another 1?

In [35]:
# A lot of the missing ones are here:
# My very good game here: https://web.archive.org/web/20201030095115/https://edwardatkin.co.uk/page/2
# Few more here: https://web.archive.org/web/20201124235748/https://edwardatkin.co.uk/page/3
# Feb 2020 devlog: https://web.archive.org/web/20210127190848/https://edwardatkin.co.uk/page/4
# I can go by month to get a few things: https://web.archive.org/web/20200716032715/https://edwardatkin.co.uk/2020/02
# March has 2 pages: https://web.archive.org/web/20200716235940/https://edwardatkin.co.uk/2020/03
# We can scrape these pages and use the clean name to determine if we've already got it
# We'll do that then work by month to see if we can get the remainder

urls = [
    'https://web.archive.org/web/20200831052715/https://edwardatkin.co.uk/',
    'https://web.archive.org/web/20201124235748/https://edwardatkin.co.uk/page/3',
    'https://web.archive.org/web/20210127190848/https://edwardatkin.co.uk/page/4',
    'https://web.archive.org/web/20200716032715/https://edwardatkin.co.uk/2020/02',
    'https://web.archive.org/web/20200716235940/https://edwardatkin.co.uk/2020/03'
]
blogs = {}
for page, url in enumerate(urls, 1):
    print(f"Getting page {page}")
    res = requests.get(url)
    if res.status_code != 200:
        print("Failed to get page")
        break

    soup = BeautifulSoup(res.content, 'html.parser')

    articles = soup.find_all('article')

    if not articles:
        print("No more posts")
        break

    for article in articles:
        title_elem = article.find('h2', class_='post-title')
        post_title = title_elem.get_text()
        post_title = post_title.strip()
        clean_title = strip_punctuation(post_title).replace(' ', '_')
        # Check if this exxists
        if os.path.exists(os.path.join('STAGING', 'blogs', f"{clean_title}.md")):
            print("Already scraped", {post_title})
            continue
        # Save link to the post
        link = title_elem.find('a')['href']
        blogs[post_title] = link
        print("Getting images for", post_title)
        figure = article.find('figure', class_='featured-media')
        if not figure:
            print("No figure for", post_title)
            continue
        src = figure.find('img')['src']
        if src.startswith('http'):
            try:
                res = requests.get(src)
                print("Img status code", res.status_code)
                img = Image.open(BytesIO(res.content))
                img.save(os.path.join("STAGING", "images", f"{clean_title}.{src.split('.')[-1]}"))
                print(f"Downloaded image as {clean_title}.{src.split('.')[-1]}")
                # Create a thumbnail
                img.thumbnail((256, 256))
                img = img.convert('RGB')
                img.save(os.path.join("STAGING", "images", f"{clean_title}.jpg"), 'JPEG')
            except Exception as e:
                print("Failed to download image")
                print(e)
            sleep(15)

    sleep(15)

blogs


Getting page 1
Already scraped {'The Moonstone Equation'}
Getting images for Super quick film grain effect in Gamemaker Studio 2
Getting images for Crash Twinsanity
Getting images for Devlog: Other Minds, June 2020
Getting images for Theme Park World | Old Theme Park Sims
Already scraped {'The Moonstone Equation'}
Already scraped {'The Moonstone Equation'}
Already scraped {'The Moonstone Equation'}
Already scraped {'The Moonstone Equation'}
Already scraped {'The Moonstone Equation'}
Getting page 2
Getting images for Commander Keen III: Keen Must Die!
Img status code 200
Downloaded image as Commander_Keen_III_Keen_Must_Die.png
Getting images for Other Minds April Demo Update
Img status code 200
Downloaded image as Other_Minds_April_Demo_Update.png
Getting images for There Is Nothing There / There Was Something Here
Img status code 200
Downloaded image as There_Is_Nothing_There__There_Was_Something_Here.png
Getting images for Other Minds April Update: Full Changelog
Img status code 200
D

{'Super quick film grain effect in Gamemaker Studio 2': 'https://web.archive.org/web/20200831052715/https://edwardatkin.co.uk/super-quick-film-grain-effect-in-gamemaker-studio-2',
 'Crash Twinsanity': 'https://web.archive.org/web/20200831052715/https://edwardatkin.co.uk/crash-twinsanity',
 'Devlog: Other Minds, June 2020': 'https://web.archive.org/web/20200831052715/https://edwardatkin.co.uk/devlog-other-minds-june-2020',
 'Theme Park World | Old Theme Park Sims': 'https://web.archive.org/web/20200831052715/https://edwardatkin.co.uk/theme-park-world',
 'Commander Keen III: Keen Must Die!': 'https://web.archive.org/web/20201124235748/https://edwardatkin.co.uk/commander-keen-iii-keen-must-die',
 'Other Minds April Demo Update': 'https://web.archive.org/web/20201124235748/https://edwardatkin.co.uk/other-minds-april-demo-update',
 'There Is Nothing There / There Was Something Here': 'https://web.archive.org/web/20201124235748/https://edwardatkin.co.uk/there-is-nothing-there-there-was-somet

In [36]:
blogs_completed = []

In [39]:
# Let's markdownify the websites and try get more blogs
# I need to find what's missing at some point but for now I'll just go through and generate the markdown files
# Also need to get all the imgs from the blog frontpage and download them cause they aren't always in the blogs themselves
for k, v in blogs.items():
    if k in blogs_completed:
        print(f"Skipping {k} (already completed)")
    print(k, v)
    try:
        res = requests.get(v)
    except:
        print("Failed to get page")
        sleep(15)
        continue
    if res.status_code != 200:
        print("Failed to get page")
        sleep(15)
        continue

    soup = BeautifulSoup(res.content, 'html.parser')
    try:
        title = k
        clean_title = strip_punctuation(title).replace(" ", "_")
        date = soup.find('span', class_='post-date').text
        content = soup.find('div', class_='post-content')
        content_markdown = md(str(content))
        # Find a with rel='author'
        author = soup.find('a', rel='author').text

        # Description can be the first paragraph of the content up to 200 characters
        description = content.find('p').text
        if len(description) > 200:
            description = description[:200]
            description += '...'

        date_obj = dt.strptime(date, "%B %d, %Y")
        # Format the datetime object to the desired format
        formatted_date = date_obj.strftime("%Y-%m-%d")

        if author == 'Eatkin':
            author = 'Ed'
    except Exception as e:
        print("Content failed to parse")
        print(e)
        sleep(15)
        continue

    # We're going to have to manually edit these anyway so I'll just write them to the file
    # Also we need download any images and stuff
    imgs = content.find_all('img')
    for i, im in enumerate(imgs):
        try:
            src = im['src']
        except Exception as e:
            print("Failed to get image source")
            print(e)
            continue
        if src.startswith('http'):
            try:
                res = requests.get(src)
                print("Status code for image", i+1, res.status_code)
                img = Image.open(BytesIO(res.content))
                img.save(os.path.join("STAGING", "images", f"{clean_title}_{i}.{src.split('.')[-1]}"))
                print(f"Downloaded image {i} as {clean_title}_{i}.{src.split('.')[-1]}")
            except Exception as e:
                print("Failed to download image")
                print(e)
            sleep(15)


    markdowned_content = md(str(content))

    # Write markdown
    markdown_content = f"""---
date: {formatted_date}
title: {title}
description: {description}
author: {author}
tags: TAGS
type: blog
thumbnail: /assets/images/{clean_title}_thumbnail.jpg
og_title: {title}
og_description: {description}
og_image: /assets/images/{clean_title}.jpg
og_type: article
collection: WEBSITE ARCHIVE
---
{markdowned_content}"""

    # Write the markdown file
    with open(os.path.join("STAGING", "blogs", f"{clean_title}.md"), 'w') as f:
        f.write(markdown_content)

    # If we get this far we can add it to the list of completed blogs
    blogs_completed.append(k)
    print(f"Completed {k}")

    sleep(15)

Skipping Super quick film grain effect in Gamemaker Studio 2 (already completed)
Super quick film grain effect in Gamemaker Studio 2 https://web.archive.org/web/20200831052715/https://edwardatkin.co.uk/super-quick-film-grain-effect-in-gamemaker-studio-2
Status code for image 1 404
Failed to download image
cannot identify image file <_io.BytesIO object at 0x7f4ad30d14e0>
Completed Super quick film grain effect in Gamemaker Studio 2
Crash Twinsanity https://web.archive.org/web/20200831052715/https://edwardatkin.co.uk/crash-twinsanity
Status code for image 1 404
Failed to download image
cannot identify image file <_io.BytesIO object at 0x7f4ae4fed080>
Status code for image 2 404
Failed to download image
cannot identify image file <_io.BytesIO object at 0x7f4ae4fef6a0>
Status code for image 3 404
Failed to download image
cannot identify image file <_io.BytesIO object at 0x7f4ad30d1b20>
Completed Crash Twinsanity
Devlog: Other Minds, June 2020 https://web.archive.org/web/20200831052715/http

In [47]:
len(os.listdir(os.path.join("STAGING", "blogs")))

60

In [41]:
archives_html = """<ul>
					<li><a href="https://web.archive.org/web/20211026095726/https://edwardatkin.co.uk/2021/06" aria-current="page">June 2021</a></li>
	<li><a href="https://web.archive.org/web/20211026095726/https://edwardatkin.co.uk/2021/05">May 2021</a></li>
	<li><a href="https://web.archive.org/web/20211026095726/https://edwardatkin.co.uk/2021/04">April 2021</a></li>
	<li><a href="https://web.archive.org/web/20211026095726/https://edwardatkin.co.uk/2021/03">March 2021</a></li>
	<li><a href="https://web.archive.org/web/20211026095726/https://edwardatkin.co.uk/2021/02">February 2021</a></li>
	<li><a href="https://web.archive.org/web/20211026095726/https://edwardatkin.co.uk/2021/01">January 2021</a></li>
	<li><a href="https://web.archive.org/web/20211026095726/https://edwardatkin.co.uk/2020/12">December 2020</a></li>
	<li><a href="https://web.archive.org/web/20211026095726/https://edwardatkin.co.uk/2020/10">October 2020</a></li>
	<li><a href="https://web.archive.org/web/20211026095726/https://edwardatkin.co.uk/2020/09">September 2020</a></li>
	<li><a href="https://web.archive.org/web/20211026095726/https://edwardatkin.co.uk/2020/08">August 2020</a></li>
	<li><a href="https://web.archive.org/web/20211026095726/https://edwardatkin.co.uk/2020/07">July 2020</a></li>
	<li><a href="https://web.archive.org/web/20211026095726/https://edwardatkin.co.uk/2020/06">June 2020</a></li>
	<li><a href="https://web.archive.org/web/20211026095726/https://edwardatkin.co.uk/2020/05">May 2020</a></li>
	<li><a href="https://web.archive.org/web/20211026095726/https://edwardatkin.co.uk/2020/04">April 2020</a></li>
	<li><a href="https://web.archive.org/web/20211026095726/https://edwardatkin.co.uk/2020/03">March 2020</a></li>
	<li><a href="https://web.archive.org/web/20211026095726/https://edwardatkin.co.uk/2020/02">February 2020</a></li>
			</ul>"""

In [44]:
# Alright I think I'm still missing maybe 3 blogs? Further investigation required. Right now:
list_soup = BeautifulSoup(archives_html, 'html.parser')
urls = [a['href'] for a in list_soup.find_all('a')]
blogs = {}
for page, url in enumerate(urls, 1):
    print(f"Getting page {page}")
    res = requests.get(url)
    if res.status_code != 200:
        print("Failed to get page")
        break

    soup = BeautifulSoup(res.content, 'html.parser')

    articles = soup.find_all('article')

    if not articles:
        print("No more posts")
        break

    for article in articles:
        title_elem = article.find('h2', class_='post-title')
        post_title = title_elem.get_text()
        post_title = post_title.strip()
        clean_title = strip_punctuation(post_title).replace(' ', '_')
        # Check if this exxists
        if os.path.exists(os.path.join('STAGING', 'blogs', f"{clean_title}.md")):
            print("Already scraped", {post_title})
            continue
        # Save link to the post
        link = title_elem.find('a')['href']
        blogs[post_title] = link
        print("Getting images for", post_title)
        figure = article.find('figure', class_='featured-media')
        if not figure:
            print("No figure for", post_title)
            continue
        src = figure.find('img')['src']
        if src.startswith('http'):
            try:
                res = requests.get(src)
                print("Img status code", res.status_code)
                img = Image.open(BytesIO(res.content))
                img.save(os.path.join("STAGING", "images", f"{clean_title}.{src.split('.')[-1]}"))
                print(f"Downloaded image as {clean_title}.{src.split('.')[-1]}")
                # Create a thumbnail
                img.thumbnail((256, 256))
                img = img.convert('RGB')
                img.save(os.path.join("STAGING", "images", f"{clean_title}.jpg"), 'JPEG')
            except Exception as e:
                print("Failed to download image")
                print(e)
            sleep(15)

    # Some of these pages also have another page so we'll check for that
    # We can modify the urls list in place and still have the loop work so we'll do that
    next_page = soup.find('a', class_='post-nav-older')
    if next_page:
        urls.append(next_page['href'])
        print("Found next page - adding to list")

    sleep(15)

blogs


Getting page 1
Already scraped {'Lego City Undercover'}
Getting page 2
Already scraped {'Endoparasitic, An Interesting Indie Game You Probably Missed.'}
Getting page 3
Already scraped {'Lone Fungus: An Indie Metroidvania You Don‚Äôt Want to Miss!'}
Getting images for Chip‚Äôs Challenge 2
Img status code 200
Downloaded image as Chip‚Äôs_Challenge_2.png
Getting page 4
Already scraped {'The Original Xbox is Slowly Dying Out'}
Already scraped {'King‚Äôs Quest I: Quest for the Crown'}
Already scraped {'Spyro: Enter the Dragonfly'}
Already scraped {'The Secret of Monkey Island: Special Edition'}
Already scraped {'Crash Team Racing: Nitro Fueled'}
Getting page 5
Already scraped {'Cosmo‚Äôs Cosmic Adventure'}
Getting page 6
Already scraped {'TelusFax ‚Äì A Teletext Simulator'}
Already scraped {'Medievil and the Medievil Remake'}
Already scraped {'Review of my 2020 Game Development'}
Getting page 7
Already scraped {'Malice'}
Getting page 8
Already scraped {'Devlog: Other Minds, Mid-October 2020

{'Chip‚Äôs Challenge 2': 'https://web.archive.org/web/20210928050723/https://edwardatkin.co.uk/chips-challenge-2',
 '‚Äúmy very good game‚Äù: Making a Horrible Game for Game Breaker‚Äôs Toolkit Jam #2': 'https://web.archive.org/web/20210928050649/https://edwardatkin.co.uk/my-very-good-game-making-a-horrible-game-for-game-breakers-toolkit-jam-2',
 'Tombi/Tomba': 'https://web.archive.org/web/20210928040348/https://edwardatkin.co.uk/tombi-tomba-playstation',
 'Devlog: Other Minds, February 2020': 'https://web.archive.org/web/20211026074050/https://edwardatkin.co.uk/devlog-other-minds-february-2020'}

In [46]:
# Last few blogs
for k, v in blogs.items():
    if k in blogs_completed:
        print(f"Skipping {k} (already completed)")
    print(k, v)
    try:
        res = requests.get(v)
    except:
        print("Failed to get page")
        sleep(15)
        continue
    if res.status_code != 200:
        print("Failed to get page")
        sleep(15)
        continue

    soup = BeautifulSoup(res.content, 'html.parser')
    try:
        title = k
        clean_title = strip_punctuation(title).replace(" ", "_")
        date = soup.find('span', class_='post-date').text
        content = soup.find('div', class_='post-content')
        content_markdown = md(str(content))
        # Find a with rel='author'
        author = soup.find('a', rel='author').text

        # Description can be the first paragraph of the content up to 200 characters
        description = content.find('p').text
        if len(description) > 200:
            description = description[:200]
            description += '...'

        date_obj = dt.strptime(date, "%B %d, %Y")
        # Format the datetime object to the desired format
        formatted_date = date_obj.strftime("%Y-%m-%d")

        if author == 'Eatkin':
            author = 'Ed'
    except Exception as e:
        print("Content failed to parse")
        print(e)
        sleep(15)
        continue

    # We're going to have to manually edit these anyway so I'll just write them to the file
    # Also we need download any images and stuff
    imgs = content.find_all('img')
    for i, im in enumerate(imgs):
        try:
            src = im['src']
        except Exception as e:
            print("Failed to get image source")
            print(e)
            continue
        if src.startswith('http'):
            try:
                res = requests.get(src)
                print("Status code for image", i+1, res.status_code)
                img = Image.open(BytesIO(res.content))
                img.save(os.path.join("STAGING", "images", f"{clean_title}_{i}.{src.split('.')[-1]}"))
                print(f"Downloaded image {i} as {clean_title}_{i}.{src.split('.')[-1]}")
            except Exception as e:
                print("Failed to download image")
                print(e)
            sleep(15)


    markdowned_content = md(str(content))

    # Write markdown
    markdown_content = f"""---
date: {formatted_date}
title: {title}
description: {description}
author: {author}
tags: TAGS
type: blog
thumbnail: /assets/images/{clean_title}_thumbnail.jpg
og_title: {title}
og_description: {description}
og_image: /assets/images/{clean_title}.jpg
og_type: article
collection: WEBSITE ARCHIVE
---
{markdowned_content}"""

    # Write the markdown file
    with open(os.path.join("STAGING", "blogs", f"{clean_title}.md"), 'w') as f:
        f.write(markdown_content)

    # If we get this far we can add it to the list of completed blogs
    blogs_completed.append(k)
    print(f"Completed {k}")

    sleep(15)


Chip‚Äôs Challenge 2 https://web.archive.org/web/20210928050723/https://edwardatkin.co.uk/chips-challenge-2
Status code for image 1 200
Downloaded image 0 as Chip‚Äôs_Challenge_2_0.png
Status code for image 2 200
Downloaded image 1 as Chip‚Äôs_Challenge_2_1.png
Completed Chip‚Äôs Challenge 2
‚Äúmy very good game‚Äù: Making a Horrible Game for Game Breaker‚Äôs Toolkit Jam #2 https://web.archive.org/web/20210928050649/https://edwardatkin.co.uk/my-very-good-game-making-a-horrible-game-for-game-breakers-toolkit-jam-2
Status code for image 1 200
Downloaded image 0 as ‚Äúmy_very_good_game‚Äù_Making_a_Horrible_Game_for_Game_Breaker‚Äôs_Toolkit_Jam_2_0.png
Status code for image 2 200
Downloaded image 1 as ‚Äúmy_very_good_game‚Äù_Making_a_Horrible_Game_for_Game_Breaker‚Äôs_Toolkit_Jam_2_1.png
Completed ‚Äúmy very good game‚Äù: Making a Horrible Game for Game Breaker‚Äôs Toolkit Jam #2
Tombi/Tomba https://web.archive.org/web/20210928040348/https://edwardatkin.co.uk/tombi-tomba-playstation
Failed

In [48]:
# Forgot to append _thumbnail to the end of the thumbnail image names
# Let's do that (it's the .jpg files)
files = [os.path.join("STAGING", "images", f) for f in os.listdir(os.path.join("STAGING", "images")) if f.endswith('.jpg')]
for f in files:
    os.rename(f, f"{f.split('.')[0]}_thumbnail.jpg")
    print(f"Renamed {f} to {f.split('.')[0]}_thumbnail.jpg")

Renamed STAGING/images/Game_Breaker‚Äôs_Tool_Kit_2_Jam_Games.jpg to STAGING/images/Game_Breaker‚Äôs_Tool_Kit_2_Jam_Games_thumbnail.jpg
Renamed STAGING/images/King‚Äôs_Quest_I_Quest_for_the_Crown.jpg to STAGING/images/King‚Äôs_Quest_I_Quest_for_the_Crown_thumbnail.jpg
Renamed STAGING/images/Commander_Keen_II_The_Earth_Explodes.jpg to STAGING/images/Commander_Keen_II_The_Earth_Explodes_thumbnail.jpg
Renamed STAGING/images/TombiTomba.jpg to STAGING/images/TombiTomba_thumbnail.jpg
Renamed STAGING/images/Prep_School_Horrors.jpg to STAGING/images/Prep_School_Horrors_thumbnail.jpg
Renamed STAGING/images/Psychonauts.jpg to STAGING/images/Psychonauts_thumbnail.jpg
Renamed STAGING/images/Silent_Bill_A_badly_drawn_escape_room.jpg to STAGING/images/Silent_Bill_A_badly_drawn_escape_room_thumbnail.jpg
Renamed STAGING/images/Spyro_Enter_the_Dragonfly.jpg to STAGING/images/Spyro_Enter_the_Dragonfly_thumbnail.jpg
Renamed STAGING/images/Cosmo‚Äôs_Cosmic_Adventure.jpg to STAGING/images/Cosmo‚Äôs_Cosmic_A

In [75]:
import re
# Alright now it's time for the awful job of going through every blog and updating metadata, adding tags etc, also updating images
# Also need to update the feed and collections
# As a first cleanup we'll make sure all the markdow files associated thumbnails exist - if not we'll create one
blogs = [os.path.join("STAGING", "unprocessed", f) for f in os.listdir(os.path.join("STAGING", "unprocessed"))]
illegal_punctuation = {'‚Äô': "'", '‚Äú': '"', '‚Äò': "'", "‚Äù": '"'}
imgs = [os.path.join("STAGING", "images", f) for f in os.listdir(os.path.join("STAGING", "images"))]
for b in blogs:
    with open(b, 'r') as f:
        content = f.read()

    # Remove the illegal punctuation
    for p, r in illegal_punctuation.items():
        content = content.replace(p, r)

    # Now start parsing the content
    first, last = content.split('thumbnail: ')
    thumbnail, last = last.split('\n', 1)
    middle, last = last.split('og_image: ')
    og_image, last = last.split('\n', 1)

    # Rejoining requires: first + 'thumbnail: ' + thumbnail + middle + 'og_image: ' + og_image + last
    # Remember to strip()

    # Check if the thumbnail exists
    thumb_fname = thumbnail.split('/')[-1]
    thumb_exists = os.path.exists(os.path.join("STAGING", "images", thumb_fname))
    if not thumb_exists:
        # Find if there's a file with og_image name + a number
        pattern = og_image.split('/')[-1].split('.')[0]
        # Find any files that match the pattern
        matches = [f for f in imgs if f.startswith(pattern)]
        # Get the first one
        if matches:
            match = matches[0]
            img = Image.open(match)
            img.thumbnail((256, 256))
            img.save(os.path.join("STAGING", "images", thumb_fname), 'JPEG')
            print(f"Generated thumbnail for {thumb_fname}")
            # Overwrite thumbnail and og_image
            thumbnail = f'\nthumbnail: /assets/images/{thumb_fname}\n'
            og_image = f'\nog_image: /assets/images/{match}\n'
        else:
            # We can just have a blank thumbnail and og_image
            thumbnail = '\n'
            og_image = '\n'
    else:
        thumbnail = f'\nthumbnail: /assets/images/{thumb_fname}\n'
        og_image = f'\nog_image: /assets/images/{og_image.split("/")[-1]}\n'

    # TAGS and COLLECTION is going to be a manual job - could make a util to do this so might do that not in a notebook tho
    # Write the content back
    new_content = first.strip() + thumbnail + middle.strip() + og_image + last.strip()

    # Now if there are images we need to update the images in the markdown file
    _, metadata, content = new_content.split('---', 2)

    # Find all the images in the content
    imgs = re.findall(r'!\[.*\]\((.*)\)', content)

    # Find how many images we have to distribute amongst the content by searching the images dir
    pattern = og_image.split('/')[-1].split('.')[0]
    matches = [f for f in os.listdir(os.path.join("STAGING", "images")) if f.startswith(pattern) and 'thumbnail' not in f]
    # If theres more matches than imgs we can pop any without a _number at the end
    if len(matches) > len(imgs):
        matches = [m for m in matches if m.startswith(pattern + '_')]

    # Replace the images in the content until we run out - then we'll just remove them
    for img in imgs:
        if matches:
            match = matches.pop(0)
            content = content.replace(img, f"/assets/images/{match}")
        else:
            content = content.replace(img, '')

    # Remove blank images
    content = content.replace('![]()', '')

    # Remove archive/ links
    # https://web.archive.org/web/20210127191557/https://twitter.com/JustWallGames
    pattern = r'(https://web\.archive\.org/web/[\d]+/)https://.*'

    # Find all matches of the entire prependment
    matches = re.findall(pattern, content)

    for match in matches:
        # Replace the full match in the content
        content = content.replace(match, '')

    # Regex replace these lines:
    patterns = [
        r'Talk about this post and suggest new blog topics in my \[Community Discord\]\(https://discord\.gg/ZkSwYtP\)!',
        r'\[If you like my work, consider supporting me on Ko\\-fi\]\(https://ko-fi\.com/edwardatkin\)!'
    ]
    for pattern in patterns:
        content = re.sub(pattern, '', content)

    # Finally the annoying thing is the markdownify uses ------------ to indicate headers instead of # so we'll replace those but it might be a bit annoying
    lines = content.split('\n')
    new_lines = []
    for line, next_line in zip(lines[:-1], lines[1:]):
        if next_line.startswith('----'):
            line = '## ' + line

        if line.startswith('----'):
            continue

        new_lines.append(line)

    content = '\n'.join(new_lines)

    # Rejoin with the metadata
    new_content = '---' + metadata + '---\n' + content.strip()

    # Finally replace the description = with description: as it should b
    new_content = new_content.replace('description =', 'description:')
    # Also author should be Ed not Edward Atkin
    new_content = new_content.replace('author: Edward Atkin', 'author: Ed')

    # Now write it to the blogs dir
    new_path = os.path.join("STAGING", "blogs", b.split("/")[-1])
    with open(new_path, 'w') as f:
        f.write(new_content)

    print(f"Updated {b}")


Updated STAGING/unprocessed/RydenWood__RydenWood_Deeper_Than_Before.md
Updated STAGING/unprocessed/Silent_Bill_A_badly_drawn_escape_room.md
Updated STAGING/unprocessed/Devlog_Other_Minds_May_2020.md
Updated STAGING/unprocessed/Lone_Fungus_An_Indie_Metroidvania_You_Don‚Äôt_Want_to_Miss.md
Updated STAGING/unprocessed/Remember_Mary_A_squigglevision_adventure_game.md
Updated STAGING/unprocessed/Retro_Space_Ball.md
Updated STAGING/unprocessed/Digging_Day.md
Updated STAGING/unprocessed/The_Moonstone_Equation.md
Updated STAGING/unprocessed/There_Is_Nothing_There__There_Was_Something_Here.md
Updated STAGING/unprocessed/Commander_Keen_I_Marooned_on_Mars.md
Updated STAGING/unprocessed/Ingame_Event_Management_with_Binary_Operators.md
Updated STAGING/unprocessed/Theme_Park_World__Old_Theme_Park_Sims.md
Updated STAGING/unprocessed/TombiTomba.md
Updated STAGING/unprocessed/Prep_School_Horrors.md
Updated STAGING/unprocessed/All_You_Can_Eat.md
Updated STAGING/unprocessed/Devlog_Other_Minds_March_2020.

In [20]:
# Alright all blogs ready to go - now we need to update the feed and collections and upload to gcp - will do that tomorrow
# For the purposes of feeds and collections we will creaate a dictionary of the form:
# {
#     '2022-05-20 14:16:23': {
#         'location': 'blogs/keyboard_cat.md'
#         'collection': 'WEBSITE ARCHIVE'
#     },
feed = {}
for b in [os.path.join("STAGING", "blogs", f) for f in os.listdir(os.path.join("STAGING", "blogs"))]:
    with open(b, 'r') as f:
        content = f.read()

    # Get the date
    date = content.split('\n')[1].split(': ')[1]

    # Get the location
    location = f'blogs/{b.split("/")[-1]}'

    # Get the collection
    collection = content.split('collection: ')[1].split('\n')[0]

    feed[date + ' 00:00:00'] = {
        'location': location,
        'collection': collection
    }

feed

{'2020-03-29 00:00:00': {'location': 'blogs/RydenWood__RydenWood_Deeper_Than_Before.md',
  'collection': 'Indie Game Reviews'},
 '2020-06-06 00:00:00': {'location': 'blogs/Silent_Bill_A_badly_drawn_escape_room.md',
  'collection': 'Devlogs'},
 '2020-05-28 00:00:00': {'location': 'blogs/Devlog_Other_Minds_May_2020.md',
  'collection': 'Devlogs'},
 '2021-04-24 00:00:00': {'location': 'blogs/Lone_Fungus_An_Indie_Metroidvania_You_Don‚Äôt_Want_to_Miss.md',
  'collection': 'Guest Blogs'},
 '2020-08-07 00:00:00': {'location': 'blogs/Remember_Mary_A_squigglevision_adventure_game.md',
  'collection': 'Devlogs'},
 '2020-03-26 00:00:00': {'location': 'blogs/Retro_Space_Ball.md',
  'collection': 'Indie Game Reviews'},
 '2020-02-04 00:00:00': {'location': 'blogs/The_Keymaster.md',
  'collection': 'Indie Game Reviews'},
 '2020-03-10 00:00:00': {'location': 'blogs/The_Moonstone_Equation.md',
  'collection': 'Indie Game Reviews'},
 '2020-04-10 00:00:00': {'location': 'blogs/There_Is_Nothing_There__The

In [8]:
# Cool now we need to update the feed and collections
feed_ref = db.collection('feed').document('content-log')

In [10]:
old_feed = feed_ref.get().to_dict()
old_feed

{'2022-03-18 11:20:30': {'location': 'videos/A_Closer_Look_at_the_Rare_666_Edition_of_Slime_Volleyball.md'},
 '2004-08-16 00:00:07': {'location': 'comics/hewligg_urobokkle_8.md'},
 '2024-04-23 08:17:04': {'location': 'blogs/internet.md'},
 '2010-01-09 03:03:18': {'location': 'music/poopremixes.md'},
 '2022-07-09 01:57:49': {'location': 'videos/Can_t_Cast_Ep_0_naming_the_podcast_and_working_out_the_shuff.md'},
 '2004-08-16 00:00:01': {'location': 'comics/hewligg_urobokkle_2.md'},
 '2023-07-02 11:24:15': {'location': 'videos/Commander_Keen39s_Last_Adventure_Unveiling_the_Lipton_Tea_Connection.md'},
 '2024-02-01 17:08:24': {'location': 'blogs/hip.md'},
 '2022-06-08 20:08:33': {'location': 'videos/StumbleUpon_Internet_Discovery_Through_the_0039s.md'},
 '2022-08-06 01:53:55': {'location': 'videos/Cant_Cast_Ep_Different_English_Aussie_Flag_Day.md'},
 '2023-07-18 08:52:17': {'location': 'blogs/onrss.md'},
 '2023-03-08 09:14:22': {'location': 'videos/In_the_Mind_of_Kimberly_Kubus_The_Game_Deve

In [12]:
new_feed = feed.copy()
for k, v in feed.items():
    new_feed[k].pop('collection', None)

new_feed

{'2020-03-29 00:00:00': {'location': 'blogs/RydenWood__RydenWood_Deeper_Than_Before.md'},
 '2020-06-06 00:00:00': {'location': 'blogs/Silent_Bill_A_badly_drawn_escape_room.md'},
 '2020-05-28 00:00:00': {'location': 'blogs/Devlog_Other_Minds_May_2020.md'},
 '2021-04-24 00:00:00': {'location': 'blogs/Lone_Fungus_An_Indie_Metroidvania_You_Don‚Äôt_Want_to_Miss.md'},
 '2020-08-07 00:00:00': {'location': 'blogs/Remember_Mary_A_squigglevision_adventure_game.md'},
 '2020-03-26 00:00:00': {'location': 'blogs/Retro_Space_Ball.md'},
 '2020-02-04 00:00:00': {'location': 'blogs/The_Keymaster.md'},
 '2020-03-10 00:00:00': {'location': 'blogs/The_Moonstone_Equation.md'},
 '2020-04-10 00:00:00': {'location': 'blogs/There_Is_Nothing_There__There_Was_Something_Here.md'},
 '2020-03-27 00:00:00': {'location': 'blogs/Commander_Keen_I_Marooned_on_Mars.md'},
 '2020-03-25 00:00:00': {'location': 'blogs/Ingame_Event_Management_with_Binary_Operators.md'},
 '2020-06-28 00:00:00': {'location': 'blogs/Theme_Park_W

In [13]:
# Now we can update old feed with new feed
old_feed.update(new_feed)
old_feed

{'2022-03-18 11:20:30': {'location': 'videos/A_Closer_Look_at_the_Rare_666_Edition_of_Slime_Volleyball.md'},
 '2004-08-16 00:00:07': {'location': 'comics/hewligg_urobokkle_8.md'},
 '2024-04-23 08:17:04': {'location': 'blogs/internet.md'},
 '2010-01-09 03:03:18': {'location': 'music/poopremixes.md'},
 '2022-07-09 01:57:49': {'location': 'videos/Can_t_Cast_Ep_0_naming_the_podcast_and_working_out_the_shuff.md'},
 '2004-08-16 00:00:01': {'location': 'comics/hewligg_urobokkle_2.md'},
 '2023-07-02 11:24:15': {'location': 'videos/Commander_Keen39s_Last_Adventure_Unveiling_the_Lipton_Tea_Connection.md'},
 '2024-02-01 17:08:24': {'location': 'blogs/hip.md'},
 '2022-06-08 20:08:33': {'location': 'videos/StumbleUpon_Internet_Discovery_Through_the_0039s.md'},
 '2022-08-06 01:53:55': {'location': 'videos/Cant_Cast_Ep_Different_English_Aussie_Flag_Day.md'},
 '2023-07-18 08:52:17': {'location': 'blogs/onrss.md'},
 '2023-03-08 09:14:22': {'location': 'videos/In_the_Mind_of_Kimberly_Kubus_The_Game_Deve

In [15]:
len(old_feed)

230

In [16]:
# Now we can write it back
feed_ref.set(old_feed)

update_time {
  seconds: 1721626586
  nanos: 992643000
}

In [22]:
# Now we need to deal with collections
# Sort feed by key
feed = dict(sorted(feed.items(), key=lambda x: x[0]))
collections = {}
for v in feed.values():
    collection = v['collection']
    collection = strip_punctuation(collection).replace(" ", "_").lower()
    if collection not in collections:
        collections[collection] = []

    collections[collection].append(v['location'].split('/')[-1])

collections

{'indie_game_reviews': ['The_Keymaster.md',
  'BinaryGrab.md',
  'The_Moonstone_Equation.md',
  'Seiklus.md',
  'About_Ten_Years.md',
  'Prep_School_Horrors.md',
  'Retro_Space_Ball.md',
  'RydenWood__RydenWood_Deeper_Than_Before.md',
  'Myth_Bearer.md',
  'There_Is_Nothing_There__There_Was_Something_Here.md',
  'ThinginItself.md',
  'All_You_Can_Eat.md',
  'Nepenthe.md',
  'Fuchsia.md',
  'TelusFax_‚Äì_A_Teletext_Simulator.md'],
 'devlogs': ['Smooth_acceleration_using_sinx.md',
  'Devlog_Other_Minds_February_2020.md',
  'Building_to_Linux_with_a_virtual_machine.md',
  'Ingame_Event_Management_with_Binary_Operators.md',
  'Devlog_Other_Minds_March_2020.md',
  'Other_Minds_April_Update_Full_Changelog.md',
  'Other_Minds_April_Demo_Update.md',
  'my_very_good_game_Making_a_Horrible_Game_for_Game_Breaker‚Äôs_Toolkit_Jam_2.md',
  'Devlog_Other_Minds_April_2020.md',
  'Game_Breakers_Tool_Kit_2_Jam_Games.md',
  'Devlog_Other_Minds_May_2020.md',
  'Silent_Bill_A_badly_drawn_escape_room.md',
 

In [23]:
# Now we can write this back
for k, v in collections.items():
    ref = db.collection('collections').document(k)
    ref.set({
        'content': v
    })

In [24]:
# Got some more shit to go up only two so let's just manually do it
new_feed = {
    '2023-02-20': {
        'location': 'blogs/regular_everyday_breakdown.md'
    },
    '2023-02-19': {
        'location': 'blogs/set_story.md'
    }
}

collection = {'content':
    [
    'set_story.md',
    'regular_everyday_breakdown.md'
]
}
collection_name = 'fiction_writing'

feed_ref = db.collection('feed').document('content-log')
old_feed = feed_ref.get().to_dict()
old_feed.update(new_feed)
feed_ref.set(old_feed)

collection_ref = db.collection('collections').document(collection_name)
collection_ref.set(collection)

update_time {
  seconds: 1721628586
  nanos: 33861000
}

In [26]:
import json
from markdownify import markdownify as md
# Now we're going to read the json from games.json and do some stuff with it
with open(os.path.join('STAGING', 'unprocessed', 'games.json'), 'r') as f:
    content = json.load(f)
content


[{'gameTitle': 'What the Pong',
  'shortDesc': 'Good pong game',
  'longDesc': '<p>An arcade style game inspired by Pong, Warioware and classic Flash games. 21 variations of Pong, each with their own unique twist.</p>',
  'img': 'whatThePong.png',
  'link': 'https://eatkin.itch.io/what-the-pong',
  'tags': ['2023', 'free', 'browser', 'casual', 'open-source']},
 {'gameTitle': 'Ert Wurm',
  'shortDesc': 'Ert Wurm lives in Shittown&#8482; and he and everyone else are all so happy!',
  'longDesc': '<p>Ert Wurm is a game of personal expression and exploration.</p><p>It contains extremely coarse language.</p><p>Made for Lame Jam 32</p>',
  'img': 'ertWurm.png',
  'link': 'https://eatkin.itch.io/ert-wurm',
  'tags': ['game-jam',
   'free',
   'browser',
   'story-rich',
   '2023',
   'best-of',
   'top-down',
   'casual']},
 {'gameTitle': 'A Relaxing Typing Game',
  'shortDesc': 'Type out public domain poetry and stories whilst listening to chill music',
  'longDesc': '<p>A game I made as a p

In [51]:
# Content is a list of dictionaries with game title, short description and long description - very useful
# We also have an image and a link
# So what I propose is to copy all images over, parse this stuff, create markdown files including a game_link property so I can decide what to do with it
# Then it's basically ready (subject to adding new endpoint)
# Long desc is html so markdownify it
# Also need thumbnails
# Also it doesn't have dates, I'll probably have to scrape itch io for that but fortunately we have the links to itch io to do that

feed_update = {}
failed_dates = []
for details in content:
    title = details['gameTitle']
    description = details['shortDesc']
    page_content = md(str(details['longDesc']))
    img_src = details['img']
    game_link = details['link']

    # Make the thumbnail
    img = Image.open(os.path.join('STAGING', 'images', img_src))
    thumbnail = img.copy()
    thumbnail.thumbnail((256, 256))
    thumbnail = thumbnail.convert('RGB')
    thumbnail.save(os.path.join('STAGING', 'images', f"{img_src.split('.')[0]}_thumbnail.jpg"), 'JPEG')
    print(f"Generated thumbnail for {img_src}")

    res = requests.get(game_link)
    print("Request made to", game_link)
    print("Status code", res.status_code)
    soup = BeautifulSoup(res.content, 'html.parser')
    game_info = soup.find('div', class_='game_info_panel_widget')
    trs = game_info.find_all('tr')
    date_info = None
    for tr in trs:
        tds = tr.find_all('td')
        for td in tds:
            if td.text == 'Published':
                print("Found date info")
                date_info = tds[1]
                break

    if not date_info:
        print("Failed to find date info for game", title, "You'll have to manually add it")
        failed_dates.append(game_link)
    else:
        # Extract the date
        abbr = date_info.find('abbr')
        date_time = abbr['title']

        # Parse it from 09 January 2021 @ 08:40 UTC to yyyy-mm-dd hh:mm:ss
        date_time = dt.strptime(date_time, "%d %B %Y @ %H:%M %Z")
        date = date_time.strftime("%Y-%m-%d")
        date_time = date_time.strftime("%Y-%m-%d %H:%M:%S")

        print("Got datetime published", date_time)

        clean_title = strip_punctuation(title).replace(' ', '_')

        feed_update[date_time] = {
            'location': f'games/{clean_title}.md',
        }

    # Now generate our markdown file
    markdown_content = f"""---
date: {date}
title: {title}
description: {description}
author: Ed
tags: ['Indie Game', 'Game Development']
type: game
thumbnail: /assets/images/{img_src.split('.')[0]}_thumbnail.jpg
og_title: {title}
og_description: {description}
og_image: /assets/images/{img_src}
og_type: article
game_link: {game_link}
collection: Ed's Games
---
{page_content}"""

    with open(os.path.join('STAGING', 'games', f"{clean_title}.md"), 'w') as f:
        f.write(markdown_content)

    print(f"Generated markdown for {title}")

    sleep(15)

Generated thumbnail for whatThePong.png
Request made to https://eatkin.itch.io/what-the-pong
Status code 200
Failed to find date info for game What the Pong You'll have to manually add it
Generated markdown for What the Pong
Generated thumbnail for ertWurm.png
Request made to https://eatkin.itch.io/ert-wurm
Status code 200
Failed to find date info for game Ert Wurm You'll have to manually add it
Generated markdown for Ert Wurm
Generated thumbnail for aRelaxingTypingGame.png
Request made to https://eatkin.itch.io/a-relaxing-typing-game
Status code 200
Failed to find date info for game A Relaxing Typing Game You'll have to manually add it
Generated markdown for A Relaxing Typing Game
Generated thumbnail for acidTrip.png
Request made to https://eatkin.itch.io/acid-trip
Status code 200
Failed to find date info for game Acid Trip You'll have to manually add it
Generated markdown for Acid Trip
Generated thumbnail for badControlGame.png
Request made to https://eatkin.itch.io/bad-control-game




Generated thumbnail for cucumberRacing.png
Request made to https://eatkin.itch.io/match-box-racing-gbtk6
Status code 200
Failed to find date info for game Cucumber Racing You'll have to manually add it
Generated markdown for Cucumber Racing
Generated thumbnail for fallGuys.png
Request made to https://eatkin.itch.io/fall-guys-but-someone-else-made-it-in-1-hour-and-i-just-changed-some-things
Status code 200
Failed to find date info for game Fall Guys But Someone Else Made it in 1 Hour and I Just Changed Some Things Which Probably Made it Worse You'll have to manually add it
Generated markdown for Fall Guys But Someone Else Made it in 1 Hour and I Just Changed Some Things Which Probably Made it Worse
Generated thumbnail for friends.png
Request made to https://eatkin.itch.io/friends
Status code 200
Failed to find date info for game Friends (Falling to Bitsy #4) You'll have to manually add it
Generated markdown for Friends (Falling to Bitsy #4)
Generated thumbnail for imposter.png
Request m

In [58]:
# Alrigh t so I couldn't get ANY dates what the hell I don't really uderstand why but whatever depends how much I want to automate this - could use seleium lets do that cause I'm lazy
# It's because you have to be logged in
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException

driver = webdriver.Chrome()

# well I need to login
# I'll just do it manually
driver.get('https://itch.io')


In [60]:

feed_update = {}
failed_dates = []
for details in content:
    title = details['gameTitle']
    description = details['shortDesc']
    page_content = md(str(details['longDesc']))
    img_src = details['img']
    game_link = details['link']

    driver.get(game_link)
    try:
        date = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CLASS_NAME, 'game_info_panel_widget'))
        )
    except TimeoutException:
        print("Timed out")
        failed_dates.append(game_link)
        continue

    # Find this thing - toggle_info_btn
    button = driver.find_element(By.CLASS_NAME, 'toggle_info_btn')

    button.click()

    sleep(5)

    game_info = driver.find_element(By.CLASS_NAME, 'game_info_panel_widget')

    # Wait for the abbr to load
    try:
        date = WebDriverWait(game_info, 10).until(
            EC.presence_of_element_located((By.TAG_NAME, 'abbr'))
        )
    except TimeoutException:
        print("Timed out")
        failed_dates.append(game_link)
        continue

    # Now we can extract the date in the same way as before
    trs = game_info.find_elements(By.TAG_NAME, 'tr')
    date_info = None
    for tr in trs:
        tds = tr.find_elements(By.TAG_NAME, 'td')
        for td in tds:
            if td.text == 'Published':
                date_info = tds[1]
                break

    if not date_info:
        print("Failed to find date info for game", title, "You'll have to manually add it")
        failed_dates.append(game_link)
        continue

    abbr = date_info.find_element(By.TAG_NAME, 'abbr')
    # Get the title
    date_time = abbr.get_attribute('title')

    # Parse it from 09 January 2021 @ 08:40 UTC to yyyy-mm-dd hh:mm:ss
    date_time = dt.strptime(date_time, "%d %B %Y @ %H:%M %Z")
    date = date_time.strftime("%Y-%m-%d")
    date_time = date_time.strftime("%Y-%m-%d %H:%M:%S")

    clean_title = strip_punctuation(title).replace(' ', '_')

    # We ca open and replace the date 2020-04-15 with the actual date
    with open(os.path.join('STAGING', 'games', f"{clean_title}.md"), 'r') as f:
        content = f.read()

    content = content.replace('2020-04-15', date)

    with open(os.path.join('STAGING', 'games', f"{clean_title}.md"), 'w') as f:
        f.write(content)

    feed_update[date_time] = {
        'location': f'games/{clean_title}.md',
    }

    print(f"Updated date for {title}")
    sleep(5)

Updated date for What the Pong
Updated date for Ert Wurm
Updated date for A Relaxing Typing Game
Updated date for Acid Trip
Updated date for Bad Control Game
Updated date for Ball Dude Adventures
Updated date for BalL DuDe AdVENtures TOo
Updated date for Beer (Falling to Bitsy #2)
Updated date for Ben Was Assimilated
Updated date for Board Hunting
Updated date for Carrot Hell
Updated date for Cat in Space
Updated date for Cucumber Racing
Updated date for Fall Guys But Someone Else Made it in 1 Hour and I Just Changed Some Things Which Probably Made it Worse
Updated date for Friends (Falling to Bitsy #4)
Updated date for Impostor (Falling to Bitsy #3)
Updated date for It's Not Easy Being Green
Updated date for Justin Wall's Cooking Simulator
Updated date for Life Simulator
Updated date for Monotony (Falling to Bitsy #1)
Updated date for my very good game
Updated date for Other Minds
Updated date for Remember Mary
Updated date for Silent Bill (Jam Version)
Updated date for Silent Bill (2

In [61]:
driver.quit()

# Also we should set Acid Trip's date to like 2004 cause that's when I made it
feed_update

{'2023-12-09 16:34:00': {'location': 'games/What_the_Pong.md'},
 '2023-09-22 14:57:00': {'location': 'games/Ert_Wurm.md'},
 '2021-01-09 08:40:00': {'location': 'games/A_Relaxing_Typing_Game.md'},
 '2022-02-07 19:15:00': {'location': 'games/Acid_Trip.md'},
 '2020-07-12 09:01:00': {'location': 'games/Bad_Control_Game.md'},
 '2022-07-29 20:02:00': {'location': 'games/BalL_DuDe_AdVENtures_TOo.md'},
 '2020-10-24 11:28:00': {'location': 'games/Beer_Falling_to_Bitsy_2.md'},
 '2020-06-27 11:03:00': {'location': 'games/Ben_Was_Assimilated.md'},
 '2021-01-06 09:00:00': {'location': 'games/Board_Hunting.md'},
 '2021-04-18 17:22:00': {'location': 'games/Carrot_Hell.md'},
 '2021-01-04 16:01:00': {'location': 'games/Cat_in_Space.md'},
 '2021-07-06 18:39:00': {'location': 'games/Cucumber_Racing.md'},
 '2020-09-12 15:35:00': {'location': 'games/Fall_Guys_But_Someone_Else_Made_it_in_1_Hour_and_I_Just_Changed_Some_Things_Which_Probably_Made_it_Worse.md'},
 '2021-02-08 08:27:00': {'location': 'games/Frie

In [62]:
acid_trip = feed_update.pop('2022-02-07 19:15:00')
feed_update['2006-09-26 15:05:00'] = acid_trip

In [63]:
feed_update

{'2023-12-09 16:34:00': {'location': 'games/What_the_Pong.md'},
 '2023-09-22 14:57:00': {'location': 'games/Ert_Wurm.md'},
 '2021-01-09 08:40:00': {'location': 'games/A_Relaxing_Typing_Game.md'},
 '2020-07-12 09:01:00': {'location': 'games/Bad_Control_Game.md'},
 '2022-07-29 20:02:00': {'location': 'games/BalL_DuDe_AdVENtures_TOo.md'},
 '2020-10-24 11:28:00': {'location': 'games/Beer_Falling_to_Bitsy_2.md'},
 '2020-06-27 11:03:00': {'location': 'games/Ben_Was_Assimilated.md'},
 '2021-01-06 09:00:00': {'location': 'games/Board_Hunting.md'},
 '2021-04-18 17:22:00': {'location': 'games/Carrot_Hell.md'},
 '2021-01-04 16:01:00': {'location': 'games/Cat_in_Space.md'},
 '2021-07-06 18:39:00': {'location': 'games/Cucumber_Racing.md'},
 '2020-09-12 15:35:00': {'location': 'games/Fall_Guys_But_Someone_Else_Made_it_in_1_Hour_and_I_Just_Changed_Some_Things_Which_Probably_Made_it_Worse.md'},
 '2021-02-08 08:27:00': {'location': 'games/Friends_Falling_to_Bitsy_4.md'},
 '2020-11-04 16:09:00': {'locat

In [64]:
# Now sort it in ascending order so we can write the collection
feed_update = dict(sorted(feed_update.items(), key=lambda x: x[0]))
feed_update

{'2006-09-26 15:05:00': {'location': 'games/Acid_Trip.md'},
 '2019-12-29 13:19:00': {'location': 'games/Other_Minds.md'},
 '2020-04-26 09:03:00': {'location': 'games/my_very_good_game.md'},
 '2020-06-05 18:17:00': {'location': 'games/Silent_Bill_Jam_Version.md'},
 '2020-06-27 11:03:00': {'location': 'games/Ben_Was_Assimilated.md'},
 '2020-07-12 09:01:00': {'location': 'games/Bad_Control_Game.md'},
 '2020-07-24 17:36:00': {'location': 'games/Stoner_Stevie_in_Asstown.md'},
 '2020-08-05 06:15:00': {'location': 'games/Remember_Mary.md'},
 '2020-09-12 15:35:00': {'location': 'games/Fall_Guys_But_Someone_Else_Made_it_in_1_Hour_and_I_Just_Changed_Some_Things_Which_Probably_Made_it_Worse.md'},
 '2020-10-18 12:26:00': {'location': 'games/Monotony_Falling_to_Bitsy_1.md'},
 '2020-10-24 11:28:00': {'location': 'games/Beer_Falling_to_Bitsy_2.md'},
 '2020-10-26 06:46:00': {'location': 'games/Its_Not_Easy_Being_Green.md'},
 '2020-11-04 16:09:00': {'location': 'games/Impostor_Falling_to_Bitsy_3.md'},


In [65]:
collection = [v['location'].split('/')[-1] for v in feed_update.values()]
collection

['Acid_Trip.md',
 'Other_Minds.md',
 'my_very_good_game.md',
 'Silent_Bill_Jam_Version.md',
 'Ben_Was_Assimilated.md',
 'Bad_Control_Game.md',
 'Stoner_Stevie_in_Asstown.md',
 'Remember_Mary.md',
 'Fall_Guys_But_Someone_Else_Made_it_in_1_Hour_and_I_Just_Changed_Some_Things_Which_Probably_Made_it_Worse.md',
 'Monotony_Falling_to_Bitsy_1.md',
 'Beer_Falling_to_Bitsy_2.md',
 'Its_Not_Easy_Being_Green.md',
 'Impostor_Falling_to_Bitsy_3.md',
 'Tennis_Tower.md',
 'Trapped_Inside_My_Own_Head.md',
 'x1F171roken_game_for_game_x1F171reakers_toolkit.md',
 'Justin_Walls_Cooking_Simulator.md',
 'Cat_in_Space.md',
 'Board_Hunting.md',
 'A_Relaxing_Typing_Game.md',
 'Friends_Falling_to_Bitsy_4.md',
 'Carrot_Hell.md',
 'Cucumber_Racing.md',
 'Trash_Cat.md',
 'Type_The_Entire_Of_Ulysses_Thats_it_Thats_the_game.md',
 'Life_Simulator.md',
 'Silent_Bill_2022.md',
 'BalL_DuDe_AdVENtures_TOo.md',
 'Ert_Wurm.md',
 'What_the_Pong.md']

In [66]:
# Cool now let's update the feed
feed_ref = db.collection('feed').document('content-log')
feed = feed_ref.get().to_dict()
feed.update(feed_update)
feed

{'2021-01-03 00:00:00': {'location': 'blogs/Review_of_my_2020_Game_Development.md'},
 '2010-01-09 03:03:18': {'location': 'music/poopremixes.md'},
 '2004-08-16 00:00:01': {'location': 'comics/hewligg_urobokkle_2.md'},
 '2023-07-02 11:24:15': {'location': 'videos/Commander_Keen39s_Last_Adventure_Unveiling_the_Lipton_Tea_Connection.md'},
 '2004-10-15 00:00:00': {'location': 'comics/hewligg_urobokkle_38.md'},
 '2022-06-29 16:07:45': {'location': 'videos/Dunnet__The_Secret_Terminal_Game.md'},
 '2004-10-21 00:00:00': {'location': 'comics/hewligg_urobokkle_44.md'},
 '2020-03-22 00:00:00': {'location': 'blogs/Seiklus.md'},
 '2004-08-17 00:00:03': {'location': 'comics/hewligg_urobokkle_13.md'},
 '2023-07-21 18:02:45': {'location': 'comics/pp_comic11.md'},
 '2023-07-09 14:28:02': {'location': 'comics/pp_comic8.md'},
 '2022-02-02 15:00:07': {'location': 'videos/The_Slime_Volley_Ball_Experience_shorts.md'},
 '2022-07-18 11:45:01': {'location': 'videos/Gedaria__Indie_Game_Review__An_Indie_Action_2

In [67]:
# Write it back
feed_ref.set(feed)

update_time {
  seconds: 1721647656
  nanos: 341694000
}

In [68]:
# Set the content (eds_games)
content_ref = db.collection('collections').document("eds_games")
content_ref.set({
    'content': collection
})

update_time {
  seconds: 1721647679
  nanos: 957922000
}

In [69]:
# We actually must update cucumber racing too in the feed
cucumber_racing_key = [k for k, v in feed.items() if v['location'] == 'games/Cucumber_Racing.md'][0]
cucumber_racing_key

'2021-07-06 18:39:00'

In [70]:
cucumber_racing = feed.pop(cucumber_racing_key)
feed['2006-07-13 18:29:00'] = cucumber_racing
# Update it
feed_ref.set(feed)

update_time {
  seconds: 1721647939
  nanos: 504804000
}

In [72]:
collection.remove('Cucumber_Racing.md')
collection.insert(0, 'Cucumber_Racing.md')
collection

['Cucumber_Racing.md',
 'Acid_Trip.md',
 'Other_Minds.md',
 'my_very_good_game.md',
 'Silent_Bill_Jam_Version.md',
 'Ben_Was_Assimilated.md',
 'Bad_Control_Game.md',
 'Stoner_Stevie_in_Asstown.md',
 'Remember_Mary.md',
 'Fall_Guys_But_Someone_Else_Made_it_in_1_Hour_and_I_Just_Changed_Some_Things_Which_Probably_Made_it_Worse.md',
 'Monotony_Falling_to_Bitsy_1.md',
 'Beer_Falling_to_Bitsy_2.md',
 'Its_Not_Easy_Being_Green.md',
 'Impostor_Falling_to_Bitsy_3.md',
 'Tennis_Tower.md',
 'Trapped_Inside_My_Own_Head.md',
 'x1F171roken_game_for_game_x1F171reakers_toolkit.md',
 'Justin_Walls_Cooking_Simulator.md',
 'Cat_in_Space.md',
 'Board_Hunting.md',
 'A_Relaxing_Typing_Game.md',
 'Friends_Falling_to_Bitsy_4.md',
 'Carrot_Hell.md',
 'Trash_Cat.md',
 'Type_The_Entire_Of_Ulysses_Thats_it_Thats_the_game.md',
 'Life_Simulator.md',
 'Silent_Bill_2022.md',
 'BalL_DuDe_AdVENtures_TOo.md',
 'Ert_Wurm.md',
 'What_the_Pong.md']

In [73]:
# Write it back
content_ref.set({
    'content': collection
})

update_time {
  seconds: 1721647983
  nanos: 371792000
}

In [79]:
# Update the global feed
feed_ref = db.collection('feed').document('content-log')
feed = feed_ref.get().to_dict()
feed

{'2021-01-03 00:00:00': {'location': 'blogs/Review_of_my_2020_Game_Development.md'},
 '2010-01-09 03:03:18': {'location': 'music/poopremixes.md'},
 '2023-12-09 16:34:00': {'location': 'games/What_the_Pong.md'},
 '2020-06-05 18:17:00': {'location': 'games/Silent_Bill_Jam_Version.md'},
 '2004-08-16 00:00:01': {'location': 'comics/hewligg_urobokkle_2.md'},
 '2023-07-02 11:24:15': {'location': 'videos/Commander_Keen39s_Last_Adventure_Unveiling_the_Lipton_Tea_Connection.md'},
 '2021-07-10 07:38:00': {'location': 'games/Trash_Cat.md'},
 '2019-12-29 13:19:00': {'location': 'games/Other_Minds.md'},
 '2004-10-15 00:00:00': {'location': 'comics/hewligg_urobokkle_38.md'},
 '2022-06-29 16:07:45': {'location': 'videos/Dunnet__The_Secret_Terminal_Game.md'},
 '2004-10-21 00:00:00': {'location': 'comics/hewligg_urobokkle_44.md'},
 '2020-03-22 00:00:00': {'location': 'blogs/Seiklus.md'},
 '2004-08-17 00:00:03': {'location': 'comics/hewligg_urobokkle_13.md'},
 '2023-07-21 18:02:45': {'location': 'comics

In [80]:
# Get the edited date of binary_search.md
file = os.path.join('STAGING', 'blogs', 'binary_search.md')
edited_time = os.path.getmtime(file)
# Format it
edited_time = dt.fromtimestamp(edited_time).strftime("%Y-%m-%d %H:%M:%S")
edited_time

'2024-07-23 08:46:06'

In [81]:
feed[edited_time] = {
    'location': 'blogs/binary_search.md'
}

In [82]:
# Update the feed
feed_ref.set(feed)

update_time {
  seconds: 1721720904
  nanos: 687748000
}

In [83]:
# Update coding heaven content list
coding_heaven = db.collection('collections').document('coding_heaven')
content_list = coding_heaven.get().to_dict()
content_list

{'content': ['hello_world.md',
  'off_by_one.md',
  'fizz_buzz.md',
  'bash_math.md',
  'image_crappifier.md',
  'sql_recursive.md']}

In [84]:
content_list['content'].append('binary_search.md')

In [85]:
# Write it back
coding_heaven.set(content_list)

update_time {
  seconds: 1721720933
  nanos: 503700000
}

In [86]:
# Potentially the last thing is projects - which is annoying
# I have projects.json which has names of the associated html pages used to build the homepage
# I need to copy all the images over and then generate markdown files from them, updating images, generating thumbnails, etc, it'll be a pain
# I love my life
with open(os.path.join('STAGING', 'unprocessed', 'json', 'projects.json'), 'r') as f:
    projects = json.load(f)

projects

{'Data Science Projects': [{'name': 'Horse Racing Predictor',
   'html': 'assets/data/html/horse.html'},
  {'name': 'Reddit Sentiment Analysis',
   'html': 'assets/data/html/redditsentiment.html'},
  {'name': 'Codewars Analysis', 'html': 'assets/data/html/codewars.html'}],
 'Game Development Projects': [{'name': 'Other Minds',
   'html': 'assets/data/html/otherminds.html'},
  {'name': 'Date and Game', 'html': 'assets/data/html/dateandgame.html'}],
 'Web Development Projects': [{'name': 'David Social Command Line Interface',
   'html': 'assets/data/html/david.html'},
  {'name': 'Personal Homepage', 'html': 'assets/data/html/homepage.html'},
  {'name': 'Tombstone Tapes', 'html': 'assets/data/html/tombstone.html'}],
 'Blogs': [{'name': 'Coding Heaven',
   'html': 'assets/data/html/codingheaven.html'},
  {'name': "Edward Atkin's Blog", 'html': 'assets/data/html/edwardatkin.html'},
  {'name': 'Weird Indie Shit',
   'html': 'assets/data/html/weirdindieshit.html'}],
 'Certifications': [{'name

In [87]:
# Pop anything that doesn't have 'Projects' in the key
for k in list(projects.keys()):
    if 'Projects' not in k:
        projects.pop(k)

projects

{'Data Science Projects': [{'name': 'Horse Racing Predictor',
   'html': 'assets/data/html/horse.html'},
  {'name': 'Reddit Sentiment Analysis',
   'html': 'assets/data/html/redditsentiment.html'},
  {'name': 'Codewars Analysis', 'html': 'assets/data/html/codewars.html'}],
 'Game Development Projects': [{'name': 'Other Minds',
   'html': 'assets/data/html/otherminds.html'},
  {'name': 'Date and Game', 'html': 'assets/data/html/dateandgame.html'}],
 'Web Development Projects': [{'name': 'David Social Command Line Interface',
   'html': 'assets/data/html/david.html'},
  {'name': 'Personal Homepage', 'html': 'assets/data/html/homepage.html'},
  {'name': 'Tombstone Tapes', 'html': 'assets/data/html/tombstone.html'}]}

In [96]:
import shutil
from markdownify import markdownify as md
# Cool now we go through and generate markdown files lol gonna have to use bs4 for this üò≠üò≠ I don't wanna copilot ddo it all for me
for v in projects.values():
    for project in v:
        name = project['name']
        file = os.path.join('STAGING', 'unprocessed', 'html', project['html'].split('/')[-1])

        with open(file, 'r') as f:
            content = f.read()

        soup = BeautifulSoup(content, 'html.parser')

        # Div class project-image is the og image so we'll get that and thumbnail it too
        img_src = soup.find('div', class_='project-image').find('img')['src']
        img_filename = img_src.split('/')[-1]
        img_location = os.path.join('STAGING', 'unprocessed', 'images', img_filename)
        img = Image.open(img_location)
        img.thumbnail((256, 256))
        img = img.convert('RGB')
        img.save(os.path.join('STAGING', 'images', f"{img_filename.split('.')[0]}_thumbnail.jpg"), 'JPEG')

        # Also shutil the image over
        shutil.copy(img_location, os.path.join('STAGING', 'images', img_filename))

        # Now the content is in div project-content
        content = soup.find('div', class_='project-content')

        # Markdownify the content
        markdown_content = md(str(content))

        # Get a description
        description = content.find('p').text

        # Generate the markdown file
        clean_name = strip_punctuation(name).replace(' ', '_')

        # Get rid of the stupid close overlay thing
        markdown_content = markdown_content.replace('![Close overlay](assets/images/cross.png "Close overlay")', '')
        while '\n\n\n' in markdown_content:
            markdown_content = markdown_content.replace('\n\n\n', '\n\n')

        markdown = f"""---
date: INSERT DATE
title: {name}
description: {description}
author: Ed
tags: ['Projects', 'Portfolio', 'Programming']
type: project
thumbnail: /assets/images/{img_filename.split('.')[0]}_thumbnail.jpg
og_title: {name}
og_description: {description}
og_image: /assets/images/{img_filename}
og_type: article
collection: Projects
---
![{name}](/assets/images/{img_filename})

{markdown_content}"""

        with open(os.path.join('STAGING', 'projects', f"{clean_name}.md"), 'w') as f:
            f.write(markdown)

        print(f"Generated markdown for {name}")


Generated markdown for Horse Racing Predictor
Generated markdown for Reddit Sentiment Analysis
Generated markdown for Codewars Analysis
Generated markdown for Other Minds
Generated markdown for Date and Game
Generated markdown for David Social Command Line Interface
Generated markdown for Personal Homepage
Generated markdown for Tombstone Tapes


In [None]:
# Have to manually set the datetimes through github commits cause cba to scrape the dates or whatever
feed = {
    '2023-04-09 12:08:00': {
        'location': 'projects/Tombstone_Tapes.md'
    },
    '2023-04-22 12:22:00': {
        'location': 'projects/Codewars_Analysis.md'
    },
    '2022-04-01 00:00:00': {
        'location': 'projects/Date_and_Game.md'
    },
    '2023-12-16 12:45:00': {
        'location': 'projects/David_Social_Command_Line_Interface.md'
    },
    '2023-05-22 00:00:00': {
        'location': 'projects/Horse_Racing_Predictor.md'
    },
    '2019-12-29 00:00:01': {
        'location': 'projects/Other_Minds.md'
    },
    '2023-05-17 15:02:00': {
        'location': 'projects/Personal_Homepage.md'
    },
    '2023-09-08 17:38:00': {
        'location': 'projects/Reddit_Sentiment_Analysis.md'
    }
}