In [1]:
# Load data

import json

FILENAME = './scraped_json/drake_hotline_bling.json'

meme_name: str
template_url: str
meme_data: list
with open(FILENAME) as f:
    data = json.load(f)
    meme_name = data['name']
    template_url = data["template"]
    meme_data = [(meme['url'], meme['text']) for meme in data['memes']]
print("Total memes:", len(meme_data))
print(meme_data[:5])


Total memes: 695
[('i.imgflip.com/5amwr6.jpg', 'HOW SUPERMARKETS THINKS WE ARE; $10; $9.99'), ('i.imgflip.com/5ybql2.jpg', 'Elon Musk; Eel on Musk'), ('i.imgflip.com/587wks.jpg', 'happy May 4th! May the 4th be with you'), ('i.imgflip.com/5b0yw1.jpg', 'making a meme with words and a meaning'), ('i.imgflip.com/6bczfl.jpg', 'when you realize downvoting does nothing; when you immediately realize upvoting images gets you points and also makes the creator happy')]


In [2]:
# Clean data

NUM_TEXT_BLOCKS = 2
cleaned_data = []
for url, text in meme_data:
    # Sometimes the JSON will contain memes with no text,
    # which will give a text of: "image tagged in [categories]"
    # Don't include these samples
    if "image tagged in" in text:
        continue
    # Don't include samples that do not meet desired # of text blocks
    if len(text.split(";")) != NUM_TEXT_BLOCKS:
        continue
    cleaned_data.append((url, text))

print("Removed", len(meme_data) - len(cleaned_data), "memes")
print("Memes after cleaning:", len(cleaned_data))

Removed 154 memes
Memes after cleaning: 541


In [3]:
# Download and scale template

import requests
import io
from PIL import Image

# Download template
print("Getting template from", template_url)
r = requests.get(template_url, stream=True)
img_bytes = r.raw.read()

# Scale to size 
image = Image.open(io.BytesIO(img_bytes))
print("Image size before:", image.size)
scaled = image.resize((100, 100))
print("Image size after:", scaled.size)

Getting template from https://imgflip.com/s/meme/Drake-Hotline-Bling.jpg
Image size before: (1200, 1200)
Image size after: (100, 100)


In [7]:
# Export all data to folder

import os

output_folder = "_".join(meme_name.split())
if not os.path.isdir(output_folder):
    os.mkdir(output_folder)

scaled.save(output_folder + "/" + "image.jpg")
with open(output_folder + "/" + "data.json", "w") as f:
    json.dump(cleaned_data, f)
