In [None]:
#@title Tag Scraper
#@markdown ### ⚙️ Core Settings
#@markdown Enter the desired settings below. The script will use these values when you run the cell.
Output_Filename = "danbooru2026.csv" #@param {type:"string"}
Minimum_Tag_Count = 20 #@param {type:"integer"}
Boards_to_Scrape = "Danbooru only" #@param ["Danbooru only", "e621 only", "Both (merged)"]
Cutoff_Date = "2025-09-27" #@param {type:"date"}

#@markdown ---
#@markdown ### 📝 Formatting Options
Replace_Underscore_with_Dash = False #@param {type:"boolean"}

#@markdown ---
#@markdown ### 🚫 Category Exclusions
#@markdown Check the boxes for any tag categories you wish to exclude from the final list.
Exclude_General = False #@param {type:"boolean"}
Exclude_Artist = False #@param {type:"boolean"}
Exclude_Copyright = False #@param {type:"boolean"}
Exclude_Character = False #@param {type:"boolean"}
Exclude_Post = False #@param {type:"boolean"}

# Step 1: Import necessary libraries
import os
import requests
import collections
import csv
import time
import datetime
from google.colab import files

# Step 2: Process the inputs from the Colab form
csv_filename = Output_Filename
minimum_count = Minimum_Tag_Count
dashes = 'y' if Replace_Underscore_with_Dash else 'n'
date = Cutoff_Date

if Boards_to_Scrape == "Danbooru only":
    boards = 'd'
elif Boards_to_Scrape == "e621 only":
    boards = 'e'
else:
    boards = 'de'

exclude = []
if Exclude_General: exclude.append('general')
if Exclude_Artist: exclude.append('artist')
if Exclude_Copyright: exclude.append('copyright')
if Exclude_Character: exclude.append('character')
if Exclude_Post: exclude.append('post')


# Step 3: The Scraper Logic (adapted from your script)
print("🚀 Starting scraper with the specified settings...")

class Complete(Exception): pass

try:
    max_date = datetime.datetime.strptime(date.strip()[:10], "%Y-%m-%d")
    print(f"Using cutoff date: {max_date}")
except:
    max_date = datetime.datetime.now()
    print(f"Using today's date: {max_date}")

excluded = ""
excluded += "0" if "general" in exclude else ""
excluded += "1" if "artist" in exclude else ""
excluded += "3" if "copyright" in exclude else ""
excluded += "4" if "character" in exclude else ""
excluded += "5" if "post" in exclude else ""

kaomojis = [
    "0_0", "(o)_(o)", "+_+", "+_-", "._.", "<o>_<o>", "<|>_<|>", "=_=", ">_<",
    "3_3", "6_9", ">_o", "@_@", "^_^", "o_o", "u_u", "x_x", "|_|", "||_||",
]

if not '.csv' in csv_filename:
    csv_filename += '.csv'

temp_csv_filename = csv_filename
if dashes == 'y':
    temp_csv_filename += '-temp'

base_url = 'https://danbooru.donmai.us/tags.json?limit=1000&search[hide_empty]=yes&search[is_deprecated]=no&search[order]=count'
alias_url = 'https://danbooru.donmai.us/tag_aliases.json?commit=Search&limit=1000&search[order]=tag_count'
e6_base_url = 'https://e621.net/tags.json?limit=1000&search[hide_empty]=yes&search[is_deprecated]=no&search[order]=count'
e6_alias_url = 'https://e621.net/tag_aliases.json?commit=Search&limit=1000&search[order]=tag_count'

session = requests.Session()
session.headers.update({"User-Agent": "Colab-Tag-Scraper/1.0"})

def backdate(tags, aliases, date):
    print(f"Clearing older aliases...")
    filtered_aliases = {}
    for key in aliases:
        kept = []
        for item in aliases[key]:
            entry_date = datetime.datetime.strptime(item[1][:10], "%Y-%m-%d")
            if entry_date <= date:
                kept += [item[0]]
        filtered_aliases[key] = kept

    for key in list(tags.keys()):
        if datetime.datetime.strptime(tags[key][2][:10], "%Y-%m-%d") > date:
            try:
                new_key = filtered_aliases[key].pop(0)
                value = tags.pop(key)
                tags[new_key] = value
            except Exception:
                pass

    for key in filtered_aliases:
        try:
            alias_string = ",".join(filtered_aliases[key])
            tags[key] += [alias_string]
        except:
            pass

def get_aliases(url, type):
    try:
        aliases = collections.defaultdict(list)
        for page in range(1, 1001):
            full_url = f'{url}&page={page}'
            while True:
                try:
                    response = session.get(full_url)
                    if response.status_code == 200:
                        break
                    else:
                        print(f"Couldn't reach server, Status: {response.status_code}. Retrying in 5s")
                        time.sleep(5)
                except requests.exceptions.RequestException as e:
                    print(f"Request failed: {e}. Retrying in 10s")
                    time.sleep(10)
            data = response.json()
            if not data:
                print(f'No more alias data found at page {page}.')
                break
            for item in data:
                if type == "e":
                    if int(item['post_count']) < int(minimum_count):
                        raise Complete
                aliases[item['consequent_name']] += [[item['antecedent_name'], item['created_at']]]
            print(f'Page {page} aliases processed.', flush=True)
            time.sleep(0.1)
    except(Complete):
        print("Reached the post threshold for aliases.")
    return aliases

dan_tags = {}
if "d" in boards:
    print("\n--- Scraping Danbooru ---")
    try:
        for page in range(1, 1001):
            full_url = f'{base_url}&page={page}'
            response = session.get(full_url)
            if response.status_code != 200:
                print(f"Failed to fetch Danbooru page {page}. Status: {response.status_code}")
                time.sleep(5)
                continue
            data = response.json()
            if not data:
                print(f'No more Danbooru data found at page {page}.')
                break
            for item in data:
                if int(item['post_count']) < int(minimum_count):
                    raise Complete
                if not str(item['category']) in excluded:
                    dan_tags[item['name']] = [item['category'], item['post_count'], item['created_at']]
            print(f'Danbooru page {page} processed.', flush=True)
            time.sleep(0.1)
    except(Complete):
        print(f'All Danbooru tags with {minimum_count} posts or more have been scraped.')

    dan_aliases = get_aliases(alias_url, "d")
    backdate(dan_tags, dan_aliases, max_date)

e6_tags = {}
if "e" in boards:
    print("\n--- Scraping e621 ---")
    try:
        for page in range(1, 1001):
            full_url = f'{e6_base_url}&page={page}'
            response = session.get(full_url)
            if response.status_code != 200:
                print(f'Failed to fetch e621 page {page}. Status: {response.status_code}')
                time.sleep(5)
                continue
            data = response.json()
            if not data:
                print(f'No more e621 data found at page {page}.')
                break
            for item in data:
                if int(item['post_count']) < int(minimum_count):
                    raise Complete
                if not str(item['category']) in excluded:
                    e6_tags[item['name']] = [item['category'], item['post_count'], item['created_at']]
            print(f'e621 page {page} processed.', flush=True)
            time.sleep(1) # e621 requires a 1-second delay
    except Complete:
        print(f'All e621 tags with {minimum_count} posts or more have been scraped.')

if ("d" in boards) and ("e" in boards):
    print("\nMerging Danbooru and e621 lists...")
    for tag, data in dan_tags.items():
        if tag in e6_tags:
            e6_tags[tag][1] += data[1] # Combine post counts
        else:
            e6_tags[tag] = data # Add Danbooru tag if not in e6
    full_tags = e6_tags
elif "d" in boards:
    full_tags = dan_tags
else:
    full_tags = e6_tags

print(f"\nWriting data to {temp_csv_filename}...")
with open(temp_csv_filename, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(['tag', 'category', 'post_count', 'aliases']) # Write header
    for key, value in full_tags.items():
        try:
            writer.writerow([key, value[0], value[1], value[3]])
        except IndexError:
            writer.writerow([key, value[0], value[1], ''])

final_filename = temp_csv_filename
if dashes == 'y':
    final_filename = csv_filename
    print(f"Replacing '_' with '-' and saving to {final_filename}...")
    with open(temp_csv_filename, 'r', encoding='utf-8') as csvfile:
        reader = csv.reader(csvfile)
        with open(final_filename, 'w', encoding='utf-8', newline='') as outfile:
            writer = csv.writer(outfile)
            next(reader) # Skip header in read file
            writer.writerow(['tag', 'category', 'post_count', 'aliases']) # Write header to new file
            for row in reader:
                if not row[0] in kaomojis:
                    row[0] = row[0].replace("_", "-")
                    if len(row) > 3: # Check if aliases column exists
                        row[3] = row[3].replace("_", "-")
                writer.writerow(row)
    os.remove(temp_csv_filename)

print(f"\n✅ Success! Data has been written to {final_filename}")
print("Starting download...")
files.download(final_filename)

🚀 Starting scraper with the specified settings...
Using cutoff date: 2025-09-27 00:00:00

--- Scraping Danbooru ---
Danbooru page 1 processed.
Danbooru page 2 processed.
Danbooru page 3 processed.
Danbooru page 4 processed.
Danbooru page 5 processed.
Danbooru page 6 processed.
Danbooru page 7 processed.
Danbooru page 8 processed.
Danbooru page 9 processed.
Danbooru page 10 processed.
Danbooru page 11 processed.
Danbooru page 12 processed.
Danbooru page 13 processed.
Danbooru page 14 processed.
Danbooru page 15 processed.
Danbooru page 16 processed.
Danbooru page 17 processed.
Danbooru page 18 processed.
Danbooru page 19 processed.
Danbooru page 20 processed.
Danbooru page 21 processed.
Danbooru page 22 processed.
Danbooru page 23 processed.
Danbooru page 24 processed.
Danbooru page 25 processed.
Danbooru page 26 processed.
Danbooru page 27 processed.
Danbooru page 28 processed.
Danbooru page 29 processed.
Danbooru page 30 processed.
Danbooru page 31 processed.
Danbooru page 32 processe