In [5]:
import os
import re
from tqdm import tqdm
import pandas as pd
import json

In [6]:
# !curl -O https://hugovk.github.io/gutenberg-metadata/gutenberg-metadata.json gutenberg-metadata.json

In [8]:
metadata = json.load(open('gutenberg-metadata.json'))

In [9]:
len(metadata)

68502

In [10]:
keys = metadata.keys()

In [11]:
metadata['1']

{'author': ['Jefferson, Thomas'],
 'formaturi': ['https://www.gutenberg.org/files/1/1-0.txt',
  'https://www.gutenberg.org/ebooks/1.kindle.noimages',
  'https://www.gutenberg.org/cache/epub/1/pg1.cover.small.jpg',
  'https://www.gutenberg.org/ebooks/1.kindle.images',
  'https://www.gutenberg.org/files/1/1-h/1-h.htm',
  'https://www.gutenberg.org/ebooks/1.epub.images',
  'https://www.gutenberg.org/cache/epub/1/pg1.cover.medium.jpg',
  'https://www.gutenberg.org/files/1/1-0.zip',
  'https://www.gutenberg.org/ebooks/1.epub.noimages',
  'https://www.gutenberg.org/ebooks/1.rdf',
  'https://www.gutenberg.org/ebooks/1.html.images',
  'https://www.gutenberg.org/files/1/1-h.zip',
  'https://www.gutenberg.org/ebooks/1.txt.utf-8'],
 'language': ['en'],
 'rights': ['Public domain in the USA.'],
 'subject': ['United States -- History -- Revolution, 1775-1783 -- Sources',
  'United States. Declaration of Independence',
  'JK',
  'E201'],
 'title': ['The Declaration of Independence of the United Stat

In [12]:
# Convert metadata dictionary to a list of records for Pandas DataFrame
metadata_records = [
    {
        "id": key,
        "author": ", ".join(value.get("author", [])),
        "formaturi": ", ".join(value.get("formaturi", [])),
        "language": ", ".join(value.get("language", [])),
        "rights": ", ".join(value.get("rights", [])),
        "subject": ", ".join(value.get("subject", [])),
        "title": ", ".join(value.get("title", []))
    }
    for key, value in metadata.items()
]

In [13]:
# Create a Pandas DataFrame
metadata_df = pd.DataFrame(metadata_records)

In [14]:
metadata_df

Unnamed: 0,id,author,formaturi,language,rights,subject,title
0,1,"Jefferson, Thomas","https://www.gutenberg.org/files/1/1-0.txt, htt...",en,Public domain in the USA.,"United States -- History -- Revolution, 1775-1...",The Declaration of Independence of the United ...
1,2,United States,https://www.gutenberg.org/cache/epub/2/pg2.cov...,en,Public domain in the USA.,"JK, KF, Civil rights -- United States -- Sourc...",The United States Bill of Rights\r\nThe Ten Or...
2,3,"Kennedy, John F. (John Fitzgerald)",https://www.gutenberg.org/ebooks/3.html.images...,en,Public domain in the USA.,Presidents -- United States -- Inaugural addre...,John F. Kennedy's Inaugural Address
3,4,"Lincoln, Abraham",https://www.gutenberg.org/cache/epub/4/pg4.cov...,en,Public domain in the USA.,"Soldiers' National Cemetery (Gettysburg, Pa.),...",Lincoln's Gettysburg Address\r\nGiven November...
4,5,United States,https://www.gutenberg.org/cache/epub/5/pg5.cov...,en,Public domain in the USA.,United States -- Politics and government -- 17...,The United States Constitution
...,...,...,...,...,...,...,...
68497,68498,"Burtis, Thomas",https://www.gutenberg.org/files/68498/68498-h/...,en,Public domain in the USA.,,The sky sheriff: The pioneer spirit lives agai...
68498,68499,"Carter, Nicholas (House name)",https://www.gutenberg.org/files/68499/68499-0....,en,Public domain in the USA.,,"Nick Carter Stories No. 143, The sultan's pear..."
68499,68500,,,,,,
68500,68501,,,,,,


# Download a sample of the books

In [17]:
import os
import requests
from pathlib import Path
import json

In [18]:
metadata = json.load(open('gutenberg-metadata.json'))

In [19]:
# Function to download a file
def download_file(url, output_dir):
    local_filename = output_dir / url.split('/')[-1]
    with requests.get(url, stream=True) as response:
        response.raise_for_status()
        with open(local_filename, 'wb') as f:
            for chunk in response.iter_content(chunk_size=8192): 
                f.write(chunk)
    return local_filename

In [21]:
output_dir = "gutenberg_dataset"
output_dir = Path(output_dir)
output_dir.mkdir(parents=True, exist_ok=True)

In [25]:
key = 1
book_data = metadata.get(str(key), {})

# Get metadata
title = book_data.get("title", ["Unknown Title"])[0]
author = book_data.get("author", ["Unknown Author"])[0]
formaturi = book_data.get("formaturi", [])

book_dir = output_dir / f"{key}_{title.replace(' ', '_')}"
book_dir.mkdir(parents=True, exist_ok=True)

In [34]:
url = formaturi[-1]
local_filename = book_dir / url.split('/')[-1]
local_filename

PosixPath('gutenberg_dataset/1_The_Declaration_of_Independence_of_the_United_States_of_America/1.txt.utf-8')

In [35]:
download_file(url, book_dir)

PosixPath('gutenberg_dataset/1_The_Declaration_of_Independence_of_the_United_States_of_America/1.txt.utf-8')

# Run multiple downloads

In [38]:
import tqdm

In [44]:
# Main script to download data for keys from '1' to '10'
def download_subset(metadata, output_dir, keys):
    output_dir = Path(output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)

    for key in tqdm.tqdm(keys):
        book_data = metadata.get(str(key), {})
        
        title = book_data.get("title", ["Unknown Title"])[0] if book_data.get("title") else "Unknown Title"
        author = book_data.get("author", ["Unknown Author"])[0] if book_data.get("author") else "Unknown Author"
        formaturi = book_data.get("formaturi", [])
        
        print(f"Downloading key {key}: {title} by {author}")

        book_dir = output_dir / f"{key}_{title.replace(' ', '_')}"
        book_dir.mkdir(parents=True, exist_ok=True)

        for url in formaturi:
            try:
                print(f"  Downloading {url}")
                download_file(url, book_dir)
            except Exception as e:
                print(f"  Failed to download {url}: {e}")

In [45]:
# Define keys and output directory
keys_to_download = range(7, 11)  # Keys '1' to '10'
output_directory = "gutenberg_dataset"

In [46]:
# Run the download
download_subset(metadata, output_directory, keys_to_download)

  0%|          | 0/4 [00:00<?, ?it/s]

Downloading key 7: The Mayflower Compact by Unknown Author
  Downloading https://www.gutenberg.org/ebooks/7.rdf
  Downloading https://www.gutenberg.org/ebooks/7.epub.noimages
  Downloading https://www.gutenberg.org/ebooks/7.epub.images
  Downloading https://www.gutenberg.org/ebooks/7.kindle.noimages
  Downloading https://www.gutenberg.org/ebooks/7.kindle.images
  Downloading https://www.gutenberg.org/files/7/7-h.zip
  Failed to download https://www.gutenberg.org/files/7/7-h.zip: 404 Client Error: Not Found for url: https://www.gutenberg.org/files/7/7-h.zip
  Downloading https://www.gutenberg.org/cache/epub/7/pg7.cover.medium.jpg
  Downloading https://www.gutenberg.org/files/7/7.txt
  Failed to download https://www.gutenberg.org/files/7/7.txt: 404 Client Error: Not Found for url: https://www.gutenberg.org/files/7/7.txt
  Downloading https://www.gutenberg.org/files/7/7-h/7-h.htm
  Downloading https://www.gutenberg.org/cache/epub/7/pg7.cover.small.jpg
  Downloading https://www.gutenberg.o

 25%|██▌       | 1/4 [00:07<00:22,  7.43s/it]

Downloading key 8: Abraham Lincoln's Second Inaugural Address by Lincoln, Abraham
  Downloading https://www.gutenberg.org/files/8/8.zip
  Downloading https://www.gutenberg.org/ebooks/8.rdf
  Downloading https://www.gutenberg.org/ebooks/8.html.images
  Downloading https://www.gutenberg.org/files/8/8-h/8-h.htm
  Downloading https://www.gutenberg.org/ebooks/8.kindle.images
  Downloading https://www.gutenberg.org/files/8/8-h.zip
  Downloading https://www.gutenberg.org/ebooks/8.epub.noimages
  Downloading https://www.gutenberg.org/cache/epub/8/pg8.cover.small.jpg
  Downloading https://www.gutenberg.org/ebooks/8.kindle.noimages
  Downloading https://www.gutenberg.org/ebooks/8.epub.images
  Downloading https://www.gutenberg.org/files/8/8.txt
  Downloading https://www.gutenberg.org/ebooks/8.txt.utf-8
  Downloading https://www.gutenberg.org/cache/epub/8/pg8.cover.medium.jpg


 50%|█████     | 2/4 [00:15<00:15,  7.61s/it]

Downloading key 9: Abraham Lincoln's First Inaugural Address by Lincoln, Abraham
  Downloading https://www.gutenberg.org/files/9/9-h/9-h.htm
  Downloading https://www.gutenberg.org/cache/epub/9/pg9.cover.small.jpg
  Downloading https://www.gutenberg.org/files/9/9-h.zip
  Failed to download https://www.gutenberg.org/files/9/9-h.zip: 404 Client Error: Not Found for url: https://www.gutenberg.org/files/9/9-h.zip
  Downloading https://www.gutenberg.org/ebooks/9.rdf
  Downloading https://www.gutenberg.org/cache/epub/9/pg9.cover.medium.jpg
  Downloading https://www.gutenberg.org/ebooks/9.kindle.images
  Downloading https://www.gutenberg.org/files/9/9.txt
  Failed to download https://www.gutenberg.org/files/9/9.txt: 404 Client Error: Not Found for url: https://www.gutenberg.org/files/9/9.txt
  Downloading https://www.gutenberg.org/ebooks/9.epub.images
  Downloading https://www.gutenberg.org/ebooks/9.txt.utf-8
  Downloading https://www.gutenberg.org/ebooks/9.kindle.noimages
  Downloading https

 75%|███████▌  | 3/4 [00:22<00:07,  7.68s/it]

Downloading key 10: The King James Version of the Bible by Unknown Author
  Downloading https://www.gutenberg.org/files/10/10-h/10-h.htm
  Downloading https://www.gutenberg.org/ebooks/10.html.images
  Downloading https://www.gutenberg.org/ebooks/10.epub.noimages
  Downloading https://www.gutenberg.org/files/10/10-0.zip
  Failed to download https://www.gutenberg.org/files/10/10-0.zip: 404 Client Error: Not Found for url: https://www.gutenberg.org/files/10/10-0.zip
  Downloading https://www.gutenberg.org/ebooks/10.epub.images
  Downloading https://www.gutenberg.org/cache/epub/10/pg10.cover.medium.jpg
  Downloading https://www.gutenberg.org/ebooks/10.rdf
  Downloading https://www.gutenberg.org/files/10/10-0.txt
  Downloading https://www.gutenberg.org/cache/epub/10/pg10.cover.small.jpg
  Downloading https://www.gutenberg.org/ebooks/10.kindle.noimages
  Downloading https://www.gutenberg.org/ebooks/10.txt.utf-8
  Downloading https://www.gutenberg.org/ebooks/10.kindle.images
  Downloading htt

100%|██████████| 4/4 [00:35<00:00,  8.78s/it]

  Failed to download https://www.gutenberg.org/files/10/10-h.zip: 404 Client Error: Not Found for url: https://www.gutenberg.org/files/10/10-h.zip





# Move to local folder

In [None]:
# mv gutenberg_dataset/* /home/david/Documents/data_science/datasets