In [10]:
import http.client
import requests
from datetime import datetime
import json
import os
import re
from collections import defaultdict
import string
import nltk
from nltk.corpus import stopwords
import sqlite3

In [None]:
def extract_book_info(data):
    """
    Parse the API response data (assumed to be JSON) and extract book metadata.

    Args:
        data (str): JSON string from the API response.

    Returns:
        list: List of dictionaries containing book metadata.
    """
    books = json.loads(data)

    if isinstance(books, dict) and 'results' in books:
        books = books['results']

    book_info = []
    for book in books: 
        book_id = book.get('id')
        title = book.get('title')
        authors = [author.get('name') for author in book.get('authors', [])]
        subjects = book.get('subjects', [])
        bookshelves = book.get('bookshelves', [])
        book_info.append({
            'id': book_id,
            'title': title,
            'authors': authors,
            'subjects': subjects,
            'bookshelves': bookshelves
        })
    return book_info


In [12]:
def inverted_index_creation(book_info, headers, stop_words):
    """
    For each book in book_info, fetch the text, print a sample, and build an inverted index (excluding stop words).

    Args:
        book_info (list): List of book metadata dictionaries.
        headers (dict): Headers for the API request.
        stop_words (set): Set of stop words to exclude from the index.

    Returns:
        dict: Inverted index mapping words to lists of book IDs.
    """
    inverted_index = defaultdict(lambda: defaultdict(int))

    for book in book_info:
        book_id = book['id']
        response = requests.get(
            f"https://project-gutenberg-free-books-api1.p.rapidapi.com/books/{book_id}/text?cleaning_mode=simple", 
            headers=headers
        )
        text = response.text
        books = json.loads(text)

        if isinstance(books, dict) and 'text' in books:
            text = books['text']

        for word in text.split():
            word = word.lower().strip('.,!?;"()[]{}')
            if word and word not in stop_words:
                inverted_index[word][book_id] += 1

    return inverted_index


In [None]:
def extract_fetch_and_store_books(data, headers, datalake_dir='datalake'):
    """
    Extract book metadata, fetch book texts, and store metadata in a structured datalake directory.

    Args:
        data (str): JSON string from the API response.
        headers (dict): Headers for the API request.
        datalake_dir (str): Root directory for storing data.
    Returns:
        list: List of book metadata dictionaries.
    """
    book_info = extract_book_info(data)

    now = datetime.now()
    date_dir = now.strftime('%Y-%m-%d')
    time_dir = now.strftime('%H')
    full_path = os.path.join(datalake_dir, date_dir, time_dir)
    os.makedirs(full_path, exist_ok=True)

    control_dir = 'control'
    downloaded_books_path = os.path.join(control_dir, 'downloaded_books.txt')
    indexed_books_path = os.path.join(control_dir, 'indexed_books.txt')

    downloaded_books = set()
    if os.path.exists(downloaded_books_path):
        with open(downloaded_books_path, 'r', encoding='utf-8') as f:
            downloaded_books = set(line.strip() for line in f if line.strip())

    indexed_books = set()
    if os.path.exists(indexed_books_path):
        with open(indexed_books_path, 'r', encoding='utf-8') as f:
            indexed_books = set(line.strip() for line in f if line.strip())

    for book in book_info:
        book_id = str(book['id'])
        if book_id in downloaded_books and book_id in indexed_books:
            continue 

        header_path = os.path.join(full_path, f"{book_id}.header.txt")
        with open(header_path, 'w', encoding='utf-8') as header_file:
            json.dump(book, header_file, ensure_ascii=False, indent=4)
        response = requests.get(f"https://project-gutenberg-free-books-api1.p.rapidapi.com/books/{book_id}/text?cleaning_mode=simple", headers=headers)
        text = response.text
        book_text = json.loads(text).get('text', '')
        body_path = os.path.join(full_path, f"{book_id}.body.txt")
        with open(body_path, 'w', encoding='utf-8') as body_file:
            body_file.write(book_text)

    print("Books downloaded and stored in datalake.")
    return book_info


In [14]:
def datamart_fill(book_info, headers, stop_words):
    """
    This function creates a directory named 'datamart', if it doesn't already exist, 
    and processes the data stored in the 'datalake' directory. Inside datamart create two files,
    unless they already exist: metadata.sql and inverted_index.json

    Args:
        book_info (list): List of book metadata dictionaries.
        headers (dict): Headers for the API request.
        stop_words (set): Set of stop words to exclude from the index.
    """
    datamart_dir = 'datamart'
    os.makedirs(datamart_dir, exist_ok=True)

    metadata_path = os.path.join(datamart_dir, 'metadata.sql')
    inverted_index_path = os.path.join(datamart_dir, 'inverted_index.json')
    inverted_index = inverted_index_creation(book_info, headers, stop_words)
    
    datalake_dir = 'datalake'
    metadata = []

    for root, _, files in os.walk(datalake_dir):
        for file in files:
            if file.endswith('.header.txt'):
                with open(os.path.join(root, file), 'r', encoding='utf-8') as f:
                    book_metadata = json.load(f)
                    metadata.append(book_metadata)

    if not os.path.exists(inverted_index_path):
        with open(inverted_index_path, 'w', encoding='utf-8') as index_file:
            json.dump(inverted_index, index_file, ensure_ascii=False, indent=4)
    else:
        with open(inverted_index_path, 'r+', encoding='utf-8') as index_file:
            existing_index = json.load(index_file)
            existing_index.update(inverted_index) 
            index_file.seek(0)
            json.dump(existing_index, index_file, ensure_ascii=False, indent=4)
            index_file.truncate()
    
    if not os.path.exists(metadata_path):
        db_path = os.path.join(datamart_dir, 'metadata.db')
        conn = sqlite3.connect(db_path)
        cursor = conn.cursor()
        cursor.execute(
            """
            CREATE TABLE IF NOT EXISTS books (
            id INTEGER PRIMARY KEY,
            title TEXT,
            authors TEXT,
            subjects TEXT,
            bookshelves TEXT
            );
            """
        )
        for book in metadata:
            authors = ', '.join(book.get('authors', []))
            subjects = ', '.join(book.get('subjects', []))
            bookshelves = ', '.join(book.get('bookshelves', []))
            title = book.get('title', '')
            cursor.execute(
            """
            INSERT OR IGNORE INTO books (id, title, authors, subjects, bookshelves)
            VALUES (?, ?, ?, ?, ?)
            """,
            (book.get('id'), title, authors, subjects, bookshelves)
            )
        conn.commit()
        conn.close()
    print("Datamart filled with metadata and inverted index.")


In [15]:
def controller(data):
    """
    Controller function to control which books are downloaded 
    and which ones are indexed, creating in the directory control
    the files downloaded_books.txt and indexed_books.txt.
    """

    control_dir = 'control'
    os.makedirs(control_dir, exist_ok=True)
    downloaded_books_path = os.path.join(control_dir, 'downloaded_books.txt')
    indexed_books_path = os.path.join(control_dir, 'indexed_books.txt')

    headers = {
        'x-rapidapi-key': "29ab1edf9dmshb37d07ffbb17e29p1ce99ejsn7592f187c027",
        'x-rapidapi-host': "project-gutenberg-free-books-api1.p.rapidapi.com"
    }
        
    nltk.download('stopwords')
    stop_words = set(stopwords.words('english'))

    book_info = extract_fetch_and_store_books(data.decode("utf-8"), headers)
    
    with open(downloaded_books_path, 'w', encoding='utf-8') as f:
        for book in book_info:
            f.write(f"{book['id']}\n")

    datamart_fill(book_info, headers, stop_words)

    with open(indexed_books_path, 'w', encoding='utf-8') as f:
        for book in book_info:
            f.write(f"{book['id']}\n")

    print("Controller finished processing books.")


In [None]:
def main(url = "https://project-gutenberg-free-books-api1.p.rapidapi.com/books"):
    headers = {
        "x-rapidapi-key": "29ab1edf9dmshb37d07ffbb17e29p1ce99ejsn7592f187c027",
	    "x-rapidapi-host": "project-gutenberg-free-books-api1.p.rapidapi.com"
        }

    response = requests.get(url, headers=headers)

    controller(response.content)

    print("Page processed, moving to next page if available...")
    url = response.json().get('next')
    if url == "https://project-gutenberg-free-books-api1.p.rapidapi.com/books?page=21":
        return "First 20 pages done"
    else:
        main(url)


In [17]:
main()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jcubt\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Books downloaded and stored in datalake.
Datamart filled with metadata and inverted index.
Controller finished processing books.
Page processed, moving to next page if available...


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jcubt\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


KeyboardInterrupt: 