In [None]:
import http.client
import requests
from datetime import datetime
import json
import os
import re
from collections import defaultdict
import string
import nltk
from nltk.corpus import stopwords

In [6]:
def extract_book_info(data):
    """
    Parse the API response data (assumed to be JSON) and extract book metadata.

    Args:
        data (str): JSON string from the API response.

    Returns:
        list: List of dictionaries containing book metadata.
    """
    books = json.loads(data)

    # If books is a dict with a key like 'results', use that
    if isinstance(books, dict) and 'results' in books:
        books = books['results']

    # Collect id, title, authors, subjects, and bookshelves for each book
    book_info = []
    for book in books:
        book_id = book.get('id')
        title = book.get('title')
        authors = [author.get('name') for author in book.get('authors', [])]
        subjects = book.get('subjects', [])
        bookshelves = book.get('bookshelves', [])
        book_info.append({
            'id': book_id,
            'title': title,
            'authors': authors,
            'subjects': subjects,
            'bookshelves': bookshelves
        })
    return book_info


In [None]:
def fetch_and_index_books(book_info, conn, headers, stop_words):
    """
    For each book in book_info, fetch the text, print a sample, and build an inverted index (excluding stop words).

    Args:
        book_info (list): List of book metadata dictionaries.
        conn (http.client.HTTPSConnection): HTTP connection object.
        headers (dict): Headers for the API request.
        stop_words (set): Set of stop words to exclude from the index.

    Returns:
        dict: Inverted index mapping words to lists of book IDs.
    """
    inverted_index = defaultdict(list)
    for book in book_info:
        book_id = book['id']
        response = requests.get(f"https://project-gutenberg-free-books-api1.p.rapidapi.com/books/{book_id}/text?cleaning_mode=simple", headers=headers)
        text = response.text
        
        for word in text.split():
            word = word.lower().strip('.,!?;"()[]{}')
            if word and word not in stop_words:
                if book_id not in inverted_index[word]:
                    inverted_index[word][book_id] = 0
            inverted_index[word][book_id] += 1
    return inverted_index


In [None]:
def extract_fetch_and_store_books(data, conn, headers, datalake_dir='datalake'):
    """
    Extract book metadata, fetch book texts, and store metadata in a structured datalake directory.

    Args:
        data (str): JSON string from the API response.
        conn (http.client.HTTPSConnection): HTTP connection object.
        headers (dict): Headers for the API request.
        datalake_dir (str): Root directory for storing data.
    Returns:
        list: List of book metadata dictionaries.
    """
    # Extract metadata
    book_info = extract_book_info(data)

    # Create datalake directory structure
    now = datetime.now()
    date_dir = now.strftime('%Y-%m-%d')
    time_dir = now.strftime('%H')
    full_path = os.path.join(datalake_dir, date_dir, time_dir)
    os.makedirs(full_path, exist_ok=True)

    # For each book, save metadata and text in separate files
    for book in book_info:
        book_id = book['id']
        # Save metadata as [book_id].header.txt
        header_path = os.path.join(full_path, f"{book_id}.header.txt")
        with open(header_path, 'w', encoding='utf-8') as header_file:
            json.dump(book, header_file, ensure_ascii=False, indent=4)
        # Fetch and save book text as [book_id].body.txt
        response = requests.get(f"https://project-gutenberg-free-books-api1.p.rapidapi.com/books/{book_id}/text?cleaning_mode=simple", headers=headers)
        text = response.text
        book_text = json.loads(text).get('text', '')
        body_path = os.path.join(full_path, f"{book_id}.body.txt")
        with open(body_path, 'w', encoding='utf-8') as body_file:
            body_file.write(book_text)

    return book_info

In [None]:
def datamart_creation(book_info, conn, headers, stop_words):
    """
    Create a datamart by processing the stored book metadata and texts.

    This function creates a directory named 'datamart', if it doesn't already exist, 
    and processes the data stored in the 'datalake' directory. Inside datamart create two files,
    unless they already exist: metadata.sql and inverted_index.json

    Args:
        book_info (list): List of book metadata dictionaries.
        conn (http.client.HTTPSConnection): HTTP connection object.
        headers (dict): Headers for the API request.
        stop_words (set): Set of stop words to exclude from the index.
    """
    datamart_dir = 'datamart'
    os.makedirs(datamart_dir, exist_ok=True)

    metadata_path = os.path.join(datamart_dir, 'metadata.sql')
    inverted_index_path = os.path.join(datamart_dir, 'inverted_index.json')
    inverted_index = fetch_and_index_books(book_info, conn, headers, stop_words)

    if not os.path.exists(metadata_path) or not os.path.exists(inverted_index_path):
        # Process datalake to create metadata.sql and inverted_index.json
        datalake_dir = 'datalake'
        metadata = []
        for root, files in os.walk(datalake_dir):
            for file in files:
                if file.endswith('.header.txt'):
                    with open(os.path.join(root, file), 'r', encoding='utf-8') as f:
                        book_metadata = json.load(f)
                        metadata.append(book_metadata)

        # Create inverted index file
        with open(inverted_index_path, 'w', encoding='utf-8') as index_file:
            json.dump(inverted_index, index_file, ensure_ascii=False, indent=4)

        # Save metadata to SQL file
        with open(metadata_path, 'w', encoding='utf-8') as sql_file:
            # Create table statement (adjust columns as per metadata structure)
            sql_file.write(
            "CREATE TABLE IF NOT EXISTS books (\n"
            "    id INTEGER PRIMARY KEY,\n"
            "    title TEXT,\n"
            "    authors TEXT,\n"
            "    subjects TEXT,\n"
            "    bookshelves TEXT\n"
            ");\n\n"
            )
            # Insert statements
            for book in metadata:
                authors = ', '.join(book.get('authors', []))
                subjects = ', '.join(book.get('subjects', []))
                bookshelves = ', '.join(book.get('bookshelves', []))
                # Escape single quotes in text fields
                title = book.get('title', '').replace("'", "''")
                authors = authors.replace("'", "''")
                subjects = subjects.replace("'", "''")
                bookshelves = bookshelves.replace("'", "''")
                sql_file.write(
                    f"INSERT INTO books (id, title, authors, subjects, bookshelves) VALUES "
                    f"({book.get('id')}, '{title}', '{authors}', '{subjects}', '{bookshelves}');\n"
                    )

        # Save inverted index to JSON
        with open(inverted_index_path, 'w', encoding='utf-8') as index_file:
            json.dump(inverted_index, index_file, ensure_ascii=False, indent=4)

In [None]:
def controller(data):
    """Controller function to control which books are downloaded 
    and which ones are indexed, creating in the directory control
    the files downloaded_books.txt and indexed_books.txt."""
    
    control_dir = 'control'
    os.makedirs(control_dir, exist_ok=True)
    downloaded_books_path = os.path.join(control_dir, 'downloaded_books.txt')
    indexed_books_path = os.path.join(control_dir, 'indexed_books.txt')

    conn = http.client.HTTPSConnection("project-gutenberg-free-books-api1.p.rapidapi.com")
    
    headers = {
        'x-rapidapi-key': "29ab1edf9dmshb37d07ffbb17e29p1ce99ejsn7592f187c027",
        'x-rapidapi-host': "project-gutenberg-free-books-api1.p.rapidapi.com"
    }
        
    nltk.download('stopwords')
    stop_words = set(stopwords.words('english'))

    # Call the functions to download and index books
    book_info = extract_fetch_and_store_books(data.decode("utf-8"), conn, headers)
    
    # Update downloaded_books.txt
    with open(downloaded_books_path, 'w', encoding='utf-8') as f:
        for book in book_info:
            f.write(f"{book['id']}\n")

    datamart_creation(book_info, conn, headers, stop_words)

    # Update indexed_books.txt
    with open(indexed_books_path, 'w', encoding='utf-8') as f:
        for book in book_info:
            f.write(f"{book['id']}\n")


In [None]:
def main(url = "https://project-gutenberg-free-books-api1.p.rapidapi.com/books"):
    headers = {
        "x-rapidapi-key": "29ab1edf9dmshb37d07ffbb17e29p1ce99ejsn7592f187c027",
	    "x-rapidapi-host": "project-gutenberg-free-books-api1.p.rapidapi.com"
        }

    response = requests.get(url, headers=headers)

    controller(response.content)

    # from response use next to get next page
    url = response.json().get('next')
    if url == "https://project-gutenberg-free-books-api1.p.rapidapi.com/books?page=21":
        return "Frist 20 pages done"
    else:
        main(url)

In [None]:
main()