In [1]:
# We will need this later to chunk books
%pip install --upgrade semantic-kernel

StatementMeta(, 2d270a25-df40-44a5-8fa8-5b0bd2f735f5, 8, Finished, Available)

Collecting semantic-kernel
  Downloading semantic_kernel-0.5.1.dev0-py3-none-any.whl (244 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.5/244.5 kB[0m [31m17.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting aiofiles<24.0.0,>=23.1.0 (from semantic-kernel)
  Downloading aiofiles-23.2.1-py3-none-any.whl (15 kB)
Collecting motor<4.0.0,>=3.3.1 (from semantic-kernel)
  Downloading motor-3.3.2-py3-none-any.whl (70 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m70.6/70.6 kB[0m [31m34.1 MB/s[0m eta [36m0:00:00[0m
Collecting openai>=1.0 (from semantic-kernel)
  Downloading openai-1.13.3-py3-none-any.whl (227 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m227.4/227.4 kB[0m [31m88.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting openapi_core<0.19.0,>=0.18.0 (from semantic-kernel)
  Downloading openapi_core-0.18.2-py3-none-any.whl (82 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m82.4/82.4 kB[0m [31m40.5

# Create Lakehouse Folders

In [2]:
import os

root_dir = '/lakehouse/default/Files'
main_folders = ['keys','book_texts','book_enriched']

for folder in main_folders:
    os.makedirs(os.path.join(root_dir, folder), exist_ok=True)

print("Directory structure created successfully!")

StatementMeta(, 2d270a25-df40-44a5-8fa8-5b0bd2f735f5, 10, Finished, Available)

Directory structure created successfully!


# Ingest Data From Source

While I recommend using the provided defaults, two parameters are available to modify:

- **books_to_ingest** - amount of books that you would like to store and analyze. Increasing this will exponentially increase TSNE analysis later
- **max_byte_size** - max allowed size of each book in bytes

In [3]:
import requests

books_ingested = 0
book_content = []

books_to_ingest = 32  # modify based on your needs
max_byte_size = 100000  # modify based on your needs

for book_id in range(1, 1000):
    url = f'https://www.gutenberg.org/cache/epub/{book_id}/pg{book_id}.txt'

    # Check file size
    response = requests.head(url)
    content_length = int(response.headers.get('Content-Length', 0))

    if content_length <= max_byte_size:
        full_response = requests.get(url)

        if full_response.status_code == 200:
            book_text = full_response.text

            # Save file to default lakehouse
            with open(f'/lakehouse/default/Files/book_texts/{book_id}.txt', 'w') as file:
                file.write(book_text)

            book_content.append((book_id, book_text))
            books_ingested += 1
            print(f'Success book {book_id}: {books_ingested}/{books_to_ingest}')

            if books_ingested >= books_to_ingest:
                break
        else:
            print(f"Error fetching book {book_id}: Status code {full_response.status_code}")
    else:
        print(f"Skipping book {book_id}: Content size exceeds limit")

StatementMeta(, 2d270a25-df40-44a5-8fa8-5b0bd2f735f5, 11, Finished, Available)

Skipping book 1: Content size exceeds limit
Success book 2: 1/30
Success book 3: 2/30
Success book 4: 3/30
Success book 5: 4/30
Success book 6: 5/30
Success book 7: 6/30
Success book 8: 7/30
Success book 9: 8/30
Skipping book 10: Content size exceeds limit
Skipping book 11: Content size exceeds limit
Skipping book 12: Content size exceeds limit
Success book 13: 9/30
Skipping book 14: Content size exceeds limit
Skipping book 15: Content size exceeds limit
Skipping book 16: Content size exceeds limit
Skipping book 17: Content size exceeds limit
Skipping book 18: Content size exceeds limit
Skipping book 19: Content size exceeds limit
Skipping book 20: Content size exceeds limit
Skipping book 21: Content size exceeds limit
Skipping book 22: Content size exceeds limit
Skipping book 23: Content size exceeds limit
Skipping book 24: Content size exceeds limit
Skipping book 25: Content size exceeds limit
Skipping book 26: Content size exceeds limit
Skipping book 27: Content size exceeds limit
S

# Text Chunking with Semantic Kernel

In [4]:
from semantic_kernel.text import text_chunker

StatementMeta(, 2d270a25-df40-44a5-8fa8-5b0bd2f735f5, 12, Finished, Available)

In [5]:
import json

def chunk_and_save_text(x):
    book_id,book_text = x

    start_index = book_text.index('*** START')
    end_index = book_text.index('*** END')

    # Get the header, footer, and main text of Project Gutenberg eBook
    header_text = book_text[0:start_index]
    main_text = book_text[start_index:end_index]

    # Use Semantic Kernel to chunk main text by tokens
    chunks = text_chunker.split_plaintext_lines(text=main_text, max_token_per_line=2000)
    print(f'Chunked book {book_id} into {len(chunks)} pieces')

    chunk_list = [{'chunk_id': i, 'content': chunk} for i, chunk in enumerate(chunks)]

    json_object = {
        'book_id': book_id,
        'header_text': header_text,
        'chunks': chunk_list
    }

    with open(f'/lakehouse/default/Files/book_enriched/{book_id}.json','w') as f:
        json.dump(json_object, f, indent=4)

StatementMeta(, 2d270a25-df40-44a5-8fa8-5b0bd2f735f5, 13, Finished, Available)

In [6]:
for book in book_content:
    chunk_and_save_text(book)

StatementMeta(, 2d270a25-df40-44a5-8fa8-5b0bd2f735f5, 14, Finished, Available)

Chunked book 2 into 1 pieces
Chunked book 3 into 1 pieces
Chunked book 4 into 1 pieces
Chunked book 5 into 4 pieces
Chunked book 6 into 1 pieces
Chunked book 7 into 1 pieces
Chunked book 8 into 1 pieces
Chunked book 9 into 4 pieces
Chunked book 13 into 8 pieces
Chunked book 39 into 8 pieces
Chunked book 41 into 16 pieces
Chunked book 49 into 8 pieces
Chunked book 56 into 4 pieces
Chunked book 57 into 4 pieces
Chunked book 61 into 16 pieces
Chunked book 71 into 8 pieces
Chunked book 99 into 8 pieces
Chunked book 104 into 2 pieces
Chunked book 109 into 8 pieces
Chunked book 117 into 1 pieces
Chunked book 136 into 8 pieces
Chunked book 151 into 4 pieces
Chunked book 156 into 1 pieces
Chunked book 181 into 8 pieces
Chunked book 207 into 16 pieces
Chunked book 216 into 8 pieces
Chunked book 229 into 8 pieces
Chunked book 230 into 8 pieces
Chunked book 235 into 2 pieces
Chunked book 237 into 6 pieces
