# Load JSON Data from Lakehouse

In [1]:
# Load and enrich JSON
import os
import json

json_docs = []
header_path = '/lakehouse/default/Files/book_enriched'

for file_name in os.listdir(header_path):
    file_path = os.path.join(header_path,file_name)

    # Open current file
    with open(file_path,'r') as f:
        json_content = f.read()
        json_doc = json.loads(json_content)
 
    json_docs.append(json_doc)

StatementMeta(, 01a69189-c9f4-46c3-8e05-2297feb1fb54, 3, Finished, Available)

# Entity Extraction with Azure OpenAI

Notice how when using Microsoft Fabric F64 or higher sku that you don't need an API key. All resource usage gets charged against your capacity units

https://learn.microsoft.com/en-us/fabric/data-science/ai-services/how-to-use-openai-sdk-synapse?tabs=python

In [2]:
import openai

StatementMeta(, 01a69189-c9f4-46c3-8e05-2297feb1fb54, 4, Finished, Available)

In [3]:
# Recieves a piece of text and desired entities to extract and returns extracted entities in JSON format
def extract_entities_and_enhance(text, entities):

    prompt = f"""
        Extract these entities {entities} from the following text
        If a field is not available use 'unknown'
        Any dates should be formatted as yyyy-mm-dd

        Text:
            {text}
    """
    
    completion = openai.ChatCompletion.create(
        deployment_id='gpt-35-turbo',
        messages=[
            {'role': 'system', 'content': 'Youre an AI assistant that extracts information from text. Provide the requested fields in JSON format.'},
            {'role': 'user', 'content': prompt}
        ],
        temperature=0
    )

    response = completion.choices[0].message.content
    json_response = json.loads(response)
    return json_response

StatementMeta(, 01a69189-c9f4-46c3-8e05-2297feb1fb54, 5, Finished, Available)

In [35]:
entities = ['title', 'author','language','release_date','last_modified_date']
total = len(json_docs)

for json_doc in json_docs:
    header_text = json_doc['header_text']
    extracted_json = extract_entities_and_enhance(header_text, entities)

    # Update the current json doc with new extracted fields
    json_doc.update(extracted_json)

    i = json_docs.index(json_doc) + 1
    book_id = json_doc['book_id']
    print(f'({i}/{total}) extracted data for book {book_id}')

StatementMeta(, 01a69189-c9f4-46c3-8e05-2297feb1fb54, 37, Finished, Available)

(1/32) extracted data for book 104
(2/32) extracted data for book 109
(3/32) extracted data for book 117
(4/32) extracted data for book 13
(5/32) extracted data for book 136
(6/32) extracted data for book 151
(7/32) extracted data for book 156
(8/32) extracted data for book 181
(9/32) extracted data for book 2
(10/32) extracted data for book 207
(11/32) extracted data for book 216
(12/32) extracted data for book 229
(13/32) extracted data for book 230
(14/32) extracted data for book 235
(15/32) extracted data for book 237
(16/32) extracted data for book 246
(17/32) extracted data for book 249
(18/32) extracted data for book 3
(19/32) extracted data for book 39
(20/32) extracted data for book 4
(21/32) extracted data for book 41
(22/32) extracted data for book 49
(23/32) extracted data for book 5
(24/32) extracted data for book 56
(25/32) extracted data for book 57
(26/32) extracted data for book 6
(27/32) extracted data for book 61
(28/32) extracted data for book 7
(29/32) extracted da

In [36]:
# Here is an example output
json_doc = json_docs[0]
header_text = json_doc['header_text']
extracted_json_example = extract_entities_and_enhance(header_text, entities)

print(json.dumps(extracted_json_example, indent=4))

StatementMeta(, 01a69189-c9f4-46c3-8e05-2297feb1fb54, 38, Finished, Available)

{
    "title": "Inaugural Address of Franklin Delano Roosevelt",
    "author": "Franklin D. Roosevelt",
    "language": "English",
    "release_date": "1994-02-01",
    "last_modified_date": "2023-07-12"
}


# Text Summarization with Azure OpenAI

In [13]:
#Generates a short summary based on text input
def generate_short_summary(text):

    prompt = f"""
        Summarize the following text in 2-3 sentences

        Text:
            {text}
    """

    try:
        completion = openai.ChatCompletion.create(
        deployment_id='gpt-35-turbo',
            messages=[
                {'role': 'system', 'content': 'Youre an AI assistant that summarizes text.'},
                {'role': 'user', 'content': prompt}
            ],
            temperature=0
        )

        summary = completion.choices[0].message.content
        return summary

    except:
        return ''

StatementMeta(, 01a69189-c9f4-46c3-8e05-2297feb1fb54, 15, Finished, Available)

In [7]:
# Generates a short summary from a list of chunks
def summarize_chunks(chunks):
    combined_summary = ''

    for chunk in chunks:
        summary = generate_short_summary(chunk)
        combined_summary += summary
    
    overall_summary = generate_short_summary(combined_summary)
    return overall_summary

StatementMeta(, 01a69189-c9f4-46c3-8e05-2297feb1fb54, 9, Finished, Available)

In [14]:
total = len(json_docs)

for json_doc in json_docs:
    chunks = json_doc['chunks']
    text_chunks = [item['content'] for item in chunks]

    summary = summarize_chunks(text_chunks)

    # Update the current json doc with new summary
    json_doc['summary'] = summary

    book_title = json_doc['title']
    i = json_docs.index(json_doc) + 1
    print(f'({i}/{total}) summarized {book_title}')

StatementMeta(, 01a69189-c9f4-46c3-8e05-2297feb1fb54, 16, Finished, Available)

(1/32) summarized Inaugural Address of Franklin Delano Roosevelt
(2/32) summarized Renascence, and Other Poems
(3/32) summarized Symphony No. 5 in C minor Opus 67
(4/32) summarized The Hunting of the Snark: An Agony in Eight Fits
(5/32) summarized A Child's Garden of Verses
(6/32) summarized The Rime of the Ancient Mariner
(7/32) summarized Symphony No. 5 in C minor, Opus 67
(8/32) summarized The Project Gutenberg RST Manual
(9/32) summarized The United States Bill of Rights
(10/32) summarized The Spell of the Yukon and Other Verses
(11/32) summarized The Tao Teh King, or the Tao and its Characteristics
(12/32) summarized The Bucolics and Eclogues
(13/32) summarized The Bucolics and Eclogues
(14/32) summarized William Gibson Interviewed
(15/32) summarized Sexti Properti Elegiarvm: Liber Primvs
(16/32) summarized The Rubaiyat of Omar Khayyam
(17/32) summarized French Cave Paintings
(18/32) summarized John F. Kennedy's Inaugural Address
(19/32) summarized Hitchhiker's Guide to the Intern

In [15]:
json_doc = json_docs[0]
chunks = json_doc['chunks']
text_chunks = [item['content'] for item in chunks]

summary = summarize_chunks(text_chunks)

book_title = json_doc['title']
print(f'Summary for {book_title}:')
print()
print(summary)

StatementMeta(, 01a69189-c9f4-46c3-8e05-2297feb1fb54, 17, Finished, Available)

Summary for Inaugural Address of Franklin Delano Roosevelt:

In his inaugural address, Franklin Delano Roosevelt acknowledges the economic challenges facing the United States in 1933, including high unemployment and the failure of the financial system. He emphasizes the need for action, such as putting people to work and reorganizing the use of natural resources, as well as the importance of unified relief activities and strict supervision of banking and investments. Roosevelt pledges to lead the nation in addressing these issues and expresses hope for a balanced executive and legislative authority to meet the challenges ahead.


# Text Classification with Azure OpenAI

In [16]:
# Classifies an input of text based upon a list of categories 
def classify_text(text, categories):

    prompt = f"""
        Classify the following text into one of the categories

        Your response should look like:
            category
            
        Categories:
            {categories}

        Text:
            {text}
    """
    
    try:
        completion = openai.ChatCompletion.create(
        deployment_id='gpt-35-turbo',
            messages=[
                {'role': 'system', 'content': 'Youre an AI assistant that classifies text.'},
                {'role': 'user', 'content': prompt}
            ],
            temperature=0,
            max_tokens = 60
        )

        classification = completion.choices[0].message.content
        return classification

    except:
        return ''

StatementMeta(, 01a69189-c9f4-46c3-8e05-2297feb1fb54, 18, Finished, Available)

In [17]:
book_categories = ['Fantasy Science Fiction','History','Biography','Poetry','Music','Technology','Philosophy']

# Use the newly generated summary to categorize each book into one of the above categories
for json_doc in json_docs:
    summary = json_doc['summary']
    category = classify_text(summary, book_categories)

    # Update the current json doc with category
    json_doc['category'] = category

    i = json_docs.index(json_doc) + 1
    book_title = json_doc['title']
    print(f'({i}/{total}) categorized {book_title}')

StatementMeta(, 01a69189-c9f4-46c3-8e05-2297feb1fb54, 19, Finished, Available)

(1/32) categorized Inaugural Address of Franklin Delano Roosevelt
(2/32) categorized Renascence, and Other Poems
(3/32) categorized Symphony No. 5 in C minor Opus 67
(4/32) categorized The Hunting of the Snark: An Agony in Eight Fits
(5/32) categorized A Child's Garden of Verses
(6/32) categorized The Rime of the Ancient Mariner
(7/32) categorized Symphony No. 5 in C minor, Opus 67
(8/32) categorized The Project Gutenberg RST Manual
(9/32) categorized The United States Bill of Rights
(10/32) categorized The Spell of the Yukon and Other Verses
(11/32) categorized The Tao Teh King, or the Tao and its Characteristics
(12/32) categorized The Bucolics and Eclogues
(13/32) categorized The Bucolics and Eclogues
(14/32) categorized William Gibson Interviewed
(15/32) categorized Sexti Properti Elegiarvm: Liber Primvs
(16/32) categorized The Rubaiyat of Omar Khayyam
(17/32) categorized French Cave Paintings
(18/32) categorized John F. Kennedy's Inaugural Address
(19/32) categorized Hitchhiker's 

In [18]:
json_doc = json_docs[0]
summary = json_doc['summary']
category = classify_text(summary, book_categories)

# Update the current json doc with category
json_doc['category'] = category

book_title = json_doc['title']
book_summary = json_doc['summary']

print(book_title)
print()
print(summary)
print()
print(f'Category: {category}')

StatementMeta(, 01a69189-c9f4-46c3-8e05-2297feb1fb54, 20, Finished, Available)

Inaugural Address of Franklin Delano Roosevelt

In his inaugural address, Franklin Delano Roosevelt acknowledges the economic challenges facing the United States in 1933, including high unemployment and the failure of the financial system. He emphasizes the need for action, such as putting people to work and reorganizing the use of natural resources, as well as the importance of unified relief activities and strict supervision of banking and investments. Roosevelt pledges to lead the nation in addressing these issues and expresses hope for a balanced executive and legislative authority to meet the challenges ahead.

Category: History


# Generate Embeddings for Semantic Similarities
We will use these for semantic similarity analysis later

In [19]:
# Get embeddings from text
def get_embedding_from_text(text):
    
    query_embeddings = openai.Embedding.create(
        deployment_id='text-embedding-ada-002',
        input=text
    )

    embedding = query_embeddings.data[0].embedding
    return embedding

StatementMeta(, 01a69189-c9f4-46c3-8e05-2297feb1fb54, 21, Finished, Available)

In [29]:
import numpy as np
import time

# Generates average embedding from a list of chunks
def get_average_embedding_from_chunks(chunks):
    embeddings = []

    for chunk in chunks:
        embedding = get_embedding_from_text(chunk)
        embeddings.append(embedding)

    embeddings_array = np.array(embeddings)

    # Calculate the mean
    avg_embedding = np.mean(embeddings_array, axis=0).tolist()

    return avg_embedding

StatementMeta(, 01a69189-c9f4-46c3-8e05-2297feb1fb54, 31, Finished, Available)

In [30]:
for json_doc in json_docs:
    chunks = json_doc['chunks']
    text_chunks = [item['content'] for item in chunks]

    embedding = get_average_embedding_from_chunks(text_chunks)

    # Update the current json doc with embeddings
    json_doc['embedding'] = embedding

    i = json_docs.index(json_doc) + 1
    book_title = json_doc['title']
    print(f'({i}/{total}) embedded {book_title}')

    # To avoid rate limit
    time.sleep(1)

StatementMeta(, 01a69189-c9f4-46c3-8e05-2297feb1fb54, 32, Finished, Available)

(1/32) embedded Inaugural Address of Franklin Delano Roosevelt
(2/32) embedded Renascence, and Other Poems
(3/32) embedded Symphony No. 5 in C minor Opus 67
(4/32) embedded The Hunting of the Snark: An Agony in Eight Fits
(5/32) embedded A Child's Garden of Verses
(6/32) embedded The Rime of the Ancient Mariner
(7/32) embedded Symphony No. 5 in C minor, Opus 67
(8/32) embedded The Project Gutenberg RST Manual
(9/32) embedded The United States Bill of Rights
(10/32) embedded The Spell of the Yukon and Other Verses
(11/32) embedded The Tao Teh King, or the Tao and its Characteristics
(12/32) embedded The Bucolics and Eclogues
(13/32) embedded The Bucolics and Eclogues
(14/32) embedded William Gibson Interviewed
(15/32) embedded Sexti Properti Elegiarvm: Liber Primvs
(16/32) embedded The Rubaiyat of Omar Khayyam
(17/32) embedded French Cave Paintings
(18/32) embedded John F. Kennedy's Inaugural Address
(19/32) embedded Hitchhiker's Guide to the Internet
(20/32) embedded Lincoln's Gettysbu

In [31]:
print(json_doc)

StatementMeta(, 01a69189-c9f4-46c3-8e05-2297feb1fb54, 33, Finished, Available)

{'book_id': 99, 'header_text': '\ufeffThe Project Gutenberg eBook of Collected Articles of Frederick Douglass\r\n    \r\nThis ebook is for the use of anyone anywhere in the United States and\r\nmost other parts of the world at no cost and with almost no restrictions\r\nwhatsoever. You may copy it, give it away or re-use it under the terms\r\nof the Project Gutenberg License included with this ebook or online\r\nat www.gutenberg.org. If you are not located in the United States,\r\nyou will have to check the laws of the country where you are located\r\nbefore using this eBook.\r\n\r\nTitle: Collected Articles of Frederick Douglass\r\n\r\n\r\nAuthor: Frederick Douglass\r\n\r\nRelease date: January 1, 1994 [eBook #99]\r\n                Most recently updated: June 27, 2022\r\n\r\nLanguage: English\r\n\r\n\r\n\r\n', 'chunks': [{'chunk_id': 0, 'content': '*** START OF THE PROJECT GUTENBERG EBOOK COLLECTED ARTICLES OF FREDERICK DOUGLASS ***\n\n\n\n\nCollected Articles of Frederick Douglass\n\

# Save Enriched Data

We will be saving the data back into JSON for future use as well as into Fabric Lakehouse tables for analysis

In [32]:
for json_doc in json_docs:
    book_id = json_doc['book_id']
    file_name = f'{book_id}.json'
    file_path = os.path.join(header_path,file_name)

    with open(file_path,'w') as f:
        json.dump(json_doc,f, indent=4)

StatementMeta(, 01a69189-c9f4-46c3-8e05-2297feb1fb54, 34, Finished, Available)

In [38]:
df = spark.createDataFrame(json_docs)

display(df.limit(5))
df.createOrReplaceTempView('books')

StatementMeta(, 01a69189-c9f4-46c3-8e05-2297feb1fb54, 40, Finished, Available)

SynapseWidget(Synapse.DataFrame, fc46be84-c496-414b-806d-efb24a9f48cb)

In [39]:
updated_df = spark.sql("""
    SELECT
        book_id,
        title,
        category,
        author,
        language,
        CAST(release_date AS DATE) release_date,
        CAST(last_modified_date AS DATE) last_modified_date,
        summary
    FROM books
""")

display(updated_df.limit(5))

StatementMeta(, 01a69189-c9f4-46c3-8e05-2297feb1fb54, 41, Finished, Available)

SynapseWidget(Synapse.DataFrame, caed799e-f277-4de6-bb18-5d62d1323a40)

In [40]:
# Saves updated_df to the default Lakehouse
updated_df.write.format('delta').mode('overwrite').option('overwriteSchema','true').saveAsTable('books')

StatementMeta(, 01a69189-c9f4-46c3-8e05-2297feb1fb54, 42, Finished, Available)