# Build Azure OpenAI Client

In [1]:
# We will need this version of Azure OpenAI
%pip install openai==1.12.0

StatementMeta(, 57306c3e-cac7-48a1-b99d-e16c672afd7d, 8, Finished, Available)

Collecting openai==1.12.0
  Downloading openai-1.12.0-py3-none-any.whl (226 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m226.7/226.7 kB[0m [31m13.6 MB/s[0m eta [36m0:00:00[0m
Collecting distro<2,>=1.7.0 (from openai==1.12.0)
  Downloading distro-1.9.0-py3-none-any.whl (20 kB)
Collecting httpx<1,>=0.23.0 (from openai==1.12.0)
  Downloading httpx-0.27.0-py3-none-any.whl (75 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.6/75.6 kB[0m [31m34.7 MB/s[0m eta [36m0:00:00[0m
Collecting typing-extensions<5,>=4.7 (from openai==1.12.0)
  Downloading typing_extensions-4.10.0-py3-none-any.whl (33 kB)
Collecting httpcore==1.* (from httpx<1,>=0.23.0->openai==1.12.0)
  Downloading httpcore-1.0.4-py3-none-any.whl (77 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.8/77.8 kB[0m [31m37.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: typing-extensions, httpcore, distro, httpx, openai
  Attempting uninstall: typin

In [2]:
from openai import AzureOpenAI
import json

# Get keys and instantiate Azure OpenAI Client
with open('/lakehouse/default/Files/keys/aoai_key.json','r') as f:
    content = json.load(f)

ENDPOINT_URL = content.get('API_ENDPOINT')
API_KEY = content.get('API_KEY')
API_VERSION = '2023-09-01-preview'
MODEL_NAME = 'gpt-35-turbo'

client = AzureOpenAI(
    azure_endpoint=ENDPOINT_URL,
    api_key=API_KEY,
    api_version=API_VERSION,
)

StatementMeta(, 57306c3e-cac7-48a1-b99d-e16c672afd7d, 10, Finished, Available)

# Build Metadata with Entity Extraction

Extract metadata about the text including author, title, and release date

In [3]:
def extract_entities_and_enhance_json(json_doc):

    text = json_doc['header_text']

    prompt = f"""
        Extract title, author, language, release date, and last modified date fromt the following text
        If a field is not available use 'unknown'
        Any dates should be formatted as yyyy-mm-dd

        Text:
            {text}
    """

    completion = client.chat.completions.create(
        model=MODEL_NAME,
        messages=[
            {'role': 'system', 'content': 'Youre an AI assistant that extracts information from text. Provide the requested fields in JSON format.'},
            {'role': 'user', 'content': prompt}
        ],
        temperature=0
    )

    response = completion.choices[0].message.content
    json_response = json.loads(response)
    json_response.update(json_doc)
    
    return json_response

StatementMeta(, 57306c3e-cac7-48a1-b99d-e16c672afd7d, 11, Finished, Available)

In [4]:
# Load and enrich JSON
import os

enrich_json_docs = []
header_path = '/lakehouse/default/Files/book_enriched'

for file_name in os.listdir(header_path):
    file_path = os.path.join(header_path,file_name)

    # Open current file
    with open(file_path,'r') as f:
        json_content = f.read()
        json_doc = json.loads(json_content)
    
    # Use function to extract entities from the first chunk
    enriched_json_doc = extract_entities_and_enhance_json(json_doc)

    # Write AI enriched data back to source
    with open(file_path,'w') as f:
        json.dump(enriched_json_doc,f, indent=4)
    
    print(f'Enriched {file_name}')
    enrich_json_docs.append(enriched_json_doc)

StatementMeta(, 57306c3e-cac7-48a1-b99d-e16c672afd7d, 12, Finished, Available)

Enriched 104.json
Enriched 109.json
Enriched 117.json
Enriched 13.json
Enriched 136.json
Enriched 151.json
Enriched 156.json
Enriched 181.json
Enriched 2.json
Enriched 207.json
Enriched 216.json
Enriched 229.json
Enriched 230.json
Enriched 235.json
Enriched 237.json
Enriched 3.json
Enriched 39.json
Enriched 4.json
Enriched 41.json
Enriched 49.json
Enriched 5.json
Enriched 56.json
Enriched 57.json
Enriched 6.json
Enriched 61.json
Enriched 7.json
Enriched 71.json
Enriched 8.json
Enriched 9.json
Enriched 99.json


# Save AI Enriched Data to Lakehouse Table

In [5]:
df = spark.createDataFrame(enrich_json_docs)
display(df.limit(5))

StatementMeta(, 57306c3e-cac7-48a1-b99d-e16c672afd7d, 13, Finished, Available)

SynapseWidget(Synapse.DataFrame, 2db31490-2435-4db5-aef8-3644b5b15d5b)

In [6]:
import pyspark.sql.functions as F
from pyspark.sql.types import DateType

# Select columns we want in our data model and modify column types
df = (
    df
    .select(['book_id', 'title', 'author', 'language', 'release_date', 'last_modified_date'])
    .withColumn('release_date', F.to_date('release_date', 'yyyy-MM-dd'))
    .withColumn('last_modified_date', F.to_date('last_modified_date', 'yyyy-MM-dd'))
)

display(df.limit(5))

StatementMeta(, 57306c3e-cac7-48a1-b99d-e16c672afd7d, 14, Finished, Available)

SynapseWidget(Synapse.DataFrame, e71198b2-10b1-453b-8d02-fd43b0c98ffe)

In [7]:
df.write.format('delta').mode('overwrite').option('overwriteSchema','true').saveAsTable('books')

StatementMeta(, 57306c3e-cac7-48a1-b99d-e16c672afd7d, 15, Finished, Available)