In [31]:
%%pyspark

df = spark.read.load('abfss://capture@spkaccelerjqvse6bhhchxi.dfs.core.windows.net/SeattlePublicLibrary/Library_Collection_Inventory.csv', format='csv'
## If header exists uncomment line below
, header=True
)
display(df.limit(10))

StatementMeta(synasp1, 6, 25, Finished, Available)

SynapseWidget(Synapse.DataFrame, 5720fb1b-8e15-48a7-be91-4676bc4fde8b)

In [32]:
%%pyspark

# Show Schema
df.printSchema()

StatementMeta(synasp1, 6, 26, Finished, Available)

root
 |-- BibNum: string (nullable = true)
 |-- Title: string (nullable = true)
 |-- Author: string (nullable = true)
 |-- ISBN: string (nullable = true)
 |-- PublicationYear: string (nullable = true)
 |-- Publisher: string (nullable = true)
 |-- Subjects: string (nullable = true)
 |-- ItemType: string (nullable = true)
 |-- ItemCollection: string (nullable = true)
 |-- FloatingItem: string (nullable = true)
 |-- ItemLocation: string (nullable = true)
 |-- ReportDate: string (nullable = true)
 |-- ItemCount: string (nullable = true)

In [33]:
%%pyspark

from pyspark.sql import SparkSession
from pyspark.sql.types import *

# Primary storage info
capture_account_name = 'spkaccelerjqvse6bhhchxi' # fill in your primary account name
capture_container_name = 'capture' # fill in your container name
capture_relative_path = 'SeattlePublicLibrary/Library_Collection_Inventory.csv' # fill in your relative folder path

capture_adls_path = 'abfss://%s@%s.dfs.core.windows.net/%s' % (capture_container_name, capture_account_name, capture_relative_path)
print('Primary storage account path: ' + capture_adls_path)

StatementMeta(synasp1, 6, 27, Finished, Available)

Primary storage account path: abfss://capture@spkaccelerjqvse6bhhchxi.dfs.core.windows.net/SeattlePublicLibrary/Library_Collection_Inventory.csv

In [34]:
%%pyspark

from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DoubleType, DateType, TimestampType
csvSchema = StructType([
  StructField('bibnum', IntegerType(), True),
  StructField('title', StringType(), True),
  StructField('author', StringType(), True), 
  StructField('isbn', StringType(), True),
  StructField('publication_year', StringType(), True),
  StructField('publisher', StringType(), True),
  StructField('subjects', StringType(), True),
  StructField('item_type', StringType(), True),
  StructField('item_collection', StringType(), True),
  StructField('floating_item', StringType(), True),
  StructField('item_location', StringType(), True),
  StructField('reportDate', StringType(), True),
  StructField('item_count', IntegerType(), True)
])

CheckByTPI_capture_df = spark.read.format('csv').option('header', 'True').schema(csvSchema).load(capture_adls_path)

display(CheckByTPI_capture_df.limit(10))



StatementMeta(synasp1, 6, 28, Finished, Available)

SynapseWidget(Synapse.DataFrame, 409a795b-ae2e-4a2c-aace-d30ac47ae5bb)

In [35]:
%%pyspark

from pyspark.sql.functions import to_date, to_timestamp, col, date_format, current_timestamp
df_final = (CheckByTPI_capture_df.withColumn("report_date", to_date(col("reportDate"),"M/d/yyyy")).drop("reportDate")
                                 .withColumn('loadDate', date_format(current_timestamp(), 'M/d/y H:m:s a'))
                                 .withColumn("load_date", to_timestamp(col("loadDate"),"M/d/y H:m:s a")).drop("loadDate")
)

StatementMeta(synasp1, 6, 29, Finished, Available)



In [36]:
%%pyspark

# Show Schema
df_final.printSchema()

display(df_final.limit(10))


StatementMeta(synasp1, 6, 30, Finished, Available)

root
 |-- bibnum: integer (nullable = true)
 |-- title: string (nullable = true)
 |-- author: string (nullable = true)
 |-- isbn: string (nullable = true)
 |-- publication_year: string (nullable = true)
 |-- publisher: string (nullable = true)
 |-- subjects: string (nullable = true)
 |-- item_type: string (nullable = true)
 |-- item_collection: string (nullable = true)
 |-- floating_item: string (nullable = true)
 |-- item_location: string (nullable = true)
 |-- item_count: integer (nullable = true)
 |-- report_date: date (nullable = true)
 |-- load_date: timestamp (nullable = true)



SynapseWidget(Synapse.DataFrame, 2253a7f7-f42e-4681-8a0f-3b3e78badef7)

In [37]:
%%pyspark

from pyspark.sql import SparkSession
from pyspark.sql.types import *

# Primary storage info
compose_account_name = 'spkaccelerjqvse6bhhchxi' # fill in your primary account name
compose_container_name = 'compose' # fill in your container name
compose_relative_path = 'SeattlePublicLibrary/LibraryCollectionInventory/' # fill in your relative folder path

compose_adls_path = 'abfss://%s@%s.dfs.core.windows.net/%s' % (compose_container_name, compose_account_name, compose_relative_path)
print('Primary storage account path: ' + compose_adls_path)

StatementMeta(synasp1, 6, 31, Finished, Available)

Primary storage account path: abfss://compose@spkaccelerjqvse6bhhchxi.dfs.core.windows.net/SeattlePublicLibrary/LibraryCollectionInventory/

In [38]:
%%pyspark

compose_parquet_path = compose_adls_path + 'CollectionInventory.parquet'

print('parquet file path: ' + compose_parquet_path)

StatementMeta(synasp1, 6, 32, Finished, Available)

parquet file path: abfss://compose@spkaccelerjqvse6bhhchxi.dfs.core.windows.net/SeattlePublicLibrary/LibraryCollectionInventory/CollectionInventory.parquet

In [39]:
%%pyspark

df_final.write.parquet(compose_parquet_path, mode = 'overwrite')

StatementMeta(synasp1, 6, 33, Finished, Available)



In [40]:
%%sql

-- Create database SeattlePublicLibrary only if database with same name does not exist
CREATE DATABASE IF NOT EXISTS SeattlePublicLibrary

StatementMeta(synasp1, 6, 34, Finished, Available)

<Spark SQL result set with 0 rows and 0 fields>

In [41]:
%%sql

-- Create table CheckoutsByTitlePhysicalItemsschemafinal only if table with same name does not exist
CREATE TABLE IF NOT EXISTS SeattlePublicLibrary.library_collection_inventory
 (title STRING
 ,author STRING
 ,isbn STRING
 ,publication_year STRING
 ,publisher STRING
 ,subjects STRING
 ,item_type STRING
 ,item_collection STRING
 ,floating_item STRING
 ,item_location STRING
 ,report_date DATE
 ,item_count INTEGER
 ,load_date TIMESTAMP
)
USING PARQUET OPTIONS (path 'abfss://compose@spkaccelerjqvse6bhhchxi.dfs.core.windows.net/SeattlePublicLibrary/LibraryCollectionInventory/CollectionInventory.parquet')

StatementMeta(synasp1, 6, 35, Finished, Available)

<Spark SQL result set with 0 rows and 0 fields>