In [1]:
%%pyspark

df = spark.read.load('abfss://capture@splacceler5lmevhdeon4ym.dfs.core.windows.net/SeattlePublicLibrary/Integrated_Library_System__ILS__Data_Dictionary.csv', format='csv'
## If header exists uncomment line below
, header=True
)
display(df.limit(10))

In [2]:
%%pyspark

# Show Schema
df.printSchema()

In [3]:
%%pyspark

from pyspark.sql import SparkSession
from pyspark.sql.types import *

# Primary storage info
capture_account_name = 'splacceler5lmevhdeon4ym' # fill in your primary account name
capture_container_name = 'capture' # fill in your container name
capture_relative_path = 'SeattlePublicLibrary/Integrated_Library_System__ILS__Data_Dictionary.csv' # fill in your relative folder path

capture_adls_path = 'abfss://%s@%s.dfs.core.windows.net/%s' % (capture_container_name, capture_account_name, capture_relative_path)
print('Primary storage account path: ' + capture_adls_path)

In [4]:
%%pyspark

from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DoubleType, DateType, TimestampType
csvSchema = StructType([
  StructField('code', StringType(), True),
  StructField('description', StringType(), True),
  StructField('code_type', StringType(), True), 
  StructField('format_group', StringType(), True),
  StructField('format_subgroup', StringType(), True),
  StructField('category_group', StringType(), True),
  StructField('category_subgroup', StringType(), True),
  StructField('age_group', StringType(), True)   
])

CheckByTPI_capture_df = spark.read.format('csv').option('header', 'True').schema(csvSchema).load(capture_adls_path)

display(CheckByTPI_capture_df.limit(10))



In [5]:
%%pyspark

from pyspark.sql.functions import to_date, to_timestamp, col, date_format, current_timestamp
df_final = (CheckByTPI_capture_df.withColumn('loadDate', date_format(current_timestamp(), 'MM/dd/yyyy hh:mm:ss aa'))
                                 .withColumn("load_date", to_timestamp(col("loadDate"),"MM/dd/yyyy hh:mm:ss aa")).drop("loadDate")
)

In [6]:
%%pyspark

# Show Schema
df_final.printSchema()

display(df_final.limit(10))


In [7]:
%%pyspark

from pyspark.sql import SparkSession
from pyspark.sql.types import *

# Primary storage info
compose_account_name = 'splacceler5lmevhdeon4ym' # fill in your primary account name
compose_container_name = 'compose' # fill in your container name
compose_relative_path = 'SeattlePublicLibrary/IntegratedLibrarySystemILSDataDictionary/' # fill in your relative folder path

compose_adls_path = 'abfss://%s@%s.dfs.core.windows.net/%s' % (compose_container_name, compose_account_name, compose_relative_path)
print('Primary storage account path: ' + compose_adls_path)

In [8]:
%%pyspark

compose_parquet_path = compose_adls_path + 'datadictionary.parquet'

print('parquet file path: ' + compose_parquet_path)

In [9]:
%%pyspark

df_final.write.parquet(compose_parquet_path, mode = 'overwrite')

In [10]:
%%sql

-- Create database SeattlePublicLibrary only if database with same name does not exist
CREATE DATABASE IF NOT EXISTS SeattlePublicLibrary

In [11]:
%%sql

-- Create table CheckoutsByTitlePhysicalItemsschemafinal only if table with same name does not exist
CREATE TABLE IF NOT EXISTS SeattlePublicLibrary.integrated_library_system_ils_data_dictionary
(code STRING
 ,description STRING
 ,code_type STRING
 ,format_group STRING
 ,format_subgroup STRING
 ,category_group STRING
 ,category_subgroup STRING
 ,age_group STRING
 ,load_date TIMESTAMP
)
USING PARQUET OPTIONS (path 'abfss://compose@splacceler5lmevhdeon4ym.dfs.core.windows.net/SeattlePublicLibrary/IntegratedLibrarySystemILSDataDictionary/datadictionary.parquet')

In [23]:
%%sql

--DROP TABLE SeattlePublicLibrary.integrated_library_system_ils_data_dictionary