##Create table for OLTP "trips" data

In [0]:
table_name = "riders"

Remove loaded delta files from silver storage to satisfy overwrite requirements

In [0]:
%fs rm -r lakehouse/silver/oltp_riders

In [0]:
raw_riders_df = spark.read.format('delta').load(F"/lakehouse/bronze/{table_name}")

In [0]:
raw_riders_df.count()

75000

In [0]:
raw_riders_df.createOrReplaceTempView(F'oltp_{table_name}')

In [0]:
# to satisfy overwrite requirements
spark.sql(F'''
           DROP TABLE IF EXISTS silver_{table_name}  
          ''')


DataFrame[]

In [0]:
#Creating empty table
spark.sql(F''' CREATE TABLE silver_{table_name}
                (
                    rider_id INTEGER,
                    first STRING,
                    last STRING, 
                    address STRING, 
                    birthday DATE,
                    account_start_date DATE,
                    account_end_date DATE,
                    is_member BOOLEAN 
                )
                USING delta
                LOCATION "/lakehouse/silver/oltp_{table_name}" 
        ''')


DataFrame[]

In [0]:
spark.sql(F'''
          INSERT INTO silver_{table_name} SELECT * FROM oltp_{table_name}
          ''')

DataFrame[num_affected_rows: bigint, num_inserted_rows: bigint]

In [0]:
display(spark.sql(F'''
          SELECT count(*) FROM silver_{table_name}
          '''))

count(1)
150000


In [0]:
# Checking for Null values
# "account_end_date" is null when the account is active
# Not under the WHERE statement, because null is 
# significant is its case
display(spark.sql('''
          SELECT 
            rider_id, first, last, address, birthday, account_start_date, account_end_date, is_member
          FROM 
            silver_riders
          WHERE
            first IS NULL
            OR
            last IS NULL
            OR
            birthday IS NULL
            OR
            account_start_date IS NULL
            OR
            is_member IS NULL  

        '''))

rider_id,first,last,address,birthday,account_start_date,account_end_date,is_member


In [0]:
#checking for duplicates
display(spark.sql(''' 
                  SELECT 
                    rider_id 
                  FROM 
                    silver_riders 
                  GROUP BY rider_id
                  HAVING count(rider_id) > 1  
                  '''
                ))

rider_id


In [0]:
# Min Max of account_start_date
display(spark.sql(''' 
                  SELECT 
                    min(account_start_date), max(account_start_date)
                  FROM 
                    silver_riders 
                  '''
                ))

min(account_start_date),max(account_start_date)
2013-01-31,2022-02-12


In [0]:
# Min Max of account_end_date
display(spark.sql(''' 
                  SELECT 
                    min(account_end_date), max(account_end_date)
                  FROM 
                    silver_riders 
                  '''
                ))

min(account_end_date),max(account_end_date)
2013-03-01,2022-02-01
