In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import StringType, StructField, IntegerType, DateType, FloatType
import urllib

In [0]:
# File path to table with S3 bucket credentails
s3_creds_path = "dbfs:/user/hive/warehouse/authentication_credentials"

# Creates dataframe from the above table
creds_df = spark.read.format("delta").load(s3_creds_path)

# Stores the values from the table above
ACCESS_KEY = creds_df.select("ACCESS key ID").collect()[0]["ACCESS key ID"]
SECRET_KEY = creds_df.select("Secret access key").collect()[0]["Secret access key"]

ENCODED_SECRET_KEY = urllib.parse.quote(string = SECRET_KEY, safe = "")

In [0]:
%sql
SET spark.databricks.delta.formatCheck.enabled=false

key,value
spark.databricks.delta.formatCheck.enabled,False


In [0]:
%sql
SET spark.sql.streaming.schemaInference.enabled = true

key,value
spark.sql.streaming.schemaInference.enabled,True


In [0]:
# Stream records from Kinesis into dataframe
df_pin = spark.readStream.format('kinesis')\
  .option('streamName', 'streaming-0affe94cc7d3-pin')\
  .option('initialPosition', 'earliest')\
  .option('region', 'us-east-1')\
  .option('awsAccessKey', ACCESS_KEY)\
  .option('awsSecretKey', SECRET_KEY)\
  .load()

# Change cast type from byte to string
df_pin = df_pin.selectExpr("CAST(data as STRING)")

In [0]:
# Setup a preferred schema for the dataframe
pin_schema = StructType([
  StructField('index', IntegerType(), True), StructField('unique_id', StringType(), True),
  StructField('title', StringType(), True), StructField('description', StringType(), True),
  StructField('poster_name', StringType(), True), StructField('follower_count', StringType(), True),
  StructField('tag_list', StringType(), True), StructField('is_image_or_video', StringType(), True),
  StructField('image_src', StringType(), True), StructField('downloaded', IntegerType(), True),
  StructField('save_location', StringType(), True), StructField('category', StringType(), True)
])

# Load the data column as json then create a dataframe using schema
df_pin = df_pin.withColumn("data", from_json(col("data"), schema = pin_schema))

# Change column names
df_pin = df_pin.select(col("data.index").alias("ind"),
                       col("data.unique_id").alias("unique_id"),
                       col("data.title").alias("title"),
                       col("data.description").alias("description"),
                       col("data.poster_name").alias("poster_name"),
                       col("data.follower_count").alias("follower_count"),
                       col("data.tag_list").alias("tag_list"),
                       col("data.is_image_or_video").alias("is_image_or_video"),
                       col("data.image_src").alias("image_src"),
                       col("data.downloaded").alias("downloaded"),
                       col("data.save_location").alias("save_location"),
                       col("data.category").alias("category"))

display(df_pin)

ind,unique_id,title,description,poster_name,follower_count,tag_list,is_image_or_video,image_src,downloaded,save_location,category
8610,1f1db6e4-45a4-46a8-a52b-38adea2646f7,"Sabrina Carpenter Got Her Very First Tattoo, and It's a Reference to Her ""Lucky"" Life","The ""Work It"" star just got her first tiny tattoo by celebrity tattoo artist Dr. Woo, and the design is a reference to her ""lucky""",POPSUGAR,5M,"Petite Tattoos,Dainty Tattoos,Cute Tattoos,Small Tattoos,Tatoos,Small Feminine Tattoos,Grace Tattoos,Hip Tattoo Small,Hidden Tattoos",image,https://i.pinimg.com/originals/5a/45/ad/5a45adc8361768eb312c4d16d4db45f9.jpg,1,Local save in /data/tattoos,tattoos
9014,45c2e92a-5daf-40f3-9732-771c186c0757,75 More Small Tattoo Ideas from Playground Tattoo - Crestfox,"Hi everyone! My last small tattoo ideas post was really popular on Pinterest, so I decided to put together this post with even more tiny tattoo ideas. Just like the other post,…",Sarah Wahl | Crestfox,17k,"Little Tattoos,Mini Tattoos,Body Art Tattoos,Sleeve Tattoos,Tatoos,Flower Tattoos,White Tattoos,Arrow Tattoos,Word Tattoos",image,https://i.pinimg.com/originals/b5/72/b5/b572b5641d4efd2e9a13de2506b9e721.png,1,Local save in /data/tattoos,tattoos
8433,61ffdc64-3bfa-4c53-a9f0-9bc761514da5,Relationship Rules,Relationship Rules is a modern-age lifestyle/love blog that discusses everything from breakups to being amazing parents.,Kim Hefner,8,"Now Quotes,Real Quotes,Words Quotes,Life Quotes,I Trust You Quotes,I Needed You Quotes,Sayings,The Lucky One Quotes,Losing Trust Quotes",image,https://i.pinimg.com/originals/da/d2/c4/dad2c43f1a4d7693a746758932139db0.webp,1,Local save in /data/quotes,quotes
10567,cf1e3bef-8a03-4b66-b19a-50d3c7d13f9a,Yessir,No description available Story format,Louisiana_Redneck,233,"Dually Trucks,Lifted Chevy Trucks,Diesel Trucks,Pickup Trucks,Chevy Trucks Older,Old Ford Trucks,Truck Drivers,Jeep Pickup,Chevrolet Silverado",multi-video(story page format),https://i.pinimg.com/videos/thumbnails/originals/d4/dd/de/d4ddde53e277c155dc72b336ec73a40e.0000001.jpg,1,Local save in /data/vehicles,vehicles
6892,ebf6b77f-0656-4297-92a4-f37f48f6cc44,Giuseppe Leonardi (@giuse_leonardi) • Instagram photos and videos,"8 Followers, 11 Following, 0 Posts - See Instagram photos and videos from Giuseppe Leonardi (@giuse_leonardi)",Roberto Rafael Ovalle Echeverria,19,"Der Gentleman,Gentleman Style,Lace Converse Shoes,Glitter Shoes,Swag Shoes,Fashion Shoes,Mens Fashion,Fashion Tips,Fashion Menswear",image,https://i.pinimg.com/originals/bb/18/6b/bb186bd58c4bc65b9036f874938a351b.jpg,1,Local save in /data/mens-fashion,mens-fashion
2016,ea82682c-5186-44fa-8b4d-9b4f6b74739e,Christmas Home Tour,"Hi friends, It’s been a little while since I’ve published a blog post and a lot has happened in the last year for me and my family! My husband and I welcomed boy/girl twins into",Live Oak Nest,12k,"French Country Christmas,Country Christmas Decorations,Farmhouse Christmas Decor,Xmas Decorations,Christmas Home,Christmas Ideas,Christmas Mantle Decorations,Cottage Christmas Decorating,Christmas Fireplace Mantels",image,https://i.pinimg.com/originals/a5/48/c6/a548c6175e16ff33dc25c983c8e3514c.jpg,1,Local save in /data/christmas,christmas
1128,c839f3ff-0e34-40be-9209-63f0b2c285fc,8 innovative drugstore beauty products to try in 2018,Celebrity beauty experts love these innovative drugstore beauty products.,TODAY Show,426k,"Face Care,Skin Care,Facial Therapy,Facial Steamer,Spa Day At Home,Beauty Skin,Clean Beauty,Beauty Care,Natural Beauty",image,https://i.pinimg.com/originals/32/fc/92/32fc925c7fda3259e337c14aa8fdfc4d.jpg,1,Local save in /data/beauty,beauty
9608,7eb7a087-8889-4eae-9ad6-6d71ad7b09ad,"Highlights of Scandinavia: 10 Days Itinerary in Denmark, Sweden and Norway","Follow our 10 days itinerary that takes you from Copenhagen to Oslo to Stockholm. From cosmopolitan Sweden to Denmark, the Wild West of Scandinavia to the fjords & valleys of No…",Anywhr,78,"Travel To Sweden,Denmark Travel,Norway Travel,Beautiful Places To Travel,Cool Places To Visit,Places To Go,Cruise Travel,Summer Travel,Sweden Holidays",image,https://i.pinimg.com/originals/d0/e1/c7/d0e1c775d25ba92489aa134628be7611.png,1,Local save in /data/travel,travel
10678,e1ca5651-2047-4711-9665-3fd853aac88b,This Guy Transformed A 1961 VW Beetle Deluxe Into A Black Matte Roadster,"It should come as no surprise that people love the VW Beetle and many honor it by converting the car into an even cooler version, just like designer Danni Koldal did with his bl…",Bored Panda,2M,"Custom Trucks,Custom Cars,Vw Coccinelle Cabriolet,Mclaren P1 Black,Bmw R65,Jetta Vw,Carros Vw,Vw Beetle Convertible,Kdf Wagen",image,https://i.pinimg.com/originals/42/3b/6a/423b6a354062b0ee9c844ee4854656a4.jpg,1,Local save in /data/vehicles,vehicles
7954,e99cc6c5-0f3d-401e-9d20-e0d625b30038,150+ Best Love Quotes That'll Make Anyone Believe In Love,"Finding new, different ways to say, 'I love you' can be tough. Here are 150+ of the best love quotes for him or her to express what loving and being loved means.",YourTango,942k,"Love Quotes For Him Cute,Life Quotes Love,Best Love Quotes,Cute Quotes,Quotes To Live By,Favorite Quotes,Top Quotes,Hope Love Quotes,Cowboy Love Quotes",image,https://i.pinimg.com/originals/79/44/28/7944280986ed62559c8b8d72b94083c6.jpg,1,Local save in /data/quotes,quotes


In [0]:
# Stream records from Kinesis into dataframe
  df_geo = spark.readStream.format('kinesis')\
  .option('streamName','streaming-0affe94cc7d3-geo')\
  .option('initialPosition', 'earliest')\
  .option('region', 'us-east-1')\
  .option('awsAccessKey', ACCESS_KEY)\
  .option('awsSecretKey', SECRET_KEY)\
  .load()

# Change cast type from byte to string
df_geo = df_geo.selectExpr("CAST(data as STRING)")

In [0]:
# Setup a preferred schema for the dataframe
geo_schema = StructType([
  StructField('ind', IntegerType(), True), StructField('timestamp', DateType(), True),
  StructField('latitude', FloatType(), True), StructField('longitude', FloatType(), True),
  StructField('country', StringType(), True)
])

# Load the data column as json then create a dataframe using schema
df_geo = df_geo.withColumn("data", from_json(col("data"), schema = geo_schema))

# Change column names
df_geo = df_geo.select(col("data.ind").alias("ind"),
                       col("data.timestamp").alias("timestamp"),
                       col("data.latitude").alias("latitude"),
                       col("data.longitude").alias("longitude"),
                       col("data.country").alias("country"))

display(df_geo)

ind,timestamp,latitude,longitude,country
8610,2021-11-20,-84.3984,-144.933,Bouvet Island (Bouvetoya)
9014,2021-04-20,-37.2495,-118.101,Ethiopia
8433,2019-05-11,-0.709639,173.034,Lesotho
10567,2021-07-06,6.50767,-36.2076,Djibouti
6892,2022-04-11,-56.3743,60.8553,Montenegro
2016,2018-04-08,-27.1161,110.753,Estonia
1128,2019-06-09,-86.0614,-86.5849,Chile
9608,2019-07-02,16.858,59.3183,Chad
10678,2020-09-11,-88.8298,-170.188,Albania
7954,2022-09-17,-89.5173,-179.689,Algeria


In [0]:
# Stream records from Kinesis into dataframe
df_user = spark.readStream.format('kinesis')\
  .option('streamName','streaming-0affe94cc7d3-user')\
  .option('initialPosition', 'earliest')\
  .option('region', 'us-east-1')\
  .option('awsAccessKey', ACCESS_KEY)\
  .option('awsSecretKey', SECRET_KEY)\
  .load()

# Change cast type from byte to string
df_user = df_user.selectExpr("CAST(data as STRING)")

In [0]:
# Setup a preferred schema for the dataframe
user_schema = StructType([
  StructField('ind', IntegerType(), True), StructField('first_name', StringType(), True),
  StructField('last_name', StringType(), True), StructField('age', IntegerType(), True),
  StructField('date_joined', DateType(), True)
])

# Load the data column as json then create a dataframe using schema
df_user = df_user.withColumn("data", from_json(col("data"), schema = user_schema))

# Change column names
df_user = df_user.select(col("data.ind").alias("ind"),
                       col("data.first_name").alias("first_name"),
                       col("data.last_name").alias("last_name"),
                       col("data.age").alias("age"),
                       col("data.date_joined").alias("date_joined"))

display(df_user)

ind,first_name,last_name,age,date_joined
8610,Amy,Brown,21,2015-11-08
9014,Luke,Carter,37,2016-01-13
8433,Christopher,Andrews,33,2016-08-31
10567,Brandon,Valdez,22,2017-05-29
6892,Mark,Adams,46,2015-11-19
2016,Lindsey,Brown,21,2017-04-03
1128,Gregory,Barnett,20,2016-03-29
9608,Melanie,Stuart,49,2016-03-30
10678,Adam,Acosta,20,2015-10-21
7954,Aaron,Abbott,20,2015-10-23


In [0]:
# List of the common no relevant data strings in the pin dataframe
value_to_replace = ["No description available Story format","User Info Error", "Image scr error.", "User Info Error", "N,o, ,T,a,g,s, ,A,v,a,i,l,a,b,l,e", "No Title Data Available"]

# The columns where the above strings appear
replace_in_column = ["description", "follower_count", "image_src", "poster_name", "tag_list", "title"]

# Replaces the no relevant data strings with the None type
df_pin = df_pin.replace(value_to_replace, None, subset = replace_in_column)

# Searches for (k, M) and replaces with (000, 000000) in the follower_count column
df_pin = df_pin.withColumn("follower_count", regexp_replace("follower_count", "M", "000000"))
df_pin = df_pin.withColumn("follower_count", regexp_replace("follower_count", "k", "000"))

# Removes the string "Local save in " from the save_location column
df_pin = df_pin.withColumn("save_location", regexp_replace("save_location", "Local save in ", ""))

# Cast follower_count column from string to integer
df_pin = df_pin.withColumn("follower_count", col("follower_count").cast("int"))

# Reorder columns
df_pin = df_pin.select(df_pin.ind, "unique_id", df_pin.title, df_pin.description, df_pin.follower_count, df_pin.poster_name, df_pin.tag_list, df_pin.is_image_or_video, df_pin.image_src, df_pin.save_location, df_pin.category)

ind,unique_id,title,description,follower_count,poster_name,tag_list,is_image_or_video,image_src,save_location,category
8610,1f1db6e4-45a4-46a8-a52b-38adea2646f7,"Sabrina Carpenter Got Her Very First Tattoo, and It's a Reference to Her ""Lucky"" Life","The ""Work It"" star just got her first tiny tattoo by celebrity tattoo artist Dr. Woo, and the design is a reference to her ""lucky""",5000000.0,POPSUGAR,"Petite Tattoos,Dainty Tattoos,Cute Tattoos,Small Tattoos,Tatoos,Small Feminine Tattoos,Grace Tattoos,Hip Tattoo Small,Hidden Tattoos",image,https://i.pinimg.com/originals/5a/45/ad/5a45adc8361768eb312c4d16d4db45f9.jpg,/data/tattoos,tattoos
9014,45c2e92a-5daf-40f3-9732-771c186c0757,75 More Small Tattoo Ideas from Playground Tattoo - Crestfox,"Hi everyone! My last small tattoo ideas post was really popular on Pinterest, so I decided to put together this post with even more tiny tattoo ideas. Just like the other post,…",17000.0,Sarah Wahl | Crestfox,"Little Tattoos,Mini Tattoos,Body Art Tattoos,Sleeve Tattoos,Tatoos,Flower Tattoos,White Tattoos,Arrow Tattoos,Word Tattoos",image,https://i.pinimg.com/originals/b5/72/b5/b572b5641d4efd2e9a13de2506b9e721.png,/data/tattoos,tattoos
8433,61ffdc64-3bfa-4c53-a9f0-9bc761514da5,Relationship Rules,Relationship Rules is a modern-age lifestyle/love blog that discusses everything from breakups to being amazing parents.,8.0,Kim Hefner,"Now Quotes,Real Quotes,Words Quotes,Life Quotes,I Trust You Quotes,I Needed You Quotes,Sayings,The Lucky One Quotes,Losing Trust Quotes",image,https://i.pinimg.com/originals/da/d2/c4/dad2c43f1a4d7693a746758932139db0.webp,/data/quotes,quotes
10567,cf1e3bef-8a03-4b66-b19a-50d3c7d13f9a,Yessir,,233.0,Louisiana_Redneck,"Dually Trucks,Lifted Chevy Trucks,Diesel Trucks,Pickup Trucks,Chevy Trucks Older,Old Ford Trucks,Truck Drivers,Jeep Pickup,Chevrolet Silverado",multi-video(story page format),https://i.pinimg.com/videos/thumbnails/originals/d4/dd/de/d4ddde53e277c155dc72b336ec73a40e.0000001.jpg,/data/vehicles,vehicles
6892,ebf6b77f-0656-4297-92a4-f37f48f6cc44,Giuseppe Leonardi (@giuse_leonardi) • Instagram photos and videos,"8 Followers, 11 Following, 0 Posts - See Instagram photos and videos from Giuseppe Leonardi (@giuse_leonardi)",19.0,Roberto Rafael Ovalle Echeverria,"Der Gentleman,Gentleman Style,Lace Converse Shoes,Glitter Shoes,Swag Shoes,Fashion Shoes,Mens Fashion,Fashion Tips,Fashion Menswear",image,https://i.pinimg.com/originals/bb/18/6b/bb186bd58c4bc65b9036f874938a351b.jpg,/data/mens-fashion,mens-fashion
2016,ea82682c-5186-44fa-8b4d-9b4f6b74739e,Christmas Home Tour,"Hi friends, It’s been a little while since I’ve published a blog post and a lot has happened in the last year for me and my family! My husband and I welcomed boy/girl twins into",12000.0,Live Oak Nest,"French Country Christmas,Country Christmas Decorations,Farmhouse Christmas Decor,Xmas Decorations,Christmas Home,Christmas Ideas,Christmas Mantle Decorations,Cottage Christmas Decorating,Christmas Fireplace Mantels",image,https://i.pinimg.com/originals/a5/48/c6/a548c6175e16ff33dc25c983c8e3514c.jpg,/data/christmas,christmas
1128,c839f3ff-0e34-40be-9209-63f0b2c285fc,8 innovative drugstore beauty products to try in 2018,Celebrity beauty experts love these innovative drugstore beauty products.,426000.0,TODAY Show,"Face Care,Skin Care,Facial Therapy,Facial Steamer,Spa Day At Home,Beauty Skin,Clean Beauty,Beauty Care,Natural Beauty",image,https://i.pinimg.com/originals/32/fc/92/32fc925c7fda3259e337c14aa8fdfc4d.jpg,/data/beauty,beauty
9608,7eb7a087-8889-4eae-9ad6-6d71ad7b09ad,"Highlights of Scandinavia: 10 Days Itinerary in Denmark, Sweden and Norway","Follow our 10 days itinerary that takes you from Copenhagen to Oslo to Stockholm. From cosmopolitan Sweden to Denmark, the Wild West of Scandinavia to the fjords & valleys of No…",78.0,Anywhr,"Travel To Sweden,Denmark Travel,Norway Travel,Beautiful Places To Travel,Cool Places To Visit,Places To Go,Cruise Travel,Summer Travel,Sweden Holidays",image,https://i.pinimg.com/originals/d0/e1/c7/d0e1c775d25ba92489aa134628be7611.png,/data/travel,travel
10678,e1ca5651-2047-4711-9665-3fd853aac88b,This Guy Transformed A 1961 VW Beetle Deluxe Into A Black Matte Roadster,"It should come as no surprise that people love the VW Beetle and many honor it by converting the car into an even cooler version, just like designer Danni Koldal did with his bl…",2000000.0,Bored Panda,"Custom Trucks,Custom Cars,Vw Coccinelle Cabriolet,Mclaren P1 Black,Bmw R65,Jetta Vw,Carros Vw,Vw Beetle Convertible,Kdf Wagen",image,https://i.pinimg.com/originals/42/3b/6a/423b6a354062b0ee9c844ee4854656a4.jpg,/data/vehicles,vehicles
7954,e99cc6c5-0f3d-401e-9d20-e0d625b30038,150+ Best Love Quotes That'll Make Anyone Believe In Love,"Finding new, different ways to say, 'I love you' can be tough. Here are 150+ of the best love quotes for him or her to express what loving and being loved means.",942000.0,YourTango,"Love Quotes For Him Cute,Life Quotes Love,Best Love Quotes,Cute Quotes,Quotes To Live By,Favorite Quotes,Top Quotes,Hope Love Quotes,Cowboy Love Quotes",image,https://i.pinimg.com/originals/79/44/28/7944280986ed62559c8b8d72b94083c6.jpg,/data/quotes,quotes


In [0]:
# Combines the latitube and longitude columns into column of arrays
df_geo = df_geo.withColumn("coordinates", array("latitude", "longitude"))

# Removes the now redundant latitude and longitude columns
df_geo = df_geo.drop("latitude", "longitude")

# Reorder columns
df_geo = df_geo.select(df_geo.ind, df_geo.country, df_geo.coordinates, df_geo.timestamp)

In [0]:
# Combines the first_name and last_name columns into a single column
df_user = df_user.withColumn("user_name", concat("first_name", lit(" "), "last_name"))

# Removes the first_name and last_name columns
df_user = df_user.drop("first_name", "last_name")

# Reorder columns
df_user = df_user.select(df_user.ind, df_user.user_name, df_user.age, df_user.date_joined)

In [0]:
# Save streamed dataframed in a delta table 
df_pin.writeStream\
  .format("delta")\
  .outputMode("append")\
  .option("checkpointLocation", "/tmp/kinesis/_checkpoints")\
  .table("0affe94cc7d3_pin_table")

# Save streamed dataframed in a delta table
df_geo.writeStream\
  .format("delta")\
  .outputMode("append")\
  .option("checkpointLocation", "/tmp/kinesis/_checkpoints")\
  .table("0affe94cc7d3_geo_table")

# Save streamed dataframed in a delta table
df_user.writeStream\
  .format("delta")\
  .outputMode("append")\
  .option("checkpointLocation", "/tmp/kinesis/_checkpoints")\
  .table("0affe94cc7d3_user_table")

# Delete the temp checkpoint file
dbutils.fs.rm("/tmp/kinesis/_checkpoints/", True)

True

In [0]:
%sql
-- Sanity check the delta table
SELECT * FROM `0affe94cc7d3_user_table`

ind,user_name,age,date_joined
8610,Amy Brown,21,2015-11-08
9014,Luke Carter,37,2016-01-13
8433,Christopher Andrews,33,2016-08-31
10567,Brandon Valdez,22,2017-05-29
6892,Mark Adams,46,2015-11-19
2016,Lindsey Brown,21,2017-04-03
1128,Gregory Barnett,20,2016-03-29
9608,Melanie Stuart,49,2016-03-30
10678,Adam Acosta,20,2015-10-21
7954,Aaron Abbott,20,2015-10-23
