In [None]:
username = "12885f560a0b"
spark.sql(f"DROP TABLE IF EXISTS {username}_pin_table")
spark.sql(f"DROP TABLE IF EXISTS {username}_geo_table")
spark.sql(f"DROP TABLE IF EXISTS {username}_user_table")

In [None]:
def clean_pin_df(df):
  to_replace_with_none = {
    'description': ['No description available Story format', 'Untitled', 'No description available Story format'],
    'image_src': 'Image src error.',
    'poster_name': 'User Info Error',
    'tag_list': 'N,o, ,T,a,g,s, ,A,v,a,i,l,a,b,l,e',
    'title': 'No Title Data Available'
    }

  for column, values in to_replace_with_none.items():
    if isinstance(values, list):
        for value in values:
          cleaned_df = df.replace(value, None, subset=[column])
    else:
        cleaned_df = cleaned_df.replace(values, None, subset=[column])                    
  # Using '0' instead of None, as it is a numeric column
  cleaned_df = cleaned_df.replace({'User Info Error':'0'}, subset=['follower_count'])
  # replace k with 000, M with 000000, and B with 000000000
  cleaned_df = cleaned_df.withColumn("follower_count", regexp_replace("follower_count", "k", "000"))
  cleaned_df = cleaned_df.withColumn("follower_count", regexp_replace("follower_count", "M", "000000"))
  cleaned_df = cleaned_df.withColumn("follower_count", regexp_replace("follower_count", "B", "000000000"))
  # Perform the necessary transformations on the follower_count to ensure every entry is a number. Make sure the data type for this column is an int
  # convert to int
  cleaned_df = cleaned_df.withColumn("follower_count", cleaned_df["follower_count"].cast("int"))
  # Ensure that each column containing numeric data has a numeric data type

  # Clean the data in the save_location column to include only the save location path
  cleaned_df = cleaned_df.withColumn("save_location", regexp_replace("save_location", "Local save in ", ""))
  # Rename the index column to ind.
  cleaned_df = cleaned_df.withColumnRenamed("index", "ind")
  # Reorder the columns:
  # ind, unique_id, title, description, follower_count, poster_name, tag_list, is_image_or_video, image_src, save_location, category
  cleaned_df = cleaned_df.select("ind", "unique_id", "title", "description", "follower_count", "poster_name", "tag_list", "is_image_or_video", "image_src", "save_location", "category")
  return cleaned_df
  
def clean_geo_df(df):
  cleaned_df = df.withColumn("coordinates", array("latitude", "longitude"))
  cleaned_df = cleaned_df.drop("latitude", "longitude")
  cleaned_df = cleaned_df.withColumn("timestamp", to_timestamp("timestamp"))
  cleaned_df = cleaned_df.select("ind", "country", "coordinates", "timestamp")
  return cleaned_df

def clean_user_df(df):
  cleaned_df = df.withColumn("user_name", concat("first_name", lit(" "), "last_name"))
  cleaned_df = cleaned_df.drop("first_name", "last_name")
  cleaned_df = cleaned_df.withColumn("date_joined", to_timestamp("date_joined"))
  cleaned_df = cleaned_df.select("ind", "user_name", "age", "date_joined")
  return cleaned_df

In [None]:
%sql
SET spark.databricks.delta.formatCheck.enabled=false

In [None]:
from pyspark.sql.types import *
from pyspark.sql.functions import *
import urllib
delta_table_path = "dbfs:/user/hive/warehouse/authentication_credentials"
aws_keys_df = spark.read.format("delta").load(delta_table_path)
# Get the AWS access key and secret key from the spark dataframe
ACCESS_KEY = aws_keys_df.select('Access key ID').collect()[0]['Access key ID']
SECRET_KEY = aws_keys_df.select('Secret access key').collect()[0]['Secret access key']
# Encode the secret key
ENCODED_SECRET_KEY = urllib.parse.quote(string=SECRET_KEY, safe="")

pin_streaming_schema = StructType([
    StructField("category", StringType(), True),
    StructField("description", StringType(), True),
    StructField("downloaded", IntegerType(), True),
    StructField("follower_count", StringType(), True),
    StructField("image_src", StringType(), True),
    StructField("index", IntegerType(), True),
    StructField("is_image_or_video", StringType(), True),
    StructField("poster_name", StringType(), True),
    StructField("save_location", StringType(), True),
    StructField("tag_list", StringType(), True),
    StructField("title", StringType(), True),
    StructField("unique_id", StringType(), True)]
  )

geo_streaming_schema = StructType([
  StructField("country", StringType(), True),
  StructField("ind", LongType(), True),
  StructField("latitude", DoubleType(), True),
  StructField("longitude", DoubleType(), True),
  StructField("timestamp", StringType(), True)]
  )

user_streaming_schema = StructType([
  StructField("age", LongType(), True),
  StructField("date_joined", StringType(), True),
  StructField("first_name", StringType(), True),
  StructField("ind", LongType(), True),
  StructField("last_name", StringType(), True)]  
  )

dbutils.fs.rm(f"/tmp/kinesis/_checkpoints/{username}/pin", True)
dbutils.fs.rm(f"/tmp/kinesis/_checkpoints/{username}/geo", True)
dbutils.fs.rm(f"/tmp/kinesis/_checkpoints/{username}/user", True)

In [None]:
from pyspark.sql.types import *
from pyspark.sql.functions import *
import urllib
delta_table_path = "dbfs:/user/hive/warehouse/authentication_credentials"
aws_keys_df = spark.read.format("delta").load(delta_table_path)
# Get the AWS access key and secret key from the spark dataframe
ACCESS_KEY = aws_keys_df.select('Access key ID').collect()[0]['Access key ID']
SECRET_KEY = aws_keys_df.select('Secret access key').collect()[0]['Secret access key']
# Encode the secret key
ENCODED_SECRET_KEY = urllib.parse.quote(string=SECRET_KEY, safe="")

pin_streaming_schema = StructType([
    StructField("category", StringType(), True),
    StructField("description", StringType(), True),
    StructField("downloaded", IntegerType(), True),
    StructField("follower_count", StringType(), True),
    StructField("image_src", StringType(), True),
    StructField("index", IntegerType(), True),
    StructField("is_image_or_video", StringType(), True),
    StructField("poster_name", StringType(), True),
    StructField("save_location", StringType(), True),
    StructField("tag_list", StringType(), True),
    StructField("title", StringType(), True),
    StructField("unique_id", StringType(), True)]
  )

pin_df = (
    spark
    .readStream
    .format("kinesis") \
    .option("streamName", "streaming-12885f560a0b-pin") \
    .option("region", "us-east-1") \
    .option("initialPosition", "earliest") \
    .option('awsAccessKey', ACCESS_KEY) \
    .option('awsSecretKey', SECRET_KEY) \
    .load()
    )
pin_df = pin_df.selectExpr("CAST(data as STRING)")
pin_df = pin_df.withColumn("data", from_json(col("data"), schema=pin_streaming_schema))
pin_df = pin_df.select("data.*")

cleaned_pin_df = clean_pin_df(pin_df)

cleaned_pin_df.writeStream \
  .format("delta") \
  .outputMode("append") \
  .option("checkpointLocation", "/tmp/kinesis/_checkpoints/pin") \
  .table("12885f560a0b_pin_table")

In [None]:
geo_streaming_schema = StructType([
  StructField("country", StringType(), True),
  StructField("ind", LongType(), True),
  StructField("latitude", DoubleType(), True),
  StructField("longitude", DoubleType(), True),
  StructField("timestamp", StringType(), True)]
  )

geo_df = (
    spark
    .readStream
    .format("kinesis") \
    .option("streamName", "streaming-12885f560a0b-geo") \
    .option("region", "us-east-1") \
    .option("initialPosition", "earliest") \
    .option('awsAccessKey', ACCESS_KEY) \
    .option('awsSecretKey', SECRET_KEY) \
    .load()
    )
geo_df = geo_df.selectExpr("CAST(data as STRING)")
geo_df = geo_df.withColumn("data", from_json(col("data"), schema=geo_streaming_schema))
geo_df = geo_df.select("data.*")

cleaned_geo_df = clean_geo_df(geo_df)

#display(cleaned_geo_df)
cleaned_geo_df.writeStream \
  .format("delta") \
  .outputMode("append") \
  .option("checkpointLocation", "/tmp/kinesis/_checkpoints/geo") \
  .table("12885f560a0b_geo_table")


In [None]:
geo_streaming_schema = StructType([
  StructField("country", StringType(), True),
  StructField("ind", LongType(), True),
  StructField("latitude", DoubleType(), True),
  StructField("longitude", DoubleType(), True),
  StructField("timestamp", StringType(), True)]
  )

geo_df = (
    spark
    .readStream
    .format("kinesis") \
    .option("streamName", "streaming-12885f560a0b-geo") \
    .option("region", "us-east-1") \
    .option("initialPosition", "earliest") \
    .option('awsAccessKey', ACCESS_KEY) \
    .option('awsSecretKey', SECRET_KEY) \
    .load()
    )
geo_df = geo_df.selectExpr("CAST(data as STRING)")
geo_df = geo_df.withColumn("data", from_json(col("data"), schema=geo_streaming_schema))
geo_df = geo_df.select("data.*")

cleaned_geo_df = clean_geo_df(geo_df)

#display(cleaned_geo_df)
cleaned_geo_df.writeStream \
  .format("delta") \
  .outputMode("append") \
  .option("checkpointLocation", "/tmp/kinesis/_checkpoints/geo") \
  .table("12885f560a0b_geo_table")


In [None]:
user_streaming_schema = StructType([
  StructField("age", LongType(), True),
  StructField("date_joined", StringType(), True),
  StructField("first_name", StringType(), True),
  StructField("ind", LongType(), True),
  StructField("last_name", StringType(), True)]  
  )

user_df = (
    spark
    .readStream
    .format("kinesis") \
    .option("streamName", "streaming-12885f560a0b-user") \
    .option("region", "us-east-1") \
    .option("initialPosition", "earliest") \
    .option('awsAccessKey', ACCESS_KEY) \
    .option('awsSecretKey', SECRET_KEY) \
    .load()
    )
user_df = user_df.selectExpr("CAST(data as STRING)")
user_df = user_df.withColumn("data", from_json(col("data"), schema=user_streaming_schema))
user_df = user_df.select("data.*")

cleaned_user_df = clean_user_df(user_df)
display(cleaned_user_df)
cleaned_user_df.writeStream \
  .format("delta") \
  .outputMode("append") \
  .option("checkpointLocation", "/tmp/kinesis/_checkpoints/user") \
  .table("12885f560a0b_user_table")


In [None]:
dbutils.fs.rm(f"/tmp/kinesis/_checkpoints/pin", True)
dbutils.fs.rm(f"/tmp/kinesis/_checkpoints/geo", True)
dbutils.fs.rm(f"/tmp/kinesis/_checkpoints/user", True)

In [None]:
# For getting rid of unwanted objects
#dbutils.fs.rm('.../12885f560a0b_user_table_new',recurse=True)