##Milestone 6 Batch Processing: Spark on Databricks##

In [0]:
# Read Delta table pin
df_pin = spark.read.format("delta").table("57e94de2a910_pin")


In [0]:
# Clean pin data

from pyspark.sql.functions import when, col, lit, regexp_replace

# Replace empty entries and entries with no relevant data in each column with Nones
replace_values = ["", "N/A", "null", "No description available", "No description available Story format", "User Info Error", "Image src error.", "N,o, ,T,a,g,s, ,A,v,a,i,l,a,b,l,e", "No Title Data Available"]

# Replace empty and irrelevant entries with None for each column
df_pin = df_pin.select([
    when(col(c).isin(replace_values) | (col(c) == ""), lit(None)).otherwise(col(c)).alias(c)
    for c in df_pin.columns
])

# Filter rows where 'downloaded' is 1 or 0
df_pin = df_pin.filter((col("downloaded") == 1) | (col("downloaded") == 0))

df_pin.show()

+-----+--------------------+--------------------+--------------------+--------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------+
|  ind|           unique_id|               title|         description|follower_count|         poster_name|            tag_list|   is_image_or_video|           image_src|       save_location|      category|
+-----+--------------------+--------------------+--------------------+--------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------+
| 3824|03e68424-8062-432...|Special Education...|Special education...|          4000|Caffeine Queen Te...|Teaching Special ...|               image|https://i.pinimg....|Local save in /da...|     education|
| 7116|7eecd591-39d4-451...|           style guy|Fashion is what y...|          3000|      Vanessa Ferran|White Shirt Outfi...|               image|https://i.pinimg....|Local s

In [0]:
df_pin.select("downloaded").distinct().show()

+----------+
|downloaded|
+----------+
|         0|
|         1|
+----------+



In [0]:

# Step 1: Standardize follower_count values
df_pin = df_pin.withColumn(
    "follower_count",
    # Replace 'k' with '000' and 'M' with '000000'
    when(col("follower_count").endswith("k"), regexp_replace(col("follower_count"), "k", "000").cast("double"))
    .when(col("follower_count").endswith("M"), regexp_replace(col("follower_count"), "M", "000000").cast("double"))
    .otherwise(col("follower_count").cast("double"))  # Handle numeric values like '25'
)

# Step 2: Drop rows where follower_count is null or not valid
df_pin = df_pin.filter(col("follower_count").isNotNull())

# Show the cleaned DataFrame
df_pin.show()

+--------------+--------------------+----------+--------------+--------------------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|      category|         description|downloaded|follower_count|           image_src|index|   is_image_or_video|         poster_name|       save_location|            tag_list|               title|           unique_id|
+--------------+--------------------+----------+--------------+--------------------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|     education|Special education...|         1|        4000.0|https://i.pinimg....| 3824|               image|Caffeine Queen Te...|Local save in /da...|Teaching Special ...|Special Education...|03e68424-8062-432...|
|  mens-fashion|Fashion is what y...|         1|        3000.0|https://i.pinimg....| 7116|               image|      Vanessa Ferran|

In [0]:
from pyspark.sql.types import IntegerType

df_pin = df_pin.withColumn("follower_count", col("follower_count").cast(IntegerType()))
df_pin.show()

+--------------+--------------------+----------+--------------+--------------------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|      category|         description|downloaded|follower_count|           image_src|index|   is_image_or_video|         poster_name|       save_location|            tag_list|               title|           unique_id|
+--------------+--------------------+----------+--------------+--------------------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|     education|Special education...|         1|          4000|https://i.pinimg....| 3824|               image|Caffeine Queen Te...|Local save in /da...|Teaching Special ...|Special Education...|03e68424-8062-432...|
|  mens-fashion|Fashion is what y...|         1|          3000|https://i.pinimg....| 7116|               image|      Vanessa Ferran|

In [0]:
# rename the index column to ind
df_pin = df_pin.withColumnRenamed("index", "ind")

df_pin.show()

+--------------+--------------------+----------+--------------+--------------------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|      category|         description|downloaded|follower_count|           image_src|  ind|   is_image_or_video|         poster_name|       save_location|            tag_list|               title|           unique_id|
+--------------+--------------------+----------+--------------+--------------------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|     education|Special education...|         1|          4000|https://i.pinimg....| 3824|               image|Caffeine Queen Te...|Local save in /da...|Teaching Special ...|Special Education...|03e68424-8062-432...|
|  mens-fashion|Fashion is what y...|         1|          3000|https://i.pinimg....| 7116|               image|      Vanessa Ferran|

In [0]:
# column order
column_order = [
    "ind",
    "unique_id",
    "title",
    "description",
    "follower_count",
    "poster_name",
    "tag_list",
    "is_image_or_video",
    "image_src",
    "save_location",
    "category"
]

# Reorder the DataFrame columns
df_pin = df_pin.select(column_order)

# Show the reordered DataFrame
df_pin.show()

+-----+--------------------+--------------------+--------------------+--------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------+
|  ind|           unique_id|               title|         description|follower_count|         poster_name|            tag_list|   is_image_or_video|           image_src|       save_location|      category|
+-----+--------------------+--------------------+--------------------+--------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------+
| 3824|03e68424-8062-432...|Special Education...|Special education...|          4000|Caffeine Queen Te...|Teaching Special ...|               image|https://i.pinimg....|Local save in /da...|     education|
| 7116|7eecd591-39d4-451...|           style guy|Fashion is what y...|          3000|      Vanessa Ferran|White Shirt Outfi...|               image|https://i.pinimg....|Local s

In [0]:
# Save cleaned data
table_name = "57e94de2a910_pin_cleaned"  
df_pin.write.format("delta").mode("overwrite").saveAsTable(table_name)

In [0]:
# Clean geolocation data
# Read Delta table geo
df_geo = spark.read.format("delta").table("57e94de2a910_geo")
df_geo.show()

+--------------------+-----+---------+---------+-------------------+
|             country|  ind| latitude|longitude|          timestamp|
+--------------------+-----+---------+---------+-------------------+
|             Albania| 7528| -89.9787| -173.293|2020-08-28T03:52:47|
|             Armenia| 2863| -5.34445| -177.924|2020-04-27T13:34:16|
|            Colombia| 5730|  -77.015| -101.437|2021-04-19T17:37:03|
|       French Guiana| 8304| -28.8852|  -164.87|2019-09-13T04:50:29|
|               Aruba| 8731|  -83.104| -171.302|2020-07-17T04:39:09|
|            Maldives| 1313|  77.0447|  61.9119|2018-06-26T02:39:25|
|       Cote d'Ivoire| 4315| -45.8508|  66.1003|2019-12-15T03:51:28|
|Cocos (Keeling) I...|10794| -89.5236| -154.567|2022-01-01T02:26:50|
|            Bulgaria| 5494| -82.6768| -129.202|2021-07-21T02:02:35|
|          Azerbaijan| 5069| -63.0063| -157.474|2021-03-20T09:32:44|
|       Cote d'Ivoire| 2923| -84.6302| -164.507|2019-09-08T22:53:09|
|             Albania| 3089| -89.9

In [0]:
df_geo.printSchema()
df_geo.show()

root
 |-- country: string (nullable = true)
 |-- ind: long (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- timestamp: string (nullable = true)

+--------------------+-----+---------+---------+-------------------+
|             country|  ind| latitude|longitude|          timestamp|
+--------------------+-----+---------+---------+-------------------+
|             Albania| 7528| -89.9787| -173.293|2020-08-28T03:52:47|
|             Armenia| 2863| -5.34445| -177.924|2020-04-27T13:34:16|
|            Colombia| 5730|  -77.015| -101.437|2021-04-19T17:37:03|
|       French Guiana| 8304| -28.8852|  -164.87|2019-09-13T04:50:29|
|               Aruba| 8731|  -83.104| -171.302|2020-07-17T04:39:09|
|            Maldives| 1313|  77.0447|  61.9119|2018-06-26T02:39:25|
|       Cote d'Ivoire| 4315| -45.8508|  66.1003|2019-12-15T03:51:28|
|Cocos (Keeling) I...|10794| -89.5236| -154.567|2022-01-01T02:26:50|
|            Bulgaria| 5494| -82.6768| -1

In [0]:
from pyspark.sql.functions import array, col, to_timestamp

# Create a new column as an array of 'column1' and 'column2'
df_geo = df_geo.withColumn("coordinates", array(col("latitude"), col("longitude")))

# Drop the latitude and longitude columns
df_geo = df_geo.drop("latitude", "longitude")

# Convert the timestamp column from a string to a timestamp data type
df_geo = df_geo.withColumn("timestamp", to_timestamp(col("timestamp"), "yyyy-MM-dd'T'HH:mm:ss"))

# Filter out rows where conversion failed (timestamp is null)
df_geo = df_geo.filter(df_geo["timestamp"].isNotNull())

# New column order
column_order = [
    "ind",
    "country",
    "coordinates",
    "timestamp",
]

# Reorder the DataFrame columns
df_geo = df_geo.select(column_order)

# Show the updated DataFrame
df_geo.show()

+-----+--------------------+--------------------+-------------------+
|  ind|             country|         coordinates|          timestamp|
+-----+--------------------+--------------------+-------------------+
| 3824|British Virgin Is...|[-82.4276, -170.019]|2018-10-28 19:22:34|
| 7116|               Macao| [-18.3686, 72.4109]|2019-09-16 18:45:41|
|   74| Antigua and Barbuda|[-81.0108, -165.206]|2020-01-29 14:03:35|
| 1335|Antarctica (the t...|[-77.9931, -175.682]|2022-03-19 17:29:42|
| 4513|             Namibia|  [22.2314, 144.214]|2022-04-11 11:05:13|
| 7850|             Vietnam|  [26.5208, 160.699]|2022-04-09 15:06:01|
|10248|   Equatorial Guinea| [-31.9615, 161.151]|2019-11-03 15:11:00|
| 8123|             Morocco| [80.2593, -134.292]|2021-08-23 07:31:46|
| 2987|            Dominica| [83.1466, -169.136]|2019-12-24 22:33:06|
| 1487|             Denmark| [6.23029, -75.5939]|2021-05-15 18:23:45|
|10052|Central African R...| [14.7195, -130.921]|2020-01-03 01:10:57|
| 8677|Bouvet Island

In [0]:
spark.sql("DROP TABLE IF EXISTS 57e94de2a910_geo_cleaned_new")

DataFrame[]

In [0]:
# Save cleaned data
table_name = "57e94de2a910_geo_cleaned"  
df_geo.write.format("delta").mode("overwrite").saveAsTable(table_name)

In [0]:
# Read Delta table user
df_user = spark.read.format("delta").table("57e94de2a910_user")
df_user.show()

+---+-------------------+----------+-----+---------+
|age|        date_joined|first_name|  ind|last_name|
+---+-------------------+----------+-----+---------+
| 20|2015-10-24T11:23:51|   Abigail| 7528|      Ali|
| 32|2016-10-23T14:06:51|     Dylan| 2863|   Holmes|
| 36|2015-12-08T20:02:43|    Rachel| 5730|    Davis|
| 25|2015-12-28T04:21:39|   Charles| 8304|    Berry|
| 21|2015-11-10T09:27:42|    Andrea| 8731|Alexander|
| 32|2016-04-02T03:51:23|  Brittany| 1313|    Jones|
| 36|2015-12-20T16:38:13|  Michelle| 4315|   Prince|
| 34|2016-12-22T00:02:02|    Thomas|10794|   Turner|
| 27|2015-12-16T15:20:05|      Anne| 5494|    Allen|
| 25|2016-01-13T17:36:30|    Amanda| 5069|     Ball|
| 26|2015-11-11T03:20:57|     Brian| 2923|   Nelson|
| 20|2015-10-24T11:23:51|   Abigail| 3089|      Ali|
| 23|2015-11-25T13:36:22|     Corey| 6063|  Andrews|
| 48|2017-09-26T16:31:56|    Robert| 3454|   Murphy|
| 20|2017-04-11T16:35:33|    Cheryl| 7554|   Huerta|
| 30|2015-12-25T20:24:37|     Emily| 6145|  Ha

In [0]:
from pyspark.sql.functions import concat, lit, col

# Create a new column username that concatenates the information found in the first_name and last_name columns
df_user = df_user.withColumn("username", concat(col("first_name"), lit(" "), col("last_name")))

# Drop the latitude and longitude columns
df_user = df_user.drop("first_name", "last_name")

# Convert the date joined column from a string to a timestamp data type
df_user = df_user.withColumn("date_joined", to_timestamp(col("date_joined"), "yyyy-MM-dd'T'HH:mm:ss"))

# Filter out rows where conversion failed (timestamp_column is null)
df_user = df_user.filter(df_user["date_joined"].isNotNull())

# New column order
column_order = [
    "ind",
    "username",
    "age",
    "date_joined"
]

# Reorder the DataFrame columns
df_user = df_user.select(column_order)

# Show the updated DataFrame
df_user.show()

+----+----------------+---+-------------------+
| ind|        username|age|        date_joined|
+----+----------------+---+-------------------+
|9502|    Brian Barnes| 29|2015-11-01 23:09:37|
|2048|       Ann Chung| 22|2015-11-18 23:11:15|
|1341|     Abigail Ali| 20|2015-10-24 11:23:51|
|2553|  Bonnie Estrada| 30|2015-10-21 07:31:00|
|2810|Brandi Cervantes| 35|2016-05-31 01:40:44|
|9155| Aaron Alexander| 21|2015-10-25 07:36:08|
|7448| Brenda Gonzalez| 56|2015-12-07 11:53:35|
|1173|    Dana Jackson| 57|2016-06-05 21:37:09|
|1631|   Lauren Powell| 30|2016-05-22 08:40:37|
|8090|    Aaron Abbott| 20|2015-10-23 16:08:41|
|7511|Sarah Valenzuela| 54|2016-12-08 15:53:57|
|8996| Aaron Alexander| 21|2015-10-25 07:36:08|
|9335|    Amanda Adams| 20|2015-10-21 08:27:36|
|1139|   Jeffrey Smith| 30|2016-10-26 06:53:14|
|7437|     Alvin Adams| 20|2016-01-01 13:50:40|
|5198|     James Sharp| 23|2016-10-21 08:22:24|
|9407|     Amy Andrews| 23|2015-10-29 19:12:55|
|8926|     Tyler Davis| 27|2016-06-14 18

In [0]:
# Save cleaned data
table_name = "57e94de2a910_user_cleaned"  
df_user.write.format("delta").mode("overwrite").saveAsTable(table_name)