Each markdown block indicates a new section which is a prerequisite for the next.

First we check the file system for the access keys, then load the keys and mount the S3 bucket.


In [0]:
print(spark.conf.get("spark.databricks.clusterUsageTags.clusterId")) # we'll need this to configure the DAG in AWS

In [0]:
from pyspark.sql.types import *
from pyspark.sql.functions import *
import urllib

# Define the path to the Delta table
delta_table_path = "dbfs:/user/hive/warehouse/authentication_credentials"

# check we can access where the access keys are stored
dbutils.fs.ls(delta_table_path)

# Read the Delta table to a Spark DataFrame
aws_keys_df = spark.read.format("delta").load(delta_table_path)

from pyspark.sql.types import *
from pyspark.sql.functions import *
import urllib

# Define the path to the Delta table
delta_table_path = "dbfs:/user/hive/warehouse/authentication_credentials"

# Read the Delta table to a Spark DataFrame
aws_keys_df = spark.read.format("delta").load(delta_table_path)

# Get the AWS access key and secret key from the spark dataframe
ACCESS_KEY = aws_keys_df.select('Access key ID').collect()[0]['Access key ID']
SECRET_KEY = aws_keys_df.select('Secret access key').collect()[0]['Secret access key']
# Encode the secrete key
ENCODED_SECRET_KEY = urllib.parse.quote(string=SECRET_KEY, safe="")

# AWS S3 bucket name
AWS_S3_BUCKET = "user-0a1153066525-bucket"
# Mount name for the bucket
MOUNT_NAME = "/mnt/incoming"
# MOUNT_NAME = "/mnt/dan_bucket"
# Source url
SOURCE_URL = "s3n://{0}:{1}@{2}".format(ACCESS_KEY, ENCODED_SECRET_KEY, AWS_S3_BUCKET)

In [0]:
# Mount the drive
dbutils.fs.mount(SOURCE_URL, MOUNT_NAME)

# test it mounted
# display(dbutils.fs.ls(MOUNT_NAME))
display(dbutils.fs.ls(MOUNT_NAME + "/../.."))

path,name,size,modificationTime
dbfs:/FileStore/,FileStore/,0,1704383990317
dbfs:/Volume/,Volume/,0,0
dbfs:/Volumes/,Volumes/,0,0
dbfs:/_delta_log/,_delta_log/,0,1704383990317
dbfs:/checkpoint/,checkpoint/,0,1704383990317
dbfs:/databricks-datasets/,databricks-datasets/,0,0
dbfs:/databricks-results/,databricks-results/,0,0
dbfs:/delta/,delta/,0,1704383990317
dbfs:/df_pin.csv/,df_pin.csv/,0,1704383990317
dbfs:/df_pin.parquet/,df_pin.parquet/,0,1704383990317


Now we load the files from the 3 streams each into a Spark data frame.

In [0]:
# File location and type
file_type = "json"
# Ask Spark to infer the schema
infer_schema = "true"

# Asterisk(*) indicates reading all the content of the specified file that have .json extension
file_location = MOUNT_NAME + "/topics/0a1153066525.pin/partition=0/*.json" 
# Read in JSONs from mounted S3 bucket
df_pin = spark.read.format(file_type) \
.option("inferSchema", infer_schema) \
.load(file_location)
# Display Spark dataframe to check its content
# display works best for many columns as it presents a scrollbar and also the type of each column is indicated
display(df_pin.head(10))


category,description,downloaded,follower_count,image_src,index,is_image_or_video,poster_name,save_location,tag_list,title,unique_id
mens-fashion,スポーツミックススタイルやアスレジャースタイルの台頭によって、すっかりおなじみとなった「ジョガーパンツ」。スニーカーと相性抜群なアイテムであり、フーディを合わせたカジュアルなスタイルからジャケットを羽織ったドレスライクなスタイルまで幅広くフィットする。今回はそんな「ジョガーパンツ」にフォーカスして注目の着こなし&アイテムを紹介！,1,122k,https://i.pinimg.com/originals/5d/31/e4/5d31e49fada653798f7c8f4c47f65d14.jpg,7491,image,OTOKOMAE/男前研究所,Local save in /data/mens-fashion,"Streetwear,Mens Casual Hats,Fashion Week Hommes,Herren Style,Moda Blog,Look Man,La Mode Masculine,Outfits With Converse,Black Converse",ジョガーパンツでメンズコーデの足元を軽快にこなす！ | メンズファッションメディア OTOKOMAE,0bfd8ee2-8bc8-4d43-8a43-3f9e5f9678bf
vehicles,"!Закрыто! Хотите эстетику или кое-какие элементы для фанфиков? Если это так, то вам, определённо, стоит заглянуть сюда. 𝐅𝐚𝐧𝐜𝐢 𝐅 𝐨𝐫 𝐱𝐱𝐬𝐚𝐣𝐧𝐱",1,422,https://i.pinimg.com/originals/ed/45/25/ed452567d0affd9329d33cc6fb14b5d6.jpg,11100,image,Tia,Local save in /data/vehicles,"Cool Sports Cars,Sport Cars,Cool Cars,Top Luxury Cars,Lamborghini Cars,Lamborghini Urus Interior,Lux Cars,Street Racing Cars,Pretty Cars",𝒜𝑒𝓈𝓉𝒽𝑒𝓉𝒾𝒸 / 𝒻𝑜𝓇 𝒻𝒶𝓃𝒻𝒾𝒸𝓈,c5474c35-4711-416b-b627-764d51498916
tattoos,"Elmira Kruger shared a photo on Instagram: “Больше всего жалко девчонок, которые экономят и разукрашивают свои красивые тела ляпистыми не…” • See 1,214 photos and videos on thei…",1,848,https://i.pinimg.com/originals/9e/4e/a8/9e4ea85a446f779c6d0a1fdc785e9d4b.jpg,8822,image,Alicia Keller,Local save in /data/tattoos,"Arm Sleeve Tattoos For Women,Chicano Tattoos Sleeve,Full Sleeve Tattoos,Body Art Tattoos,Girl Tattoos,Arabic Tattoos,Badass Sleeve Tattoos,Portrait Tattoo Sleeve,Tattoos Pics","Elmira Kruger on Instagram: “Больше всего жалко девчонок, которые экономят и разукрашивают свои красивые тела ляпистыми не стильными рисунками☹️ Я в женском рукаве…”",db80a4e4-293a-45cf-b60c-a67ba9053246
home-decor,"Вместе с Русланом Кирничанским рассказываем, как выжать максимум из маленькой площади и продумать системы хранения так, чтобы ими было удобно пользоваться. Свежие идеи дизайна и…",1,35k,https://i.pinimg.com/originals/0b/5c/0f/0b5c0fdd3ea40beff4a91ddcdaf98852.jpg,5996,image,INMYROOM.RU,Local save in /data/home-decor,"Scandinavian Interior Design,Home Interior Design,Scandinavian Style,Swedish Decor,Swedish Style,Interior Door,Interior Modern,Minimalist Interior,Scandi Chic",Где найти и как организовать дополнительные места для хранения? — INMYROOM,4081ef06-637d-4cfb-9195-5e06b56fbd8c
event-planning,Το όνομα που επέλεξε η μαμά Ανδριανή για τη γλυκιά Τιτίκα δεν είναι καθόλου τυχαίο. Και φυσικά δεν άφησε τίποτα στην τύχη ούτε την ημέρα της βάπτισης. Ανέθεσε την οργάνωση στην…,1,4,https://i.pinimg.com/originals/db/aa/d2/dbaad28fa85012a4ea6958540d98a8e5.jpg,4387,image,Manosbojana Katsareas,Local save in /data/event-planning,"Diy Flowers,Flower Diy,Baptism Decorations,Christening,Event Planning,Wedding Planner,Baptism Ideas,Birthday,Party",Βάπτιση: H παραμυθένια βάπτιση της Τιτίκας με θέμα το μονόκερο από την e.m. for you,ae5e7377-f1bd-4ac5-94de-bee317f51a43
home-decor,"Так повелось в последнее время, что по субботам мы делимся фотографиями какой-нибудь милой скандинавской дачи (надеюсь никто не против такой нашей ✌PUFIK. Beautiful Interiors. O…",1,136k,https://i.pinimg.com/originals/25/82/6b/25826bbe3789faa1c1c70d78ad93a33c.jpg,5953,image,PUFIK Interiors & Inspirations,Local save in /data/home-decor,"Cottage Living Rooms,Cottage Interiors,Home Living Room,Cottage Style Living Room,Cottage House,Cozy Living,Cottage Style Decor,Country Decor,Country Cottage Decorating",〚 Vintage summer cottage filled with love and charm in Sweden 〛◾ Photos ◾ Ideas ◾ Design,0c15cec8-5e8d-42e7-a2ad-9bd1267e0679
diy-and-crafts,"Make this adorable (and easy) DIY doll crib from an old cardboard box, your little one will love to rock their dolls to sleep in this fun upcycled project!",1,66k,https://i.pinimg.com/originals/62/ee/0d/62ee0d79f92248b2d5819b8bfdd94551.jpg,3020,image,"Cassie May - Little Red Window Crafts, DIY, crafts, tutorials and knitting patterns! Cassie May - Little Red Window Crafts, DIY, crafts, tutorials and knitting patterns! Edit settings Cassie May - Little Red Window Crafts, DIY, crafts, tutorials and knitting patterns!",Local save in /data/diy-and-crafts,"Easy Sewing Projects,Sewing Projects For Beginners,Sewing Hacks,Diy Dolls Crib,Doll Beds,Diy Baby Headbands,Diy Cardboard,Minky Fabric,Sewing Basics",Cardboard DIY Doll Crib,186a847a-6778-48c0-888c-4578550ce7c7
tattoos,"Image uploaded by 🦢🥀𝓘𝓼𝓸 𝑹𝒐𝒔𝒊𝒆🌸🦋. Find images and videos about flowers, tattoo and colors on We Heart It - the app to get lost in what you love.",1,15M,https://i.pinimg.com/originals/47/a0/34/47a034a356959e22d8ad367a5e6d62d1.jpg,8586,image,We Heart It,Local save in /data/tattoos,"Mens Body Tattoos,Body Art Tattoos,Tatoos,Collar Bone Tattoos,Pretty Tattoos,Beautiful Tattoos,Awesome Tattoos,Mini Tattoos,Small Tattoos",Pretty ♥ discovered by 🦢🥀𝓘𝓼𝓸 𝑹𝒐𝒔𝒊𝒆🌸🦋 on We Heart It,c338b1c8-7c6a-4a1a-8fba-cdb5a423c0ca
home-decor,"Традиционные шведские коттеджи, обычно с красным фасадом — это настоящее воплощением идеального зимнего уюта. Они обычно оформлены очень просто и ✌PUFIK. Beautiful Interiors. On…",1,136k,https://i.pinimg.com/originals/32/eb/72/32eb72e4fd8654c115a64528bd1f34b4.png,6717,image,PUFIK Interiors & Inspirations,Local save in /data/home-decor,"Scandinavian Cottage,Swedish Cottage,Swedish Home Decor,Swedish Farmhouse,Swedish Style,Swedish Kitchen,Kitchen Black,Swedish House,Cozy Cottage",〚 Уютные шведские коттеджи от Carina Olander 〛 ◾ Фото ◾ Идеи ◾ Дизайн,bc5ab9ee-505e-44f6-92ba-677fe4fdf3e3
event-planning,"Wow your guests! Our backdrops are a great option for providing a personalized, stylish and fun addition to your party .It will be the focal point in any event! They are great a…",1,1k,https://i.pinimg.com/originals/15/1f/93/151f93d662dc158ca2c9bbfed198f556.jpg,4608,image,"Iconica Design | Personalized Event Decor, Stationery & Gifts",Local save in /data/event-planning,"Christmas Party Backdrop,Holiday Banner,Birthday Backdrop,Circus First Birthday,First Birthday Banners,Dinasour Birthday,Birthday Bash,Banner Backdrop,Photo Booth Backdrop","Virtual Baby Shower Little Man Baby Shower Banner, Mustache Baby Shower Backdrop, Oh Boy, Any Color, Printed Or Printable File BBS0035 - 10x8 ft / Top Pole Pocket",d234e56f-5b18-4ef3-905b-44103f7719d9


In [0]:
# Asterisk(*) indicates reading all the content of the specified file that have .json extension
file_location = MOUNT_NAME + "/topics/0a1153066525.geo/partition=0/*.json" 
# Read in JSONs from mounted S3 bucket
df_geo = spark.read.format(file_type) \
.option("inferSchema", infer_schema) \
.load(file_location)
# Display Spark dataframe to check its content
df_geo.show(10)


In [0]:
# Asterisk(*) indicates reading all the content of the specified file that have .json extension
file_location = MOUNT_NAME + "/topics/0a1153066525.user/partition=0/*.json" 
# Read in JSONs from mounted S3 bucket
df_user = spark.read.format(file_type) \
.option("inferSchema", infer_schema) \
.load(file_location)
# Display Spark dataframe to check its content
df_user.show(10)


Clean the data in each data frame as per specification. Note that these cells transform the data so it may be required to rerun the dataframe loading if you wish to rerun a cleanup transformation cell. 
Cleaning functions have been moved to a Github repository for version control and sharing with other data connections such as in the Kinesis assignment.

In [0]:
import PinterestTransformations

# clean the df_pin DataFrame
df_pin = PinterestTransformations.clean_pin(df_pin)
display(df_pin.head(10))


ind,unique_id,title,description,follower_count,poster_name,tag_list,is_image_or_video,image_src,save_location,category
7491,0bfd8ee2-8bc8-4d43-8a43-3f9e5f9678bf,ジョガーパンツでメンズコーデの足元を軽快にこなす！ | メンズファッションメディア OTOKOMAE,スポーツミックススタイルやアスレジャースタイルの台頭によって、すっかりおなじみとなった「ジョガーパンツ」。スニーカーと相性抜群なアイテムであり、フーディを合わせたカジュアルなスタイルからジャケットを羽織ったドレスライクなスタイルまで幅広くフィットする。今回はそんな「ジョガーパンツ」にフォーカスして注目の着こなし&アイテムを紹介！,122000,OTOKOMAE/男前研究所,"Streetwear,Mens Casual Hats,Fashion Week Hommes,Herren Style,Moda Blog,Look Man,La Mode Masculine,Outfits With Converse,Black Converse",image,https://i.pinimg.com/originals/5d/31/e4/5d31e49fada653798f7c8f4c47f65d14.jpg,/data/mens-fashion,mens-fashion
11100,c5474c35-4711-416b-b627-764d51498916,𝒜𝑒𝓈𝓉𝒽𝑒𝓉𝒾𝒸 / 𝒻𝑜𝓇 𝒻𝒶𝓃𝒻𝒾𝒸𝓈,"!Закрыто! Хотите эстетику или кое-какие элементы для фанфиков? Если это так, то вам, определённо, стоит заглянуть сюда. 𝐅𝐚𝐧𝐜𝐢 𝐅 𝐨𝐫 𝐱𝐱𝐬𝐚𝐣𝐧𝐱",422,Tia,"Cool Sports Cars,Sport Cars,Cool Cars,Top Luxury Cars,Lamborghini Cars,Lamborghini Urus Interior,Lux Cars,Street Racing Cars,Pretty Cars",image,https://i.pinimg.com/originals/ed/45/25/ed452567d0affd9329d33cc6fb14b5d6.jpg,/data/vehicles,vehicles
8822,db80a4e4-293a-45cf-b60c-a67ba9053246,"Elmira Kruger on Instagram: “Больше всего жалко девчонок, которые экономят и разукрашивают свои красивые тела ляпистыми не стильными рисунками☹️ Я в женском рукаве…”","Elmira Kruger shared a photo on Instagram: “Больше всего жалко девчонок, которые экономят и разукрашивают свои красивые тела ляпистыми не…” • See 1,214 photos and videos on thei…",848,Alicia Keller,"Arm Sleeve Tattoos For Women,Chicano Tattoos Sleeve,Full Sleeve Tattoos,Body Art Tattoos,Girl Tattoos,Arabic Tattoos,Badass Sleeve Tattoos,Portrait Tattoo Sleeve,Tattoos Pics",image,https://i.pinimg.com/originals/9e/4e/a8/9e4ea85a446f779c6d0a1fdc785e9d4b.jpg,/data/tattoos,tattoos
5996,4081ef06-637d-4cfb-9195-5e06b56fbd8c,Где найти и как организовать дополнительные места для хранения? — INMYROOM,"Вместе с Русланом Кирничанским рассказываем, как выжать максимум из маленькой площади и продумать системы хранения так, чтобы ими было удобно пользоваться. Свежие идеи дизайна и…",35000,INMYROOM.RU,"Scandinavian Interior Design,Home Interior Design,Scandinavian Style,Swedish Decor,Swedish Style,Interior Door,Interior Modern,Minimalist Interior,Scandi Chic",image,https://i.pinimg.com/originals/0b/5c/0f/0b5c0fdd3ea40beff4a91ddcdaf98852.jpg,/data/home-decor,home-decor
4387,ae5e7377-f1bd-4ac5-94de-bee317f51a43,Βάπτιση: H παραμυθένια βάπτιση της Τιτίκας με θέμα το μονόκερο από την e.m. for you,Το όνομα που επέλεξε η μαμά Ανδριανή για τη γλυκιά Τιτίκα δεν είναι καθόλου τυχαίο. Και φυσικά δεν άφησε τίποτα στην τύχη ούτε την ημέρα της βάπτισης. Ανέθεσε την οργάνωση στην…,4,Manosbojana Katsareas,"Diy Flowers,Flower Diy,Baptism Decorations,Christening,Event Planning,Wedding Planner,Baptism Ideas,Birthday,Party",image,https://i.pinimg.com/originals/db/aa/d2/dbaad28fa85012a4ea6958540d98a8e5.jpg,/data/event-planning,event-planning
5953,0c15cec8-5e8d-42e7-a2ad-9bd1267e0679,〚 Vintage summer cottage filled with love and charm in Sweden 〛◾ Photos ◾ Ideas ◾ Design,"Так повелось в последнее время, что по субботам мы делимся фотографиями какой-нибудь милой скандинавской дачи (надеюсь никто не против такой нашей ✌PUFIK. Beautiful Interiors. O…",136000,PUFIK Interiors & Inspirations,"Cottage Living Rooms,Cottage Interiors,Home Living Room,Cottage Style Living Room,Cottage House,Cozy Living,Cottage Style Decor,Country Decor,Country Cottage Decorating",image,https://i.pinimg.com/originals/25/82/6b/25826bbe3789faa1c1c70d78ad93a33c.jpg,/data/home-decor,home-decor
3020,186a847a-6778-48c0-888c-4578550ce7c7,Cardboard DIY Doll Crib,"Make this adorable (and easy) DIY doll crib from an old cardboard box, your little one will love to rock their dolls to sleep in this fun upcycled project!",66000,"Cassie May - Little Red Window Crafts, DIY, crafts, tutorials and knitting patterns! Cassie May - Little Red Window Crafts, DIY, crafts, tutorials and knitting patterns! Edit settings Cassie May - Little Red Window Crafts, DIY, crafts, tutorials and knitting patterns!","Easy Sewing Projects,Sewing Projects For Beginners,Sewing Hacks,Diy Dolls Crib,Doll Beds,Diy Baby Headbands,Diy Cardboard,Minky Fabric,Sewing Basics",image,https://i.pinimg.com/originals/62/ee/0d/62ee0d79f92248b2d5819b8bfdd94551.jpg,/data/diy-and-crafts,diy-and-crafts
8586,c338b1c8-7c6a-4a1a-8fba-cdb5a423c0ca,Pretty ♥ discovered by 🦢🥀𝓘𝓼𝓸 𝑹𝒐𝒔𝒊𝒆🌸🦋 on We Heart It,"Image uploaded by 🦢🥀𝓘𝓼𝓸 𝑹𝒐𝒔𝒊𝒆🌸🦋. Find images and videos about flowers, tattoo and colors on We Heart It - the app to get lost in what you love.",15000000,We Heart It,"Mens Body Tattoos,Body Art Tattoos,Tatoos,Collar Bone Tattoos,Pretty Tattoos,Beautiful Tattoos,Awesome Tattoos,Mini Tattoos,Small Tattoos",image,https://i.pinimg.com/originals/47/a0/34/47a034a356959e22d8ad367a5e6d62d1.jpg,/data/tattoos,tattoos
6717,bc5ab9ee-505e-44f6-92ba-677fe4fdf3e3,〚 Уютные шведские коттеджи от Carina Olander 〛 ◾ Фото ◾ Идеи ◾ Дизайн,"Традиционные шведские коттеджи, обычно с красным фасадом — это настоящее воплощением идеального зимнего уюта. Они обычно оформлены очень просто и ✌PUFIK. Beautiful Interiors. On…",136000,PUFIK Interiors & Inspirations,"Scandinavian Cottage,Swedish Cottage,Swedish Home Decor,Swedish Farmhouse,Swedish Style,Swedish Kitchen,Kitchen Black,Swedish House,Cozy Cottage",image,https://i.pinimg.com/originals/32/eb/72/32eb72e4fd8654c115a64528bd1f34b4.png,/data/home-decor,home-decor
4608,d234e56f-5b18-4ef3-905b-44103f7719d9,"Virtual Baby Shower Little Man Baby Shower Banner, Mustache Baby Shower Backdrop, Oh Boy, Any Color, Printed Or Printable File BBS0035 - 10x8 ft / Top Pole Pocket","Wow your guests! Our backdrops are a great option for providing a personalized, stylish and fun addition to your party .It will be the focal point in any event! They are great a…",1000,"Iconica Design | Personalized Event Decor, Stationery & Gifts","Christmas Party Backdrop,Holiday Banner,Birthday Backdrop,Circus First Birthday,First Birthday Banners,Dinasour Birthday,Birthday Bash,Banner Backdrop,Photo Booth Backdrop",image,https://i.pinimg.com/originals/15/1f/93/151f93d662dc158ca2c9bbfed198f556.jpg,/data/event-planning,event-planning


In [0]:
# clean the df_geo DataFrame
df_geo = PinterestTransformations.clean_geo(df_geo)
df_geo.show(10)


In [0]:
# clean the df_user DataFrame
df_user = PinterestTransformations.clean_user(df_user)
df_user.show(10)


Below are report requests. The begining of each code block will have the reporting requirements stated.

In [0]:
# Find the most popular Pinterest category people post to based on their country.
# return a DataFrame that contains the following columns:
#     country
#     category
#     category_count, a new column containing the desired query output

# join pin and geo tables so we have country and category together
combined_geo_df = df_pin.join(df_geo, df_geo["ind"] == df_pin["ind"], how="inner")
# display(combined_geo_df)
# count category use per country
grouped_df = combined_geo_df.groupBy("country","category").agg(count("category")).withColumnRenamed("count(category)", "category_count")
# display(grouped_df)
# get the maximum value for each country
max_df = grouped_df.groupBy("country").agg(max("category_count")).withColumnRenamed("country", "max_country")
# display(max_df)
# filter to only the maximum category per country
most_popular_category_per_country_df = max_df.join(grouped_df, ((grouped_df["country"] == max_df["max_country"]) & (grouped_df["category_count"] == max_df["max(category_count)"])), how="inner")
# display(most_popular_category_per_country)
# Selecting only the relevant columns
most_popular_category_per_country_df = most_popular_category_per_country_df.select("country", "category", "category_count")
display(most_popular_category_per_country_df.orderBy("country"))


country,category,category_count
Afghanistan,beauty,11
Albania,art,193
Algeria,beauty,13
American Samoa,beauty,49
Andorra,art,11
Angola,beauty,10
Anguilla,beauty,19
Antarctica (the territory South of 60 deg S),christmas,25
Antigua and Barbuda,art,20
Argentina,art,14


In [0]:
# Find how many posts each category had between 2018 and 2022.
# return a DataFrame that contains the following columns:
#     post_year, a new column that contains only the year from the timestamp column
#     category
#     category_count, a new column containing the desired query output

# combined_geo_df already has timestamp and category together so we'll use it again here
# create post_year column which will be of type in
grouped_df = combined_geo_df.withColumn('post_year', year(combined_geo_df["timestamp"]))
# filter out requested years and count categories per country/year after filter
grouped_df = grouped_df.where("post_year >= 2018 and post_year <= 2022").groupBy("post_year","category").agg(count("category")).withColumnRenamed("count(category)", "category_count")
display(grouped_df.orderBy("post_year", "category"))


post_year,category,category_count
2018,art,176
2018,beauty,152
2018,christmas,148
2019,art,176
2019,beauty,131
2019,christmas,127
2020,art,188
2020,beauty,158
2020,christmas,126
2020,mens-fashion,1


In [0]:
# Step 1: For each country find the user with the most followers.
# return a DataFrame that contains the following columns:
#     country
#     poster_name
#     follower_count

# combined_geo_df already has country and poster_name together so we'll use it again here
df_pin_sum_followers = combined_geo_df.groupBy("country", "poster_name").agg(sum("follower_count")).withColumnRenamed("sum(follower_count)", "follower_count")
#display(df_pin_sum_followers)
df_pin_max_followers = df_pin_sum_followers.groupBy("country").agg(max("follower_count")).withColumnRenamed("country", "max_country")
#display(df_pin_max_followers)
# filter to only the most follower user per country
most_followed_per_country_df = df_pin_max_followers.join(df_pin_sum_followers, ((df_pin_sum_followers["country"] == df_pin_max_followers["max_country"]) & (df_pin_sum_followers["follower_count"] == df_pin_max_followers["max(follower_count)"])), how="inner")
#display(most_followed_per_country_df)
# filter to relevant columns
most_followed_per_country_df = most_followed_per_country_df.select("country", "poster_name", "follower_count")
# display sorted by country - confirm one poster_name per country
display(most_followed_per_country_df.orderBy("country"))

# Step 2: Based on the above query, find the country with the user with most followers.
# return a DataFrame that contains the following columns:
#     country
#     follower_count
# This DataFrame should have only one entry.
country_with_user_with_most_followers_df = most_followed_per_country_df.select("country", "follower_count").orderBy("follower_count").tail(1)
display(country_with_user_with_most_followers_df)


country,poster_name,follower_count
Afghanistan,Blossom,6000000
Albania,Bored Panda,220000000
Algeria,Apartment Therapy,20000000
American Samoa,BuzzFeed,255000000
Andorra,Glaminati,4794000
Angola,Tastemade,16000000
Anguilla,We Heart It,45000000
Antarctica (the territory South of 60 deg S),HikenDip,9500000
Antigua and Barbuda,Country Living Magazine,2000000
Argentina,Cheezburger,10000000


country,follower_count
American Samoa,255000000


In [0]:
# find the most popular category people post to based on the following age groups:
#     18-24
#     25-35
#     36-50
#     +50
# return a DataFrame that contains the following columns:
#     age_group, a new column based on the original age column
#     category
#     category_count, a new column containing the desired query output

# join user with age field to piin with category field
combined_user_df = df_pin.join(df_user, df_user["ind"] == df_pin["ind"], how="inner")
# create age_group - note this will be used by other report requests in cells below
combined_user_df = combined_user_df.withColumn("age_group",when(combined_user_df.age > 50, '+50').when(combined_user_df.age > 35, '36-50').when(combined_user_df.age > 24, '25-35').when(combined_user_df.age > 17, '18-24').otherwise('other'))

# back to specifics to this reporting request
user_category_count_df = combined_user_df.groupBy("age_group","category").agg(count("category")).withColumnRenamed("count(category)", "category_count")
# display(user_category_count_df)

# get the maximum category count per age group
user_category_max_df = user_category_count_df.groupBy("age_group").agg(max("category_count")).withColumnRenamed("age_group","max_age_group")
# display(user_category_max_df)
# filter to keep only the maximum category count per age group
user_category_max_df = user_category_count_df.join(user_category_max_df, (user_category_max_df['max_age_group'] == user_category_count_df['age_group']) & (user_category_max_df['max(category_count)'] == user_category_count_df['category_count']), how="inner")
# display(user_category_max_df)
# reduce to the columns as per specification
user_category_max_df = user_category_max_df.select("age_group", "category", "category_count")
display(user_category_max_df)


age_group,category,category_count
+50,vehicles,114
18-24,tattoos,615
25-35,christmas,321
36-50,vehicles,215


In [0]:
# Find the median follower count for users in the following age groups:
#     18-24
#     25-35
#     36-50
#     +50
# return a DataFrame that contains the following columns:
#     age_group, a new column based on the original age column
#     median_follower_count, a new column containing the desired query output

# we have our age group in our combined_user_df, so we'll use that here
# Databricks is using an older version of Spark (3.2.1). With version 3.4.0 and later, the following should work and avoid the "NameError: name 'median' is not defined" error
# user_median_follower_count = combined_user_df.groupBy("age_group","follower_count").agg(median("follower_count")).withColumnRenamed("median(follower_count)", "median_follower_count")
# Originally tried counting the number of users per age_group and then selecting the middle index; however, using the window function appears to run faster
from pyspark.sql import Window
median_window = Window.partitionBy("age_group")
first_window = median_window.orderBy("follower_count")                                  # first, order by column we want to compute the median for
user_median_follower_count = combined_user_df.withColumn("percent_rank", percent_rank().over(first_window))  # add percent_rank column, percent_rank = 0.5 coressponds to median
second_window = median_window.orderBy(pow(user_median_follower_count.percent_rank-0.5, 2))                 # order by (percent_rank - 0.5)^2 ascending
user_median_follower_count = user_median_follower_count.withColumn("median_follower_count", first("follower_count").over(second_window))     # the first row of the window corresponds to median
# display(user_median_follower_count)
user_median_follower_count = user_median_follower_count.select("age_group","median_follower_count").distinct()
display(user_median_follower_count)


age_group,median_follower_count
+50,4000
18-24,119000
25-35,22000
36-50,8000


In [0]:
# Find how many users have joined between 2015 and 2020.
# return a DataFrame that contains the following columns:
#     post_year, a new column that contains only the year from the timestamp column
#     number_users_joined, a new column containing the desired query output
# note that users could appear in multiple years since they may have joined and posted in one year and continued to post in other years

# reduce to only the rows we want first
df_user_2015to2020 = df_user.withColumn('joined_year', year(df_user["date_joined"])).where("joined_year >= 2015 and joined_year <= 2020").withColumnRenamed("ind", "ind_user")
# join to the other tables - note this will be used by other reporting requests in the cells below
users_joined_by_year_2015to2020_df = df_geo.join(df_user_2015to2020, df_user_2015to2020["ind_user"] == df_geo["ind"], how="inner") # for timestamp

# back to specifics to this reporting request
# create the post_year column
users_joined_by_year_2015to2020_df = users_joined_by_year_2015to2020_df.withColumn('post_year', year(users_joined_by_year_2015to2020_df["timestamp"]))
# reduce to the columns we want which make a unique user and keep the distinct list
users_joined_by_year_2015to2020_df_summary = users_joined_by_year_2015to2020_df.select("post_year","age","date_joined","user_name").distinct()
# display(users_joined_by_year_2015to2020_df_summary)
users_joined_by_year_2015to2020_df_summary = users_joined_by_year_2015to2020_df_summary.groupBy("post_year").agg(count("date_joined")).withColumnRenamed("count(date_joined)", "number_users_joined")
display(users_joined_by_year_2015to2020_df_summary.orderBy("post_year"))


post_year,number_users_joined
2017,81
2018,358
2019,328
2020,368
2021,322
2022,304


In [0]:
# Find the median follower count of users have joined between 2015 and 2020.
# return a DataFrame that contains the following columns:
#     post_year, a new column that contains only the year from the timestamp column
#     median_follower_count, a new column containing the desired query output

# Databricks is using an older version of Spark (3.2.1). With version 3.4.0 and later, the median function can be used. The current version of Spark is 3.5.

# we already have the data filtered, tables joined and post_year in users_joined_by_year_2015to2020_df so we'll re-use that
records_combined_year_2015to2020_df = df_pin.join(users_joined_by_year_2015to2020_df, users_joined_by_year_2015to2020_df["ind_user"] == df_pin["ind"], how="inner") # for follower_count

median_window = Window.partitionBy("post_year")
first_window = median_window.orderBy("follower_count")                                  # first, order by column we want to compute the median for
median_user_follow_count_by_year_2015to2020_df = records_combined_year_2015to2020_df.withColumn("percent_rank", percent_rank().over(first_window))  # add percent_rank column, percent_rank = 0.5 coressponds to median
second_window = median_window.orderBy(pow(median_user_follow_count_by_year_2015to2020_df.percent_rank-0.5, 2))                 # order by (percent_rank - 0.5)^2 ascending
median_user_follow_count_by_year_2015to2020_df = median_user_follow_count_by_year_2015to2020_df.withColumn("median_follower_count", first("follower_count").over(second_window))     # the first row of the window corresponds to median
# display(median_user_follow_count_by_year_2015to2020_df)
median_user_follow_count_by_year_2015to2020_df = median_user_follow_count_by_year_2015to2020_df.select("post_year","median_follower_count").distinct()
display(median_user_follow_count_by_year_2015to2020_df.orderBy("post_year"))



post_year,median_follower_count
2017,65000
2018,53000
2019,52000
2020,42000
2021,64000
2022,54000


In [0]:
# Find the median follower count of users that have joined between 2015 and 2020, based on which age group they are part of.
# return a DataFrame that contains the following columns:
#     age_group, a new column based on the original age column
#     post_year, a new column that contains only the year from the timestamp column
#     median_follower_count, a new column containing the desired query output

# Databricks is using an older version of Spark (3.2.1). With version 3.4.0 and later, the median function can be used. The current version of Spark is 3.5.

# we already have the data filtered, tables joined and post_year in records_combined_year_2015to2020_df so we'll re-use that and just add the age_group to it
records_combined_age_group_2015to2020_df = records_combined_year_2015to2020_df.withColumn("age_group",when(records_combined_year_2015to2020_df.age > 50, '+50').when(records_combined_year_2015to2020_df.age > 35, '36-50').when(records_combined_year_2015to2020_df.age > 24, '25-35').when(records_combined_year_2015to2020_df.age > 17, '18-24').otherwise('other'))

median_window = Window.partitionBy("post_year","age_group")
first_window = median_window.orderBy("follower_count")                                  # first, order by column we want to compute the median for
median_user_follow_count_by_year_2015to2020_df = records_combined_age_group_2015to2020_df.withColumn("percent_rank", percent_rank().over(first_window))  # add percent_rank column, percent_rank = 0.5 coressponds to median
second_window = median_window.orderBy(pow(median_user_follow_count_by_year_2015to2020_df.percent_rank-0.5, 2))                 # order by (percent_rank - 0.5)^2 ascending
median_user_follow_count_by_year_2015to2020_df = median_user_follow_count_by_year_2015to2020_df.withColumn("median_follower_count", first("follower_count").over(second_window))     # the first row of the window corresponds to median
# display(median_user_follow_count_by_year_2015to2020_df)
median_user_follow_count_by_year_2015to2020_df = median_user_follow_count_by_year_2015to2020_df.select("post_year", "age_group","median_follower_count").distinct()
display(median_user_follow_count_by_year_2015to2020_df.orderBy("post_year", "age_group"))


post_year,age_group,median_follower_count
2017,+50,5000
2017,18-24,315000
2017,25-35,45000
2017,36-50,34000
2018,+50,2000
2018,18-24,229000
2018,25-35,33000
2018,36-50,14000
2019,+50,8000
2019,18-24,275000


Unmount the drive if we are finished with it. While working with the notebook and running selective cells, the filesystem should remain mounted.

In [0]:
dbutils.fs.unmount(MOUNT_NAME)