---
## Setup

### Mount Bucket

In [None]:
from pyspark.sql.functions import *
import urllib


file_type = "csv"
first_row_is_header = "true"
delimiter = ","
aws_keys_df = spark.read.format(file_type)\
.option("header", first_row_is_header)\
.option("sep", delimiter)\
.load("/FileStore/tables/authentication_credentials.csv")

ACCESS_KEY = aws_keys_df.where(col('User name')=='databricks-user').select('Access key ID').collect()[0]['Access key ID']
SECRET_KEY = aws_keys_df.where(col('User name')=='databricks-user').select('Secret access key').collect()[0]['Secret access key']
ENCODED_SECRET_KEY = urllib.parse.quote(string=SECRET_KEY, safe="")

AWS_S3_BUCKET = "user-0ea903d23769-bucket"
MOUNT_NAME = "/mnt/0ea903d23769-bucket"
SOURCE_URL = "s3n://{0}:{1}@{2}".format(ACCESS_KEY, ENCODED_SECRET_KEY, AWS_S3_BUCKET)
dbutils.fs.mount(SOURCE_URL, MOUNT_NAME)

### Read Topic Contents Into Dataframes

In [None]:
def read_data(spark, file_type, infer_schema, location):
    return spark.read.format(file_type) \
                    .option("inferSchema", infer_schema) \
                    .load(location)

file_type = "json"
infer_schema = "true"

location_pin = "/mnt/0ea903d23769-bucket/topics/0ea903d23769.pin/partition=0/*.json"
location_geo = "/mnt/0ea903d23769-bucket/topics/0ea903d23769.geo/partition=0/*.json"
location_user = "/mnt/0ea903d23769-bucket/topics/0ea903d23769.user/partition=0/*.json"

df_pin_dirty = read_data(spark, file_type, infer_schema, location_pin)
df_geo_dirty = read_data(spark, file_type, infer_schema, location_geo)
df_user_dirty = read_data(spark, file_type, infer_schema, location_user)


### Download Dirty CSVs

In [None]:
# df_pin.write.csv('FileStore/tables/pin_dirty.csv')
# df_geo.write.csv('FileStore/tables/geo_dirty.csv')
# df_user.write.csv('FileStore/tables/user_dirty.csv')

display(df_pin_dirty.select("*"))
display(df_geo_dirty.select("*"))
display(df_use_dirty.select("*"))

age,date_joined,first_name,ind,last_name
28,2015-11-24T22:47:19,Christopher,6427,Montgomery
28,2015-11-24T22:47:19,Christopher,6427,Montgomery
38,2015-11-12T11:07:24,Christopher,2814,Hernandez
50,2017-02-07T08:09:03,Benjamin,8075,Fitzpatrick
49,2016-07-07T09:22:33,Elizabeth,1968,Strickland
35,2015-10-22T22:42:23,Christopher,2041,Campbell
33,2016-05-13T03:57:06,Christopher,1452,Castillo
27,2016-03-08T13:38:37,Christopher,2015,Bradshaw
39,2016-06-29T20:43:59,Christina,6398,Davenport
20,2015-10-23T04:13:23,Alexandria,4140,Alvarado


### Create Spark Session

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.window import Window
from pyspark.sql.types import IntegerType, TimestampType, DateType

spark = SparkSession.builder.appName("DF Cleaning").getOrCreate()

---
## Cleaning

### Cleaning df_pin

In [None]:
df_pin = df_pin_dirty

# Replace empty entries and entries with no relevant data in each column with Nones
df_pin = df_pin.replace(['', ' ', 'NULL', 'null'], [None] * 4)
df_pin = df_pin.withColumn("description", when(col("description") == "No description available Story format", None).otherwise(col("description")))
df_pin = df_pin.withColumn("follower_count", when(col("follower_count") == "User Info Error", None).otherwise(col("follower_count")))
df_pin = df_pin.withColumn("image_src", when(col("image_src") == "Image src error.", None).otherwise(col("image_src")))
df_pin = df_pin.withColumn("poster_name", when(col("poster_name") == "User Info Error", None).otherwise(col("poster_name")))
df_pin = df_pin.withColumn("tag_list", when(col("tag_list") == "N,o, ,T,a,g,s, ,A,v,a,i,l,a,b,l,e", None).otherwise(col("tag_list")))
df_pin = df_pin.withColumn("title", when(col("title") == "No Title Data Available", None).otherwise(col("title")))

# Transform follower_count to ensure every entry is a number and data type is an int
# Remove any non-numeric characters (like 'k' in '136k') and then convert to integer
df_pin = df_pin.withColumn("follower_count", regexp_replace(col("follower_count"), "[^0-9]", ""))
df_pin = df_pin.withColumn("follower_count", col("follower_count").cast(IntegerType()))

# Ensure that each column containing numeric data has a numeric data type
df_pin = df_pin.withColumn("downloaded", col("downloaded").cast(IntegerType()))

# Clean the data in the save_location column to include only the save location path
df_pin = df_pin.withColumn("save_location", regexp_replace(col("save_location"), "Local save in ", ""))

# Rename the index column to ind
df_pin = df_pin.withColumnRenamed("index", "ind")

# Reorder the DataFrame columns
df_pin = df_pin.select(["ind", "unique_id", "title", "description", "follower_count", "poster_name", "tag_list", "is_image_or_video", "image_src", "save_location", "category"])


display(df_pin.select("*"))

ind,unique_id,title,description,follower_count,poster_name,tag_list,is_image_or_video,image_src,save_location,category
6447,d3039535-5767-426a-ba80-cbda6e0e008a,〚 Warm natural tones and vintage decor: cozy cottage in Sweden 〛◾ Photos ◾ Ideas ◾ Design,"И хоть у шведов любимые цвета - серый и белый, интерьеры этого конкретного загородного коттеджа в Швеции наполнены теплыми натуральными оттенками. В такой ✌PUFIK. Beautiful Inte…",136.0,PUFIK Interiors & Inspirations,"Cheap Home Decor,Diy Home Decor,Room Decor,Cozy Cottage,Cozy House,Shabby Cottage,Shabby Chic,Decoration Hall,House Decorations",image,https://i.pinimg.com/originals/3a/7d/8e/3a7d8ecdf8b82f06885dc9763f9ba3d0.png,/data/home-decor,home-decor
4372,0fa170c9-c9b7-4928-9dfc-42792fc01811,Blush and Gold Outdoor Graduation Party | Simply Charming Socials | Atlanta Event Planner — Simply Charming Socials | Atlanta Wedding Planner,"Blush, white, and gold backyard graduation party with balloon installation, dessert bar, coffee cart, and outdoor lounge area. Planned and designed by Simply Charming Socials, A…",1.0,Simply Charming Socials | Event Planning & Design,"Outdoor Graduation Parties,Graduation Party Planning,Graduation Party Decor,Grad Parties,College Graduation,Grad Party Decorations,Prom Decor,Grown Up Parties,Wedding Ceremony Arch",image,https://i.pinimg.com/originals/27/1f/87/271f87bbf6f8f43f9774f756968f5f4c.jpg,/data/event-planning,event-planning
6068,ea35fad4-1b89-4a2e-b65f-215afc52bd80,HOME DECOR INSPIRATION’s Instagram photo: “Stunning design 😍 please comment and tag someone who loves to see this ♥️🥰☺️ . Credit @fagerhoi_hjemmet”,"5,996 Likes, 29 Comments - HOME DECOR INSPIRATION (@modern_homestyle) on Instagram: “Stunning design 😍 please comment and tag someone who loves to see this ♥️🥰☺️ . Credit…”",1.0,Saramertt,"Decoration Design,Deco Design,Design Design,Design Moderne,Home Interior,Interior Decorating,Decorating Your Home,Best Online Furniture Stores,Furniture Shopping",image,https://i.pinimg.com/originals/ed/10/d2/ed10d231ac9cd7ac349b600875f8d065.jpg,/data/home-decor,home-decor
2878,d2369cf7-7ed5-4080-abc0-fd5a1932e796,Pipe Cleaner Spider Craft For Kids,"Grab your toilet paper rolls and make this pipe cleaner spider craft for kids! It's a fun Halloween project that is great for making with preschool, kindergarten, and elementary…",267.0,Easy Kids Crafts & Activities | Preschool & Kindergarten Ideas,"Halloween Arts And Crafts,Halloween Crafts For Toddlers,Fall Crafts For Kids,Toddler Crafts,Holiday Crafts,Halloween Activities For Preschoolers,Halloween Crafts For Kindergarten,Christmas Crafts For Kindergarteners,Arts And Crafts For Kids Easy",video,https://i.pinimg.com/videos/thumbnails/originals/7d/70/2f/7d702f8214aaee92415cab35d8847432.0000001.jpg,/data/diy-and-crafts,diy-and-crafts
7436,1e3faf01-d315-42d3-ad29-a588246717f4,5 Τρόποι να συνδυάσεις ένα τζιν μπουφάν!,"Το τζιν μπουφάν είναι ένα διαχρονικό πανωφόρι που φοριέται πολύ εύκολα την άνοιξη. Πράγματι, είναι το ιδανικό πανωφόρι για τις casual, καθημερινές εμφανίσεις αφού είναι πολύ άνε…",2.0,The-Man.gr,"Suit Fashion,Boy Fashion,Fashion Outfits,Urban Fashion Girls,Jeans Fashion,Fashion 2018,Fashion Boots,Fashion Fashion,Fashion Trends",image,https://i.pinimg.com/originals/cc/0a/2c/cc0a2c8b9bfd569178af4aa150793ec9.jpg,/data/mens-fashion,mens-fashion
683,a0215254-abdd-4fbf-8d79-927aa319587e,Framed Canvas Home Artwork Decoration Abstract Mountain Nature Scenery Canvas Wall Art for Living Room Bedroom Canvas Wall Art Ready to Hang - Unframed 1P 24x18 / Gallery Wrap,"Framed Canvas Home Artwork Decoration Abstract Mountain Nature Scenery Canvas Wall Art for Living Room, Bedroom Canvas Wall Art Ready to Hang Each canvas is professionally print…",305.0,Wall Canvas Mall,"Bedroom Canvas,Canvas Home,Canvas For Living Room,Paintings For Living Room,Living Room Gallery Wall,Living Room Wall Art,Wall Art Bedroom,Yellow Walls Living Room,Modern Gallery Wall",image,https://i.pinimg.com/originals/ae/fa/69/aefa69dc49f836f15a6a1d0322c9b012.jpg,/data/art,art
7436,1e3faf01-d315-42d3-ad29-a588246717f4,5 Τρόποι να συνδυάσεις ένα τζιν μπουφάν!,"Το τζιν μπουφάν είναι ένα διαχρονικό πανωφόρι που φοριέται πολύ εύκολα την άνοιξη. Πράγματι, είναι το ιδανικό πανωφόρι για τις casual, καθημερινές εμφανίσεις αφού είναι πολύ άνε…",2.0,The-Man.gr,"Suit Fashion,Boy Fashion,Fashion Outfits,Urban Fashion Girls,Jeans Fashion,Fashion 2018,Fashion Boots,Fashion Fashion,Fashion Trends",image,https://i.pinimg.com/originals/cc/0a/2c/cc0a2c8b9bfd569178af4aa150793ec9.jpg,/data/mens-fashion,mens-fashion
8161,da45e81e-3768-4e7c-862f-a1aae4289cf4,15 Inspirational quotes to start your day off feeling motivated and positive. Inspiring words can he,Here are 15 inspirational quotes to start your day feeling motivated and positive. Inspiring words are a great tool to combat anxiety and fear. They can offer hope and spark you…,19.0,Dream Dash Journal,"Good Vibes Quotes Positivity,Positive Quotes For Life Encouragement,Positive Morning Quotes,Funny Positive Quotes,Positive Uplifting Quotes,Feeling Positive Quotes,Motivational Quotes For Success Positivity,Positive Quotes About Love,Morning Qoutes",image,https://i.pinimg.com/originals/db/ce/e4/dbcee4c357b42cb18fcefa044aacd55a.jpg,/data/quotes,quotes
2087,5169a175-812b-4253-ae93-1b9672e21d57,Christmas Countdown Calendar with Fun Family Activities,"Countdown to Christmas with 25 fun activities for kids! Including ""give to less fortunate,"" ""read a Christmas book,"" and ""cut out paper snowflakes,"" these activities will keep y…",8.0,"Mombrite | Activities, STEM Projects, and Crafts for Kids","Christmas Activities For Families,Fun Activities For Kids,Family Activities,Kids Printable Activities,Christmas Crafts For Preschoolers,Preschool Christmas Activities,Fun Crafts For Kids,Advent Calendar Activities,Advent Calendars For Kids",image,https://i.pinimg.com/originals/5f/6f/c4/5f6fc400d8a5b966aa43a582f0ef39f3.png,/data/christmas,christmas
1777,866d3485-0934-4707-a77b-dc0ee5618445,"Merry Christmas Banners,New Year Outdoor Indoor Christmas Decorations Welcome Bright Red Xmas Porch Sign Hanging for Home Wall Door Holiday Party Decor - C","Product Description: Product Name:Christmas Hanging Banner Quantity:2pcs Color:As Pictures Show Size:71x12 in Material:oxford Features: 1.Package list: 2 banners 2. ""Merry Chris…",5.0,Wear24-7,"Unique Christmas Door Decorations,Christmas Decorations Clearance,Holiday Decor,Christmas Front Doors,Christmas Porch,Outdoor Christmas,Christmas Sale,Christmas Wreaths,Merry Christmas Banner",image,https://i.pinimg.com/originals/47/86/35/47863563fdd2e1b330b7f7c743247b88.jpg,/data/christmas,christmas


### Cleaning df_geo

In [None]:
df_geo = df_geo_dirty

# Create a new column 'coordinates' containing an array of latitude and longitude
df_geo = df_geo.withColumn("coordinates", array(col("latitude"), col("longitude")))

# Drop the latitude and longitude columns
df_geo = df_geo.drop("latitude", "longitude")

# Convert the timestamp column from a string to a timestamp data type
df_geo = df_geo.withColumn("timestamp", col("timestamp").cast(TimestampType()))

# Reorder the DataFrame columns
df_geo = df_geo.select(["ind", "country", "coordinates", "timestamp"])

display(df_geo.select("*"))

ind,country,coordinates,timestamp
8826,British Indian Ocean Territory (Chagos Archipelago),"List(-42.0773, -163.698)",2018-01-10T04:28:20.000+0000
3889,British Indian Ocean Territory (Chagos Archipelago),"List(-85.4776, -130.258)",2019-09-12T00:19:17.000+0000
4103,British Indian Ocean Territory (Chagos Archipelago),"List(-85.4776, -130.258)",2022-06-16T15:41:10.000+0000
8221,British Indian Ocean Territory (Chagos Archipelago),"List(-20.5574, -54.4834)",2021-12-29T06:33:46.000+0000
6863,British Indian Ocean Territory (Chagos Archipelago),"List(-86.5675, -149.565)",2022-06-17T08:50:59.000+0000
9090,British Indian Ocean Territory (Chagos Archipelago),"List(-42.0773, -163.698)",2020-11-07T20:57:46.000+0000
7216,British Indian Ocean Territory (Chagos Archipelago),"List(-86.5675, -149.565)",2020-08-19T00:32:59.000+0000
6863,British Indian Ocean Territory (Chagos Archipelago),"List(-86.5675, -149.565)",2022-06-17T08:50:59.000+0000
3337,British Indian Ocean Territory (Chagos Archipelago),"List(-65.2363, 21.9622)",2020-06-29T05:02:22.000+0000
8703,British Indian Ocean Territory (Chagos Archipelago),"List(37.0529, 5.8032)",2018-08-28T18:51:28.000+0000


### Cleaning df_user

In [None]:
df_user = df_user_dirty

# Create a new column 'user_name' by concatenating 'first_name' and 'last_name'
df_user = df_user.withColumn("user_name", concat_ws(" ", col("first_name"), col("last_name")))

# Drop the 'first_name' and 'last_name' columns
df_user = df_user.drop("first_name", "last_name")

# Convert the 'date_joined' column from a string to a timestamp data type
df_user = df_user.withColumn("date_joined", col("date_joined").cast(TimestampType()))

# Reorder the DataFrame columns
df_user = df_user.select(["ind", "user_name", "age", "date_joined"])

display(df_user.select("*"))

ind,user_name,age,date_joined
6427,Christopher Montgomery,28,2015-11-24T22:47:19.000+0000
6427,Christopher Montgomery,28,2015-11-24T22:47:19.000+0000
2814,Christopher Hernandez,38,2015-11-12T11:07:24.000+0000
8075,Benjamin Fitzpatrick,50,2017-02-07T08:09:03.000+0000
1968,Elizabeth Strickland,49,2016-07-07T09:22:33.000+0000
2041,Christopher Campbell,35,2015-10-22T22:42:23.000+0000
1452,Christopher Castillo,33,2016-05-13T03:57:06.000+0000
2015,Christopher Bradshaw,27,2016-03-08T13:38:37.000+0000
6398,Christina Davenport,39,2016-06-29T20:43:59.000+0000
4140,Alexandria Alvarado,20,2015-10-23T04:13:23.000+0000


---
## Tests

### Find Most Popular Category per Country

In [None]:
# Join the geo and pin DataFrames on the 'ind' column
df_joined = df_geo.join(df_pin, "ind")

# Group by country and category and count the occurrences
df_category_count = df_joined.groupBy("country", "category").agg(count("*").alias("category_count"))

# Find the most popular category for each country by sorting within each group
window = Window.partitionBy("country").orderBy(col("category_count").desc())

df_most_popular = df_category_count.withColumn("rank", rank().over(window)) \
                                   .filter(col("rank") == 1) \
                                   .drop("rank")

# Select the desired columns for the final DataFrame
df_final = df_most_popular.select("country", "category", "category_count")

display(df_final.select("*"))

country,category,category_count
Afghanistan,education,35
Albania,art,30
Algeria,quotes,43
American Samoa,tattoos,20
Andorra,tattoos,15
Angola,education,5
Anguilla,tattoos,10
Antarctica (the territory South of 60 deg S),christmas,15
Antigua and Barbuda,travel,7
Argentina,tattoos,15


### Find Post Count per Category Between 2018 & 2022

In [None]:
df_joined = df_pin.join(df_geo, 'ind', 'inner')

# Convert the timestamp column from string to timestamp type if it's not already
df_joined = df_joined.withColumn("timestamp", col("timestamp").cast("timestamp"))

# Filter the DataFrame for posts between 2018 and 2022
df_filtered = df_joined.filter((year("timestamp") >= 2018) & (year("timestamp") <= 2022))

# Create a new column with just the year from the timestamp
df_with_year = df_filtered.withColumn("post_year", year("timestamp"))

# Group by post_year and category and count the occurrences
df_category_count = df_with_year.groupBy("post_year", "category").agg(count("*").alias("category_count"))

# Order the result for better readability
df_result = df_category_count.orderBy("post_year", "category")

display(df_result.select("*"))

post_year,category,category_count
2018,art,32
2018,beauty,23
2018,christmas,44
2018,diy-and-crafts,37
2018,education,28
2018,event-planning,25
2018,finance,39
2018,home-decor,46
2018,mens-fashion,20
2018,quotes,36


### Find Most Followed User per Country

In [None]:
df_joined = df_pin.join(df_geo, 'ind', 'inner')

# Define a window spec partitioned by country
windowSpec = Window.partitionBy("country").orderBy(col("follower_count").desc())

# Use the window spec to add a row number for each user within each country partition
df_ranked = df_joined.withColumn("row_number", row_number().over(windowSpec))

# Filter for the top user (row_number 1) in each country
df_top_user_per_country = df_ranked.filter(col("row_number") == 1) \
                                   .select("country", "poster_name", "follower_count")

display(df_top_user_per_country.select("*"))

country,poster_name,follower_count
Afghanistan,"DIY Joy - Crafts, Home Improvement, Decor & Recipes",985
Albania,WeAreTeachers,500
Algeria,YourTango,942
American Samoa,Byrdie,538
Andorra,The Best Ideas for Kids,903
Angola,CraftGossip.com,502
Anguilla,dresslily,760
Antarctica (the territory South of 60 deg S),StayGlam,829
Antigua and Barbuda,A Cultivated Nest,578
Argentina,Next Luxury,800


### Find The Country With Most Followed User

In [None]:
# Find the maximum follower count across all countries
max_global_follower_count = df_top_followers_per_country.agg(max("follower_count")).collect()[0][0]

# Find the country or countries with the user that has the maximum global follower count
df_country_with_top_follower = df_top_followers_per_country.filter(col("follower_count") == max_global_follower_count) \
                                                            .select("country", "follower_count")

# Display the results
display(df_country_with_top_follower.select("*"))

country,follower_count
Palestinian Territory,997
Western Sahara,997


### Find Most Popular Category per Age Group

In [None]:
df_joined = df_pin.join(df_user, 'ind', 'inner')

# Create the age_group column
df_with_age_group = df_joined.withColumn(
    "age_group",
    when(col("age").between(18, 24), "18-24")
    .when(col("age").between(25, 35), "25-35")
    .when(col("age").between(36, 50), "36-50")
    .otherwise("50+")
)

# Group by age_group and category and count the occurrences
df_category_count = df_with_age_group.groupBy("age_group", "category").agg(count("*").alias("category_count"))

# Define a window spec partitioned by age_group and ordered by category_count descending
windowSpec = Window.partitionBy("age_group").orderBy(col("category_count").desc())

# Use the window spec to add a rank for each category within each age group partition
df_ranked = df_category_count.withColumn("rank", rank().over(windowSpec))

# Filter for the top-ranked category within each age group
df_top_category_per_age_group = df_ranked.filter(col("rank") == 1).select("age_group", "category", "category_count")

display(df_top_category_per_age_group)

age_group,category,category_count
18-24,tattoos,133
25-35,christmas,76
36-50,christmas,52
50+,christmas,30
50+,travel,30


### Find Median Follower Count per Age Group

In [None]:
df_joined = df_pin.join(df_user, 'ind', 'inner')

# Age groups
df_with_age_group = df_joined.withColumn(
    "age_group",
    when(col("age").between(18, 24), "18-24")
    .when(col("age").between(25, 35), "25-35")
    .when(col("age").between(36, 50), "36-50")
    .otherwise("50+")
)

# Group by age_group and calculate the median follower count
df_median_follower_count = df_with_age_group.groupBy("age_group")\
                                            .agg(percentile_approx("follower_count", 0.5).alias("median_follower_count"))

# Display the result
display(df_median_follower_count)


age_group,median_follower_count
50+,24
36-50,28
18-24,55
25-35,30


### Find New User Count Between 2015 & 2020

In [None]:
# Convert the date_joined column from string to date type
df_user = df_user.withColumn("date_joined", col("date_joined").cast(DateType()))

# Extract the year from the date_joined column
df_with_year = df_user.withColumn("post_year", year(col("date_joined")))

# Filter the DataFrame for years between 2015 and 2020
df_filtered = df_with_year.filter((col("post_year") >= 2015) & (col("post_year") <= 2020))

# Group by post_year and count the number of users
df_number_users_joined = df_filtered.groupBy("post_year").agg(count("*").alias("number_users_joined"))

display(df_number_users_joined)

post_year,number_users_joined
2015,825
2016,791
2017,340


### Find Median Follower Count of Users Joined Between 2015 & 2020

In [None]:
# Filter users who joined between 2015 and 2020
df_filtered_users = df_user.withColumn("date_joined", col("date_joined").cast("timestamp")) \
                           .withColumn("post_year", year("date_joined")) \
                           .filter((col("post_year") >= 2015) & (col("post_year") <= 2020))

df_joined = df_filtered_users.join(df_pin, 'ind', 'inner')

# Calculate the median follower count per post year
df_median_follower_count = df_joined.groupBy("post_year") \
                                    .agg(expr("percentile_approx(follower_count, 0.5)").alias("median_follower_count"))

display(df_median_follower_count)

post_year,median_follower_count
2015,60
2016,32
2017,24


### Find Median Follower Count per Joining Year & Age Group

In [None]:
# Filter users who joined between 2015 and 2020
df_users_filtered = df_user.withColumn("date_joined", col("date_joined").cast("timestamp")) \
                           .withColumn("post_year", year(col("date_joined"))) \
                           .filter((col("post_year") >= 2015) & (col("post_year") <= 2020))

# Create the age_group column
df_users_age_grouped = df_users_filtered.withColumn(
    "age_group",
    when(col("age").between(18, 24), "18-24")
    .when(col("age").between(25, 35), "25-35")
    .when(col("age").between(36, 50), "36-50")
    .otherwise("50+")
)

# Join df_users_age_grouped with df_pin on the user identifier to get follower counts
df_joined = df_users_age_grouped.join(df_pin, 'ind', 'inner')

# Group by age_group and post_year, and calculate the median follower count
df_median_followers = df_joined.groupBy("age_group", "post_year") \
                               .agg(expr("percentile_approx(follower_count, 0.5)").alias("median_follower_count"))

display(df_median_followers)

age_group,post_year,median_follower_count
25-35,2017,25
50+,2017,28
18-24,2016,42
18-24,2015,79
25-35,2016,26
36-50,2017,24
36-50,2016,27
50+,2016,23
36-50,2015,45
18-24,2017,21
