### Find Most Popular Category per Country

In [None]:
# Join the geo and pin DataFrames on the 'ind' column
df_joined = df_geo.join(df_pin, "ind")

# Group by country and category and count the occurrences
df_category_count = df_joined.groupBy("country", "category").agg(count("*").alias("category_count"))

# Find the most popular category for each country by sorting within each group
window = Window.partitionBy("country").orderBy(col("category_count").desc())

df_most_popular = df_category_count.withColumn("rank", rank().over(window)) \
                                   .filter(col("rank") == 1) \
                                   .drop("rank")

# Select the desired columns for the final DataFrame
df_final = df_most_popular.select("country", "category", "category_count")

display(df_final.select("*"))

### Find Post Count per Category Between 2018 & 2022

In [None]:
df_joined = df_pin.join(df_geo, 'ind', 'inner')

# Convert the timestamp column from string to timestamp type if it's not already
df_joined = df_joined.withColumn("timestamp", col("timestamp").cast("timestamp"))

# Filter the DataFrame for posts between 2018 and 2022
df_filtered = df_joined.filter((year("timestamp") >= 2018) & (year("timestamp") <= 2022))

# Create a new column with just the year from the timestamp
df_with_year = df_filtered.withColumn("post_year", year("timestamp"))

# Group by post_year and category and count the occurrences
df_category_count = df_with_year.groupBy("post_year", "category").agg(count("*").alias("category_count"))

# Order the result for better readability
df_result = df_category_count.orderBy("post_year", "category")

display(df_result.select("*"))

### Find Most Followed User per Country

In [None]:
df_joined = df_pin.join(df_geo, 'ind', 'inner')

# Define a window spec partitioned by country
windowSpec = Window.partitionBy("country").orderBy(col("follower_count").desc())

# Use the window spec to add a row number for each user within each country partition
df_ranked = df_joined.withColumn("row_number", row_number().over(windowSpec))

# Filter for the top user (row_number 1) in each country
df_top_user_per_country = df_ranked.filter(col("row_number") == 1) \
                                   .select("country", "poster_name", "follower_count")

display(df_top_user_per_country.select("*"))

### Find The Country With Most Followed User

In [None]:
# Find the maximum follower count across all countries
max_global_follower_count = df_top_followers_per_country.agg(max("follower_count")).collect()[0][0]

# Find the country or countries with the user that has the maximum global follower count
df_country_with_top_follower = df_top_followers_per_country.filter(col("follower_count") == max_global_follower_count) \
                                                            .select("country", "follower_count")

# Display the results
display(df_country_with_top_follower.select("*"))

### Find Most Popular Category per Age Group

In [None]:
df_joined = df_pin.join(df_user, 'ind', 'inner')

# Create the age_group column
df_with_age_group = df_joined.withColumn(
    "age_group",
    when(col("age").between(18, 24), "18-24")
    .when(col("age").between(25, 35), "25-35")
    .when(col("age").between(36, 50), "36-50")
    .otherwise("50+")
)

# Group by age_group and category and count the occurrences
df_category_count = df_with_age_group.groupBy("age_group", "category").agg(count("*").alias("category_count"))

# Define a window spec partitioned by age_group and ordered by category_count descending
windowSpec = Window.partitionBy("age_group").orderBy(col("category_count").desc())

# Use the window spec to add a rank for each category within each age group partition
df_ranked = df_category_count.withColumn("rank", rank().over(windowSpec))

# Filter for the top-ranked category within each age group
df_top_category_per_age_group = df_ranked.filter(col("rank") == 1).select("age_group", "category", "category_count")

display(df_top_category_per_age_group)

### Find Median Follower Count per Age Group

In [None]:
df_joined = df_pin.join(df_user, 'ind', 'inner')

# Age groups
df_with_age_group = df_joined.withColumn(
    "age_group",
    when(col("age").between(18, 24), "18-24")
    .when(col("age").between(25, 35), "25-35")
    .when(col("age").between(36, 50), "36-50")
    .otherwise("50+")
)

# Group by age_group and calculate the median follower count
df_median_follower_count = df_with_age_group.groupBy("age_group")\
                                            .agg(percentile_approx("follower_count", 0.5).alias("median_follower_count"))

# Display the result
display(df_median_follower_count)


### Find New User Count Between 2015 & 2020

In [None]:
# Convert the date_joined column from string to date type
df_user = df_user.withColumn("date_joined", col("date_joined").cast(DateType()))

# Extract the year from the date_joined column
df_with_year = df_user.withColumn("post_year", year(col("date_joined")))

# Filter the DataFrame for years between 2015 and 2020
df_filtered = df_with_year.filter((col("post_year") >= 2015) & (col("post_year") <= 2020))

# Group by post_year and count the number of users
df_number_users_joined = df_filtered.groupBy("post_year").agg(count("*").alias("number_users_joined"))

display(df_number_users_joined)

### Find Median Follower Count of Users Joined Between 2015 & 2020

In [None]:
# Filter users who joined between 2015 and 2020
df_filtered_users = df_user.withColumn("date_joined", col("date_joined").cast("timestamp")) \
                           .withColumn("post_year", year("date_joined")) \
                           .filter((col("post_year") >= 2015) & (col("post_year") <= 2020))

df_joined = df_filtered_users.join(df_pin, 'ind', 'inner')

# Calculate the median follower count per post year
df_median_follower_count = df_joined.groupBy("post_year") \
                                    .agg(expr("percentile_approx(follower_count, 0.5)").alias("median_follower_count"))

display(df_median_follower_count)

### Find Median Follower Count per Joining Year & Age Group

In [None]:
# Filter users who joined between 2015 and 2020
df_users_filtered = df_user.withColumn("date_joined", col("date_joined").cast("timestamp")) \
                           .withColumn("post_year", year(col("date_joined"))) \
                           .filter((col("post_year") >= 2015) & (col("post_year") <= 2020))

# Create the age_group column
df_users_age_grouped = df_users_filtered.withColumn(
    "age_group",
    when(col("age").between(18, 24), "18-24")
    .when(col("age").between(25, 35), "25-35")
    .when(col("age").between(36, 50), "36-50")
    .otherwise("50+")
)

# Join df_users_age_grouped with df_pin on the user identifier to get follower counts
df_joined = df_users_age_grouped.join(df_pin, 'ind', 'inner')

# Group by age_group and post_year, and calculate the median follower count
df_median_followers = df_joined.groupBy("age_group", "post_year") \
                               .agg(expr("percentile_approx(follower_count, 0.5)").alias("median_follower_count"))

display(df_median_followers)