In [0]:
# MILE 6. MOUNT THE S3 BUCKET TO THE DATABRICKS ACCOUNT.

In [0]:
# Import libraries

# pyspark functions
from pyspark.sql.functions import *
# URL processing
import urllib

# Reading the table containing the AWS keys to Databricks
delta_table_path = "dbfs:/user/hive/warehouse/authentication_credentials"

# Read the Delta table to a Spark DataFrame
aws_keys_df = spark.read.format("delta").load(delta_table_path)

# Get the AWS access key and secret key from the spark dataframe
ACCESS_KEY = aws_keys_df.select('Access key ID').collect()[0]['Access key ID']
SECRET_KEY = aws_keys_df.select('Secret access key').collect()[0]['Secret access key']
# Encode the secrete key
ENCODED_SECRET_KEY = urllib.parse.quote(string=SECRET_KEY, safe="")

# Now the S3 bucket can be mounted by passing in the S3 URL and the desired mount name to dbutils.fs.mount()

# AWS S3 bucket name
AWS_S3_BUCKET = "user-0e172e8c4bc3-bucket"
# Mount name for the bucket
MOUNT_NAME = "/mnt/mount_S3_PDP_mile6"
# Source url
SOURCE_URL = "s3n://{0}:{1}@{2}".format(ACCESS_KEY, ENCODED_SECRET_KEY, AWS_S3_BUCKET)
# Mount the drive
# dbutils.fs.mount(SOURCE_URL, MOUNT_NAME)

# To check if the S3 bucket was mounted succesfully this command was run: 
# display(dbutils.fs.ls("/mnt/mount_S3_PDP_mile6/topics/0e172e8c4bc3.pin/partition=0/")). 

# Not displaying here to save time on scroling for the evaluator.

In [0]:
# Use code below to unmount
# dbutils.fs.unmount("/mnt/mount_S3_PDP_mile6")

In [0]:
%sql
-- Disable format checks during the reading of Delta tables
SET spark.databricks.delta.formatCheck.enabled=false

key,value
spark.databricks.delta.formatCheck.enabled,False


In [0]:
# Define a function that reads in JSONs from mounted S3 bucket topic 0e172e8c4bc3.<your-topic-ending>. Replace the topic ending with 'pin', 'geo' or 'user' when calling this function. Function returns a DataFrame.

def spark_read_from_s3(topic_ending):
    # Asterisk(*) indicates reading all the content of the specified file that have .json extension
    file_location = f"/mnt/mount_S3_PDP_mile6/topics/0e172e8c4bc3.{topic_ending}/partition=0/*.json"
    file_type = "json"
    # Ask Spark to infer the schema
    infer_schema = "true"
    # Read in JSONs from mounted S3 bucket chosen topic
    df = spark.read.format(file_type) \
    .option("inferSchema", infer_schema) \
    .load(file_location)
    return df

# Calling function to extract all three df's.
df_pin = spark_read_from_s3('pin')
df_geo = spark_read_from_s3('geo')
df_user = spark_read_from_s3('user')

# Check the data using display(), however it is here "hashed out" to not overload the notebook for the reader.
# display(df_pin)
# display(df_geo)
# display(df_user)

In [0]:
# Drop the duplicates 
df_pin = df_pin.dropDuplicates()
df_geo = df_geo.dropDuplicates()
df_user = df_user.dropDuplicates()

# Check if duplicates were deleted. Compare to original DF number of rows: 2094.
df_pin.count()   

In [0]:
%run Users/kasalotas@yahoo.com/transformation_functions 

In [0]:
# MILE 7. TASK 1. Data transformation on df_pin dataframe.

# Use clean_pin() from transformation_functions to perform cleaning tasks
df_pin = clean_pin(df_pin)
df_p = df_pin.limit(10)     # Use .limit() to have just 10 rows to display, and not the whole df.
df_pin.printSchema()
display(df_p)

ind,unique_id,title,description,follower_count,poster_name,tag_list,is_image_or_video,image_src,save_location,category
2793,65b4a439-3ada-49f3-9aa9-a572d5c311ba,Pink Glitter Jar Instructions,Learn how to make 4 unique DIY pink glitter jars with our helpful step-by-step instructions and video tutorial. They are perfect for practicing mindfulness with kids of all ages…,119000,Crafts and Activities for Kids with Fireflies and Mud Pies,"Diy Crafts Videos,Diy Crafts To Sell,Diy Crafts For Kids,Fun Crafts,Kids Diy,Sell Diy,Decor Crafts,Glitter Projects For Kids,Christmas Decorations Diy Crafts",video,https://i.pinimg.com/videos/thumbnails/originals/68/1d/f5/681df5c917cd0ab7da8c9064bdcbbbb4.0000001.jpg,/data/diy-and-crafts,diy-and-crafts
4922,af8f10c0-9444-4b17-bf76-ae38449467fe,Reusable Led Bobo Balloon Flower Bouquet Party Decorations - Blue ROSE BOUQUET / 1 Pack,"LED Luminous Balloon Rose Bouquet, Light Transparent Balloons with Flower, Ball Fake Roses for DIY Bouquets Wedding Party Gift Home Decoration Valentine's Day 2021 Gift Surprise…",5000,ifyousayido,"Plastic Balloons,Bubble Balloons,Balloon Flowers,Balloon Bouquet,Birthday Balloon Decorations,Birthday Balloons,Yellow Rose Bouquet,Transparent Balloons,Valentines Balloons",image,https://i.pinimg.com/originals/64/56/54/6456541f16077250c63596dfe4c011dc.jpg,/data/event-planning,event-planning
2267,54de78c0-8ab3-4e96-bfc8-584a2046e5fc,20 DIY Christmas Gift Baskets for Your Loved Ones,These DIY baskets for Christmas. These DIY gift ideas for Christmas will help you get a better gift for your loved ones and transform the way you celebrate Christmas. #christmas…,76000,Craftsy Hacks,"Diy Christmas Presents,Inexpensive Christmas Gifts,Neighbor Christmas Gifts,Christmas Gifts For Couples,Decoration Christmas,Handmade Christmas Gifts,Christmas Christmas,Christmas Gift Ideas,Thoughtful Christmas Gifts",image,https://i.pinimg.com/originals/c8/70/af/c870af77dc7ef58ffa07f57ae55a7a43.png,/data/christmas,christmas
4482,8489479d-b91b-46b8-8088-b29958cd2503,4 Tips for Booking your First Client without a Portfolio (2021) - EVENT PLANNING CERTIFICATE,Today I'm sharing four simple strategies event planners can use right now to book your first paying client without a portfolio.,884,Jody-Ann Rowe | Marketing Coach for Wedding Professionals,"Event Planning Template,Event Planning Quotes,Event Planning Checklist,Event Planning Business,Event Planning Design,Party Planning,Wedding Planning,Wedding Ideas,Business Ideas",image,https://i.pinimg.com/originals/6d/ec/c5/6decc533f545a95eed6536640f419c1c.png,/data/event-planning,event-planning
5503,b50c58c0-5b43-4f31-9975-75d4ee7c6979,The Richest Man In Babylon Summary In 10 minutes Or Less {2019},"The richest man in Babylon summary, here are an in-depth review and summary of the best finance book. here is the fastest step by step proven method to become filthy rich.",21000,Dr Breathe Easy Finance | Personal Finance Tips | Making Money | Saving tips | Budgeting | Investing | Retirement,"Finance Books,Finance Tips,Finance Quotes,Money Tips,Money Saving Tips,Money Budget,Money Hacks,Ways To Get Rich,Thing 1",image,https://i.pinimg.com/originals/04/aa/1f/04aa1f2f863d05ce7983fbf1420f7823.png,/data/finance,finance
3831,61649785-f7d5-4548-b621-93a5a145f35f,"Struggling Learners and Language Immersion Education : Research-Based, Practitioner-Informed Responses to Educators' Top Questions","This handbook provides dual language and immersion educators with rich information and practical resources that address common concerns with children who struggle with language,…",2000000,Walmart,"Language Immersion,Spanish Immersion,Language Acquisition,Bilingual Education,Dual Language,Background Information,Research,Literacy,No Response",image,https://i.pinimg.com/originals/a7/e2/72/a7e272ef258aaf2a9abe99781edd1c6b.jpg,/data/education,education
4837,06eabdd7-d008-42c8-858a-0737e94a5187,28 Bridal Shower Gifts That Aren't on the Couple's Registry,"The role of maid of honor comes with its fair share of duties, one of which includes the giving of some seriously one-of-a-kind gifts. While the engagement gifts are all fun and…",10000,Modern MOH | How to be a Head Bridesmaid in Charge,"Wedding Gifts For Bride And Groom,Diy Wedding Gifts,Wedding Shower Gifts,Bride Gifts,Wedding Ideas,Wedding Inspiration,Wedding Cards,Wedding Stuff,Dream Wedding",image,https://i.pinimg.com/originals/3f/01/b9/3f01b95c57e326885500cee84d6da3bd.jpg,/data/event-planning,event-planning
485,cf669122-4305-4807-a0dc-97a0f4481125,"Collagraph Printmaking with Kids Using Wooden Blocks - Kids Art Classes, Camps, Parties and Events - Small Hands Big Art","This collagraph printmaking project had all the makings of a the perfect art experience! Big heavy blocks, movement, color theory, texture, & pattern! Not to mention the end res…",1000,Sharon Wojcik,"Kids Printmaking,Collagraph Printmaking,Kids Art Class,Art For Kids,Art 2nd Grade,Atelier D Art,Art Lessons Elementary,Kindergarten Art Lessons,Collaborative Art",image,https://i.pinimg.com/originals/c3/71/03/c371035dd83164a6324d066bb011c669.jpg,/data/art,art
3419,d0b80187-0171-49b2-8ee4-572984244f65,Easy Christmas Tree Crafts Ideas for toddlers and preschoolers | Sharing Our Experiences,Easy Christmas tree Craft Ideas for toddlers and preschoolers. Engage your kids in these DIY,3000,Kids Crafts & Free Preschool Printables- Sharing Our Experiences,"Christmas Crafts For Kids To Make,Christmas Tree Painting,Christmas Activities For Kids,Easy Christmas Crafts For Toddlers,Kid Activities,Christmas Handprint Crafts,Christmas Tree Crafts,Christmas Baby,Xmas Tree",image,https://i.pinimg.com/originals/69/f0/75/69f075939d4449dffa69519756c30e26.png,/data/diy-and-crafts,diy-and-crafts
2126,adc18caf-a7ed-49c8-b328-87a9fc92551f,JOLLY INSPIRATION TO STYLING TIERED TRAYS FOR HAPPY HOLIDAYS,"Styling tiered trays to help your home say happy holidays is what we have in store for you today. We have a fun little surprise in this round up. Mini Christmas trees, peppermin…",46000,"Life on Summerhill | Home, Holiday Decor & DIY Website","Dollar Tree Christmas,Mini Christmas Tree,Christmas Holidays,Christmas Ornaments,Christmas Mugs,Purple Christmas,Magical Christmas,Father Christmas,Christmas Wreaths",image,https://i.pinimg.com/originals/58/70/95/5870956a1dc6a43e8091afbeba71a427.jpg,/data/christmas,christmas


In [0]:
# MILE 7. TASK 2. Data transformation on df_geo dataframe. 

# Use clean_geo() from transformation_functions to perform cleaning tasks
df_geo = clean_geo(df_geo)
df_g = df_geo.limit(10)
df_geo.printSchema()
display(df_g)


ind,country,coordinates,timestamp
9935,Bouvet Island (Bouvetoya),"-88.516, -178.811",2022-02-04T05:27:15.000+0000
6906,Central African Republic,"-88.5425, -157.374",2019-07-12T06:22:04.000+0000
10794,Cocos (Keeling) Islands,"-89.5236, -154.567",2022-01-01T02:26:50.000+0000
10052,Central African Republic,"14.7195, -130.921",2020-01-03T01:10:57.000+0000
6660,Bouvet Island (Bouvetoya),"-54.5264, 73.4883",2019-02-01T11:00:19.000+0000
5084,Slovakia (Slovak Republic),"87.8011, 53.0249",2022-01-06T22:16:07.000+0000
10020,Cocos (Keeling) Islands,"-88.6883, -86.0607",2021-08-22T19:54:37.000+0000
7209,Central African Republic,"-88.5425, -157.374",2020-02-12T11:09:50.000+0000
6740,Saint Pierre and Miquelon,"-19.6583, 90.4449",2022-09-29T22:58:19.000+0000
7370,Saint Pierre and Miquelon,"-24.2107, 138.463",2018-04-05T08:02:42.000+0000


In [0]:
# MILE 7. TASK 3. Data transformation on df_user dataframe.

# Use clean_user() from transformation_functions to perform cleaning tasks
df_user = clean_user(df_user)
df_u = df_user.limit(10)
df_user.printSchema()
display(df_u)

ind,user_name,age,date_joined
9426,Nicholas Mcdaniel,45,2017-05-24T01:41:26.000+0000
9332,Gregory Carpenter,23,2016-07-14T11:45:49.000+0000
4765,Danielle Gonzalez,36,2017-02-16T16:01:34.000+0000
7268,Alejandra Acevedo,20,2015-11-24T21:01:23.000+0000
3716,Catherine Ferrell,21,2017-01-02T03:01:09.000+0000
7116,Michael Mcconnell,59,2017-06-01T10:55:10.000+0000
1335,Benjamin Campbell,20,2015-11-16T13:25:08.000+0000
9355,Elizabeth Johnson,33,2016-12-09T23:34:15.000+0000
10204,Heather Gonzalez,41,2016-06-14T11:16:18.000+0000
10119,Chelsea Gonzalez,43,2016-07-21T15:25:08.000+0000


In [0]:
# MILE 7. TASK 4. Category with the most pins by country.

In [0]:
from pyspark.sql.functions import col, row_number 
from pyspark.sql.window import Window 

# Join dataframes df_pin and df_geo to obtain necessary columns in one table
df_pin_geo = df_pin.join(df_geo, df_pin["ind"] == df_geo["ind"], how="inner")

# Group by country and category, then find the max count for each group 
cat_count = df_pin_geo.groupBy("country", "category").agg(count("category").alias("count"))
result = cat_count.groupBy("country", "category").agg(max("count").alias("category_count")) 

# Use window function to get the row_number for each partition (county).
windowSpec = Window.partitionBy("country").orderBy(col("category_count").desc())
final_result = result.withColumn("rank", row_number().over(windowSpec)).filter(col("rank") == 1).drop("rank")

# Show the final result 
display(final_result)

country,category,category_count
Afghanistan,education,14
Albania,art,29
Algeria,quotes,38
American Samoa,tattoos,12
Andorra,tattoos,11
Angola,diy-and-crafts,4
Anguilla,diy-and-crafts,6
Antarctica (the territory South of 60 deg S),tattoos,6
Antigua and Barbuda,art,4
Argentina,tattoos,11


In [0]:
# MILE 7. TASK 5. The most popular category each year.

In [0]:
from pyspark.sql.functions import year

# Join dataframes df_pin and df_user to obtain necessary columns in one table
df_pin_user = df_pin.join(df_user, df_pin["ind"] == df_user["ind"], how="inner")

# Filter data between 2016 and 2018 (data reached only max to 2018)
filtered_df = df_pin_user.filter((year(df_pin_user["date_joined"]) >= 2016) & (year(df_pin_user["date_joined"]) <= 2018))

# Extract year from the timestamp column
filtered_df = filtered_df.withColumn("post_year", year(filtered_df["date_joined"]))

# Group by 'category' and 'post_year', count the number of posts
result = filtered_df.groupBy("category", "post_year").count()
result = result.orderBy(col("post_year").desc(), col("count").desc())
result = result.withColumnRenamed("count", "category_count")

# Display the resulting DataFrame
display(result)


category,post_year,category_count
vehicles,2017,31
event-planning,2017,30
finance,2017,28
christmas,2017,27
mens-fashion,2017,26
beauty,2017,21
home-decor,2017,20
education,2017,18
travel,2017,17
art,2017,16


In [0]:
# MILE 7. TASK 6. User with the most fallowers in each country.

In [0]:
# STEP 1.
# Group by country and category, then find the max count for each group 
follower_df = df_pin_geo.select("country","poster_name", "follower_count")
max_followers = follower_df.orderBy(col("follower_count").desc())

# STEP 2.
# Find the country with the greatest follower count. Use limit() to return a new DataFrame.
max_followers = max_followers.drop("poster_name")
result = max_followers.limit(1)
display(result)

country,follower_count
Anguilla,15000000


In [0]:
# MILE 7. TASK 7. The most popular category for different age groups.

In [0]:
from pyspark.sql.functions import udf

# Use udf function and withColumn to create a new column with age ranges based on "age" column.
age_group = udf(lambda age: '18-24' if age >= 18 and age <= 24 else              # No users <18
                       '25-35' if (age >= 25 and age <= 35) else
                       '36-50' if (age >= 36 and age <= 50) else
                       '50+' if (age > 50) else '')

df_age_group = df_pin_user.withColumn("age_group", age_group(df_pin_user.age))

# Group by age_group and category, then find the max count for each group
category_count = df_age_group.groupBy("age_group", "category").agg(count("category").alias("category_count"))
result = category_count.groupBy("age_group", "category").agg(max("category_count").alias("category_count")) 

# Use window function to get the row_number for each partition (age_group).
windowSpec = Window.partitionBy("age_group").orderBy(col("category_count").desc())
final_result = result.withColumn("rank", row_number().over(windowSpec)).filter(col("rank") == 1).drop("rank")

display(final_result)

age_group,category,category_count
18-24,tattoos,88
25-35,christmas,55
36-50,finance,40
50+,vehicles,21


In [0]:
# MILE 7. TASK 8. The median follower count for users based on age group.

In [0]:
# Use previously defined df_age_group to select relevant columns
sorted_followers = df_age_group.select("age_group", "follower_count")

# Group by age_group, then find the median count for each group 
sorted_followers = sorted_followers.orderBy(col("age_group").asc())
median_follower_count = sorted_followers.groupBy("age_group").agg(expr("percentile_approx(follower_count, 0.5) as median"))
display(median_follower_count)


age_group,median
18-24,127000
25-35,22000
36-50,7000
50+,1000


In [0]:
# MILE 7. TASK 9.

In [0]:
# Filter data between 2015 and 2020 
filtered_df = df_pin_user.filter((year(df_pin_user["date_joined"]) >= 2015) & (year(df_pin_user["date_joined"]) <= 2020))

# Extract year from the timestamp column
post_year_df = filtered_df.withColumn("post_year", year(filtered_df["date_joined"]))

# Group by 'post_year', count the number of unique poster_names assuming that poster_name is unique to each account
result = post_year_df.groupBy("post_year").agg(countDistinct("poster_name").alias("number_users_joined"))

# Display the resulting DataFrame
display(result)


post_year,number_users_joined
2015,300
2016,675
2017,275


In [0]:
# MILE 7. TASK 10.

In [0]:
# Selec columns from previously defined post_year_df, which fltered df_pin_user based on year
follower_df = post_year_df.select("post_year", "follower_count")

# Group by age_group, then find the median count for each group 
median_follower_count = follower_df.groupBy("post_year").agg(expr("percentile_approx(follower_count, 0.5) as median_follower_count"))
display(median_follower_count)

post_year,median_follower_count
2015,150000
2016,19000
2017,3000


In [0]:
# MILE 7. TASK 11.

In [0]:
# Add a age_group column into previously defined post_year_df (joint pin data and user data df with post_year column)
df_post_year_age_group = post_year_df.withColumn("age_group", age_group(post_year_df.age))

# Select columns that should be displayed
follower_df = df_post_year_age_group.select("post_year", "age_group", "follower_count")

# Group by age_group, then find the median count for each group 
follower_df = follower_df.orderBy(col("post_year").desc(), col("age_group").asc())
median_follower_count = follower_df.groupBy("post_year", "age_group").agg(expr("percentile_approx(follower_count, 0.5) as median_follower_count"))

display(median_follower_count)

post_year,age_group,median_follower_count
2017,18-24,12000
2017,25-35,2000
2017,36-50,3000
2017,50+,1000
2016,18-24,46000
2016,25-35,21000
2016,36-50,8000
2016,50+,1000
2015,18-24,267000
2015,25-35,42000
