In [None]:
# Import pyspark functions
from pyspark.sql.functions import *
# Import URL processing
import urllib

In [None]:
# Check tables in filestore to get name of credentials file
dbutils.fs.ls("/FileStore/tables")

In [None]:
## Read the csv file containing the AWS keys to Databricks 
# specify file type to be csv
file_type = "csv"
# Indicates file has first row as the header
first_row_is_header = "true"
# Indicates file has comma as the delimeter
delimiter = ","
# Read the CSV file to spark dataframe
aws_keys_df = spark.read.format(file_type)\
.option("header", first_row_is_header)\
.option("sep", delimiter)\
.load("/FileStore/tables/authentication_credentials.csv")

In [None]:
"""Extract the access key and secret access key from the spark dataframe created. The secret access key will be encoded using urllib.parse.quote for security purposes. safe="" means that every character will be encoded."""
# Get the AWS access key and secret key from the spark dataframe
ACCESS_KEY = aws_keys_df.where(col('User name')=='databricks-user').select('Access key ID').collect()[0]['Access key ID']
SECRET_KEY = aws_keys_df.where(col('User name')=='databricks-user').select('Secret access key').collect()[0]['Secret access key']
# Encode the secret key
ENCODED_SECRET_KEY = urllib.parse.quote(string=SECRET_KEY, safe="")

In [None]:
"""Mount the S3 bucket by passing in the S3 URL and the desired mount name to dbutils.fs.mount()."""
# AWS S3 bucket name
AWS_S3_BUCKET = "user-0e35b2767ae1-bucket"
# Mount name for the bucket
MOUNT_NAME = "/mnt/user-0e35b2767ae1-bucket"
# Source url
SOURCE_URL = "s3n://{0}:{1}@{2}".format(ACCESS_KEY, ENCODED_SECRET_KEY, AWS_S3_BUCKET)
# Mount the drive
dbutils.fs.mount(SOURCE_URL, MOUNT_NAME)

In [None]:
## Read data from the mounted S3 bucket
# To check if the S3 bucket was mounted succesfully run the following command:
display(dbutils.fs.ls("/mnt/user-0e35b2767ae1-bucket/topics"))


path,name,size,modificationTime
dbfs:/mnt/user-0e35b2767ae1-bucket/topics/0e35b2767ae1.geo/,0e35b2767ae1.geo/,0,1699034596388
dbfs:/mnt/user-0e35b2767ae1-bucket/topics/0e35b2767ae1.pin/,0e35b2767ae1.pin/,0,1699034596388
dbfs:/mnt/user-0e35b2767ae1-bucket/topics/0e35b2767ae1.user/,0e35b2767ae1.user/,0,1699034596388


In [None]:
# Read the JSON format dataset from S3 into Databricks
# File location and type
# Asterisk(*) indicates reading all the content of the specified file that have .json extension
file_location = "/mnt/user-0e35b2767ae1-bucket/topics/0e35b2767ae1.user/partition=0/*.json"
file_type = "json"
# Ask Spark to infer the schema
infer_schema = "true"
# Read in JSONs from mounted S3 bucket
df_user = spark.read.format(file_type) \
.option("inferSchema", infer_schema) \
.load(file_location)
# Display Spark dataframe to check its content
display(df_user)

age,date_joined,first_name,ind,last_name
36,2016-06-01 09:41:09,Christopher,2539,Williams
20,2015-10-23 04:13:23,Alexandria,3964,Alvarado
20,2015-10-23 04:13:23,Alexandria,3539,Alvarado
20,2015-10-23 04:13:23,Alexandria,4087,Alvarado
20,2015-10-23 04:13:23,Alexandria,3790,Alvarado
38,2017-06-18 19:26:47,Christopher,5910,Hopkins
20,2015-10-23 04:13:23,Alexandria,4093,Alvarado
60,2015-12-22 01:35:36,Candice,4207,Christensen
39,2017-09-13 11:21:05,Michelle,8301,Hernandez
40,2017-03-07 21:33:30,Christopher,4761,Norman


In [None]:
# File location and type
# Asterisk(*) indicates reading all the content of the specified file that have .json extension
file_location = "/mnt/user-0e35b2767ae1-bucket/topics/0e35b2767ae1.pin/partition=0/*.json" 
file_type = "json"
# Ask Spark to infer the schema
infer_schema = "true"
# Read in JSONs from mounted S3 bucket
df_pin = spark.read.format(file_type) \
.option("inferSchema", infer_schema) \
.load(file_location)
# Display Spark dataframe to check its content
display(df_pin)

category,description,downloaded,follower_count,image_src,index,is_image_or_video,poster_name,save_location,tag_list,title,unique_id
christmas,Make your own gingerbread person with our free Christmas craft. 4 pages of accessories to mix & match! #gingerbreadchristmasdecor #gingerbreadcraftspreschool #gingerbreadcraftfo…,1,7k,https://i.pinimg.com/originals/ca/59/b1/ca59b1055ca52521b9ebd01799513b8c.jpg,2539,image,"Mrs. Merry | Free Printables for Kids, Holiday Printables & Party",Local save in /data/christmas,"Christmas Projects For Kids,Christmas To Do List,Christmas Decorations For Kids,Christmas Activities For Kids,Preschool Christmas,Free Christmas Printables,Christmas Books,Christmas Themes,Gingerbread Christmas Decor",Free Kids Printable - Build a Gingerbread Person Craft - Christmas Activities for Kids | Mrs. Merry,cd2c667e-da47-4818-8f94-3def20b90864
education,"Imagine a study guide actually designed for teachers! Because we know you've got a busy life, we've developed a study guide that isn't like other certification materials out the…",1,2M,https://i.pinimg.com/originals/cd/32/e4/cd32e4d70a22d4d010e3220e184ce62f.jpg,3790,image,Walmart,Local save in /data/education,"Early Childhood Education Online,Test Taking Strategies,Teacher Certification,Guided Practice,Test Day,Exam Study,Test Prep,Study Materials,Professional Development","Gace Early Childhood Education (001, 002; 501) Exam Study Guide 2019-2020 : Gace Early Childhood Test Prep and Practice Questions for the Georgia Assessments for the Certification of Educators…",08ff1a00-2be4-487e-8264-825479fa14e3
christmas,"For the coming Christmas, what comes to your mind first? A Christmas tree? Gifts for your children? Maybe your house also needs a gift. Look at these stickers. Small as they are…",1,5k,https://i.pinimg.com/originals/25/aa/e5/25aae534714d3c1f585e24e9fbd0b215.jpg,2036,image,Wear24-7,Local save in /data/christmas,"Christmas Wreaths With Lights,Christmas Balloons,Christmas Party Decorations,Halloween Ornaments,Xmas Ornaments,Christmas Elf,Holiday Decor,Halloween Fairy,Christmas Clipart",Andoer Christmas Decoration Christmas Decoration Stickers Christmas Decorations Indoors Wall Decorations Shop Window Decorations for Christmas - Type 10,e100fdbf-d1a1-4de0-a176-3d2b8f109709
vehicles,"Биткоин колеблется каждый день от 60000 до 65000$, начинай зарабатывать: https://bit.ly/36J0qVC★★★ Подписаться на канал: https://bit.ly/343C0Sr ★★★ Полицейск...",1,41,https://i.pinimg.com/originals/2c/2f/67/2c2f67f988c76514ecf611f6c725c304.jpg,10943,image,Masfa Dkxmrvd,Local save in /data/vehicles,"Lamborghini Veneno,Lamborghini Logo,Carros Lamborghini,Ferrari Laferrari,Luxury Sports Cars,Best Luxury Cars,Sport Cars,Exotic Sports Cars,Dream Cars",Полицейская погоня #1. Не разминулись со встречкой.,c8f5619c-c267-4d50-8231-3994d32fc0de
christmas,IntroductionThis crochet pattern is an adaptation of the 6-Day Star Blanket by Betty McKnit.This pattern creates a slightly more than circular star-shaped skirt with an opening…,1,2k,https://i.pinimg.com/originals/43/d0/96/43d096768765a0ca40c4313ff74a6d06.jpg,1835,image,Betty McKnit Knit and Crochet Pattern Designer & Instructor,Local save in /data/christmas,"Christmas Crochet Blanket,Crochet Christmas Trees,Christmas Crochet Patterns,Holiday Crochet,Christmas Sweaters,Christmas Tree Skirts Patterns,Christmas Ideas,Christmas Decorations,Crochet Ripple",6-Day Star Holiday Tree Skirt - Crochet Pattern by Betty McKnit,2f72cf98-6649-4271-b4a0-e13f15c81dce
christmas,This adorable Family of Christmas Gonks is perfect for spreading holiday spirit. They would make the perfect festive addition to your Christmas Tree. Gnomes are perfect for spre…,1,12,https://i.pinimg.com/originals/31/31/e0/3131e0f708bc562e208f559534718946.jpg,2590,image,PersonalPrintStudio | Personalised Gift Shop | Small UK Business,Local save in /data/christmas,"Led Christmas Tree,Christmas Snowflakes,Christmas Gnome,Cheap Christmas,Christmas Ideas,Christmas Ornaments,Handmade Christmas Decorations,Handmade Home Decor,Festival Decorations",Family Of Gonks LED Christmas tree Decorations - Family Of Gonks,ea46d3ae-c41d-4fcf-ac0c-d04b2d25be91
event-planning,"Purchase this lovely wedding table centerpieces! This product has more than one function – you can use as a welcome & thank you sign, table number, menu card and beautiful decor…",1,2k,https://i.pinimg.com/originals/b9/67/88/b967889dd0fc9b7c782c68cf5cb521a5.jpg,4604,image,Mr Beam Lasers GmbH,Local save in /data/event-planning,"Wood Wedding Decorations,Candle Wedding Centerpieces,Aisle Decorations,Centerpiece Ideas,Decor Wedding,Welcome To Our Wedding,Wedding In The Woods,Wedding Menu Cards,Wedding Events","Custom Laser Cut Wood Wedding Table Decoration | Welcome Sign, Table Number, Menu & Led Candle |",28d8ee8e-f20f-4edc-bfd8-64f3f9e79c01
home-decor,"13 Unique Ways To Reuse and Repurpose Junk, Thrift Store Finds and Other Household Items in Your Home – Let’s Upcycle! Ever looked at your old junk and thought, “That’s …",1,4k,https://i.pinimg.com/originals/e1/70/4b/e1704b4351bba43c92a506d55da7f675.jpg,5891,image,Jen's Clever DIY,Local save in /data/home-decor,"Country Kitchen Farmhouse,Farmhouse Kitchen Decor,Modern Farmhouse,Diy Kitchen Decor,Farmhouse Ideas,Farm House Kitchen Ideas,Kitchen Design,Country Kitchens,Small Kitchens","Upcycle Projects and Ideas - DIY Upcycled Household Items and Junk Into Furniture, Decor and More - Clever DIY Ideas",fab47a27-afdd-4f3c-9aac-dc9fd9fb425c
finance,"How to start investing in 4 easy steps with 3 simple investments. Set it up and basically forget it. Investing money is the way to make money & passive income, build wealth, get…",1,28k,https://i.pinimg.com/originals/bf/75/9a/bf759aaa891c595b0aa4339a5c9e9aa0.jpg,5727,image,Dividends Diversify: Money Matters So Build Wealth & Be Rich,Local save in /data/finance,"Investment Tips,Investment Portfolio,Investing In Stocks,Investing Money,Money Tips,Money Saving Tips,Dividend Investing,Budget Planer,How To Become Rich",How To Build a Vanguard 3 Fund Portfolio Paying Dividends (4 Steps) - Dividends Diversify,e7e82118-5d9b-426e-9454-76a2392a84ed
travel,"Discover a new adventure on the Washington Coast - the Kalaloch Tree Root Cave. Just minutes away from the Kalaloch Lodge this amazing phenomenon is a Pacific Northwest classic,…",1,32k,https://i.pinimg.com/originals/69/17/ca/6917ca48a0857908c2f11b40346871ef.jpg,10031,image,The Mandagies (Pacific Northwest Travel + Photo),Local save in /data/travel,"Arizona Road Trip,Oregon Road Trip,Road Trip Usa,Cool Places To Visit,Places To Travel,Travel Photographie,New Hampshire,Tree Roots,To Infinity And Beyond",The Perfect 24 Hour Itinerary To The Kalaloch Lodge and Tree Root Cave In Washington - The Mandagies,78254898-8f7c-4b4c-bf9c-b483fe77c326


In [None]:
# File location and type
# Asterisk(*) indicates reading all the content of the specified file that have .json extension
file_location = "/mnt/user-0e35b2767ae1-bucket/topics/0e35b2767ae1.geo/partition=0/*.json" 
file_type = "json"
# Ask Spark to infer the schema
infer_schema = "true"
# Read in JSONs from mounted S3 bucket
df_geo = spark.read.format(file_type) \
.option("inferSchema", infer_schema) \
.load(file_location)
# Display Spark dataframe to check its content
display(df_geo)

country,ind,latitude,longitude,timestamp
British Indian Ocean Territory (Chagos Archipelago),7569,-86.5675,-149.565,2018-10-16 08:40:26
Antarctica (the territory South of 60 deg S),2201,-57.9438,-148.044,2020-02-01 11:55:43
Antarctica (the territory South of 60 deg S),5891,-68.2538,-171.706,2021-03-22 00:33:27
Antarctica (the territory South of 60 deg S),2181,-88.4642,-171.061,2022-05-27 10:07:57
South Georgia and the South Sandwich Islands,2979,29.0623,23.1675,2019-09-05 09:30:36
Lao People's Democratic Republic,7248,67.3156,-151.069,2020-04-05 17:35:15
Lao People's Democratic Republic,5556,-83.7645,19.8741,2022-09-01 01:01:30
Lao People's Democratic Republic,1374,-78.0163,-64.578,2022-06-07 19:13:39
Saint Vincent and the Grenadines,3056,-36.208,59.5877,2020-06-30 01:23:34
Saint Vincent and the Grenadines,2594,16.1987,14.7998,2020-03-23 09:35:00


In [None]:
topics =  {'df_pin':'0e35b2767ae1.pin', 'df_geo':'0e35b2767ae1.geo', 'df_user':'0e35b2767ae1.user'}
for df, topic in topics.items():
    file_location = f"/mnt/user-0e35b2767ae1-bucket/topics/{topic}/partition=0/*.json" 
    file_type = "json"
    infer_schema = "true"
    df_name = spark.read.format(file_type) \
    .option("inferSchema", infer_schema) \
    .load(file_location)
    display(df_pin)
    display(df_geo)
    display(df_user)

In [None]:
## Unmount S3 bucket
# If you want to unmount the S3 bucket, run the following code:

dbutils.fs.unmount("/mnt/mount_name")

## Cleaning the dataframes and sorting columns


In [None]:
# Cleaning the dataframes and sorting columns

df_pin = df_pin.dropDuplicates()
df_pin = df_pin.withColumnRenamed('index', 'ind') 
# Create function to convert null or bad values to None
def convert_to_null(df, column, bad_values):
    '''Converts no or bad values in dataframe columns to null '''
    df = df.withColumn(column, when(col(column).like(bad_values), None).otherwise(col(column)))
    return df
# Create a dictionary of columns and values to be replaced, then replace them with None in the df
bad_values_dict = {
    "description": "No description available%",
    "follower_count": "User Info Error",
    "image_src": "Image src error.",
    "poster_name": "User Info Error",
    "tag_list": "N,o, ,T,a,g,s, ,A,v,a,i,l,a,b,l,e",
    "title": "No Title Data Available"
}
# Call the function while iterating through the dictionary items.
for key, value in bad_values_dict.items():
    df_pin = convert_to_null(df_pin, key, value)
# Perform the necessary transformations on the follower_count to ensure every entry is a number
df_pin = df_pin.withColumn("follower_count", regexp_replace("follower_count", "k", "000"))
df_pin = df_pin.withColumn("follower_count", regexp_replace("follower_count", "M", "000000"))
# Cast all columns with numbers only to integer type
df_pin = df_pin.withColumn("follower_count", col("follower_count").cast('int'))
df_pin = df_pin.withColumn('downloaded', df_pin['downloaded'].cast('int'))
# Convert save_location column to include only the save location path
df_pin = df_pin.withColumn("save_location", regexp_replace("save_location", "Local save in ", ""))
# Re-order the dataframe columns.
df_pin = df_pin.select("ind", "unique_id", "title", "description", "follower_count", "poster_name", "tag_list", "is_image_or_video", "image_src", "save_location", "category", "downloaded")
df_pin.na.drop(how = "all")

## Clean df_geo data

In [None]:
df_geo = df_geo.dropDuplicates()
# Form an array column from latitude and longitude columns and drop them from the DataFrame
df_geo = df_geo.withColumn("coordinates", array(col("latitude"), col("longitude")))
df_geo = df_geo.drop('latitude', 'longitude')
# Cast column timestamp to Timestamp type
df_geo = df_geo.withColumn("timestamp", df_geo["timestamp"].cast('Timestamp'))
# Re-order the dataframe columns.
df_geo = df_geo.select("ind", "country", "coordinates", "timestamp")

## Clean df_user data

In [None]:
df_user = df_user.dropDuplicates()
df_user = df_user.withColumn("user_name", concat_ws(' ', ("first_name"), ("last_name")))
df_user = df_user.drop("first_name", "last_name")
df_user = df_user.withColumn('date_joined', df_user['date_joined'].cast('Timestamp'))
df_user = df_user.select("ind", "user_name", "age", "date_joined")

# Querying the data

Find the most popular Pinterest category people post to based on their country.

In [None]:
# Join the pin and geo df on ind, group by columns, count all rows. 
df_pin.join(df_geo, 'ind').groupBy('country', 'category')\
 .agg(count('*').alias('count'))\
 .groupBy('country')\
 .agg(max(struct('count', 'category')).alias('max_count')) \
 .select('country','max_count.category','max_count.count' ).show()

In [None]:
# Cell shows categories according to their popularity in all country

In [None]:
from pyspark.sql.functions import *
df_pin.join(df_geo, 'ind').groupBy('country', 'category')\
    .agg(count('*').alias('category_count')) \
        .groupBy('country', 'category')\
        .agg(max('category_count').alias('max_category_count')) \
        .select ('country', 'category','max_category_count')\
        .sort(desc("max_category_count")).show()\



 Find how many posts each category had between 2018 and 2022

In [None]:
from pyspark.sql.functions import *

# Join df_pin with df_geo on 'ind' where year is within given timestamp range
df_category_by_specific_years = df_pin.join(df_geo, on='ind') \
    .where((year('timestamp') >= 2018) & (year('timestamp') <= 2022))\
    .groupBy(year('timestamp').alias('post_year'), 'category') \
    .agg(count('*').alias('category_count'))\
    .show()

In [None]:
# Join df_pin with df_geo on 'ind' where year is within given timestamp range
df_category_by_specific_years = df_pin.join(df_geo, on='ind')\
    .where((year('timestamp') >= 2018) & (year('timestamp') <= 2022))\
    .groupBy(year('timestamp').alias('post_year'), 'category')\
    .agg(count('*').alias('post_count'))\
    .groupBy('category') \
    .agg(sum('post_count'))\
    .show()

For each country find the user with the most followers.

In [None]:
# Find the country with the user that has the most follower
# Step 1
df_most_followers_by_country = df_pin.join(df_geo, 'ind')\
 .groupBy('country')\
 .agg(max(struct('follower_count', 'poster_name')).alias('max'))\
 .selectExpr('country', 'max.poster_name', 'max.follower_count')\
 .orderBy(col('follower_count').desc())
df_most_followers_by_country.show()
# Step 2
df_pin.join(df_geo, 'ind').groupBy('country').agg(max(struct('follower_count', 'poster_name')).alias('max')) \
 .selectExpr('country', 'max.poster_name', 'max.follower_count').orderBy(col('follower_count').desc()).limit(1).show()

7. What is the most popular category people post to based on the following age groups:

In [None]:
from pyspark.sql.functions import *
df_max_popular_cat_posted_to_by_age_group = df_pin.join(df_user, 'ind').withColumn('age_group', expr("""case
 WHEN age BETWEEN 18 AND 24 THEN '18-24'
 WHEN age BETWEEN 25 AND 35 THEN '25-35'
 WHEN age BETWEEN 36 AND 50 THEN '36-50'
 WHEN age> 50 THEN '+50' END """))\
.groupBy('age_group', 'category') \
.agg(count('*').alias('count')) \
.groupBy('age_group') \
.agg(max(struct('count', 'category')).alias('max_count')) \
.select('age_group', 'max_count.category', 'max_count.count') \
.show()

8. The median follower count for users in the following age groups:

In [None]:
from pyspark.sql.functions import *
df_median_followercount_by_age_group = df_pin.join(df_user, 'ind')\
    .withColumn('age_group', expr("""case\
 WHEN age BETWEEN 18 AND 24 THEN '18-24'
 WHEN age BETWEEN 25 AND 35 THEN '25-35'
 WHEN age BETWEEN 36 AND 50 THEN '36-50'
 WHEN age> 50 THEN '+50' END """))\
    .groupBy("age_group") \
    .agg(percentile_approx("follower_count", 0.5).alias("median_follower_count")) \
    .select("age_group", "median_follower_count").show()

9. Find how many users have joined between 2015 and 2020.

In [None]:
from pyspark.sql.functions import *
# Join df_pin with df_geo on 'ind' where year is within given timestamp range
df_users_joined_by_years = df_geo.join(df_user, on='ind') \
    .where((year('timestamp') >= 2015) & (year('timestamp') <= 2020))\
    .groupBy(year('timestamp').alias('post_year'))\
    .agg(count('*').alias('number_users_joined'))\
    .show()


Find the median follower count of users have joined between 2015 and 2020.

post_year, a new column that contains only the year from the timestamp column
median_follower_count, a new column containing the desired query output

In [None]:
df_median_followercount_by_years = df_user.join(df_pin, 'ind')\
    .groupBy(year('date_joined').alias('post_year'))\
    .agg(percentile_approx("follower_count", 0.5).alias("median_follower_count")) \
    .select('post_year', 'median_follower_count')\
    .where(col('post_year').between('2015', '2020')).show()

Find the median follower count of users that have joined between 2015 and 2020, based on which age group they are part of.

In [None]:
df_user.join(df_pin, 'ind')\
    .withColumn('age_group', expr("""case\
 WHEN age BETWEEN 18 AND 24 THEN '18-24'
 WHEN age BETWEEN 25 AND 35 THEN '25-35'
 WHEN age BETWEEN 36 AND 50 THEN '36-50'
 WHEN age> 50 THEN '+50' END """))\
    .groupBy(year('date_joined').alias('post_year'), 'age_group')\
    .agg(percentile_approx("follower_count", 0.5).alias("median_follower_count")) \
    .select('post_year','age_group', 'median_follower_count')\
    .where(col('post_year').between('2015', '2020')).orderBy('post_year','age_group').show()