In [0]:
from pyspark.sql.types import*
from pyspark.sql.functions import*
data = [
    {"FirstName": "Phil", "LastName": "Hawkins", "MoreDetails": {"Passport": "R-68106487", "PersonalDetails": {"Gender": "M", "MealPreference": "Veg", "DOB": "MTk4NS0wNy0xNw=="}}},
    {"FirstName": "Jay", "LastName": "Singh", "MoreDetails": {"Passport": "P-64549526", "PersonalDetails": {"Gender": "F", "MealPreference": "Any", "DOB": "MTk5NS0wMi0wOA=="}}},
    {"FirstName": "Russell", "LastName": "Constantine", "MoreDetails": {"Passport": "H-44102824", "PersonalDetails": {"Gender": "M", "MealPreference": "Any", "DOB": "MjAwMS0wNS0yMw=="}}},
    {"FirstName": "Phil", "LastName": "Singh", "MoreDetails": {"Passport": "D-61495909", "PersonalDetails": {"Gender": "M", "MealPreference": "Veg", "DOB": "MTk4NS0wOS0xMQ=="}}},
    {"FirstName": "Robert", "LastName": "Banner", "MoreDetails": {"Passport": "U-49850866", "PersonalDetails": {"Gender": "M", "MealPreference": "Any", "DOB": "MTk5MS0wNC0yNA=="}}},
    {"FirstName": "Heather", "LastName": "Singh", "MoreDetails": {"Passport": "S-26384273", "PersonalDetails": {"Gender": "M", "MealPreference": "Any", "DOB": "MTk4Ny0xMS0xOA=="}}},
    {"FirstName": "Billy", "LastName": "Kohli", "MoreDetails": {"Passport": "J-42970426", "PersonalDetails": {"Gender": "M", "MealPreference": "Veg", "DOB": "MTk5MS0wMS0zMA=="}}},
    {"FirstName": "Mary", "LastName": "Singh", "MoreDetails": {"Passport": "A-69169562", "PersonalDetails": {"Gender": "F", "MealPreference": "Veg", "DOB": "MTk5Ny0wMy0wNQ=="}}}
]
schema = StructType([
    StructField("FirstName", StringType(), True),
    StructField("LastName", StringType(), True),
    StructField("MoreDetails", StructType([
        StructField("Passport", StringType(), True),
        StructField("PersonalDetails", StructType([
            StructField("Gender", StringType(), True),
            StructField("MealPreference", StringType(), True),
            StructField("DOB", StringType(), True)
        ]), True)
    ]), True)
])
df=spark.createDataFrame(data,schema)
df.show(truncate=False)

+---------+-----------+----------------------------------------+
|FirstName|LastName   |MoreDetails                             |
+---------+-----------+----------------------------------------+
|Phil     |Hawkins    |{R-68106487, {M, Veg, MTk4NS0wNy0xNw==}}|
|Jay      |Singh      |{P-64549526, {F, Any, MTk5NS0wMi0wOA==}}|
|Russell  |Constantine|{H-44102824, {M, Any, MjAwMS0wNS0yMw==}}|
|Phil     |Singh      |{D-61495909, {M, Veg, MTk4NS0wOS0xMQ==}}|
|Robert   |Banner     |{U-49850866, {M, Any, MTk5MS0wNC0yNA==}}|
|Heather  |Singh      |{S-26384273, {M, Any, MTk4Ny0xMS0xOA==}}|
|Billy    |Kohli      |{J-42970426, {M, Veg, MTk5MS0wMS0zMA==}}|
|Mary     |Singh      |{A-69169562, {F, Veg, MTk5Ny0wMy0wNQ==}}|
+---------+-----------+----------------------------------------+



In [0]:
from pyspark.sql.types import*
from pyspark.sql.functions import*
df_flat = df.select(
    col("FirstName"),
    col("LastName"),
    col("MoreDetails.Passport").alias("Passport"),
    col("MoreDetails.PersonalDetails.Gender").alias("Gender"),
    col("MoreDetails.PersonalDetails.MealPreference").alias("MealPreference"),
    col("MoreDetails.PersonalDetails.DOB").alias("DOB")
)

# Show the flattened DataFrame
df_flat.show(truncate=False)

# Print the schema of the flattened DataFrame
df_flat.printSchema()

+---------+-----------+----------+------+--------------+----------------+
|FirstName|LastName   |Passport  |Gender|MealPreference|DOB             |
+---------+-----------+----------+------+--------------+----------------+
|Phil     |Hawkins    |R-68106487|M     |Veg           |MTk4NS0wNy0xNw==|
|Jay      |Singh      |P-64549526|F     |Any           |MTk5NS0wMi0wOA==|
|Russell  |Constantine|H-44102824|M     |Any           |MjAwMS0wNS0yMw==|
|Phil     |Singh      |D-61495909|M     |Veg           |MTk4NS0wOS0xMQ==|
|Robert   |Banner     |U-49850866|M     |Any           |MTk5MS0wNC0yNA==|
|Heather  |Singh      |S-26384273|M     |Any           |MTk4Ny0xMS0xOA==|
|Billy    |Kohli      |J-42970426|M     |Veg           |MTk5MS0wMS0zMA==|
|Mary     |Singh      |A-69169562|F     |Veg           |MTk5Ny0wMy0wNQ==|
+---------+-----------+----------+------+--------------+----------------+

root
 |-- FirstName: string (nullable = true)
 |-- LastName: string (nullable = true)
 |-- Passport: string (nu

In [0]:
# converts the PySpark DataFrame into a Pandas DataFrame
result_pd = df_flat.toPandas()

# Define the local path to save the excel file which means the file will be saved in the /tmp directory with the name passenger_details.xlsx.
local_path = "/tmp/passenger_details.xlsx"

# Write the Pandas DataFrame to an Excel file in the local file system
result_pd.to_excel(local_path, index=False)

# Define the DBFS path where the file will be saved
dbfs_path = "dbfs:/FileStore/tables/passenger_details.xlsx"

# Copy the file from the local file system to DBFS
dbutils.fs.cp(f"file:{local_path}", dbfs_path)

# Print the URL to download the file
print(f"Download URL: https://community.cloud.databricks.com/files/tables/passenger_details.xlsx")


Download URL: https://community.cloud.databricks.com/files/tables/passenger_details.xlsx


transform layer - Decrypt DOB.

In [0]:
# decode_base64(encoded_str): A function that takes a base64 encoded string as input, decodes it, and returns the decoded string.
# base64.b64decode(encoded_str): Decodes the base64 encoded string to bytes.
# decoded_bytes.decode('utf-8'): Converts the decoded bytes to a UTF-8 string.
%pip install openpyxl

import base64
import base64
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType
from datetime import datetime


def decode_base64(encoded_str):
    try:
        decoded_bytes = base64.b64decode(encoded_str)   
        decoded_str = decoded_bytes.decode('utf-8')
        return decoded_str
    except Exception as e:
        return str(e)

# Convert the function to a PySpark UDF
decode_base64_udf = udf(decode_base64, StringType())

# Apply the UDF to decrypt the DOB column
df_flat_decrypted = df_flat.withColumn("DOB", decode_base64_udf(col("DOB")))

# Show the DataFrame with decrypted DOB
df_flat_decrypted.show(truncate=False)

# Convert the PySpark DataFrame to a Pandas DataFrame
result_pd_df = df_flat_decrypted.toPandas()

# Define the local path to save the file
local_path = "/tmp/decrypted_dob.xlsx"

# Write the Pandas DataFrame to an Excel file in the local file system
result_pd_df.to_excel(local_path, index=False)

# Define the DBFS path where the file will be saved
dbfs_path = "dbfs:/FileStore/tables/decrypted_dob.xlsx"

# Copy the file from the local file system to DBFS
dbutils.fs.cp(f"file:{local_path}", dbfs_path)

# Provide the URL to download the file
url = "https://community.cloud.databricks.com/files/tables/decrypted_dob.xlsx"
print("Download URL:", url)



Python interpreter will be restarted.
Collecting openpyxl
  Using cached openpyxl-3.1.5-py2.py3-none-any.whl (250 kB)
Collecting et-xmlfile
  Using cached et_xmlfile-1.1.0-py3-none-any.whl (4.7 kB)
Installing collected packages: et-xmlfile, openpyxl
Successfully installed et-xmlfile-1.1.0 openpyxl-3.1.5
Python interpreter will be restarted.
+---------+-----------+----------+------+--------------+----------+
|FirstName|LastName   |Passport  |Gender|MealPreference|DOB       |
+---------+-----------+----------+------+--------------+----------+
|Phil     |Hawkins    |R-68106487|M     |Veg           |1985-07-17|
|Jay      |Singh      |P-64549526|F     |Any           |1995-02-08|
|Russell  |Constantine|H-44102824|M     |Any           |2001-05-23|
|Phil     |Singh      |D-61495909|M     |Veg           |1985-09-11|
|Robert   |Banner     |U-49850866|M     |Any           |1991-04-24|
|Heather  |Singh      |S-26384273|M     |Any           |1987-11-18|
|Billy    |Kohli      |J-42970426|M     |Veg 

agg layer - Find the passengers those were travelling on his/her birthday for by travel months.


In [0]:
from pyspark.sql.functions import col, month, dayofmonth, to_date

# Load trips data
df_trips = spark.read.csv("/FileStore/tables/travelmonth.csv", header=True, inferSchema=True)

# Convert TravelDate to Date format basically string format to date format     
df_trips = df_trips.withColumn("TravelDate", to_date(col("TravelDate"), "yyyy-MM-dd"))

# Rename columns in df_trips to avoid ambiguity
df_trips_renamed = df_trips.withColumnRenamed("FirstName", "TripFirstName") \
                           .withColumnRenamed("LastName", "TripLastName") \
                           .withColumnRenamed("Revenue", "TripRevenue") \
                           .withColumnRenamed("Origin", "TripOrigin") \
                           .withColumnRenamed("Destination", "TripDestination")

# Join DataFrames on Passport
df_combined = df_passengers_with_dob.join(df_trips_renamed, on="Passport", how="inner")

# Add columns for travel month and birth month/day
df_combined = df_combined.withColumn("TravelMonth", month(col("TravelDate"))) \
                         .withColumn("BirthMonth", month(col("DOB"))) \
                         .withColumn("BirthDay", dayofmonth(col("DOB")))

# Filter passengers traveling on their birthday
df_birthday_travelers = df_combined.filter(
    (col("TravelMonth") == col("BirthMonth")) &
    (col("BirthDay") == dayofmonth(col("TravelDate")))
)

# Select required columns
df_result = df_birthday_travelers.select(
    col("FirstName"),
    col("LastName"),
    col("TripRevenue").alias("Revenue"),
    col("TripOrigin").alias("Origin"),
    col("TripDestination").alias("Destination"),
    col("DOB"),
    col("TravelMonth")
)

# Show the result
df_result.show(truncate=False)


+---------+--------+-------+---------+------------+----------+-----------+
|FirstName|LastName|Revenue|Origin   |Destination |DOB       |TravelMonth|
+---------+--------+-------+---------+------------+----------+-----------+
|Phil     |Hawkins |67.73  |Atlanta  |Phoenix     |1985-07-17|7          |
|Jay      |Singh   |349.07 |Dallas   |Charlotte   |1995-02-08|2          |
|Heather  |Singh   |384.76 |Las Vegas|Phoenix     |1987-11-18|11         |
|Billy    |Kohli   |148.16 |Dallas   |Philadelphia|1991-01-30|1          |
+---------+--------+-------+---------+------------+----------+-----------+



In [0]:
import pandas as pd

# Convert the result DataFrame to a Pandas DataFrame
result_pd = df_result.toPandas()

# Define the local path to save the file
local_path = "/tmp/Birthday_Travelers.xlsx"

# Write the Pandas DataFrame to an Excel file in the local file system
result_pd.to_excel(local_path, index=False)

# Define the DBFS path where the file will be saved
dbfs_path = "dbfs:/FileStore/tables/Birthday_Travelers.xlsx"

# Copy the file from the local file system to DBFS
dbutils.fs.cp(f"file:{local_path}", dbfs_path)

# Print the URL to download the file
print(f"Download URL: https://community.cloud.databricks.com/files/tables/Birthday_Travelers.xlsx")


Download URL: https://community.cloud.databricks.com/files/tables/Birthday_Travelers.xlsx


agg layer - What is the most common meal preference for travel month Jan-2021?

In [0]:
# Filter for travel month January 2021
df_january_2021 = df_combined.filter((col("TravelMonth") == 1) & (year(col("TravelDate")) == 2021))

# Group by MealPreference and count occurrences
meal_preference_counts = df_january_2021.groupBy("MealPreference").count()

# Find the most common meal preference
most_common_meal_preference = meal_preference_counts.orderBy(col("count").desc()).limit(1)

# Show the result
most_common_meal_preference.show()


+--------------+-----+
|MealPreference|count|
+--------------+-----+
|           Veg|  143|
+--------------+-----+



In [0]:
import pandas as pd
from pyspark.sql.functions import col, year

# Filter for travel month January 2021
df_january_2021 = df_combined.filter((col("TravelMonth") == 1) & (year(col("TravelDate")) == 2021))

# Group by MealPreference and count occurrences
meal_preference_counts = df_january_2021.groupBy("MealPreference").count()

# Find the most common meal preference
most_common_meal_preference = meal_preference_counts.orderBy(col("count").desc()).limit(1)

# Convert to Pandas DataFrame
most_common_meal_preference_pd = most_common_meal_preference.toPandas()

# Define the local path to save the file
local_path = "/tmp/Most_Common_Meal_Preference_Jan2021.xlsx"







# Write the Pandas DataFrame to an Excel file in the local file system
most_common_meal_preference_pd.to_excel(local_path, index=False)

# Define the DBFS path where the file will be saved
dbfs_path = "dbfs:/FileStore/tables/Most_Common_Meal_Preference_Jan2021.xlsx"

# Copy the file from the local file system to DBFS
dbutils.fs.cp(f"file:{local_path}", dbfs_path)

# Print the URL to download the file
print(f"Download URL: https://community.cloud.databricks.com/files/tables/Most_Common_Meal_Preference_Jan2021.xlsx")


Download URL: https://community.cloud.databricks.com/files/tables/Most_Common_Meal_Preference_Jan2021.xlsx
