### Importing Libraries & Starting Spark Session

In [1]:
import pandas as pd
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', None)

import numpy as np
np.bool = np.bool_

#current notebook name
notebook_name = __session__.replace('.ipynb','')[__session__.rfind('/')+1:] 

# HDFS base paths
hdfs_lakehouse_base_path = 'hdfs://localhost:9000/lakehouse/'
hdfs_warehouse_base_path = 'hdfs://localhost:9000/warehouse'

In [2]:
import os
dependencies = ["org.apache.spark:spark-avro_2.12:3.5.0",
                "io.delta:delta-iceberg_2.12:3.0.0"]
os.environ['PYSPARK_SUBMIT_ARGS']= f"--packages {','.join(dependencies)} pyspark-shell"
os.environ['PYARROW_IGNORE_TIMEZONE'] = 'true'

In [3]:
from pyspark.sql.session import SparkSession

spark = (SparkSession.builder
    .appName(notebook_name)
    .config("spark.log.level","ERROR")
    .config("spark.sql.warehouse.dir",hdfs_warehouse_base_path)
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
    .enableHiveSupport()
    .getOrCreate()
)

24/12/11 20:24:06 WARN Utils: Your hostname, osbdet resolves to a loopback address: 127.0.0.1; using 10.0.2.15 instead (on interface enp0s3)
24/12/11 20:24:06 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


:: loading settings :: url = jar:file:/home/osbdet/.jupyter_venv/lib/python3.11/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/osbdet/.ivy2/cache
The jars for the packages stored in: /home/osbdet/.ivy2/jars
org.apache.spark#spark-avro_2.12 added as a dependency
io.delta#delta-iceberg_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-6deec668-bf59-418e-8dfe-a7cc8868e647;1.0
	confs: [default]
	found org.apache.spark#spark-avro_2.12;3.5.0 in central
	found org.tukaani#xz;1.9 in central
	found io.delta#delta-iceberg_2.12;3.0.0 in central
	found io.delta#delta-spark_2.12;3.0.0 in central
	found io.delta#delta-storage;3.0.0 in central
	found org.antlr#antlr4-runtime;4.9.3 in central
	found org.scala-lang.modules#scala-collection-compat_2.12;2.1.1 in central
	found com.github.ben-manes.caffeine#caffeine;2.9.3 in central
	found org.checkerframework#checker-qual;3.19.0 in central
	found com.google.errorprone#error_prone_annotations;2.10.0 in central
:: resolution report :: resolve 573ms :: artifacts dl 33ms
	:: modules in use:
	com.github.ben-ma

### Reading Files & Checking

In [4]:
brz_listings = (spark.read
              .option("header","true")
                .option("inferschema", "true")
              .option("escape","\"")
              .csv(f"{hdfs_lakehouse_base_path}/bronze/airbnb/listings/"))

brz_reviews = (spark.read
              .option("header","true")
                .option("inferschema", "true")
              .option("escape","\"")
              .csv(f"{hdfs_lakehouse_base_path}/bronze/airbnb/reviews/"))

                                                                                

In [None]:
brz_listings.limit(5).toPandas()

In [None]:
brz_reviews.limit(5).toPandas()

In [None]:
brz_listings.printSchema()

### Casting Values

In [5]:
from pyspark.sql.functions import *

# columns_to_convert = ['host_is_superhost', 'host_has_profile_pic', 'host_identity_verified', 'instant_bookable']

# for column in columns_to_convert:
#     slv_listings = brz_listings.withColumn(column, (col(column) == 't').cast("boolean"))

slv_listings = brz_listings.select(
    col("listing_id"),
    col("name"),
    col("host_id"),
    col("host_since"),
    col("host_location"),
    col("host_response_time"),
    col("host_response_rate"),
    col("host_acceptance_rate"),
    col("host_acceptance_rate"),
    col("host_is_superhost").cast("boolean"),
    col("host_total_listings_count"),
    col("host_has_profile_pic").cast("boolean"),
    col("host_identity_verified").cast("boolean"),
    col("neighbourhood"),
    col("district"),
    col("city"),
    col("latitude"),
    col("longitude"),
    col("property_type"),
    col("room_type"),
    col("accommodates"),
    col("bedrooms"),
    col("amenities"),
    col("price"),
    col("minimum_nights"),
    col("maximum_nights"),
    col("review_scores_rating"),
    col("review_scores_accuracy"),
    col("review_scores_cleanliness"),
    col("review_scores_checkin"),
    col("review_scores_communication"),
    col("review_scores_location"),
    col("review_scores_value"),
    col("instant_bookable").cast("boolean")
)

slv_listings.limit(5).toPandas()

Unnamed: 0,listing_id,name,host_id,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_acceptance_rate.1,host_is_superhost,host_total_listings_count,host_has_profile_pic,host_identity_verified,neighbourhood,district,city,latitude,longitude,property_type,room_type,accommodates,bedrooms,amenities,price,minimum_nights,maximum_nights,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,instant_bookable
0,281420,"Beautiful Flat in le Village Montmartre, Paris",1466919,2011-12-03,"Paris, Ile-de-France, France",,,,,False,1.0,True,False,Buttes-Montmartre,,Paris,48.88668,2.33343,Entire apartment,Entire place,2,1.0,"[""Heating"", ""Kitchen"", ""Washer"", ""Wifi"", ""Long term stays allowed""]",53,2,1125,100.0,10.0,10.0,10.0,10.0,10.0,10.0,False
1,3705183,39 mÃÂ² Paris (Sacre CÃ âur),10328771,2013-11-29,"Paris, Ile-de-France, France",,,,,False,1.0,True,True,Buttes-Montmartre,,Paris,48.88617,2.34515,Entire apartment,Entire place,2,1.0,"[""Shampoo"", ""Heating"", ""Kitchen"", ""Essentials"", ""Washer"", ""Dryer"", ""Wifi"", ""Long term stays allowed""]",120,2,1125,100.0,10.0,10.0,10.0,10.0,10.0,10.0,False
2,4082273,"Lovely apartment with Terrace, 60m2",19252768,2014-07-31,"Paris, Ile-de-France, France",,,,,False,1.0,True,False,Elysee,,Paris,48.88112,2.31712,Entire apartment,Entire place,2,1.0,"[""Heating"", ""TV"", ""Kitchen"", ""Washer"", ""Wifi"", ""Long term stays allowed""]",89,2,1125,100.0,10.0,10.0,10.0,10.0,10.0,10.0,False
3,4797344,Cosy studio (close to Eiffel tower),10668311,2013-12-17,"Paris, Ile-de-France, France",,,,,False,1.0,True,True,Vaugirard,,Paris,48.84571,2.30584,Entire apartment,Entire place,2,1.0,"[""Heating"", ""TV"", ""Kitchen"", ""Wifi"", ""Long term stays allowed""]",58,2,1125,100.0,10.0,10.0,10.0,10.0,10.0,10.0,False
4,4823489,Close to Eiffel Tower - Beautiful flat : 2 rooms,24837558,2014-12-14,"Paris, Ile-de-France, France",,,,,False,1.0,True,False,Passy,,Paris,48.855,2.26979,Entire apartment,Entire place,2,1.0,"[""Heating"", ""TV"", ""Kitchen"", ""Essentials"", ""Hair dryer"", ""Washer"", ""Dryer"", ""Bathtub"", ""Wifi"", ""Elevator"", ""Long term stays allowed"", ""Cable TV""]",60,2,1125,100.0,10.0,10.0,10.0,10.0,10.0,10.0,False


### Attempt to Change Amenities

In [12]:
import pyspark.sql.functions as F

test_listings = brz_listings.limit(5)

# Remove the opening and closing brackets
test_listings = test_listings.withColumn("amenities", F.regexp_replace("amenities", r'^\[|\]$', ''))

# Split the string into an array
test_listings = test_listings.withColumn("amenities", F.split(F.col("amenities"), '", "'))

# Remove any residual quotes from each amenity
test_listings = test_listings.withColumn("amenities", F.expr("transform(amenities, x -> regexp_replace(x, '\"', ''))"))

In [13]:
# Explode the amenities array to individual rows
amenities_exploded = test_listings.select(F.explode("amenities").alias("amenity"))

# Clean up any leading/trailing whitespace
amenities_exploded = amenities_exploded.withColumn("amenity", F.trim("amenity"))

# Get the distinct list of amenities
unique_amenities = amenities_exploded.select("amenity").distinct().rdd.flatMap(lambda x: x).collect()

In [14]:
# Function to sanitize column names
def sanitize_column_name(name):
    name = name.lower().replace(' ', '_').replace('-', '_').replace('/', '_')
    name = ''.join(e for e in name if e.isalnum() or e == '_')
    return name

# Iterate over the unique amenities and create columns
for amenity in unique_amenities:
    # Sanitize the amenity name to create a valid column name
    amenity_column = sanitize_column_name(amenity)
    
    # Create a boolean column indicating the presence of the amenity
    test_listings = test_listings.withColumn(amenity_column, F.array_contains("amenities", amenity))

In [16]:
# Drop the original 'amenities' column
test_listings = test_listings.drop("amenities")

In [17]:
test_listings.limit(5).toPandas()

Unnamed: 0,listing_id,name,host_id,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_total_listings_count,host_has_profile_pic,host_identity_verified,neighbourhood,district,city,latitude,longitude,property_type,room_type,accommodates,bedrooms,price,minimum_nights,maximum_nights,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,instant_bookable,heating,kitchen,washer,wifi,long_term_stays_allowed,shampoo,essentials,dryer,tv,hair_dryer,bathtub,elevator,cable_tv
0,281420,"Beautiful Flat in le Village Montmartre, Paris",1466919,2011-12-03,"Paris, Ile-de-France, France",,,,f,1.0,t,f,Buttes-Montmartre,,Paris,48.88668,2.33343,Entire apartment,Entire place,2,1.0,53,2,1125,100.0,10.0,10.0,10.0,10.0,10.0,10.0,f,True,True,True,True,True,False,False,False,False,False,False,False,False
1,3705183,39 mÃÂ² Paris (Sacre CÃ âur),10328771,2013-11-29,"Paris, Ile-de-France, France",,,,f,1.0,t,t,Buttes-Montmartre,,Paris,48.88617,2.34515,Entire apartment,Entire place,2,1.0,120,2,1125,100.0,10.0,10.0,10.0,10.0,10.0,10.0,f,True,True,True,True,True,True,True,True,False,False,False,False,False
2,4082273,"Lovely apartment with Terrace, 60m2",19252768,2014-07-31,"Paris, Ile-de-France, France",,,,f,1.0,t,f,Elysee,,Paris,48.88112,2.31712,Entire apartment,Entire place,2,1.0,89,2,1125,100.0,10.0,10.0,10.0,10.0,10.0,10.0,f,True,True,True,True,True,False,False,False,True,False,False,False,False
3,4797344,Cosy studio (close to Eiffel tower),10668311,2013-12-17,"Paris, Ile-de-France, France",,,,f,1.0,t,t,Vaugirard,,Paris,48.84571,2.30584,Entire apartment,Entire place,2,1.0,58,2,1125,100.0,10.0,10.0,10.0,10.0,10.0,10.0,f,True,True,False,True,True,False,False,False,True,False,False,False,False
4,4823489,Close to Eiffel Tower - Beautiful flat : 2 rooms,24837558,2014-12-14,"Paris, Ile-de-France, France",,,,f,1.0,t,f,Passy,,Paris,48.855,2.26979,Entire apartment,Entire place,2,1.0,60,2,1125,100.0,10.0,10.0,10.0,10.0,10.0,10.0,f,True,True,True,True,True,False,True,True,True,True,True,True,True
