# 1. Environment Setup

In [1]:
import pandas as pd
pd.set_option("display.max_colwidth", None)

import numpy as np
np.bool = np.bool_

In [2]:
#current notebook name ie. get just the notebook name from the path
notebook_name = __session__.replace(".ipynb", "")[__session__.rfind("/")+1:]

In [3]:
#HDFS BASE PATHS
hdfs_lakehouse_base_path = 'hdfs://localhost:9000/lakehouse/'
hdfs_warehouse_base_path = 'hdfs://localhost:9000/warehouse'
hdfs_temp_base_path      = 'hdfs://localhost:9000/tmp/'

In [4]:
# Create Spark Session
import os
dependencies = ["org.apache.spark:spark-avro_2.12:3.5.0",
                "io.delta:delta-iceberg_2.12:3.0.0"]
os.environ['PYSPARK_SUBMIT_ARGS']= f"--packages {','.join(dependencies)} pyspark-shell"
os.environ['PYARROW_IGNORE_TIMEZONE'] = 'true'

from pyspark.sql.session import SparkSession

spark = (SparkSession.builder
    .appName(notebook_name)
    .config("spark.log.level","ERROR")
    .config("spark.sql.warehouse.dir",hdfs_warehouse_base_path)
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
    .enableHiveSupport()
    .getOrCreate()
)

24/12/11 15:23:42 WARN Utils: Your hostname, osbdet resolves to a loopback address: 127.0.0.1; using 10.0.2.15 instead (on interface enp0s3)
24/12/11 15:23:42 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


:: loading settings :: url = jar:file:/home/osbdet/.jupyter_venv/lib/python3.11/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/osbdet/.ivy2/cache
The jars for the packages stored in: /home/osbdet/.ivy2/jars
org.apache.spark#spark-avro_2.12 added as a dependency
io.delta#delta-iceberg_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-9397b77b-f2b4-4e19-bef6-505c065bcdc7;1.0
	confs: [default]
	found org.apache.spark#spark-avro_2.12;3.5.0 in central
	found org.tukaani#xz;1.9 in central
	found io.delta#delta-iceberg_2.12;3.0.0 in central
	found io.delta#delta-spark_2.12;3.0.0 in central
	found io.delta#delta-storage;3.0.0 in central
	found org.antlr#antlr4-runtime;4.9.3 in central
	found org.scala-lang.modules#scala-collection-compat_2.12;2.1.1 in central
	found com.github.ben-manes.caffeine#caffeine;2.9.3 in central
	found org.checkerframework#checker-qual;3.19.0 in central
	found com.google.errorprone#error_prone_annotations;2.10.0 in central
:: resolution report :: resolve 1716ms :: artifacts dl 42ms
	:: modules in use:
	com.github.ben-m

# WORKING WITH THE DATA

### Listings data processing

In [5]:
listings_brz = (spark.read
              .option("header","true")
                .option("inferschema", "true")
              .option("escape","\"")
              .csv(f"{hdfs_lakehouse_base_path}/bronze/airbnb/listings_data/"))

                                                                                

In [6]:
listings_brz.limit(2).toPandas()

Unnamed: 0,listing_id,name,host_id,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_total_listings_count,...,minimum_nights,maximum_nights,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,instant_bookable
0,281420,"Beautiful Flat in le Village Montmartre, Paris",1466919,2011-12-03,"Paris, Ile-de-France, France",,,,f,1,...,2,1125,100,10,10,10,10,10,10,f
1,3705183,39 mÂ² Paris (Sacre CÅ“ur),10328771,2013-11-29,"Paris, Ile-de-France, France",,,,f,1,...,2,1125,100,10,10,10,10,10,10,f


In [7]:
from pyspark.sql.functions import col, lower, when

def add_amenity_columns(df, amenities_list, column_name="amenities"):
    # Iterate over the amenities list and create corresponding columns
    for amenity in amenities_list:
        df = df.withColumn(
            amenity, 
            when(lower(col(column_name)).rlike(f".*{amenity}.*"), True).otherwise(False)
        )
    return df

# List of amenities to check for
amenities_list = ["pool", "wifi", "balcony", "parking"]

# Apply the function to the DataFrame
listings_brz = add_amenity_columns(listings_brz, amenities_list)

def convert_to_boolean(df, columns):
    for column in columns:
        df = df.withColumn(
            column, 
            when(col(column) == "t", True).otherwise(False)
        )
    return df

# List of columns to convert
columns_to_convert = [
    "host_is_superhost", 
    "host_has_profile_pic", 
    "host_identity_verified",
    "instant_bookable"
]

# Apply the function to convert columns
listings_brz = convert_to_boolean(listings_brz, columns_to_convert)

# Show the result
listings_brz.limit(2).toPandas()


Unnamed: 0,listing_id,name,host_id,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_total_listings_count,...,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,instant_bookable,pool,wifi,balcony,parking
0,281420,"Beautiful Flat in le Village Montmartre, Paris",1466919,2011-12-03,"Paris, Ile-de-France, France",,,,False,1,...,10,10,10,10,10,False,False,True,False,False
1,3705183,39 mÂ² Paris (Sacre CÅ“ur),10328771,2013-11-29,"Paris, Ile-de-France, France",,,,False,1,...,10,10,10,10,10,False,False,True,False,False


In [8]:
listings_brz.printSchema()

root
 |-- listing_id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- host_id: integer (nullable = true)
 |-- host_since: date (nullable = true)
 |-- host_location: string (nullable = true)
 |-- host_response_time: string (nullable = true)
 |-- host_response_rate: double (nullable = true)
 |-- host_acceptance_rate: double (nullable = true)
 |-- host_is_superhost: boolean (nullable = false)
 |-- host_total_listings_count: integer (nullable = true)
 |-- host_has_profile_pic: boolean (nullable = false)
 |-- host_identity_verified: boolean (nullable = false)
 |-- neighbourhood: string (nullable = true)
 |-- district: string (nullable = true)
 |-- city: string (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- property_type: string (nullable = true)
 |-- room_type: string (nullable = true)
 |-- accommodates: integer (nullable = true)
 |-- bedrooms: integer (nullable = true)
 |-- amenities: string (nullable = true)
 |-- 

In [9]:
spark.sql("DROP SCHEMA IF EXISTS airbnb CASCADE")
spark.sql("CREATE SCHEMA IF NOT EXISTS airbnb")

(listings_brz.write
     .format("delta") 
     .mode("overwrite")
     .option("path",f"{hdfs_lakehouse_base_path}/silver/airbnb/listings/")
     .saveAsTable("airbnb.listings")
)

DataFrame[]

DataFrame[]

                                                                                

### Reviews data

In [10]:
reviews_brz = (spark.read
              .option("header","true")
                .option("inferschema", "true")
              .option("escape","\"")
              .csv(f"{hdfs_lakehouse_base_path}/bronze/airbnb/reviews_data/"))

                                                                                

In [11]:
reviews_brz.limit(10).toPandas()

Unnamed: 0,listing_id,review_id,date,reviewer_id
0,11798,330265172,2018-09-30,11863072
1,15383,330103585,2018-09-30,39147453
2,16455,329985788,2018-09-30,1125378
3,17919,330016899,2018-09-30,172717984
4,26827,329995638,2018-09-30,17542859
5,74561,330089224,2018-09-30,173044789
6,140355,330194958,2018-09-30,160093807
7,162163,329980859,2018-09-30,94026758
8,167998,329950677,2018-09-30,35388162
9,178188,330213008,2018-09-30,3652511


In [12]:
reviews_brz.printSchema()

root
 |-- listing_id: integer (nullable = true)
 |-- review_id: integer (nullable = true)
 |-- date: date (nullable = true)
 |-- reviewer_id: integer (nullable = true)



In [13]:
(reviews_brz.write
     .format("delta") 
     .mode("overwrite")
     .option("path",f"{hdfs_lakehouse_base_path}/silver/airbnb/reviews/")
     .saveAsTable("airbnb.reviews")
)

                                                                                