# Access4All — 01_airbnb_data_loader

This notebook loads the raw Airbnb listings dataset and performs basic cleaning.
The processing here is limited to removing irrelevant columns and preparing a clean base dataset for downstream layers.
SAS token is removed, insert it in first cell after imports in order to load raw data.


In [0]:

from pyspark.sql.window import Window
from pyspark.sql.types import *
from pyspark.sql import functions as F

### Configure secure access to the Azure Blob Storage container


In [0]:
storage_account = "lab94290"  
container = "airbnb"

sas_token = "<Insert sas token here>"
sas_token = sas_token.lstrip('?')
spark.conf.set(f"fs.azure.account.auth.type.{storage_account}.dfs.core.windows.net", "SAS")
spark.conf.set(f"fs.azure.sas.token.provider.type.{storage_account}.dfs.core.windows.net", "org.apache.hadoop.fs.azurebfs.sas.FixedSASTokenProvider")
spark.conf.set(f"fs.azure.sas.fixed.token.{storage_account}.dfs.core.windows.net", sas_token)

### Load the raw Airbnb dataset from cloud storage


In [0]:
path = f"abfss://{container}@{storage_account}.dfs.core.windows.net/airbnb_1_12_parquet"

air_bnb_data = spark.read.parquet(path)


### Select only the core columns required for downstream processing


In [0]:
v1_columns = [
    "amenities",
    "details",
    "lat",
    "long",
    "location",
    "country",
    "property_id",
    "url"
]




### Remove rows missing essential identifying or descriptive information


In [0]:

airbnb_v1_clean = airbnb_v1.filter(
    F.col("amenities").isNotNull() &
    F.col("details").isNotNull() &
    (F.col("property_id").isNotNull() | F.col("url").isNotNull())
)


### Derive a basic elevator availability indicator from amenities text


In [0]:

airbnb_v1_f1 = airbnb_v1_clean.withColumn(
    "has_elevator",
    F.lower(F.col("amenities")).contains("elevator")
)




### Extract on-premises parking availability from amenities text


In [0]:
airbnb_v1_f2 = airbnb_v1_f1.withColumn(
    "has_parking_on_premises",
    F.lower(F.col("amenities")).contains("parking on premises")
)



### Identify listings with a private entrance based on amenities text


In [0]:
airbnb_v1_f3 = airbnb_v1_f2.withColumn(
    "has_private_entrance",
    F.lower(F.col("amenities")).contains("private entrance")
)



### Detect explicit allowance of assistance animals from amenities text


In [0]:

airbnb_v1_f4 = airbnb_v1_f3.withColumn(
    "assistance_animals_allowed_explicit",
    F.lower(F.col("amenities")).contains("assistance animals")
)



### Inspect the most frequent cities (used to define the project’s city scope)


In [0]:


top_cities = (
    airbnb_v1_f4
    .where(F.col("location").isNotNull() & (F.length(F.trim("location")) > 0))
    .withColumn("city", F.trim(F.split(F.col("location"), ",").getItem(0)))
    .groupBy("city")
    .agg(F.count("*").alias("listing_count"))
    .orderBy(F.col("listing_count").desc())
    .limit(15)
)
display(top_cities)


### Filter to the final 9 target cities (choosen randomly by us) and persist the v1 city-scope table


In [0]:


final_cities = [
    "Paris",
    "Rome",
    "Dubai",
    "San Francisco",
    "São Paulo",
    "Los Angeles",
    "Rio de Janeiro",
    "New York",
    "Las Vegas"
]

airbnb_v1_9cities = (
    airbnb_v1_f4
    .withColumn("city", F.trim(F.split(F.col("location"), ",").getItem(0)))
    .where(F.col("city").isin(final_cities))
)

airbnb_v1_9cities.write \
    .format("delta") \
    .mode("overwrite") \
    .saveAsTable("access4all_airbnb_v1_9cities")
