## 1. Initialize Spark Session (Local Mode)


In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
import time
import pandas as pd
spark = SparkSession.builder \
    .appName("AirbnbPricePredictor") \
    .master("local[*]") \
    .config("spark.sql.adaptive.enabled", "true") \
    .config("spark.sql.adaptive.coalescePartitions.enabled", "true") \
    .config("spark.driver.memory", "4g") \
    .config("spark.executor.memory", "4g") \
    .config("spark.sql.shuffle.partitions", "16") \
    .config("spark.driver.extraJavaOptions", "-Djava.security.manager=allow") \
    .config("spark.executor.extraJavaOptions", "-Djava.security.manager=allow") \
    .getOrCreate()
print(f"spark session created")
print(f"version: {spark.version}")
print(f"master: {spark.sparkContext.master}")


25/11/12 15:56:10 WARN Utils: Your hostname, MacBook-Pro-110.local resolves to a loopback address: 127.0.0.1; using 172.16.61.1 instead (on interface bridge101)
25/11/12 15:56:10 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/11/12 15:56:11 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


spark session created
version: 3.5.0
master: local[*]


## 2. Load Data from CSV Files


In [2]:
BASE_PATH = "../data/raw/"
DATA_PATHS = {
    'nyc': f"{BASE_PATH}nyc/listings.csv",
    'la': f"{BASE_PATH}la/listings.csv",
    'paris': f"{BASE_PATH}paris/listings.csv"
}


### 2.1 Load NYC Listings


In [3]:
print("loading nyc listings")
start_time = time.time()
try:
    df_nyc = spark.read \
        .option("header", True) \
        .option("inferSchema", True) \
        .option("escape", "\"") \
        .option("multiLine", True) \
        .csv(DATA_PATHS['nyc'])
    df_nyc = df_nyc.withColumn("city", lit("NYC"))
    load_time = time.time() - start_time
    nyc_count = df_nyc.count()
    print(f"nyc loaded: {nyc_count:,} rows in {load_time:.2f}s")
    print(f"columns: {len(df_nyc.columns)}")
    nyc_columns = set(df_nyc.columns)
    HAS_NYC = True
except Exception as e:
    print(f"nyc failed: {e}")
    df_nyc = None
    nyc_columns = set()
    HAS_NYC = False


loading nyc listings
nyc loaded: 36,111 rows in 1.92s
columns: 80


### 2.2 Load LA Listings


In [4]:
print("loading la listings")
start_time = time.time()
try:
    df_la = spark.read \
        .option("header", True) \
        .option("inferSchema", True) \
        .option("escape", "\"") \
        .option("multiLine", True) \
        .csv(DATA_PATHS['la'])
    df_la = df_la.withColumn("city", lit("LA"))
    load_time = time.time() - start_time
    la_count = df_la.count()
    print(f"la loaded: {la_count:,} rows in {load_time:.2f}s")
    print(f"columns: {len(df_la.columns)}")
    la_columns = set(df_la.columns)
    HAS_LA = True
except Exception as e:
    print(f"la failed: {e}")
    df_la = None
    la_columns = set()
    HAS_LA = False


loading la listings
la loaded: 45,886 rows in 0.49s
columns: 80


### 2.3 Load Paris Listings


In [5]:
print("loading paris listings")
start_time = time.time()
try:
    df_paris = spark.read \
        .option("header", True) \
        .option("inferSchema", True) \
        .option("escape", "\"") \
        .option("multiLine", True) \
        .csv(DATA_PATHS['paris'])
    df_paris = df_paris.withColumn("city", lit("Paris"))
    load_time = time.time() - start_time
    paris_count = df_paris.count()
    print(f"paris loaded: {paris_count:,} rows in {load_time:.2f}s")
    print(f"columns: {len(df_paris.columns)}")
    paris_columns = set(df_paris.columns)
    HAS_PARIS = True
except Exception as e:
    print(f"paris failed: {e}")
    df_paris = None
    paris_columns = set()
    HAS_PARIS = False


loading paris listings


                                                                                

paris loaded: 91,031 rows in 0.79s
columns: 76


## 3. Schema Consistency Check


In [6]:
all_column_sets = []
if HAS_NYC:
    all_column_sets.append(nyc_columns)
if HAS_LA:
    all_column_sets.append(la_columns)
if HAS_PARIS:
    all_column_sets.append(paris_columns)
if all_column_sets:
    common_columns = set.intersection(*all_column_sets)
    print(f"datasets loaded: ", end="")
    loaded = []
    if HAS_NYC: loaded.append("NYC")
    if HAS_LA: loaded.append("LA")
    if HAS_PARIS: loaded.append("Paris")
    print(",".join(loaded))
    print(f"common columns: {len(common_columns)}")
else:
    print("no datasets loaded")
    raise Exception("no data loaded")


datasets loaded: NYC,LA,Paris
common columns: 76


25/11/12 15:56:23 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors


## 4. Union All Cities


In [7]:
common_cols_list = sorted(list(common_columns))
df_all_cities = None
if HAS_NYC:
    df_all_cities = df_nyc.select(common_cols_list)
    print("added nyc")
if HAS_LA:
    df_la_common = df_la.select(common_cols_list)
    if df_all_cities is None:
        df_all_cities = df_la_common
    else:
        df_all_cities = df_all_cities.union(df_la_common)
    print("added la")
if HAS_PARIS:
    df_paris_common = df_paris.select(common_cols_list)
    if df_all_cities is None:
        df_all_cities = df_paris_common
    else:
        df_all_cities = df_all_cities.union(df_paris_common)
    print("added paris")
df_all_cities.cache()
total_count = df_all_cities.count()
print(f"\nmulti-city dataset created")
print(f"total rows: {total_count:,}")
print(f"total columns: {len(df_all_cities.columns)}")


added nyc
added la
added paris


25/11/12 15:56:27 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


multi-city dataset created
total rows: 173,028
total columns: 76


                                                                                

## 5. Data Quality Checks


### 5.1 Row Counts by City


In [8]:
print("listings by city:")
city_counts = df_all_cities.groupBy("city").count().orderBy(desc("count"))
city_counts.show()
city_counts_pd = city_counts.toPandas()
city_counts_pd['percentage'] = (city_counts_pd['count'] / total_count * 100).round(2)
print("\npercentage distribution:")
print(city_counts_pd)


listings by city:
+-----+-----+
| city|count|
+-----+-----+
|Paris|91031|
|   LA|45886|
|  NYC|36111|
+-----+-----+


percentage distribution:
    city  count  percentage
0  Paris  91031       52.61
1     LA  45886       26.52
2    NYC  36111       20.87


### 5.2 Null Value Analysis


In [9]:
key_columns = [
    'id', 'name', 'host_id', 'latitude', 'longitude',
    'room_type', 'property_type', 'price', 'bedrooms', 'beds',
    'accommodates', 'number_of_reviews', 'review_scores_rating'
]
key_columns = [col for col in key_columns if col in df_all_cities.columns]
null_analysis = []
for col_name in key_columns:
    null_count = df_all_cities.filter(col(col_name).isNull()).count()
    null_pct = (null_count / total_count * 100)
    null_analysis.append({
        'column': col_name,
        'null_count': null_count,
        'null_pct': float(f"{null_pct:.2f}")
    })
null_df = pd.DataFrame(null_analysis).sort_values('null_pct', ascending=False)
print("top 10 columns with null values:")
print(null_df.head(10))


top 10 columns with null values:
                  column  null_count  null_pct
7                  price       54788     31.66
9                   beds       54586     31.55
12  review_scores_rating       47544     27.48
8               bedrooms       16319      9.43
0                     id           0      0.00
1                   name           0      0.00
2                host_id           0      0.00
3               latitude           0      0.00
4              longitude           0      0.00
5              room_type           0      0.00


### 5.3 Price Field Inspection


In [10]:
print("price field samples:")
df_all_cities.select("city", "price").show(10, truncate=False)


price field samples:
+----+-------+
|city|price  |
+----+-------+
|NYC |$66.00 |
|NYC |NULL   |
|NYC |NULL   |
|NYC |NULL   |
|NYC |$76.00 |
|NYC |$97.00 |
|NYC |NULL   |
|NYC |$60.00 |
|NYC |$425.00|
|NYC |$240.00|
+----+-------+
only showing top 10 rows



### 5.4 Room Type Distribution


In [11]:
print("room type distribution by city:")
df_all_cities.groupBy("city", "room_type").count() \
    .orderBy("city", desc("count")).show(15)


room type distribution by city:
+-----+---------------+-----+
| city|      room_type|count|
+-----+---------------+-----+
|   LA|Entire home/apt|33550|
|   LA|   Private room|11624|
|   LA|    Shared room|  387|
|   LA|     Hotel room|  325|
|  NYC|Entire home/apt|19167|
|  NYC|   Private room|16350|
|  NYC|     Hotel room|  351|
|  NYC|    Shared room|  243|
|Paris|Entire home/apt|81068|
|Paris|   Private room| 8850|
|Paris|     Hotel room|  732|
|Paris|    Shared room|  381|
+-----+---------------+-----+



### 5.5 Summary Statistics


In [12]:
numeric_cols = [
    'accommodates', 'bedrooms', 'beds',
    'minimum_nights', 'maximum_nights',
    'number_of_reviews', 'availability_365'
]
numeric_cols = [c for c in numeric_cols if c in df_all_cities.columns]
print("summary statistics:")
df_all_cities.select(numeric_cols).describe().show()


summary statistics:
+-------+------------------+-----------------+------------------+------------------+-----------------+-----------------+------------------+
|summary|      accommodates|         bedrooms|              beds|    minimum_nights|   maximum_nights|number_of_reviews|  availability_365|
+-------+------------------+-----------------+------------------+------------------+-----------------+-----------------+------------------+
|  count|            173028|           156709|            118442|            173028|           173028|           173028|            173028|
|   mean|3.3431698915782415|1.475122679616359|1.9022981712568177|32.834396745035484|12960.45935339945|27.72357075155466|160.51575467554383|
| stddev|2.1432718095711665|1.058008962890606|  1.39913442648272| 81.36317198621622|5162692.638368316|67.90362858233858|142.31959745102614|
|    min|                 1|                0|                 0|                 1|                1|                0|                 0|


## 6. Register Temporary Views


In [13]:
df_all_cities.createOrReplaceTempView("listings_raw")
print("listings_raw view created")
if HAS_NYC:
    df_nyc.select(common_cols_list).createOrReplaceTempView("nyc_raw")
    print("nyc_raw view created")
if HAS_LA:
    df_la.select(common_cols_list).createOrReplaceTempView("la_raw")
    print("la_raw view created")
if HAS_PARIS:
    df_paris.select(common_cols_list).createOrReplaceTempView("paris_raw")
    print("paris_raw view created")


listings_raw view created
nyc_raw view created
la_raw view created
paris_raw view created


### Test SQL Query


In [14]:
result = spark.sql("""
    SELECT
        city,
        COUNT(*) as total_listings,
        COUNT(DISTINCT neighbourhood_cleansed) as neighborhoods,
        COUNT(DISTINCT host_id) as unique_hosts
    FROM listings_raw
    GROUP BY city
    ORDER BY total_listings DESC
""")
result.show()


+-----+--------------+-------------+------------+
| city|total_listings|neighborhoods|unique_hosts|
+-----+--------------+-------------+------------+
|Paris|         91031|           20|       67331|
|   LA|         45886|          266|       23025|
|  NYC|         36111|          224|       21382|
+-----+--------------+-------------+------------+



## 7. Save Data to Parquet (For Next Notebooks)

In [15]:
output_path = "../data/processed/listings_raw.parquet"
print("saving data to parquet")
df_all_cities.write.mode("overwrite").parquet(output_path)
print(f"data saved to {output_path}")
print(f"rows: {df_all_cities.count():,}")
print(f"columns: {len(df_all_cities.columns)}")

saving data to parquet




data saved to ../data/processed/listings_raw.parquet
rows: 173,028
columns: 76


                                                                                

25/11/12 19:23:24 WARN HeartbeatReceiver: Removing executor driver with no recent heartbeats: 735005 ms exceeds timeout 120000 ms
25/11/12 19:23:24 WARN SparkContext: Killing executors is not supported by current scheduler.
25/11/12 19:23:32 ERROR Inbox: Ignoring error
org.apache.spark.SparkException: Exception thrown in awaitResult: 
	at org.apache.spark.util.SparkThreadUtils$.awaitResult(SparkThreadUtils.scala:56)
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:310)
	at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:75)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRefByURI(RpcEnv.scala:102)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRef(RpcEnv.scala:110)
	at org.apache.spark.util.RpcUtils$.makeDriverRef(RpcUtils.scala:36)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.driverEndpoint$lzycompute(BlockManagerMasterEndpoint.scala:124)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.org$apache$spark$storage$BlockManagerMasterEndpoint$$