In [22]:
from pyspark.sql import SparkSession

# Initialize Spark session
spark = SparkSession.builder \
    .appName("Airbnb EDA") \
    .getOrCreate()

In [42]:
# Define the directory path to the cleaned CSV files
file_path = "datasets/Final_cleaned_dataset/mad_final_cleaned_data_csv.csv"

# Load the cleaned data from the directory
df_mad = spark.read.csv(file_path, header=True, inferSchema=True)

df_mad.createOrReplaceTempView("airbnb_listings")

df_mad.show(5)

+----------------------------+-------------------+--------------------+--------------------+---------+---------------+------------------------------+-----------------+-----------------+-------------------+-----+-------+----------------+--------+----------------+---------+----------+------------------+-----------------+--------------------+-----------------+----------------+
|neighbourhood_group_cleansed|                 id|         listing_url|                name|  host_id|      host_name|calculated_host_listings_count|host_is_superhost|         latitude|          longitude|price|kitchen|patio or balcony|elevator|air conditioning|long_term|short_term|possible_long_term|number_of_reviews|review_scores_rating|room_type_encoded|bedrooms_encoded|
+----------------------------+-------------------+--------------------+--------------------+---------+---------------+------------------------------+-----------------+-----------------+-------------------+-----+-------+----------------+--------+-

In [43]:
# Display the schema to verify data types
df_mad.printSchema()

root
 |-- neighbourhood_group_cleansed: string (nullable = true)
 |-- id: long (nullable = true)
 |-- listing_url: string (nullable = true)
 |-- name: string (nullable = true)
 |-- host_id: string (nullable = true)
 |-- host_name: string (nullable = true)
 |-- calculated_host_listings_count: string (nullable = true)
 |-- host_is_superhost: string (nullable = true)
 |-- latitude: string (nullable = true)
 |-- longitude: double (nullable = true)
 |-- price: double (nullable = true)
 |-- kitchen: string (nullable = true)
 |-- patio or balcony: integer (nullable = true)
 |-- elevator: integer (nullable = true)
 |-- air conditioning: integer (nullable = true)
 |-- long_term: integer (nullable = true)
 |-- short_term: integer (nullable = true)
 |-- possible_long_term: integer (nullable = true)
 |-- number_of_reviews: integer (nullable = true)
 |-- review_scores_rating: double (nullable = true)
 |-- room_type_encoded: integer (nullable = true)
 |-- bedrooms_encoded: integer (nullable = true)


In [44]:
 df_mad.describe().show()



+-------+----------------------------+--------------------+--------------------+--------------------+--------------------+--------------+------------------------------+-----------------+--------------------+-------------------+-----------------+-------------------+-------------------+-------------------+-------------------+-------------------+-------------------+-------------------+------------------+--------------------+------------------+------------------+
|summary|neighbourhood_group_cleansed|                  id|         listing_url|                name|             host_id|     host_name|calculated_host_listings_count|host_is_superhost|            latitude|          longitude|            price|            kitchen|   patio or balcony|           elevator|   air conditioning|          long_term|         short_term| possible_long_term| number_of_reviews|review_scores_rating| room_type_encoded|  bedrooms_encoded|
+-------+----------------------------+--------------------+-------------

                                                                                

## Summary of Columns:
    neighbourhood_group_cleansed: The neighborhood where the listing is located.
    id: Unique identifier for the listing.
    listing_url: URL of the listing.
    name: Name of the listing.
    host_id: Unique identifier for the host.
    host_name: Name of the host.
    calculated_host_listings_count: Number of listings managed by the host.
    host_is_superhost: Whether the host is a superhost (a trusted, highly-rated host).
    latitude & longitude: Geographic coordinates of the listing.
    price: Price per night for the listing.
    kitchen: Indicates if the listing has a kitchen.
    patio or balcony, elevator, air conditioning: Amenities available in the listing.
    long_term, short_term, possible_long_term: Indicates if the listing is available for long or short term.
    number_of_reviews: Number of reviews the listing has received.
    review_scores_rating: Average rating of the listing.
    room_type_encoded: Encoded value representing the type of room (e.g., entire home, private room).
    bedrooms_encoded: Encoded value indicating if the listing has more than one bedroom.

In [48]:
df_mad.columns

['neighbourhood_group_cleansed',
 'id',
 'listing_url',
 'name',
 'host_id',
 'host_name',
 'calculated_host_listings_count',
 'host_is_superhost',
 'latitude',
 'longitude',
 'price',
 'kitchen',
 'patio or balcony',
 'elevator',
 'air conditioning',
 'long_term',
 'short_term',
 'possible_long_term',
 'number_of_reviews',
 'review_scores_rating',
 'room_type_encoded',
 'bedrooms_encoded']

## 1. Basic Descriptive Statistics:

In [49]:
# Get an overview of numeric columns such as price, number_of_reviews, and review_scores_rating.
df_mad.describe(["price", "number_of_reviews", "review_scores_rating"]).show()

+-------+-----------------+------------------+--------------------+
|summary|            price| number_of_reviews|review_scores_rating|
+-------+-----------------+------------------+--------------------+
|  count|            26873|             26873|               26873|
|   mean|136.8754511963681|44.257284263014924|  3.6586134782123416|
| stddev|268.5689458209769| 83.58524552484097|  2.0964308256393567|
|    min|              5.0|                 0|                 0.0|
|    max|          21000.0|              1060|               126.0|
+-------+-----------------+------------------+--------------------+



## 2. Neighborhood Analysis:
### 2.1 Top Neighborhoods by Listing Count:

In [50]:
df_mad.groupBy("neighbourhood_group_cleansed").count().orderBy("count", ascending=False).show()

+----------------------------+-----+
|neighbourhood_group_cleansed|count|
+----------------------------+-----+
|                      Centro|11281|
|                    Chamberí| 1800|
|                   Salamanca| 1733|
|                      Tetuán| 1605|
|                  Arganzuela| 1404|
|                 Carabanchel|  988|
|                      Retiro|  941|
|               Ciudad Lineal|  925|
|                   Chamartín|  868|
|          Puente de Vallecas|  823|
|                      Latina|  773|
|           Moncloa - Aravaca|  727|
|                       Usera|  610|
|        San Blas - Canill...|  584|
|                   Hortaleza|  531|
|        Fuencarral - El P...|  412|
|                  Villaverde|  260|
|                   Moratalaz|  195|
|                     Barajas|  195|
|           Villa de Vallecas|  134|
+----------------------------+-----+
only showing top 20 rows



### 2.2 Average Price by Neighborhood:

In [51]:
df_mad.groupBy("neighbourhood_group_cleansed").agg({"price": "mean"}).orderBy("avg(price)", ascending=False).show()

+----------------------------+------------------+
|neighbourhood_group_cleansed|        avg(price)|
+----------------------------+------------------+
|                   Salamanca|177.08424697057126|
|                      Centro|157.27426646573886|
|                      Tetuán|145.66230529595015|
|                   Hortaleza|142.91337099811676|
|                  Arganzuela| 141.0462962962963|
|                   Chamartín| 129.2258064516129|
|                    Chamberí|127.67611111111111|
|        San Blas - Canill...|125.88356164383562|
|                      Retiro|122.25398512221041|
|           Moncloa - Aravaca|  121.353507565337|
|          Puente de Vallecas|111.86391251518833|
|                     Barajas|100.93333333333334|
|               Ciudad Lineal| 98.71675675675675|
|        Fuencarral - El P...| 95.23543689320388|
|                 Carabanchel| 82.37348178137651|
|                       Usera| 73.94590163934426|
|           Villa de Vallecas| 72.67164179104478|


## 3. Price Distribution:

In [52]:
df_mad.select("price").summary().show()

+-------+-----------------+
|summary|            price|
+-------+-----------------+
|  count|            26873|
|   mean|136.8754511963681|
| stddev|268.5689458209769|
|    min|              5.0|
|    25%|             70.0|
|    50%|            112.0|
|    75%|            157.0|
|    max|          21000.0|
+-------+-----------------+



## 4.  Review Analysis:
### 4.1 Review Scores Distribution:

In [53]:
df_mad.groupBy("review_scores_rating").count().orderBy("review_scores_rating").show()

+--------------------+-----+
|review_scores_rating|count|
+--------------------+-----+
|                 0.0| 5822|
|                 1.0|   71|
|                 1.5|    2|
|                1.67|    1|
|                1.75|    1|
|                 2.0|   61|
|                2.33|    2|
|                 2.4|    1|
|                 2.5|   24|
|                 2.6|    1|
|                2.67|    6|
|                2.86|    1|
|                 3.0|  143|
|                3.13|    1|
|                3.14|    1|
|                3.17|    3|
|                 3.2|    4|
|                3.22|    1|
|                3.25|   11|
|                3.29|    1|
+--------------------+-----+
only showing top 20 rows



In [54]:
# 4.2 Average Review Scores by Neighborhood:
df_mad.groupBy("neighbourhood_group_cleansed").agg({"review_scores_rating": "mean"}).orderBy("avg(review_scores_rating)", ascending=False).show()

+----------------------------+-------------------------+
|neighbourhood_group_cleansed|avg(review_scores_rating)|
+----------------------------+-------------------------+
|                     VT-2282|                    126.0|
|                      Centro|        3.896200691428062|
|                   Hortaleza|       3.7812994350282496|
|                  Arganzuela|        3.744444444444445|
|          Puente de Vallecas|       3.6881652490887014|
|                      Retiro|        3.657492029755581|
|                     Barajas|       3.6524615384615378|
|                      Latina|       3.6012160413971577|
|                 Carabanchel|        3.591072874493933|
|               Ciudad Lineal|       3.5673621621621625|
|                      Tetuán|        3.534647975077887|
|                   Salamanca|       3.4532487016733975|
|        Fuencarral - El P...|       3.4328883495145623|
|                       Usera|       3.4282622950819697|
|           Moncloa - Aravaca| 

In [55]:
# There is no correlation between price and number of reviews

from pyspark.sql.functions import corr

df_mad.select(corr("price", "number_of_reviews").alias("correlation")).show()

+--------------------+
|         correlation|
+--------------------+
|-0.01640629518843...|
+--------------------+



## 5.  Amenities Analysis:
### Count of Listings with Specific Amenities:

In [56]:
df_mad.groupBy("kitchen", "air conditioning", "elevator", "patio or balcony").count().show()

+-------+----------------+--------+----------------+-----+
|kitchen|air conditioning|elevator|patio or balcony|count|
+-------+----------------+--------+----------------+-----+
|      0|               1|       1|               0|  497|
|      0|               0|       0|               0|  704|
|      0|               0|       1|               0|  330|
|      1|               0|       0|               0| 5083|
|      1|               1|       0|               0| 6127|
|      0|               0|       1|               1|   94|
|      1|               0|       1|               0| 2664|
|      0|               1|       0|               1|   61|
|      0|               1|       1|               1|   54|
|      1|               1|       1|               1| 2103|
|      1|               1|       0|               1| 1264|
|      0|               0|       0|               1|   96|
|      1|               0|       1|               1| 1098|
|      0|               1|       0|               0|  67

## 6. Room Type:

### Distribution of listings by Room Type:

In [57]:
df_mad.groupBy("room_type_encoded") \
    .count() \
    .withColumnRenamed("count", "listing_count") \
    .orderBy("listing_count", ascending=False) \
    .show()

+-----------------+-------------+
|room_type_encoded|listing_count|
+-----------------+-------------+
|                2|        17324|
|                1|         9168|
|                0|          381|
+-----------------+-------------+



### Average Price by Room Type:

In [58]:
df_mad.groupBy("room_type_encoded").agg({"price": "mean"}).orderBy("avg(price)", ascending=False).show()

+-----------------+------------------+
|room_type_encoded|        avg(price)|
+-----------------+------------------+
|                2|162.75906257215425|
|                0|104.83464566929133|
|                1|  89.2969022687609|
+-----------------+------------------+



### Average Price by Room Type and Number of Bedroooms:

In [59]:
df_mad.groupBy("room_type_encoded", "bedrooms_encoded") \
    .agg({"price": "avg"}) \
    .withColumnRenamed("avg(price)", "avg_price") \
    .orderBy("room_type_encoded", "bedrooms_encoded") \
    .show()

+-----------------+----------------+------------------+
|room_type_encoded|bedrooms_encoded|         avg_price|
+-----------------+----------------+------------------+
|                0|               0|102.34840425531915|
|                0|               1|             291.8|
|                1|               0| 87.04648152506472|
|                1|               1|117.84029850746269|
|                2|               0|130.97321607035778|
|                2|               1| 206.2202787646898|
+-----------------+----------------+------------------+



## 7. Host Analysis:
### 7.1 Superhost vs. Price:

In [60]:
df_mad.groupBy("host_is_superhost").agg({"price": "mean"}).orderBy("avg(price)", ascending=False).show()

+-----------------+------------------+
|host_is_superhost|        avg(price)|
+-----------------+------------------+
|                f|137.46706762505502|
|                t|135.01168042361002|
|                2|               5.0|
+-----------------+------------------+



### 7.2 Host Listings Count:

In [61]:
df_mad.groupBy("calculated_host_listings_count").agg({"price": "mean"}).orderBy("avg(price)", ascending=False).show()

+------------------------------+------------------+
|calculated_host_listings_count|        avg(price)|
+------------------------------+------------------+
|                            75|350.49333333333334|
|                            10|287.97352941176473|
|                            43|284.48837209302326|
|                           115|280.04347826086956|
|                            50|            267.08|
|                            89| 227.1123595505618|
|                            63|226.36507936507937|
|                            36|215.01388888888889|
|                            34|213.51470588235293|
|                            59|203.38983050847457|
|                           289|203.17301038062283|
|                            47| 183.7659574468085|
|                            30|178.23333333333332|
|                            16|             174.5|
|                            28| 173.6547619047619|
|                           290|163.09310344827585|
|           

## Geographic Distribution of Listings:

In [41]:
# Collect latitude and longitude data for visualization
geo_data = df_mad.select("latitude", "longitude").toPandas()

# Save to CSV or another format for use in visualization tools
geo_data.to_csv("geographic_distribution.csv", index=False)

In [90]:
# Stop the Spark session
spark.stop()