In [27]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("AirbnbListingsAnalysis").getOrCreate()

In [28]:
# Load dataset
file_path = 'cleaned_listing_bcn.csv/part-00000-4bdb371e-09c4-41f1-87dd-723f88b00a3d-c000.csv'

# Load the cleaned data
df_bcn = spark.read.csv(file_path, header=True, inferSchema=True)

# Display the schema to verify data types
# df_bcn.printSchema()
df_bcn.show(5)


+----------------------------+------+--------------------+--------------------+-------+--------------+------------------------------+-----------------+-----------------+-----------------+-----+-------+----------------+--------+----------------+---------+----------+------------------+-----------------+--------------------+-----------------+----------------+
|neighbourhood_group_cleansed|    id|         listing_url|                name|host_id|     host_name|calculated_host_listings_count|host_is_superhost|         latitude|        longitude|price|kitchen|patio or balcony|elevator|air conditioning|long_term|short_term|possible_long_term|number_of_reviews|review_scores_rating|room_type_encoded|bedrooms_encoded|
+----------------------------+------+--------------------+--------------------+-------+--------------+------------------------------+-----------------+-----------------+-----------------+-----+-------+----------------+--------+----------------+---------+----------+-----------------

## Summary of Columns:
    neighbourhood_group_cleansed: The neighborhood where the listing is located.
    id: Unique identifier for the listing.
    listing_url: URL of the listing.
    name: Name of the listing.
    host_id: Unique identifier for the host.
    host_name: Name of the host.
    calculated_host_listings_count: Number of listings managed by the host.
    host_is_superhost: Whether the host is a superhost (a trusted, highly-rated host).
    latitude & longitude: Geographic coordinates of the listing.
    price: Price per night for the listing.
    kitchen: Indicates if the listing has a kitchen.
    patio or balcony, elevator, air conditioning: Amenities available in the listing.
    long_term, short_term, possible_long_term: Indicates if the listing is available for long or short term.
    number_of_reviews: Number of reviews the listing has received.
    review_scores_rating: Average rating of the listing.
    room_type_encoded: Encoded value representing the type of room (e.g., entire home, private room).
    bedrooms_encoded: Encoded value indicating if the listing has more than one bedroom.

## 1. Basic Descriptive Statistics:

In [29]:
# Get an overview of numeric columns such as price, number_of_reviews, and review_scores_rating.
df_bcn.describe(["price", "number_of_reviews", "review_scores_rating"]).show()


+-------+-----------------+------------------+--------------------+
|summary|            price| number_of_reviews|review_scores_rating|
+-------+-----------------+------------------+--------------------+
|  count|            18898|             18898|               18898|
|   mean|195.6262567467457|46.669383003492435|  3.4487617737326484|
| stddev|294.5524571009359| 96.76817892359264|  2.4109771236735402|
|    min|              0.0|                 0|                 0.0|
|    max|          13714.0|              2121|               167.0|
+-------+-----------------+------------------+--------------------+



## 2. Neighborhood Analysis:
### 2.1 Top Neighborhoods by Listing Count:

In [30]:
df_bcn.groupBy("neighbourhood_group_cleansed").count().orderBy("count", ascending=False).show()

+----------------------------+-----+
|neighbourhood_group_cleansed|count|
+----------------------------+-----+
|                    Eixample| 6706|
|                Ciutat Vella| 4398|
|              Sants-Montjuïc| 1973|
|                  Sant Martí| 1749|
|                      Gràcia| 1575|
|         Sarrià-Sant Gervasi|  991|
|              Horta-Guinardó|  556|
|                   Les Corts|  404|
|                 Sant Andreu|  314|
|                  Nou Barris|  225|
|                      Exempt|    5|
|        Numero registro: ...|    1|
|                 HUTB-002422|    1|
+----------------------------+-----+



### 2.2 Average Price by Neighborhood:

In [31]:
df_bcn.groupBy("neighbourhood_group_cleansed").agg({"price": "mean"}).orderBy("avg(price)", ascending=False).show()

+----------------------------+------------------+
|neighbourhood_group_cleansed|        avg(price)|
+----------------------------+------------------+
|                    Eixample|239.30808231434537|
|                      Gràcia|194.26222222222222|
|                  Sant Martí|193.27730131503716|
|              Sants-Montjuïc| 188.7389761784085|
|                Ciutat Vella|167.07980900409277|
|         Sarrià-Sant Gervasi|164.08072653884966|
|                   Les Corts|156.77970297029702|
|              Horta-Guinardó| 119.9658273381295|
|                 Sant Andreu| 93.79617834394904|
|                  Nou Barris|             83.72|
|                 HUTB-002422|               3.0|
|                      Exempt|               1.0|
|        Numero registro: ...|               0.0|
+----------------------------+------------------+



## 3. Price Distribution:

In [32]:
df_bcn.select("price").summary().show()

+-------+-----------------+
|summary|            price|
+-------+-----------------+
|  count|            18898|
|   mean|195.6262567467457|
| stddev|294.5524571009359|
|    min|              0.0|
|    25%|             82.0|
|    50%|            167.0|
|    75%|            239.0|
|    max|          13714.0|
+-------+-----------------+



## 4.  Review Analysis:
### 4.1 Review Scores Distribution:

In [33]:
df_bcn.groupBy("review_scores_rating").count().orderBy("review_scores_rating").show()

+--------------------+-----+
|review_scores_rating|count|
+--------------------+-----+
|                 0.0| 4768|
|                 1.0|   68|
|                1.25|    1|
|                 1.5|    2|
|                1.67|    1|
|                 2.0|   49|
|                2.17|    1|
|                2.25|    1|
|                2.33|    2|
|                2.45|    1|
|                 2.5|   19|
|                 2.6|    1|
|                2.63|    1|
|                2.67|   10|
|                2.71|    1|
|                2.75|    1|
|                 2.8|    1|
|                2.83|    1|
|                2.86|    1|
|                2.88|    1|
+--------------------+-----+
only showing top 20 rows



In [34]:
# 4.2 Average Review Scores by Neighborhood:
df_bcn.groupBy("neighbourhood_group_cleansed").agg({"review_scores_rating": "mean"}).orderBy("avg(review_scores_rating)", ascending=False).show()


+----------------------------+-------------------------+
|neighbourhood_group_cleansed|avg(review_scores_rating)|
+----------------------------+-------------------------+
|        Numero registro: ...|                    167.0|
|                      Exempt|                     20.2|
|                    Eixample|        3.558294065016386|
|                  Sant Martí|       3.5499656946826788|
|              Sants-Montjuïc|       3.5253522554485524|
|                      Gràcia|        3.469161904761904|
|                Ciutat Vella|       3.3784424738517447|
|              Horta-Guinardó|       3.2126079136690637|
|                   Les Corts|       3.2003960396039592|
|                 Sant Andreu|        3.186210191082802|
|                  Nou Barris|       3.0543111111111103|
|         Sarrià-Sant Gervasi|       2.8145206861755807|
|                 HUTB-002422|                      2.0|
+----------------------------+-------------------------+



## 5.  Amenities Analysis:
### Count of Listings with Specific Amenities:

In [39]:
df_bcn.groupBy("kitchen", "air conditioning", "elevator", "patio or balcony").count().show()

+-------+----------------+--------+----------------+-----+
|kitchen|air conditioning|elevator|patio or balcony|count|
+-------+----------------+--------+----------------+-----+
|      0|               1|       1|               0|  469|
|      0|               0|       0|               0|  391|
|      0|               0|       1|               0|  244|
|      1|               0|       0|               0| 2532|
|      1|               1|       0|               0| 2951|
|$500.00|               1|       1|               1|    1|
|      0|               0|       1|               1|   57|
| $92.00|               0|       0|               1|    3|
|      1|               0|       1|               0| 1571|
|      0|               1|       0|               1|   78|
| $50.00|               1|       0|               0|    1|
|      0|               1|       1|               1|  145|
|      1|               1|       1|               1| 3328|
|      1|               1|       0|               1| 148

## 6. Room Type vs. Price: 
### Average Price by Room Type:

In [40]:
df_bcn.groupBy("room_type_encoded").agg({"price": "mean"}).orderBy("avg(price)", ascending=False).show()

+-----------------+------------------+
|room_type_encoded|        avg(price)|
+-----------------+------------------+
|                2|239.86529884032115|
|                1| 131.5107541157727|
|                0|112.28205128205128|
+-----------------+------------------+



## 7. Host Analysis:
### 7.1 Superhost vs. Price:

In [41]:
df_bcn.groupBy("host_is_superhost").agg({"price": "mean"}).orderBy("avg(price)", ascending=False).show()

+-----------------+------------------+
|host_is_superhost|        avg(price)|
+-----------------+------------------+
|                t|209.02000533475592|
|                f| 192.4000132082948|
|              207|               3.0|
|               15|               1.0|
|                2|               1.0|
|                1|               0.0|
+-----------------+------------------+



### 7.2 Host Listings Count:

In [42]:
df_bcn.groupBy("calculated_host_listings_count").agg({"price": "mean"}).orderBy("avg(price)", ascending=False).show()

+------------------------------+------------------+
|calculated_host_listings_count|        avg(price)|
+------------------------------+------------------+
|                           207| 767.2038834951456|
|                            23| 449.3623188405797|
|                            36| 379.6111111111111|
|                           195| 366.9846153846154|
|                            43| 332.3953488372093|
|                            30|328.26666666666665|
|                           140|            325.65|
|                            52| 302.9807692307692|
|                            80|          296.1625|
|                            34|292.19117647058823|
|                            35| 292.1771428571429|
|                            76|287.88157894736844|
|                            66| 285.3333333333333|
|                            44|274.40909090909093|
|                            50|            273.68|
|                            37| 273.1261261261261|
|           

In [None]:
## Geographic Distribution of Listings:

In [None]:
# Collect latitude and longitude data for visualization
geo_data = df_mad.select("latitude", "longitude").toPandas()

# Save to CSV or another format for use in visualization tools
geo_data.to_csv("geographic_distribution.csv", index=False)