In [27]:
from pyspark.sql import SparkSession
import pandas as pd

# Initialize a Spark session
spark = SparkSession.builder.appName("AirbnbPriceFilling").getOrCreate()

# Cargar dataset de Madrid
# Load your datasets into Spark DataFrames
orig_bcn = spark.read.csv('datasets/datasets_originales/listings-detailed-bcn-original.csv', header=True, inferSchema=True, multiLine=True, sep=',', quote='"', escape='\\')
df_bcn = spark.read.csv('datasets/listings_prepared_bcn.csv', header=True, inferSchema=True)

24/08/14 13:15:55 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [28]:
df_bcn.show(5)

+------+--------------------+--------------------+-------+--------------+------------------------------+-----------------+--------------------+----------------------------+-----------------+-----------------+---------------+--------+-------+-------+----------------+--------+----------------+---------+----------+------------------+-----------------+--------------------+
|    id|         listing_url|                name|host_id|     host_name|calculated_host_listings_count|host_is_superhost|             license|neighbourhood_group_cleansed|         latitude|        longitude|      room_type|bedrooms|  price|kitchen|patio or balcony|elevator|air conditioning|long_term|short_term|possible_long_term|number_of_reviews|review_scores_rating|
+------+--------------------+--------------------+-------+--------------+------------------------------+-----------------+--------------------+----------------------------+-----------------+-----------------+---------------+--------+-------+-------+-------

In [29]:
orig_bcn.printSchema()
from pyspark.sql.functions import col, regexp_extract

# Use a regular expression to filter out non-numeric 'id' values
# Regular expression to match only numeric strings
df_numeric_ids = orig_bcn.filter(regexp_extract(col("id"), r'^\d+$', 0) != '')

# Show the result to verify the changes
df_numeric_ids.show(5)

# Optionally, save the cleaxned DataFrame to a new CSV file
df_numeric_ids.write.csv("temp_ids_bcn.csv", header=True)

root
 |-- id: string (nullable = true)
 |-- listing_url: string (nullable = true)
 |-- scrape_id: string (nullable = true)
 |-- last_scraped: string (nullable = true)
 |-- source: string (nullable = true)
 |-- name: string (nullable = true)
 |-- description: string (nullable = true)
 |-- neighborhood_overview: string (nullable = true)
 |-- picture_url: string (nullable = true)
 |-- host_id: string (nullable = true)
 |-- host_url: string (nullable = true)
 |-- host_name: string (nullable = true)
 |-- host_since: string (nullable = true)
 |-- host_location: string (nullable = true)
 |-- host_about: string (nullable = true)
 |-- host_response_time: string (nullable = true)
 |-- host_response_rate: string (nullable = true)
 |-- host_acceptance_rate: string (nullable = true)
 |-- host_is_superhost: string (nullable = true)
 |-- host_thumbnail_url: string (nullable = true)
 |-- host_picture_url: string (nullable = true)
 |-- host_neighbourhood: string (nullable = true)
 |-- host_listings_cou

                                                                                

In [30]:
df_numeric_ids.count()

18925

In [31]:
# Display the schema of the DataFrame
df_bcn.printSchema()

# Get the number of rows and columns
print(f"Number of rows: {df_bcn.count()}, Number of columns: {len(df_bcn.columns)}")

root
 |-- id: long (nullable = true)
 |-- listing_url: string (nullable = true)
 |-- name: string (nullable = true)
 |-- host_id: string (nullable = true)
 |-- host_name: string (nullable = true)
 |-- calculated_host_listings_count: string (nullable = true)
 |-- host_is_superhost: string (nullable = true)
 |-- license: string (nullable = true)
 |-- neighbourhood_group_cleansed: string (nullable = true)
 |-- latitude: string (nullable = true)
 |-- longitude: double (nullable = true)
 |-- room_type: string (nullable = true)
 |-- bedrooms: string (nullable = true)
 |-- price: string (nullable = true)
 |-- kitchen: string (nullable = true)
 |-- patio or balcony: integer (nullable = true)
 |-- elevator: integer (nullable = true)
 |-- air conditioning: integer (nullable = true)
 |-- long_term: integer (nullable = true)
 |-- short_term: integer (nullable = true)
 |-- possible_long_term: integer (nullable = true)
 |-- number_of_reviews: integer (nullable = true)
 |-- review_scores_rating: doub

In [32]:
# Remove the dollar sign and commas, and convert the column to a numerical format:
from pyspark.sql.functions import regexp_replace, col

df_bcn = df_bcn.withColumn("price", regexp_replace(col("price"), "[\$,]", "").cast("float"))

  df_bcn = df_bcn.withColumn("price", regexp_replace(col("price"), "[\$,]", "").cast("float"))


In [33]:
#  Identify Missing Values
from pyspark.sql.functions import col, sum as _sum

# Count missing values for each column
df_bcn.select([_sum(col(c).isNull().cast("int")).alias(c) for c in df_bcn.columns]).show()

# The dataset has several columns with missing values:

# host_name: Missing in 2 entries > fill in with 'No Disponible'
# host_is_superhost: Missing in 124 entries. > Fill in with f;
# license: Missing in 6137 entries (almost all of them). This column might be dropped unless it's crucial for your analysis.
# neighbourhood_group_cleansed: 1 missing > manual review
# bedrooms: Missing in 1882 entries. > check for more than one bedroom > mark 1, if less than 1 > mark 0
# price: Missing in 3736 entries. Drop or fill in depending on the cases.
# review_scores_rating: Missing in 5,856 entries. > fill in with 0

+---+-----------+----+-------+---------+------------------------------+-----------------+-------+----------------------------+--------+---------+---------+--------+-----+-------+----------------+--------+----------------+---------+----------+------------------+-----------------+--------------------+
| id|listing_url|name|host_id|host_name|calculated_host_listings_count|host_is_superhost|license|neighbourhood_group_cleansed|latitude|longitude|room_type|bedrooms|price|kitchen|patio or balcony|elevator|air conditioning|long_term|short_term|possible_long_term|number_of_reviews|review_scores_rating|
+---+-----------+----+-------+---------+------------------------------+-----------------+-------+----------------------------+--------+---------+---------+--------+-----+-------+----------------+--------+----------------+---------+----------+------------------+-----------------+--------------------+
|  0|          0|   0|      0|        2|                             0|              124|   6137|

In [34]:
# Fill host_name with 'No Disponible'
df_bcn = df_bcn.na.fill({"host_name": "No Disponible"})

In [35]:
# Fill host_is_superhost with 'f'
df_bcn = df_bcn.na.fill({"host_is_superhost": "f"})

In [36]:
# Drop the license Column
df_bcn = df_bcn.drop("license")

In [37]:
# Check by the URL link to fill in the neighbourhood for missing value

# Filter the DataFrame to find the row with the missing value in 'neighbourhood_group_cleansed'
missing_neighbourhood = df_bcn.filter(col("neighbourhood_group_cleansed").isNull())

# Show the row(s) with the missing value
missing_neighbourhood.show()

+--------+--------------------+----------------+--------------------+---------+------------------------------+-----------------+----------------------------+-------------------+---------+---------+---------------+-----+-------+----------------+--------+----------------+---------+----------+------------------+-----------------+--------------------+
|      id|         listing_url|            name|             host_id|host_name|calculated_host_listings_count|host_is_superhost|neighbourhood_group_cleansed|           latitude|longitude|room_type|       bedrooms|price|kitchen|patio or balcony|elevator|air conditioning|long_term|short_term|possible_long_term|number_of_reviews|review_scores_rating|
+--------+--------------------+----------------+--------------------+---------+------------------------------+-----------------+----------------------------+-------------------+---------+---------+---------------+-----+-------+----------------+--------+----------------+---------+----------+---------

In [38]:
# Drop the id = 27217399 as it only provides long term rental and it not available for the next 6 months
df_bcn = df_bcn.na.drop(subset=['neighbourhood_group_cleansed'])

In [39]:
# To fill in "bedrooms" columns and also encode 'bedroom' and 'room type' with the following thoery

# if the type is "shared bedroom" or null, we encode it as 0
# if the typr is "private room" or "hotel room", we encode it as 1
# and if the type if "entire home/apt", we encode it with 2
from pyspark.sql.functions import when, col

# Apply the Encoding Based on room_type
# Apply the encoding to the 'room_type' column
df_bcn = df_bcn.withColumn("room_type_encoded", 
                   when(col("room_type").isNull(), 0)
                   .when(col("room_type") == "Shared room", 0)
                   .when(col("room_type") == "Private room", 1)
                   .when(col("room_type") == "Hotel room", 1)
                   .when(col("room_type") == "Entire home/apt", 2)
                   .otherwise(0))  # This covers any unexpected or additional room types

# Now combine with the 'bedrooms' logic
df_bcn = df_bcn.withColumn("bedrooms_encoded", 
                   when(col("bedrooms") > 1, 1).otherwise(0))

# Show the result to verify the changes
df_bcn.select("room_type", "room_type_encoded", "bedrooms", "bedrooms_encoded").show(10)

# Then remove 'room_type' and 'bedrooms'
df_bcn = df_bcn.drop("room_type")
df_bcn = df_bcn.drop("bedrooms")


+---------------+-----------------+--------+----------------+
|      room_type|room_type_encoded|bedrooms|bedrooms_encoded|
+---------------+-----------------+--------+----------------+
|Entire home/apt|                2|     1.0|               0|
|Entire home/apt|                2|     4.0|               1|
|Entire home/apt|                2|     3.0|               1|
|Entire home/apt|                2|     3.0|               1|
|Entire home/apt|                2|     3.0|               1|
|Entire home/apt|                2|     2.0|               1|
|Entire home/apt|                2|     1.0|               0|
|Entire home/apt|                2|     1.0|               0|
|Entire home/apt|                2|     2.0|               1|
|Entire home/apt|                2|     2.0|               1|
+---------------+-----------------+--------+----------------+
only showing top 10 rows



In [40]:
# Fill review_scores_rating with 0

df_bcn =df_bcn.na.fill({"review_scores_rating": 0})

In [41]:
# Fill in "price" column with the following theory

# 1. Mark the rows where the "availaility_30" and "availableity_60" are both '0' in the orig_bcn dataframe
# 2. Found the correspondent rows by using its "id" in df_bcn and drop the entrie rows
# 3. Fill in the rest of the null values with the everage price in the same neighbourhood

In [42]:
# Load the orig_bcn dataframe
zero_availability_ids = df_numeric_ids.filter((col("availability_30") == 0) & (col("availability_60") == 0)).select("id")
zero_availability_ids.count()

26

In [43]:
# # Convert zero_availability_ids to a list
zero_availability_ids_list = [row.id for row in zero_availability_ids.distinct().collect()]

# # Filter df_bcn to remove rows where id is in zero_availability_ids_list
df_bcn_filtered = df_bcn.filter(~col("id").isin(zero_availability_ids_list))

In [44]:
df_bcn_filtered.count()

18898

In [45]:
from pyspark.sql import functions as F

# # Calculate the average price per neighbourhood
avg_price_neighbourhood = df_bcn_filtered.groupBy("neighbourhood_group_cleansed").agg(F.round(F.mean("price")).alias("avg_price"))

# Join the original DataFrame with the average prices DataFrame
df_bcn_with_avg = df_bcn_filtered.join(avg_price_neighbourhood, on="neighbourhood_group_cleansed", how="left")

# # Fill missing price values with the calculated average price
df_bcn_final = df_bcn_with_avg.withColumn("price", F.coalesce(col("price"), col("avg_price")))

# # Drop the avg_price column as it's no longer needed
df_bcn_final = df_bcn_final.drop("avg_price")

# # Show the result to verify the changes
df_bcn_final.select("id", "neighbourhood_group_cleansed", "price").show(10)

+------+----------------------------+-----+
|    id|neighbourhood_group_cleansed|price|
+------+----------------------------+-----+
|269467|                    Eixample|176.0|
|270197|                Ciutat Vella| 74.0|
| 18674|                    Eixample|257.0|
|272282|                    Eixample|239.0|
| 23197|                  Sant Martí|249.0|
| 32711|                      Gràcia|240.0|
|273229|                    Eixample|126.0|
| 34241|                Ciutat Vella|229.0|
|273932|                    Eixample|281.0|
|274221|                Ciutat Vella|393.0|
+------+----------------------------+-----+
only showing top 10 rows



In [46]:
df_bcn_final.count()

18898

In [47]:
# Count missing values for each column
df_bcn_final.select([_sum(col(c).isNull().cast("int")).alias(c) for c in df_bcn_final.columns]).show()

+----------------------------+---+-----------+----+-------+---------+------------------------------+-----------------+--------+---------+-----+-------+----------------+--------+----------------+---------+----------+------------------+-----------------+--------------------+-----------------+----------------+
|neighbourhood_group_cleansed| id|listing_url|name|host_id|host_name|calculated_host_listings_count|host_is_superhost|latitude|longitude|price|kitchen|patio or balcony|elevator|air conditioning|long_term|short_term|possible_long_term|number_of_reviews|review_scores_rating|room_type_encoded|bedrooms_encoded|
+----------------------------+---+-----------+----+-------+---------+------------------------------+-----------------+--------+---------+-----+-------+----------------+--------+----------------+---------+----------+------------------+-----------------+--------------------+-----------------+----------------+
|                           0|  0|          0|   0|      0|        0|    

In [48]:
# Save the cleaned dinal DataFrame to a new CSV file
df_bcn_final.write.csv("cleaned_listing_bcn.csv", header=True)

In [49]:
spark.stop()