# Listings Data Cleaning

In [0]:
from pyspark.sql.functions import col
import pandas as pd

In [0]:
Dublin = spark.table("silver.edgar_sarto_revenue.dublin_inside_airbnb_listings")
Milan = spark.table("silver.edgar_sarto_revenue.milan_inside_airbnb_listings")
London = spark.table("silver.edgar_sarto_revenue.london_inside_airbnb_listings")

Dublin = Dublin.filter((col("minimum_nights") >= 30) & (col("minimum_nights") <= 91))
Milan = Milan.filter((col("minimum_nights") >= 30) & (col("minimum_nights") <= 91))
London = London.filter((col("minimum_nights") >= 30) & (col("minimum_nights") <= 91))

Dublin = Dublin.toPandas()
Milan = Milan.toPandas()
London = London.toPandas()

In [0]:
import pandas as pd

Dublin['City'] = 'Dublin'
Milan['City'] = 'Milan'
London['City'] = 'London'

IR = pd.concat([Dublin, Milan, London], ignore_index=True)

IR.head(1)

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365,number_of_reviews_ltm,license,City
0,245903.0,Single cosy bedroom in Lucan.,1289720.0,Sheila,,South Dublin,53.34596,-6.41719,Private room,30.0,30,48.0,2024-11-30,0.32,1.0,13,4.0,,Dublin


In [0]:
IR.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3263 entries, 0 to 3262
Data columns (total 19 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   id                              3186 non-null   float64
 1   name                            3263 non-null   object 
 2   host_id                         3186 non-null   float64
 3   host_name                       3187 non-null   object 
 4   neighbourhood_group             77 non-null     object 
 5   neighbourhood                   3262 non-null   object 
 6   latitude                        3262 non-null   float64
 7   longitude                       3187 non-null   float64
 8   room_type                       3254 non-null   object 
 9   price                           2516 non-null   float64
 10  minimum_nights                  3263 non-null   int64  
 11  number_of_reviews               3187 non-null   float64
 12  last_review                     18

In [0]:
IR.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3263 entries, 0 to 3262
Data columns (total 19 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   id                              3186 non-null   float64
 1   name                            3263 non-null   object 
 2   host_id                         3186 non-null   float64
 3   host_name                       3187 non-null   object 
 4   neighbourhood_group             77 non-null     object 
 5   neighbourhood                   3262 non-null   object 
 6   latitude                        3262 non-null   float64
 7   longitude                       3187 non-null   float64
 8   room_type                       3254 non-null   object 
 9   price                           2516 non-null   float64
 10  minimum_nights                  3263 non-null   int64  
 11  number_of_reviews               3187 non-null   float64
 12  last_review                     18

In [0]:
IR = IR.drop(columns={'name', 'neighbourhood_group', 'license'}, inplace=False)

IR['id'] = IR['id'].astype(float)
IR['latitude'] = IR['latitude'].astype(float)
IR['longitude'] = IR['longitude'].astype(float)

In [0]:
columnas = ['latitude','longitude']

IR[columnas] = IR[columnas].astype(str)

for columna in columnas:
    IR[columna] = IR[columna].str.replace(r"\.", ",", regex=True)

In [0]:
IR['room_type'] = IR['room_type'] == 'Entire home/apt'

In [0]:
IR_Spark = spark.createDataFrame(IR)

IR_Spark.write.mode("overwrite").saveAsTable("silver.edgar_sarto_revenue.Inside_airbnb_listings_clean")

# Calendar Data Cleaning

In [0]:
Dublin = spark.table("silver.edgar_sarto_revenue.dublin_inside_airbnb_calendar")
Milan = spark.table("silver.edgar_sarto_revenue.milan_inside_airbnb_calendar")
London = spark.table("silver.edgar_sarto_revenue.london_inside_airbnb_calendar")

Dublin = Dublin.filter((col("minimum_nights") >= 30) & (col("minimum_nights") <= 91))
Milan = Milan.filter((col("minimum_nights") >= 30) & (col("minimum_nights") <= 91))
London = London.filter((col("minimum_nights") >= 30) & (col("minimum_nights") <= 91))

In [0]:
Dublin = Dublin.toPandas()
Milan = Milan.toPandas()
London = London.toPandas()

In [0]:
Dublin['City'] = 'Dublin'
Milan['City'] = 'Milan'
London['City'] = 'London'

IRC = pd.concat([Dublin, Milan, London], ignore_index=True)

IRC = IRC.drop(columns={'adjusted_price'}, inplace=False)

In [0]:
IRC.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1310581 entries, 0 to 1310580
Data columns (total 7 columns):
 #   Column          Non-Null Count    Dtype 
---  ------          --------------    ----- 
 0   listing_id      1310581 non-null  int64 
 1   date            1310581 non-null  object
 2   available       1310581 non-null  object
 3   price           1310581 non-null  object
 4   minimum_nights  1310581 non-null  int64 
 5   maximum_nights  1310581 non-null  int64 
 6   City            1310581 non-null  object
dtypes: int64(3), object(4)
memory usage: 70.0+ MB


In [0]:
IRC['price'] = IRC['price'].str.replace('$', '', regex=False)
IRC.head(1)

Unnamed: 0,listing_id,date,available,price,minimum_nights,maximum_nights,City
0,245903,2024-12-12,f,30.0,30,180,Dublin


In [0]:
IRC['listing_id'] = IRC['listing_id'].astype(int)


In [0]:
IRC = IRC.merge(IR[['id', 'host_name', 'neighbourhood']], left_on='listing_id',right_on='id', how='left')

IRC.head(1)

Unnamed: 0,listing_id,date,available,price,minimum_nights,maximum_nights,City,id,host_name,neighbourhood
0,245903,2024-12-12,f,30.0,30,180,Dublin,245903.0,Sheila,South Dublin


In [0]:
IRC_Spark = spark.createDataFrame(IRC)

IRC_Spark.write.mode("overwrite").saveAsTable("silver.edgar_sarto_revenue.Inside_airbnb_calendar_clean")