In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("Read Inside Airbnb data") \
    .getOrCreate()

25/08/17 10:47:57 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [2]:
listings = spark.read.csv("./data/listings.csv.gz", 
    header=True,
    inferSchema=True,
    sep=",", 
    quote='"',
    escape='"', 
    multiLine=True,
    mode="PERMISSIVE" 
)

                                                                                

In [3]:
listings.printSchema()

root
 |-- id: long (nullable = true)
 |-- listing_url: string (nullable = true)
 |-- scrape_id: long (nullable = true)
 |-- last_scraped: date (nullable = true)
 |-- source: string (nullable = true)
 |-- name: string (nullable = true)
 |-- description: string (nullable = true)
 |-- neighborhood_overview: string (nullable = true)
 |-- picture_url: string (nullable = true)
 |-- host_id: integer (nullable = true)
 |-- host_url: string (nullable = true)
 |-- host_name: string (nullable = true)
 |-- host_since: date (nullable = true)
 |-- host_location: string (nullable = true)
 |-- host_about: string (nullable = true)
 |-- host_response_time: string (nullable = true)
 |-- host_response_rate: string (nullable = true)
 |-- host_acceptance_rate: string (nullable = true)
 |-- host_is_superhost: string (nullable = true)
 |-- host_thumbnail_url: string (nullable = true)
 |-- host_picture_url: string (nullable = true)
 |-- host_neighbourhood: string (nullable = true)
 |-- host_listings_count: int

In [4]:
listings.count()

                                                                                

96651

In [59]:
# 1. Get a non-null picture URL for any property ("picture_url" field)
# Select any non-null picture URL
listings.filter("picture_url IS NOT NULL")\
        .select("id", "price", "picture_url")\
        .limit(1)\
        .show(truncate=False)
        #.dropna()\

+------+-------+----------------------------------------------------------------------------------------------------------+
|id    |price  |picture_url                                                                                               |
+------+-------+----------------------------------------------------------------------------------------------------------+
|264776|$297.00|https://a0.muscache.com/pictures/hosting/Hosting-264776/original/3cc7b93f-dbda-4ded-ac15-e9d96691e7ca.jpeg|
+------+-------+----------------------------------------------------------------------------------------------------------+



In [60]:
# 2. Get number of properties that get more than 10 reviews per month
listings.filter(listings.reviews_per_month > 10).count()

                                                                                

57

In [61]:
# 3. Get properties that have more bathrooms than bedrooms
listings.filter((listings.bedrooms < listings.bathrooms) & (listings.bedrooms > 1))\
        .select("id", "name", "bedrooms", "bathrooms")\
        .show(10, truncate=False)

+-------+--------------------------------------------------+--------+---------+
|id     |name                                              |bedrooms|bathrooms|
+-------+--------------------------------------------------+--------+---------+
|346523 |Luxury, Newly Renovated Chelsea Apartment         |2       |2.5      |
|536121 |Experience London in amazing reconverted Gin house|3       |3.5      |
|536751 |Elegant Paddington Penthouse 2 Bedroom 3 Bathroom |2       |3.0      |
|999835 |Stepney House 3 - affordable LUXURY               |2       |2.5      |
|857938 |Prime location luxury home                        |3       |3.5      |
|1346303|Lovely 2bd flat central Maida Vale                |2       |2.5      |
|1367598|Charming 3 Bedroom House in Pimlico               |3       |3.5      |
|1796751|Superlux flat in Knightsbridge                    |2       |2.5      |
|1796888|Come to stay at the The Beeches in Royal Greenwich|2       |2.5      |
|1665188|Luxury Mews House in quiet cent

In [8]:
# 4. Get properties where the price is greater than 5,000. Collect the result as a Python list
# Remember to convert a price into a number first!
#listings.filter(listings.price > 5000).count()
listings.select(listings.price).show(5)

+-------+
|  price|
+-------+
|$297.00|
| $98.00|
|$148.00|
|$144.00|
|$157.00|
+-------+
only showing top 5 rows


In [64]:
from pyspark.sql.functions import regexp_replace

price_num_df = listings.withColumn('price_num', regexp_replace('price', '[$,]', '').cast('float'))
res = price_num_df.filter('price_num > 5000')\
    .select('name', 'price')\
    .collect()

# res

                                                                                

In [65]:
# 5. Get a list of properties with the following characteristics:
# * price < 150
# * more than 20 reviews
# * review_scores_rating > 4.5
# Consider using the "&" operator
#price_num_df.filter('price_num < 150 AND number_of_reviews > 20 AND review_scores_rating > 4.5').select('id').count()
price_num_df.filter((price_num_df.price_num < 150) &\
                    (price_num_df.number_of_reviews > 20) &\
                    (price_num_df.review_scores_rating > 4.5))\
            .select('name')\
            .count()

                                                                                

9626

In [37]:
# 6. Get a list of properties with the following characteristics:
# * price < 150 OR more than one bathroom
# Use the "|" operator to implement the OR operator
price_num_df.filter((price_num_df.price_num < 150) | (price_num_df.bathrooms > 1))\
            .select('id', 'price_num', 'bathrooms')\
            .count()

                                                                                

48011

In [41]:
# 7. Get the highest listing price in this dataset
# Consider using the "max" function from "pyspark.sql.functions"
import pyspark.sql.functions as sf
price_num_df.select(sf.max(price_num_df.price_num)).show()

[Stage 47:>                                                         (0 + 1) / 1]

+--------------+
|max(price_num)|
+--------------+
|       74100.0|
+--------------+



                                                                                

In [70]:
# 8. Get the name and a price of property with the highest price
# Try to use "collect" method to get the highest price first, and then use it in a "filter" call 
res = price_num_df\
    .select(sf.max(price_num_df.price_num).alias('max_price'))\
    .collect()
res


                                                                                

[Row(max_price=74100.0)]

In [71]:
max_price = res[0]['max_price']
max_price

74100.0

In [73]:
price_num_df.filter(price_num_df.price_num == max_price)\
            .select('name', 'price')\
            .show(truncate=False)

[Stage 90:>                                                         (0 + 1) / 1]

+-----------------------------------------------+----------+
|name                                           |price     |
+-----------------------------------------------+----------+
|Bright & airy DoubleBed with EnSuite in Zone 2!|$74,100.00|
+-----------------------------------------------+----------+



                                                                                

In [74]:
# 9. Get the number of hosts in the dataset
listings.select(listings.host_name).distinct().count()

                                                                                

16659

In [75]:
# 10. Get listings with a first review in 2024
# Consider using the "year" function from "pyspark.sql.functions"
import pyspark.sql.functions as sf
# listings.select(sf.year(listings.first_review), sf.typeof(sf.year(listings.first_review))).show(5)
listings.filter(sf.year(listings.first_review) == 2024)\
    .select(listings.name, listings.first_review, sf.year(listings.first_review))\
    .show(10, truncate=False)

+--------------------------------------------------+------------+------------------+
|name                                              |first_review|year(first_review)|
+--------------------------------------------------+------------+------------------+
|Close to Wimbledon All England Tennis -huge double|2024-08-11  |2024              |
|Bridgerton inspired cottage core apartment        |2024-09-14  |2024              |
|one Double bed room with en-suite facilities      |2024-03-21  |2024              |
|Sm double room  with own bathroom                 |2024-06-04  |2024              |
|Superlux flat in Knightsbridge                    |2024-01-01  |2024              |
|Central, modern pied-a-terre                      |2024-11-29  |2024              |
|Stunning Bright Chelsea 2BR flat                  |2024-09-21  |2024              |
|Victorian 2-bedroom upstairs flat sleeps 4        |2024-12-09  |2024              |
|The Pink House, Notting Hill                      |2024-07-14  |