In [18]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, desc, sum
from pyspark.sql.types import *

In [19]:
sc = SparkSession\
    .builder\
    .master("local")\
    .appName('pyspark-airbnb')\
    .getOrCreate()

In [20]:
#reading a csv file with header
df = sc.read.option("header", "True").csv("AB_NYC_2019.csv")
df.show()

+----+--------------------+-------+----------------+-------------------+------------------+--------+---------+---------------+-----+--------------+-----------------+-----------+-----------------+------------------------------+----------------+
|  id|                name|host_id|       host_name|neighbourhood_group|     neighbourhood|latitude|longitude|      room_type|price|minimum_nights|number_of_reviews|last_review|reviews_per_month|calculated_host_listings_count|availability_365|
+----+--------------------+-------+----------------+-------------------+------------------+--------+---------+---------------+-----+--------------+-----------------+-----------+-----------------+------------------------------+----------------+
|2539|Clean & quiet apt...|   2787|            John|           Brooklyn|        Kensington|40.64749|-73.97237|   Private room|  149|             1|                9| 2018-10-19|             0.21|                             6|             365|
|2595|Skylit Midtown Ca.

In [21]:
#renaming column and printing updated schema again
df1 = df.withColumnRenamed("name", "address")

#changing price datatype to Integer
df1 = df1.withColumn("price", df1["price"].cast(IntegerType()))
# df2.groupBy("neighbourhood_group").sum("price").show()


#counts the null values in a particular column
null_count = df1.filter(df1['address'].isNull()).count()
null_count

#drops rows with null values
df2 = df1.dropna()

#gives an alias and then sorts the price column in descending order``
df2.groupBy("neighbourhood")\
    .agg(sum("price").alias("n_price")) \
    .sort(desc("n_price")).show()

+------------------+-------+
|     neighbourhood|n_price|
+------------------+-------+
|      Williamsburg| 441505|
|Bedford-Stuyvesant| 331439|
|    Hell's Kitchen| 283364|
|   Upper West Side| 276330|
|           Midtown| 263354|
|      East Village| 260102|
|            Harlem| 256974|
|   Upper East Side| 234269|
|           Chelsea| 182969|
|          Bushwick| 164922|
|      West Village| 152524|
|     Crown Heights| 140922|
|   Lower East Side| 130946|
|        Greenpoint| 125081|
|       East Harlem| 124438|
|Financial District| 111344|
|      Clinton Hill|  84461|
|           Astoria|  82087|
|              SoHo|  80509|
| Greenwich Village|  72611|
+------------------+-------+
only showing top 20 rows



In [16]:
#neigbourhood has 
u = []
u = df.select("neighbourhood_group").distinct()
# unique_values.show()
u.show()

+-------------------+
|neighbourhood_group|
+-------------------+
|         Douglaston|
|             Queens|
|              Nadia|
|            Midtown|
|    Jackson Heights|
|     Hell's Kitchen|
|  Greenwich Village|
|       Clinton Hill|
| Washington Heights|
|   Ditmars Steinway|
|           Longwood|
|          Briarwood|
|        Little Neck|
|           Flushing|
|      Randall Manor|
|             Carmen|
|      East Elmhurst|
|    Upper East Side|
|         Bath Beach|
|           Canarsie|
+-------------------+
only showing top 20 rows

