### The objective behind our project is to identify the locations around which the customers can find the most number of Airbnb listings and predicting the price category of Airbnb listings namely affordable, moderate, or expensive.

In [0]:
# File location and type
file_location = "/FileStore/tables/Airbnb_project.csv"
file_type = "csv"

# CSV options
infer_schema = "false"
first_row_is_header = "true"
delimiter = ","

# The applied options are for CSV files. For other file types, these will be ignored.
df = spark.read.format(file_type) \
  .option("inferSchema", infer_schema) \
  .option("header", first_row_is_header) \
  .option("sep", delimiter) \
  .load(file_location)

display(df)

host_is_superhost,host_identity_verified,neighbourhood_cleansed,latitude,longitude,property_type,room_type,accommodates,bathrooms,bedrooms,beds,bed_type,Number of amenities,guests_included,price_per_extra_person,minimum_nights,number_of_reviews,number_days_btw_first_last_review,review_scores_rating,cancellation_policy,price
0,0,Roslindale,42.2826188,-71.13306793,House,Entire home/apt,4,1.5,2.0,3.0,Real Bed,14,1,0,2,0,0,,moderate,250
0,1,Roslindale,42.28624082,-71.13437396,Apartment,Private room,2,1.0,1.0,1.0,Real Bed,20,0,0,2,36,804,94.0,moderate,65
1,1,Roslindale,42.29243789,-71.13576525,Apartment,Private room,2,1.0,1.0,1.0,Real Bed,17,1,20,3,41,2574,98.0,moderate,65
0,0,Roslindale,42.28110619,-71.12102117,House,Private room,4,1.0,1.0,2.0,Real Bed,22,2,25,1,1,0,100.0,moderate,75
1,1,Roslindale,42.28451221,-71.13625805,House,Private room,2,1.5,1.0,2.0,Real Bed,13,1,0,2,29,380,99.0,flexible,79
1,1,Roslindale,42.2916898,-71.13189277,Condominium,Private room,2,1.0,1.0,1.0,Real Bed,12,1,0,2,8,130,100.0,flexible,75
0,1,Roslindale,42.28138963,-71.13119042,Apartment,Entire home/apt,3,1.0,1.0,2.0,Real Bed,12,1,25,1,57,421,90.0,strict,100
1,1,Roslindale,42.2819461,-71.14102161,House,Private room,2,2.0,1.0,1.0,Real Bed,22,1,15,1,67,840,96.0,moderate,75
1,1,Roslindale,42.28587764,-71.12490956,Condominium,Private room,2,1.0,1.0,2.0,Real Bed,9,2,0,2,65,355,96.0,moderate,58
1,1,Roslindale,42.28882028,-71.1395101,Apartment,Entire home/apt,5,1.0,2.0,2.0,Real Bed,21,4,25,4,33,876,94.0,strict,229


In [0]:
df.dtypes

Out[2]: [('host_is_superhost', 'string'),
 ('host_identity_verified', 'string'),
 ('neighbourhood_cleansed', 'string'),
 ('latitude', 'string'),
 ('longitude', 'string'),
 ('property_type', 'string'),
 ('room_type', 'string'),
 ('accommodates', 'string'),
 ('bathrooms', 'string'),
 ('bedrooms', 'string'),
 ('beds', 'string'),
 ('bed_type', 'string'),
 ('Number of amenities', 'string'),
 ('guests_included', 'string'),
 ('price_per_extra_person', 'string'),
 ('minimum_nights', 'string'),
 ('number_of_reviews', 'string'),
 ('number_days_btw_first_last_review', 'string'),
 ('review_scores_rating', 'string'),
 ('cancellation_policy', 'string'),
 ('price', 'string')]

In [0]:
df = df.withColumn("host_is_superhost",df.host_is_superhost.cast("int")).\
    withColumn("host_identity_verified",df.host_identity_verified.cast("int")).\
    withColumn("latitude",df.latitude.cast("double")).\
    withColumn("longitude",df.longitude.cast("double")).\
    withColumn("accommodates",df.accommodates.cast("int")).\
    withColumn("bathrooms",df.bathrooms.cast("float")).\
    withColumn("bedrooms",df.bedrooms.cast("int")).\
    withColumn("beds",df.beds.cast("int")).\
    withColumn("guests_included",df.guests_included.cast("int")).\
    withColumn("price_per_extra_person",df.price_per_extra_person.cast("int")).\
    withColumn("minimum_nights",df.minimum_nights.cast("int")).\
    withColumn("number_of_reviews",df.number_of_reviews.cast("int")).\
    withColumn("number_days_btw_first_last_review",df.number_days_btw_first_last_review.cast("int")).\
    withColumn("review_scores_rating",df.review_scores_rating.cast("int")).\
    withColumn("price",df.price.cast("int"))


In [0]:
df=df.withColumnRenamed("Number of amenities","Number_of_amenities")

In [0]:
df=df.withColumn("Number_of_amenities",df.Number_of_amenities.cast("int"))

In [0]:
df=df.withColumnRenamed("neighbourhood_cleansed","neighbourhood")

In [0]:
df.dtypes

Out[7]: [('host_is_superhost', 'int'),
 ('host_identity_verified', 'int'),
 ('neighbourhood', 'string'),
 ('latitude', 'double'),
 ('longitude', 'double'),
 ('property_type', 'string'),
 ('room_type', 'string'),
 ('accommodates', 'int'),
 ('bathrooms', 'float'),
 ('bedrooms', 'int'),
 ('beds', 'int'),
 ('bed_type', 'string'),
 ('Number_of_amenities', 'int'),
 ('guests_included', 'int'),
 ('price_per_extra_person', 'int'),
 ('minimum_nights', 'int'),
 ('number_of_reviews', 'int'),
 ('number_days_btw_first_last_review', 'int'),
 ('review_scores_rating', 'int'),
 ('cancellation_policy', 'string'),
 ('price', 'int')]

In [0]:
# Create a view or table

temp_table_name = "Airbnb_project"

df.createOrReplaceTempView(temp_table_name)

In [0]:
%sql

/* Query the created temp table in a SQL cell */

select * from `Airbnb_project`

host_is_superhost,host_identity_verified,neighbourhood,latitude,longitude,property_type,room_type,accommodates,bathrooms,bedrooms,beds,bed_type,Number_of_amenities,guests_included,price_per_extra_person,minimum_nights,number_of_reviews,number_days_btw_first_last_review,review_scores_rating,cancellation_policy,price
0,0,Roslindale,42.2826188,-71.13306793,House,Entire home/apt,4,1.5,2.0,3.0,Real Bed,14,1,0,2,0,0,,moderate,250
0,1,Roslindale,42.28624082,-71.13437396,Apartment,Private room,2,1.0,1.0,1.0,Real Bed,20,0,0,2,36,804,94.0,moderate,65
1,1,Roslindale,42.29243789,-71.13576525,Apartment,Private room,2,1.0,1.0,1.0,Real Bed,17,1,20,3,41,2574,98.0,moderate,65
0,0,Roslindale,42.28110619,-71.12102117,House,Private room,4,1.0,1.0,2.0,Real Bed,22,2,25,1,1,0,100.0,moderate,75
1,1,Roslindale,42.28451221,-71.13625805,House,Private room,2,1.5,1.0,2.0,Real Bed,13,1,0,2,29,380,99.0,flexible,79
1,1,Roslindale,42.2916898,-71.13189277,Condominium,Private room,2,1.0,1.0,1.0,Real Bed,12,1,0,2,8,130,100.0,flexible,75
0,1,Roslindale,42.28138963,-71.13119042,Apartment,Entire home/apt,3,1.0,1.0,2.0,Real Bed,12,1,25,1,57,421,90.0,strict,100
1,1,Roslindale,42.2819461,-71.14102161,House,Private room,2,2.0,1.0,1.0,Real Bed,22,1,15,1,67,840,96.0,moderate,75
1,1,Roslindale,42.28587764,-71.12490956,Condominium,Private room,2,1.0,1.0,2.0,Real Bed,9,2,0,2,65,355,96.0,moderate,58
1,1,Roslindale,42.28882028,-71.1395101,Apartment,Entire home/apt,5,1.0,2.0,2.0,Real Bed,21,4,25,4,33,876,94.0,strict,229


In [0]:
# With this registered as a temp view, it will only be available to this particular notebook. If you'd like other users to be able to query this table, you can also create a table from the DataFrame.
# Once saved, this table will persist across cluster restarts as well as allow various users across different notebooks to query this data.
# To do so, choose your table name and uncomment the bottom line.

permanent_table_name = "Airbnbproject"

# df.write.format("parquet").mode("overwrite").saveAsTable(permanent_table_name)

In [0]:
df=df.dropna()

### Which neighborhood has maximum number of listings?

In [0]:
from pyspark.sql.functions import *

df.groupBy("neighbourhood").count().sort(desc('count')).show()

+--------------------+-----+
|       neighbourhood|count|
+--------------------+-----+
|       Jamaica Plain|  837|
|           South End|  750|
|            Back Bay|  678|
|          Dorchester|  657|
|              Fenway|  546|
|             Allston|  540|
|         Beacon Hill|  477|
|        South Boston|  423|
|         East Boston|  396|
|            Brighton|  390|
|            Downtown|  339|
|           North End|  339|
|             Roxbury|  327|
|        Mission Hill|  243|
|         Charlestown|  207|
|South Boston Wate...|  153|
|          Roslindale|  150|
|           Chinatown|  123|
|        West Roxbury|   99|
|            West End|   81|
+--------------------+-----+
only showing top 20 rows



### Average price for each property type

In [0]:
%sql
select avg(CASE WHEN property_type='Apartment' THEN price else null end) as avg_apartment_price, 
       avg(CASE WHEN property_type='Condominium' THEN price else null end) as avg_condo_price,
       avg(CASE WHEN property_type='House' THEN price else null end) as avg_house_price,
       avg(CASE WHEN property_type='Villa' THEN price else null end) as avg_villa_price,
       avg(CASE WHEN property_type='Loft' THEN price else null end) as avg_loft_price,
       avg(CASE WHEN property_type='Townhouse' THEN price else null end) as avg_townhouse_price
from Airbnb_project

avg_apartment_price,avg_condo_price,avg_house_price,avg_villa_price,avg_loft_price,avg_townhouse_price
168.34115805946792,172.97297297297297,113.69981583793738,189.5,177.1315789473684,123.0


### Neighbourhood with highest average rating (top 5)

In [0]:
df.groupby("neighbourhood").agg(avg("Review_scores_rating").alias('Avg_rating')).sort(desc('Avg_rating')).show(5)

+--------------------+-----------------+
|       neighbourhood|       Avg_rating|
+--------------------+-----------------+
|    Leather District|98.33333333333333|
|          Roslindale|            95.38|
|        West Roxbury|95.21212121212122|
|South Boston Wate...|94.45098039215686|
|        South Boston|94.39716312056737|
+--------------------+-----------------+
only showing top 5 rows



### Neighborhood with highest average price

In [0]:
df.groupby("neighbourhood").avg("price").sort(desc("avg(price)")).show(1)

+----------------+------------------+
|   neighbourhood|        avg(price)|
+----------------+------------------+
|Leather District|242.66666666666666|
+----------------+------------------+
only showing top 1 row



### Property type percentage in the overall region

In [0]:
from pyspark.sql.window import Window

df1=df.groupby(["neighbourhood","property_type"]).count().\
    withColumn("percentage",round(col("count")*100/sum("count").over(Window.partitionBy("neighbourhood")),2))
display(df1.sort("neighbourhood",desc("percentage")))

neighbourhood,property_type,count,percentage
Allston,Apartment,357,66.11
Allston,House,159,29.44
Allston,Condominium,21,3.89
Allston,Townhouse,3,0.56
Back Bay,Apartment,618,91.15
Back Bay,Condominium,42,6.19
Back Bay,Loft,9,1.33
Back Bay,Townhouse,6,0.88
Back Bay,House,3,0.44
Bay Village,Apartment,27,81.82


Output can only be rendered in Databricks

### Creating new column for categorizing price

In [0]:
display(df.select("price")) ### Creating a histogram for categorizing the price

price
65
65
75
79
75
100
75
58
229
60


Output can only be rendered in Databricks

### We will categorize price into 3 categories
1. Affordable (less than $175)
2. Moderate (between $175 and $335)
3. Expensive (greater than $335)

In [0]:
df2=df.select(['host_is_superhost','host_identity_verified','neighbourhood','latitude','longitude','property_type','room_type','accommodates','bathrooms','bedrooms','beds','bed_type','Number_of_amenities','guests_included','price_per_extra_person','minimum_nights','number_of_reviews','number_days_btw_first_last_review','review_scores_rating','cancellation_policy','price'])

In [0]:
def price_categorizer(x):
    price_c=''
    if x.price <= 175:
        price_c = 'Affordable'
    elif x.price > 175 and x.price <=335:
        price_c = 'Moderate'
    else:
        price_c= 'Expensive'
    return(price_c,x.price)
rdd2 = df2.rdd.map(lambda x : price_categorizer(x))

In [0]:
df4=rdd2.toDF(["price_c","price2"])

In [0]:
data = df2.join(df4,df2.price == df4.price2,"inner")

In [0]:
data=data.drop("price2")
display(data)

host_is_superhost,host_identity_verified,neighbourhood,latitude,longitude,property_type,room_type,accommodates,bathrooms,bedrooms,beds,bed_type,Number_of_amenities,guests_included,price_per_extra_person,minimum_nights,number_of_reviews,number_days_btw_first_last_review,review_scores_rating,cancellation_policy,price,price_c
0,1,Charlestown,42.38522635,-71.08092263,Apartment,Private room,2,1.0,1,1,Real Bed,7,1,30,1,2,8,90,strict,65,Affordable
0,1,Charlestown,42.38774101,-71.07989661,Apartment,Private room,2,1.0,1,1,Real Bed,4,1,30,1,2,12,80,flexible,65,Affordable
0,1,Allston,42.34832593,-71.13613371,House,Entire home/apt,3,1.0,1,2,Real Bed,10,1,0,1,1,0,80,flexible,65,Affordable
0,0,Allston,42.34877625,-71.13132203,Apartment,Entire home/apt,2,1.0,1,1,Futon,13,1,0,3,9,64,77,flexible,65,Affordable
0,1,Allston,42.3533978,-71.14235466,House,Private room,2,2.0,1,1,Real Bed,18,1,20,30,36,591,87,strict,65,Affordable
1,1,Allston,42.34998843,-71.14420208,House,Private room,2,2.0,1,1,Real Bed,12,1,20,1,39,713,96,strict,65,Affordable
0,0,South Boston,42.33084495,-71.05246031,House,Private room,2,1.0,1,1,Real Bed,13,0,0,2,11,67,98,moderate,65,Affordable
0,0,Dorchester,42.31673406,-71.05337817,House,Private room,3,1.0,1,1,Real Bed,4,1,0,1,14,66,80,flexible,65,Affordable
0,1,Dorchester,42.2924911,-71.07464279,House,Private room,2,1.0,1,1,Real Bed,7,1,0,2,15,456,73,strict,65,Affordable
0,1,Dorchester,42.31417775,-71.060367,House,Private room,2,2.5,1,1,Real Bed,14,1,20,2,62,782,91,flexible,65,Affordable


### K means clustering

In [0]:
model_1= data.select(["latitude","longitude"])

#### Creating Elbow Plot for determining optimal number of clusters

In [0]:
display(model_1)

latitude,longitude
42.38522635,-71.08092263
42.38774101,-71.07989661
42.34832593,-71.13613371
42.34877625,-71.13132203
42.3533978,-71.14235466
42.34998843,-71.14420208
42.33084495,-71.05246031
42.31673406,-71.05337817
42.2924911,-71.07464279
42.31417775,-71.060367


In [0]:
from pyspark.ml.clustering import KMeans
from pyspark.ml.feature import VectorAssembler,StringIndexer
from pyspark.ml import Pipeline

features = ["latitude","longitude"]
assembler = VectorAssembler(
    inputCols=features, 
    outputCol="airbnb_features")
assembled_data = assembler.transform(model_1)


In [0]:
costs = []
for k in range(2, 10):
    k_means = KMeans(featuresCol='airbnb_features', k=k)
    model = k_means.fit(assembled_data)
    costs.append((k,model.summary.trainingCost))

In [0]:
# import pylab as pl
# pl.plot(costs.keys(), costs.values())
# pl.xlabel('Number of Clusters')
# pl.ylabel('Score')
# pl.title('Elbow Curve')
# pl.show()

In [0]:
elbow_df=spark.createDataFrame(costs,schema=["Number_of_clusters","Score"])

In [0]:
display(elbow_df)

Number_of_clusters,Score
2,738.9625484686453
3,540.8303627714592
4,429.2028074874493
5,290.2114114993138
6,232.23400213932163
7,191.99683530061023
8,142.64928877137388
9,145.84368307872492


Output can only be rendered in Databricks

In [0]:
assembler = VectorAssembler(inputCols=['latitude','longitude'],
                            outputCol="features")

In [0]:
pipe = Pipeline(stages=[assembler])

In [0]:
result_model1=pipe.fit(model_1).transform(model_1)

In [0]:
kmeans_model = KMeans(k=5)

In [0]:
fit_model = kmeans_model.fit(result_model1)

In [0]:
# wsse = fit_model.computeCost(final_data) for spark 2.7
wssse = fit_model.summary.trainingCost # for spark 3.0
print("The within set sum of squared error of the mode is {}".format(wssse))

The within set sum of squared error of the mode is 290.2114114993138


In [0]:
centers = fit_model.clusterCenters()

In [0]:
print("Cluster Centers")
index=1
for cluster in centers:
    print("Centroid {}: {}".format(index,cluster))
    index+=1
#'opened_by_index','location_index','category_index',
# 'subcategory_index','u_symptom_index','assignment_group_index'

Cluster Centers
Centroid 1: [ 42.31555492 -71.05618115]
Centroid 2: [ 42.36325513 -71.05463656]
Centroid 3: [ 42.35114839 -71.13995305]
Centroid 4: [ 42.34186255 -71.08475597]
Centroid 5: [ 42.30114796 -71.11702549]


In [0]:
results = fit_model.transform(result_model1)

In [0]:
display(results.select(['latitude','longitude','prediction']))

latitude,longitude,prediction
42.38522635,-71.08092263,1
42.38774101,-71.07989661,1
42.34832593,-71.13613371,2
42.34877625,-71.13132203,2
42.3533978,-71.14235466,2
42.34998843,-71.14420208,2
42.33084495,-71.05246031,0
42.31673406,-71.05337817,0
42.2924911,-71.07464279,0
42.31417775,-71.060367,0


In [0]:
display(results.groupby(['prediction']).count().sort(desc('count')))

prediction,count
3,253629
1,242766
4,140193
0,123426
2,96246


Output can only be rendered in Databricks

### Interpretation

Most of the listings(about 58%) belong to cluster 3 and 1 which means that:

29.6 % of the listings are centered around location with latitude 42.35114839 and longitude -71.13995305, which are the co-ordinates for <B>50 Gordon St, Boston, MA area.</B>

28.6 % of the listings are centered around location with latitude 42.31555492 and longitude -71.05618115, which are the co-ordinates for <B>68 Belfort St, Dorchester, Boston, MA area.</B>

It can be suggested that tourists have higher chances of finding Airbnb listings for stay around the above suggested areas.

### Classification Model 1 - Decision Tree Classification

In [0]:
#data_model2= data.select(['host_is_superhost','neighbourhood','property_type','room_type','accommodates','bathrooms','bedrooms','Number_of_amenities','price_c'])

data_model2= data.select(['host_is_superhost','host_identity_verified','neighbourhood','property_type','room_type','accommodates','bathrooms','bedrooms','beds','bed_type','Number_of_amenities','guests_included','price_per_extra_person','minimum_nights','number_of_reviews','number_days_btw_first_last_review','cancellation_policy','price_c'])



In [0]:
data_model2.dtypes

Out[42]: [('host_is_superhost', 'int'),
 ('host_identity_verified', 'int'),
 ('neighbourhood', 'string'),
 ('property_type', 'string'),
 ('room_type', 'string'),
 ('accommodates', 'int'),
 ('bathrooms', 'float'),
 ('bedrooms', 'int'),
 ('beds', 'int'),
 ('bed_type', 'string'),
 ('Number_of_amenities', 'int'),
 ('guests_included', 'int'),
 ('price_per_extra_person', 'int'),
 ('minimum_nights', 'int'),
 ('number_of_reviews', 'int'),
 ('number_days_btw_first_last_review', 'int'),
 ('cancellation_policy', 'string'),
 ('price_c', 'string')]

In [0]:
data_model2=data_model2.dropna()

In [0]:
# Create a 70-30 train test split

train_data,test_data=data_model2.randomSplit([0.7,0.3])

In [0]:
# Import the required libraries

from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.feature import VectorAssembler,StringIndexer
from pyspark.ml import Pipeline


In [0]:
# Use StringIndexer to convert the categorical columns to hold numerical data

neighbourhood_indexer = StringIndexer(inputCol='neighbourhood',outputCol='neighbourhood_index',handleInvalid='keep')
property_type_indexer = StringIndexer(inputCol='property_type',outputCol='property_type_index',handleInvalid='keep')
room_type_indexer = StringIndexer(inputCol='room_type',outputCol='room_type_index',handleInvalid='keep')
bed_type_indexer = StringIndexer(inputCol='bed_type',outputCol='bed_type_index',handleInvalid='keep')
cancellation_policy_indexer = StringIndexer(inputCol='cancellation_policy',outputCol='cancellation_policy_index',handleInvalid='keep')
price_c_indexer = StringIndexer(inputCol='price_c',outputCol='price_c_index',handleInvalid='keep')


In [0]:
# Vector assembler is used to create a vector of input features

assembler = VectorAssembler(inputCols=['host_is_superhost','host_identity_verified','neighbourhood_index','property_type_index','room_type_index','accommodates','bathrooms','bedrooms','beds','bed_type_index','Number_of_amenities','guests_included','price_per_extra_person','minimum_nights','number_of_reviews','number_days_btw_first_last_review','cancellation_policy_index'],outputCol="features")

In [0]:
# Create an object for the Logistic Regression model
# Use the parameter maxBins and assign a value that is equal to or more than the number of categories in any sigle feature

dt_model = DecisionTreeClassifier(labelCol='price_c_index',maxBins=500)

In [0]:
# Pipeline is used to pass the data through indexer and assembler simultaneously. Also, it helps to pre-rocess the test data
# in the same way as that of the train data

pipe = Pipeline(stages=[neighbourhood_indexer,property_type_indexer,room_type_indexer,bed_type_indexer,cancellation_policy_indexer,
                        price_c_indexer,assembler,dt_model])

In [0]:
# It took 1.6 minutes for this step to execute

fit_model=pipe.fit(train_data)

In [0]:
# Store the results in a dataframe

results = fit_model.transform(test_data)

In [0]:
results.select(['price_c_index','prediction']).show()

+-------------+----------+
|price_c_index|prediction|
+-------------+----------+
|          0.0|       0.0|
|          0.0|       0.0|
|          0.0|       0.0|
|          0.0|       0.0|
|          0.0|       0.0|
|          0.0|       0.0|
|          0.0|       0.0|
|          0.0|       0.0|
|          0.0|       0.0|
|          0.0|       0.0|
|          0.0|       0.0|
|          0.0|       0.0|
|          0.0|       0.0|
|          0.0|       0.0|
|          0.0|       0.0|
|          0.0|       0.0|
|          0.0|       0.0|
|          0.0|       0.0|
|          0.0|       0.0|
|          0.0|       0.0|
+-------------+----------+
only showing top 20 rows



##### Model Evaluation - Decision Tree Classification
##### 1) Accuracy
##### 2) f1 score

In [0]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [0]:
ACC_evaluator = MulticlassClassificationEvaluator(
    labelCol="price_c_index", predictionCol="prediction", metricName="accuracy")

In [0]:
accuracy = ACC_evaluator.evaluate(results)

In [0]:
print("The accuracy of the decision tree classifier is {}".format(accuracy))

The accuracy of the decision tree classifier is 0.8365605551963984


In [0]:
f1_evaluator = MulticlassClassificationEvaluator(
    labelCol="price_c_index", predictionCol="prediction", metricName="f1")

In [0]:
f1_1 = f1_evaluator.evaluate(results)

In [0]:
print("The f1_Score of the decision tree classifier is {}".format(f1_1))

The f1_Score of the decision tree classifier is 0.8315319706697342


### Classification Model 2 - Random Forest Classification

In [0]:
from pyspark.ml.classification import RandomForestClassifier

In [0]:
# Train a rf model.
rf_model = RandomForestClassifier(labelCol="price_c_index", featuresCol="features", numTrees=10)

In [0]:
# Pipeline is used to pass the data through indexer and assembler simultaneously. Also, it helps to pre-rocess the test data
# in the same way as that of the train data

pipe = Pipeline(stages=[neighbourhood_indexer,property_type_indexer,room_type_indexer,bed_type_indexer,cancellation_policy_indexer,
                        price_c_indexer,assembler,rf_model])

In [0]:
# It took 1.6 minutes for this step to execute

fit_model=pipe.fit(train_data)

In [0]:
# Store the results in a dataframe

results_2 = fit_model.transform(test_data)

In [0]:
results_2.select(['price_c_index','prediction']).show()

+-------------+----------+
|price_c_index|prediction|
+-------------+----------+
|          0.0|       0.0|
|          0.0|       0.0|
|          0.0|       0.0|
|          0.0|       0.0|
|          0.0|       0.0|
|          0.0|       0.0|
|          0.0|       0.0|
|          0.0|       0.0|
|          0.0|       0.0|
|          0.0|       0.0|
|          0.0|       0.0|
|          0.0|       0.0|
|          0.0|       0.0|
|          0.0|       0.0|
|          0.0|       0.0|
|          0.0|       0.0|
|          0.0|       0.0|
|          0.0|       0.0|
|          0.0|       0.0|
|          0.0|       0.0|
+-------------+----------+
only showing top 20 rows



##### Model Evaluation - Random Forest Classification
##### 1) Accuracy
##### 2) f1 score

In [0]:
rf_acc_evaluator = MulticlassClassificationEvaluator(
    labelCol="price_c_index", predictionCol="prediction", metricName="accuracy")

In [0]:
accuracy_2 = rf_acc_evaluator.evaluate(results_2)

In [0]:
print("The accuracy of the random forest classifier is {}".format(accuracy_2))

The accuracy of the random forest classifier is 0.8470483771721437


In [0]:
f1_rf_evaluator = MulticlassClassificationEvaluator(
    labelCol="price_c_index", predictionCol="prediction", metricName="f1")

In [0]:
f1_rf = f1_rf_evaluator.evaluate(results_2)

In [0]:
print("The f1 score of the random forest is {}".format(f1_rf))

The f1 score of the random forest is 0.8314335581232121


### Classification Model 3 - MultilayerPerceptron Classification

In [0]:
from pyspark.ml.feature import OneHotEncoder, VectorAssembler, StringIndexer
from pyspark.ml import Pipeline
from pyspark.sql.functions import udf, StringType
from pyspark.ml.classification import MultilayerPerceptronClassifier
import pyspark.sql.functions


In [0]:
train, test = data_model2.randomSplit([0.7, 0.1], 1234)

In [0]:
categorical_columns = [item[0] for item in data_model2.dtypes if item[1].startswith(
    'string')]
int_columns = [item[0] for item in data_model2.dtypes if item[1].startswith('int')]
float_columns = [item[0] for item in data_model2.dtypes if item[1].startswith('float')]
indexers = [StringIndexer(inputCol=column, outputCol='{0}_index'.format(
    column)) for column in categorical_columns]

In [0]:
featuresCreator = VectorAssembler(
    inputCols=[indexer.getOutputCol() for indexer in indexers] + float_columns + int_columns,
    outputCol='features')
layers = [len(featuresCreator.getInputCols()), 4, 2, 3]

In [0]:
classifier = MultilayerPerceptronClassifier(labelCol='price_c_index',
                                            featuresCol='features',
                                            maxIter=100,
                                            layers=layers,
                                            blockSize=128,
                                            seed=1234)

In [0]:
pipeline = Pipeline(stages=indexers + [featuresCreator, classifier])
model = pipeline.fit(train)

In [0]:
test_output_df = model.transform(test)

In [0]:
test_output_df.select(['price_c_index','prediction']).show()

+-------------+----------+
|price_c_index|prediction|
+-------------+----------+
|          0.0|       0.0|
|          0.0|       0.0|
|          0.0|       0.0|
|          0.0|       0.0|
|          0.0|       0.0|
|          0.0|       0.0|
|          0.0|       0.0|
|          0.0|       0.0|
|          0.0|       0.0|
|          0.0|       0.0|
|          0.0|       0.0|
|          0.0|       0.0|
|          0.0|       0.0|
|          0.0|       0.0|
|          0.0|       0.0|
|          0.0|       0.0|
|          0.0|       0.0|
|          0.0|       0.0|
|          0.0|       0.0|
|          0.0|       0.0|
+-------------+----------+
only showing top 20 rows



##### Model Evaluation - Multilayer Perceptron Classification
##### 1) Accuracy
##### 2) f1 score

In [0]:
mpc_evaluator = MulticlassClassificationEvaluator(
    labelCol="price_c_index", predictionCol="prediction", metricName="accuracy")

In [0]:
accuracy_3 = mpc_evaluator.evaluate(test_output_df)

In [0]:
print("The accuracy of the Multilayer Perceptron classifier is {}".format(accuracy_3))

The accuracy of the Multilayer Perceptron classifier is 0.7491991744258805


In [0]:
f1_mpc_evaluator = MulticlassClassificationEvaluator(
    labelCol="price_c_index", predictionCol="prediction", metricName="f1")

In [0]:
f1_mpc = f1_mpc_evaluator.evaluate(test_output_df)

In [0]:
print("The f1 score of the Multilayer perceptron classifier is {}".format(f1_mpc))

The f1 score of the Multilayer perceptron classifier is 0.660557319280575


### Interpretation

The objective behind our project is to identify the locations around which the customers can find the most number of Airbnb listings and predicting the price category of Airbnb listings namely affordable, moderate, or expensive.

For identifying locations, we used K means clustering on latitude and longitude of Airbnb listings. From K means clustering, we were able to infer that:

29.6 % of the listings are centered around location with latitude 42.35114839 and longitude -71.13995305, which are the co-ordinates for 50 Gordon St, Boston, MA area.

28.6 % of the listings are centered around location with latitude 42.31555492and longitude -71.05618115, which are the co-ordinates for 68 Belfort St, Dorchester, Boston, MA area.
It can be suggested that tourists have higher chances of finding Airbnb listings for stay around the above suggested areas.

For predicting the price category of Airbnb listings into affordable (<$175), moderate (between $175 and $335) or expensive (>$335) categories, we have a multiclass classification problem. We used 17 attributes for our prediction models, such as if the host is super host, no. of bedrooms, neighborhood, cancellation policy, to name a few. The analysis was carried out using 3 classification models and below are the evaluation metrics for each model:

#### Decision Tree Classification:

Accuracy: 84%; F1 Score: 0.83

#### Random Forest Classification:

Accuracy: 85.2%; F1 Score: 0.84

#### Multi-Layer Perceptron Classification:
 
Accuracy: 74.9%; F1 Score: 0.66

Since our dataset used is imbalanced, we are not using Accuracy but F1 Score as our preferred evaluation metric for comparing the three models. F1 score provides a way to combine precision and recall metrices (harmonic mean of precision and recall). Precision is a metric that finds out what fraction of predicted positives is actually positive, i.e., out of the total predicted price categories, what fraction of them were correctly identified. Recall is a metric that quantifies the number of correct positive predictions (correctly identified prices) made of all positive predictions that could have been made.

Comparing all the 3 models, we could infer that Random Forest Classifier performs the best among the other models with F1 score of 0.84. Since the score is closer to 1, we conclude that
the model performs well in classifying the price of Airbnb listings into 3 categories- affordable, moderate, and expensive.