### Predict Real Estate's Price

In [1]:
#Import the Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pyspark
import pyspark.pandas as ps



In [2]:
#Initialize the spark session
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('ML Pyspark').getOrCreate()
spark

In [3]:
# Read the data
df = spark.read.csv('Dataset Real Estate.csv',header=True,inferSchema=True)

In [4]:
df.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- Address: string (nullable = true)
 |-- Beds: string (nullable = true)
 |-- Baths: string (nullable = true)
 |-- Area: string (nullable = true)
 |-- Price: integer (nullable = true)
 |-- Street: string (nullable = true)
 |-- District: string (nullable = true)
 |-- City: string (nullable = true)
 |-- Zip Code: string (nullable = true)



In [5]:
df.show()

+---+--------------------+------+-----+--------------------+-------+--------------------+--------------------+----+--------+
|_c0|             Address|  Beds|Baths|                Area|  Price|              Street|            District|City|Zip Code|
+---+--------------------+------+-----+--------------------+-------+--------------------+--------------------+----+--------+
|  0|4-74 48th Ave  #9...|     1|    1|                 700| 555000|  4-74 48th Ave  #9K|    Long Island City|  NY|   11109|
|  1|469 E  49th St   ...|    10|    6|                 n/a|1250000| 469 E  49th St   #6|            Brooklyn|  NY|   11203|
|  2|715 Avenue L, Bro...|     8|    6|               2,015|1690000|        715 Avenue L|            Brooklyn|  NY|   11230|
|  3|223 E  62nd St, N...|     5|    6|               3,750|7995000|      223 E  62nd St|            New York|  NY|   10065|
|  4|1824 E  17th St  ...|     2|    2|                 888| 579000|1824 E  17th St  ...|            Brooklyn|  NY|   11229|


### Cleaning the Data

In [6]:
from pyspark.sql.functions import *
df = df.withColumn('Area', regexp_replace('Area', ',', ''))
df = df.withColumn('Area', split(df.Area, " ").getItem(0))
df = df.withColumn('Beds', regexp_replace('Beds', 'Studio', '0'))

In [7]:
print((df.count(), len(df.columns)))

(208150, 10)


In [8]:
df.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in df.columns]).show()

+---+-------+----+-----+----+-----+------+--------+----+--------+
|_c0|Address|Beds|Baths|Area|Price|Street|District|City|Zip Code|
+---+-------+----+-----+----+-----+------+--------+----+--------+
|  0|      0|   0|    0|   0|  469|     0|       0|   0|       0|
+---+-------+----+-----+----+-----+------+--------+----+--------+



In [9]:
df_clean=df.dropna()
df_clean=df_clean.where(df.Area != r'n/a')
df_clean=df_clean.where(df.Baths != r'n/')
df_clean=df_clean.dropDuplicates()
df_clean.show()

+----+--------------------+----+-----+-----+--------+--------------------+--------------+----+--------+
| _c0|             Address|Beds|Baths| Area|   Price|              Street|      District|City|Zip Code|
+----+--------------------+----+-----+-----+--------+--------------------+--------------+----+--------+
|  58|137 W  12th St #5...|   1|    1|52200|  675000| 137 W  12th St #5-1|      New York|  NY|   10011|
|  80|750 Park Ave #4-B...|   1|    2|  800| 1900000|   750 Park Ave #4-B|      New York|  NY|   10021|
| 198|33 Bionia Ave, St...|   3|    1| 1932|  849999|       33 Bionia Ave| Staten Island|  NY|   10305|
| 832|420 Jansen St, St...|   4|    5| 6800| 2500000|       420 Jansen St| Staten Island|  NY|   10312|
| 943|712 Saint Anns Av...|   3|    2| 1152|  550000|712 Saint Anns Av...|         Bronx|  NY|   10455|
| 955|214 Adams Ave, St...|   3|    1| 1280|  599000|       214 Adams Ave| Staten Island|  NY|   10306|
|1296|62-98 Woodheaven ...|   1|    1|  760|  599000|62-98 Woodh

In [10]:
print((df_clean.count(), len(df_clean.columns)))

(167712, 10)


In [11]:
from pyspark.sql.types import StringType, DateType, FloatType, IntegerType
  
df_clean = df_clean \
    .withColumn("Beds", df_clean["Beds"].cast(IntegerType()))   \
    .withColumn("Baths", df_clean["Baths"].cast(IntegerType()))   \
    .withColumn("Area" , df_clean["Area"].cast(IntegerType()))   \
    .withColumn("Price", df_clean["Price"].cast(IntegerType()))

df_clean.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- Address: string (nullable = true)
 |-- Beds: integer (nullable = true)
 |-- Baths: integer (nullable = true)
 |-- Area: integer (nullable = true)
 |-- Price: integer (nullable = true)
 |-- Street: string (nullable = true)
 |-- District: string (nullable = true)
 |-- City: string (nullable = true)
 |-- Zip Code: string (nullable = true)



In [12]:
df_clean.select(
    percentile_approx("Price", [0.25, 0.5, 0.75], 1000000).alias("quantiles")).collect()

[Row(quantiles=[439990, 710000, 1249000])]

In [13]:
q1_beds = df_clean.select(percentile_approx("Beds", [0.25], 1000000).alias("quantiles")).collect()[0]['quantiles'][0]
q3_beds = df_clean.select(percentile_approx("Beds", [0.75], 1000000).alias("quantiles")).collect()[0]['quantiles'][0]
iqr_beds = q3_beds - q1_beds

#Check the Outlier Beds
top_outlier_beds = q3_beds + 1.5 * iqr_beds
bottom_outlier_beds= q1_beds - 1.5 * iqr_beds
print(top_outlier_beds, bottom_outlier_beds)

7.0 -1.0


In [14]:
q1_baths = df_clean.select(percentile_approx("Baths", [0.25], 1000000).alias("quantiles")).collect()[0]['quantiles'][0]
q3_baths = df_clean.select(percentile_approx("Baths", [0.75], 1000000).alias("quantiles")).collect()[0]['quantiles'][0]
iqr_baths = q3_baths - q1_baths

#Check the Outlier Baths
top_outlier_baths = q3_baths + 1.5 * iqr_baths
bottom_outlier_baths = q1_baths - 1.5 * iqr_baths
print(top_outlier_baths, bottom_outlier_baths)


4.5 0.5


In [15]:
q1_area = df_clean.select(percentile_approx("Area", [0.25], 1000000).alias("quantiles")).collect()[0]['quantiles'][0]
q3_area = df_clean.select(percentile_approx("Area", [0.75], 1000000).alias("quantiles")).collect()[0]['quantiles'][0]
iqr_area = q3_area - q1_area

#Check the Outlier Area
top_outlier_area = q3_area + 1.5 * iqr_area
bottom_outlier_area = q1_area - 1.5 * iqr_area
print(top_outlier_area, bottom_outlier_area)

4167.0 -705.0


In [16]:
q1_price = df_clean.select(percentile_approx("Price", [0.25], 1000000).alias("quantiles")).collect()[0]['quantiles'][0]
q3_price = df_clean.select(percentile_approx("Price", [0.75], 1000000).alias("quantiles")).collect()[0]['quantiles'][0]
iqr_price = q3_price - q1_price

#Check the Outlier Price
top_outlier_price = q3_price + 1.5 * iqr_price
bottom_outlier_price = q1_price - 1.5 * iqr_price
print(top_outlier_price, bottom_outlier_price)

2462515.0 -773525.0


In [17]:
df_clean = df_clean.filter(df_clean['Beds']<=top_outlier_beds)
df_clean = df_clean.filter(df_clean['Baths']<=top_outlier_baths)
df_clean = df_clean.filter(df_clean['Area']<=top_outlier_area)
df_clean = df_clean.filter(df_clean['Price']<=top_outlier_price)


In [18]:
df_clean.show()

+----+--------------------+----+-----+----+-------+--------------------+-----------------+----+--------+
| _c0|             Address|Beds|Baths|Area|  Price|              Street|         District|City|Zip Code|
+----+--------------------+----+-----+----+-------+--------------------+-----------------+----+--------+
|  80|750 Park Ave #4-B...|   1|    2| 800|1900000|   750 Park Ave #4-B|         New York|  NY|   10021|
| 198|33 Bionia Ave, St...|   3|    1|1932| 849999|       33 Bionia Ave|    Staten Island|  NY|   10305|
| 943|712 Saint Anns Av...|   3|    2|1152| 550000|712 Saint Anns Av...|            Bronx|  NY|   10455|
| 955|214 Adams Ave, St...|   3|    1|1280| 599000|       214 Adams Ave|    Staten Island|  NY|   10306|
|1296|62-98 Woodheaven ...|   1|    1| 760| 599000|62-98 Woodheaven ...|        Rego Park|  NY|   11374|
|1444|1824 E  17th St  ...|   2|    2| 888| 579000|1824 E  17th St  ...|         Brooklyn|  NY|   11229|
|1520|179 E 31st Street...|   6|    3|2046|1349000|   1

In [19]:
df_selected = df_clean.select("Beds", "Baths", "Area", "Price", "District", "City", "Zip code")
df_selected.printSchema()

root
 |-- Beds: integer (nullable = true)
 |-- Baths: integer (nullable = true)
 |-- Area: integer (nullable = true)
 |-- Price: integer (nullable = true)
 |-- District: string (nullable = true)
 |-- City: string (nullable = true)
 |-- Zip code: string (nullable = true)



### Convert the Categorical Variables

In [20]:
from pyspark.ml.feature import StringIndexer

indexer = StringIndexer(inputCols=["District", "City", "Zip code"], 
                        outputCols=["districtIndex", "cityIndex", "zipIndex"], 
                        stringOrderType="alphabetAsc")
indexed = indexer.fit(df_selected).transform(df_selected)
indexed.show()

+----+-----+----+-------+-----------------+----+--------+-------------+---------+--------+
|Beds|Baths|Area|  Price|         District|City|Zip code|districtIndex|cityIndex|zipIndex|
+----+-----+----+-------+-----------------+----+--------+-------------+---------+--------+
|   1|    2| 800|1900000|         New York|  NY|   10021|         66.0|      6.0|    18.0|
|   3|    1|1932| 849999|    Staten Island|  NY|   10305|        103.0|      6.0|    40.0|
|   3|    2|1152| 550000|            Bronx|  NY|   10455|          9.0|      6.0|    51.0|
|   3|    1|1280| 599000|    Staten Island|  NY|   10306|        103.0|      6.0|    41.0|
|   1|    1| 760| 599000|        Rego Park|  NY|   11374|         87.0|      6.0|   128.0|
|   2|    2| 888| 579000|         Brooklyn|  NY|   11229|         10.0|      6.0|   101.0|
|   6|    3|2046|1349000|         Brooklyn|  NY|   11226|         10.0|      6.0|    99.0|
|   2|    1|1075| 349900|          Bayside|  NY|   11360|          5.0|      6.0|   115.0|

In [21]:
from pyspark.ml.feature import OneHotEncoder

encoder = OneHotEncoder(inputCols=["districtIndex", "cityIndex", "zipIndex"],
                        outputCols=["categoryDistrict", "categoryCity", "categoryZip"])
model = encoder.fit(indexed)
encoded = model.transform(indexed)
encoded.show()

+----+-----+----+-------+-----------------+----+--------+-------------+---------+--------+-----------------+------------+-----------------+
|Beds|Baths|Area|  Price|         District|City|Zip code|districtIndex|cityIndex|zipIndex| categoryDistrict|categoryCity|      categoryZip|
+----+-----+----+-------+-----------------+----+--------+-------------+---------+--------+-----------------+------------+-----------------+
|   1|    2| 800|1900000|         New York|  NY|   10021|         66.0|      6.0|    18.0| (131,[66],[1.0])|   (6,[],[])| (435,[18],[1.0])|
|   3|    1|1932| 849999|    Staten Island|  NY|   10305|        103.0|      6.0|    40.0|(131,[103],[1.0])|   (6,[],[])| (435,[40],[1.0])|
|   3|    2|1152| 550000|            Bronx|  NY|   10455|          9.0|      6.0|    51.0|  (131,[9],[1.0])|   (6,[],[])| (435,[51],[1.0])|
|   3|    1|1280| 599000|    Staten Island|  NY|   10306|        103.0|      6.0|    41.0|(131,[103],[1.0])|   (6,[],[])| (435,[41],[1.0])|
|   1|    1| 760| 59

In [22]:
encoded.columns

['Beds',
 'Baths',
 'Area',
 'Price',
 'District',
 'City',
 'Zip code',
 'districtIndex',
 'cityIndex',
 'zipIndex',
 'categoryDistrict',
 'categoryCity',
 'categoryZip']

In [23]:
# from pyspark.ml import Pipeline
# from pyspark.ml.feature import MinMaxScaler
# columns_to_scale = ["Price"]
# assemblers = [VectorAssembler(inputCols=[col], outputCol=col + "_vec") for col in columns_to_scale]
# scalers = [MinMaxScaler(inputCol=col + "_vec", outputCol=col + "_scaled") for col in columns_to_scale]
# pipeline = Pipeline(stages=assemblers + scalers)
# scalerModel = pipeline.fit(encoded)
# scaledData = scalerModel.transform(encoded)

### Make the Features into a Vector

In [24]:
from pyspark.ml.feature import VectorAssembler
featureassembler=VectorAssembler(inputCols=['Baths', 'Area', 'Beds', 'categoryDistrict', 'categoryCity', 'categoryZip'],
                                 outputCol="Independent Features")
output=featureassembler.transform(encoded)

In [25]:
output.show()

+----+-----+----+-------+-----------------+----+--------+-------------+---------+--------+-----------------+------------+-----------------+--------------------+
|Beds|Baths|Area|  Price|         District|City|Zip code|districtIndex|cityIndex|zipIndex| categoryDistrict|categoryCity|      categoryZip|Independent Features|
+----+-----+----+-------+-----------------+----+--------+-------------+---------+--------+-----------------+------------+-----------------+--------------------+
|   1|    2| 800|1900000|         New York|  NY|   10021|         66.0|      6.0|    18.0| (131,[66],[1.0])|   (6,[],[])| (435,[18],[1.0])|(575,[0,1,2,69,15...|
|   3|    1|1932| 849999|    Staten Island|  NY|   10305|        103.0|      6.0|    40.0|(131,[103],[1.0])|   (6,[],[])| (435,[40],[1.0])|(575,[0,1,2,106,1...|
|   3|    2|1152| 550000|            Bronx|  NY|   10455|          9.0|      6.0|    51.0|  (131,[9],[1.0])|   (6,[],[])| (435,[51],[1.0])|(575,[0,1,2,12,19...|
|   3|    1|1280| 599000|    State

In [26]:
finalized_data=output.select("Independent Features","Price")

In [27]:
finalized_data.show()

+--------------------+-------+
|Independent Features|  Price|
+--------------------+-------+
|(575,[0,1,2,69,15...|1900000|
|(575,[0,1,2,106,1...| 849999|
|(575,[0,1,2,12,19...| 550000|
|(575,[0,1,2,106,1...| 599000|
|(575,[0,1,2,90,26...| 599000|
|(575,[0,1,2,13,24...| 579000|
|(575,[0,1,2,13,23...|1349000|
|(575,[0,1,2,8,255...| 349900|
|(575,[0,1,13,234]...|1599000|
|(575,[0,1,2,106,1...| 659999|
|(575,[0,1,2,13,24...| 278000|
|(575,[0,1,2,106,1...| 739000|
|(575,[0,1,2,92,27...| 660000|
|(575,[0,1,2,69,16...| 799000|
|(575,[0,1,2,12,19...| 550000|
|(575,[0,1,2,106,1...| 799999|
|(575,[0,1,2,12,19...| 330000|
|(575,[0,1,2,89,27...| 469000|
|(575,[0,1,2,57,21...| 790000|
|(575,[0,1,2,13,24...| 775000|
+--------------------+-------+
only showing top 20 rows



In [28]:
##train test split
train_data,test_data=finalized_data.randomSplit([0.8,0.2])

### Simple Linear Regression 

In [29]:
from pyspark.ml.regression import LinearRegression

regressor=LinearRegression(featuresCol='Independent Features', labelCol='Price')
regressor=regressor.fit(train_data)
print("Coefficients: " + str(regressor.coefficients))
print("Intercept: " + str(regressor.intercept))

Coefficients: [99663.04300170258,216.4232342979904,39185.2890888283,-50769.7640933765,-175794.79740775438,-140146.02283080143,206378.92898102873,-240152.6579052283,-317038.61717539094,-46710.96103409387,249745.96531317712,-230698.7147771512,-107035.85350588078,98467.69522379692,-42038.321604549375,-7112.811082971376,-25173.20150807183,-56659.6754166538,-97399.436602962,-124873.44933878005,-155202.72695214523,157518.93511181494,97311.61607320838,-123582.4500260222,-8415.83889990429,-129985.54542649114,-319786.70946384966,282146.5628080423,235888.3420751696,17173.278925151582,-101732.42509973343,40538.2485569563,-159853.97050475184,-147189.07552284256,135338.2385674601,-69266.59501583071,145684.4199222653,-47148.44736229998,-98240.28128693998,-236470.57262652548,5698.338502996121,-26922.057667240115,-359552.9739369702,-70856.75364027981,-198870.1186574118,-28472.679734671063,-165112.41032447945,163357.89630101723,-49645.86244356173,-36929.65864560126,-209253.10213758328,-99419.2059271391

In [30]:
trainingSummary = regressor.summary
print("RMSE: %f" % trainingSummary.rootMeanSquaredError)
print("r2: %f" % trainingSummary.r2)

RMSE: 203245.433807
r2: 0.813443


In [31]:
### Prediction of Linear Regression
pred_results=regressor.evaluate(test_data)

In [32]:
pred_results.meanAbsoluteError,pred_results.rootMeanSquaredError

(143200.4778807347, 204741.45791918394)

### Random Forest Model

In [33]:
from pyspark.ml.regression import RandomForestRegressor

rf=RandomForestRegressor(featuresCol='Independent Features', labelCol='Price')
rf_model=rf.fit(train_data)

In [34]:
### Prediction of random forest model
from pyspark.ml.evaluation import RegressionEvaluator
rf_predictions = rf_model.transform(test_data)
rf_evaluator = RegressionEvaluator(labelCol="Price", predictionCol="prediction", metricName="rmse")
rmse = rf_evaluator.evaluate(rf_predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

Root Mean Squared Error (RMSE) on test data = 297281


In [35]:
rf_predictions.show()

+--------------------+-------+------------------+
|Independent Features|  Price|        prediction|
+--------------------+-------+------------------+
|(575,[0,1,2,3,136...| 305000|399703.76929645665|
|(575,[0,1,2,4,136...| 121000|399703.76929645665|
|(575,[0,1,2,4,136...| 169000|399703.76929645665|
|(575,[0,1,2,4,136...| 169000|399703.76929645665|
|(575,[0,1,2,4,136...| 399000| 819804.1101659948|
|(575,[0,1,2,5,134...| 729000| 687775.9001049759|
|(575,[0,1,2,6,215...|1598000| 812526.8887728349|
|(575,[0,1,2,6,215...|1598000| 812526.8887728349|
|(575,[0,1,2,6,215...|1598000| 812526.8887728349|
|(575,[0,1,2,6,215...|1598000| 812526.8887728349|
|(575,[0,1,2,6,215...|1598000| 812526.8887728349|
|(575,[0,1,2,6,215...|1598000| 812526.8887728349|
|(575,[0,1,2,6,215...|1598000| 812526.8887728349|
|(575,[0,1,2,7,134...| 668490| 902781.5698682014|
|(575,[0,1,2,8,255...| 349900|  548010.760723152|
|(575,[0,1,2,8,255...| 349900|  548010.760723152|
|(575,[0,1,2,8,255...| 349900|  548010.760723152|


In [36]:
rf_model.featureImportances

SparseVector(575, {0: 0.1629, 1: 0.1768, 2: 0.0346, 8: 0.0002, 12: 0.0018, 13: 0.0436, 18: 0.0004, 19: 0.0635, 25: 0.002, 26: 0.0002, 34: 0.001, 56: 0.0003, 57: 0.0009, 59: 0.0226, 60: 0.0009, 69: 0.0739, 77: 0.0001, 85: 0.004, 96: 0.0005, 99: 0.0005, 101: 0.0001, 108: 0.0001, 109: 0.0001, 125: 0.0007, 130: 0.0002, 134: 0.0246, 135: 0.2631, 136: 0.0513, 139: 0.0132, 148: 0.0001, 151: 0.0002, 157: 0.0024, 160: 0.0028, 162: 0.0001, 168: 0.0005, 172: 0.0005, 173: 0.003, 195: 0.0, 212: 0.0017, 219: 0.0002, 223: 0.0005, 224: 0.0001, 225: 0.0002, 226: 0.0004, 229: 0.0001, 238: 0.0005, 240: 0.0035, 246: 0.001, 248: 0.001, 256: 0.0014, 257: 0.0004, 300: 0.0, 303: 0.0033, 306: 0.0, 307: 0.0, 314: 0.001, 319: 0.0005, 326: 0.0, 332: 0.0, 335: 0.0, 338: 0.0001, 342: 0.0029, 345: 0.0001, 347: 0.0, 349: 0.0, 366: 0.0008, 393: 0.0, 399: 0.0, 429: 0.0, 433: 0.0, 436: 0.0, 443: 0.0, 462: 0.0005, 482: 0.0002, 484: 0.0009, 485: 0.0002, 491: 0.0008, 501: 0.0024, 502: 0.0007, 504: 0.0008, 508: 0.001, 514: 

In [37]:
finalized_data.select('Independent Features').take(2)

[Row(Independent Features=SparseVector(575, {0: 2.0, 1: 800.0, 2: 1.0, 69: 1.0, 158: 1.0})),
 Row(Independent Features=SparseVector(575, {0: 1.0, 1: 1932.0, 2: 3.0, 106: 1.0, 180: 1.0}))]

feature_columns = ['Baths', 'Area', 'Beds', 'District', 'City', 'Zip']

### Gradient Boosting Tree Model

In [38]:
from pyspark.ml.regression import GBTRegressor
train_data,test_data=finalized_data.randomSplit([0.8,0.2])
gbt=GBTRegressor(featuresCol='Independent Features', labelCol='Price')
gbt_model=gbt.fit(train_data)

In [39]:
### Prediction of GBT Model
gbt_predictions = gbt_model.transform(test_data)
gbt_evaluator = RegressionEvaluator(labelCol="Price", predictionCol="prediction", metricName="rmse")
rmse = gbt_evaluator.evaluate(gbt_predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

Root Mean Squared Error (RMSE) on test data = 249755


In [40]:
gbt_predictions.show()

+--------------------+-------+------------------+
|Independent Features|  Price|        prediction|
+--------------------+-------+------------------+
|(575,[0,1,2,4,136...| 121000|234946.70238474893|
|(575,[0,1,2,4,136...| 121000|234946.70238474893|
|(575,[0,1,2,4,136...| 169000| 248620.3973794887|
|(575,[0,1,2,4,136...| 169000| 248620.3973794887|
|(575,[0,1,2,4,136...| 169000| 248620.3973794887|
|(575,[0,1,2,4,136...| 345000|281989.20100772195|
|(575,[0,1,2,4,136...| 399000| 733217.4860062656|
|(575,[0,1,2,5,134...| 620000| 599615.2537050897|
|(575,[0,1,2,5,134...| 620000| 599615.2537050897|
|(575,[0,1,2,5,134...| 685000| 786487.7206427073|
|(575,[0,1,2,6,215...|1598000| 1097122.515133293|
|(575,[0,1,2,7,134...| 668490|1080399.7832907895|
|(575,[0,1,2,8,255...| 349900|468617.79749605857|
|(575,[0,1,2,8,255...| 349900|468617.79749605857|
|(575,[0,1,2,8,255...| 374999|468617.79749605857|
|(575,[0,1,2,8,255...| 374999|468617.79749605857|
|(575,[0,1,2,8,256...|1350000|1395682.5539331457|
