## House price prediction using pyspark

In [62]:
from pyspark.sql import SparkSession

In [63]:
ss = SparkSession.builder.appName("House_Price").getOrCreate()

In [64]:
df = ss.read.csv('D:\\Python Projects\\Pyspark\\MLlib\\USA_Housing.csv',header=True,inferSchema=True)

In [65]:
df.printSchema()

root
 |-- Avg Area Income: string (nullable = true)
 |-- Avg Area House Age: string (nullable = true)
 |-- Avg Area Number of Rooms: double (nullable = true)
 |-- Avg Area Number of Bedrooms: double (nullable = true)
 |-- Area Population: double (nullable = true)
 |-- Price: double (nullable = true)
 |-- Address: string (nullable = true)



In [66]:
df.head(4)

[Row(Avg Area Income='79545.45857', Avg Area House Age='5.682861322', Avg Area Number of Rooms=7.009188143, Avg Area Number of Bedrooms=4.09, Area Population=23086.8005, Price=1059033.558, Address='208 Michael Ferry Apt. 674'),
 Row(Avg Area Income='Laurabury', Avg Area House Age=' NE 37010-5101"', Avg Area Number of Rooms=None, Avg Area Number of Bedrooms=None, Area Population=None, Price=None, Address=None),
 Row(Avg Area Income='79248.64245', Avg Area House Age='6.002899808', Avg Area Number of Rooms=6.730821019, Avg Area Number of Bedrooms=3.09, Area Population=40173.07217, Price=1505890.915, Address='188 Johnson Views Suite 079'),
 Row(Avg Area Income='Lake Kathleen', Avg Area House Age=' CA 48958"', Avg Area Number of Rooms=None, Avg Area Number of Bedrooms=None, Area Population=None, Price=None, Address=None)]

In [67]:
df.show()

+---------------+------------------+------------------------+---------------------------+---------------+-----------+--------------------+
|Avg Area Income|Avg Area House Age|Avg Area Number of Rooms|Avg Area Number of Bedrooms|Area Population|      Price|             Address|
+---------------+------------------+------------------------+---------------------------+---------------+-----------+--------------------+
|    79545.45857|       5.682861322|             7.009188143|                       4.09|     23086.8005|1059033.558|208 Michael Ferry...|
|      Laurabury|    NE 37010-5101"|                    NULL|                       NULL|           NULL|       NULL|                NULL|
|    79248.64245|       6.002899808|             6.730821019|                       3.09|    40173.07217|1505890.915|188 Johnson Views...|
|  Lake Kathleen|         CA 48958"|                    NULL|                       NULL|           NULL|       NULL|                NULL|
|    61287.06718|        5.

In [68]:
df.na.drop(how='any').show()

+---------------+------------------+------------------------+---------------------------+---------------+-----------+--------------------+
|Avg Area Income|Avg Area House Age|Avg Area Number of Rooms|Avg Area Number of Bedrooms|Area Population|      Price|             Address|
+---------------+------------------+------------------------+---------------------------+---------------+-----------+--------------------+
|    79545.45857|       5.682861322|             7.009188143|                       4.09|     23086.8005|1059033.558|208 Michael Ferry...|
|    79248.64245|       6.002899808|             6.730821019|                       3.09|    40173.07217|1505890.915|188 Johnson Views...|
|    61287.06718|        5.86588984|              8.51272743|                       5.13|     36882.1594|1058987.988|9127 Elizabeth St...|
|    63345.24005|       7.188236095|             5.586728665|                       3.26|    34310.24283|1260616.807|         USS Barnett|
|    59982.19723|       5.0

In [69]:
df.printSchema()

root
 |-- Avg Area Income: string (nullable = true)
 |-- Avg Area House Age: string (nullable = true)
 |-- Avg Area Number of Rooms: double (nullable = true)
 |-- Avg Area Number of Bedrooms: double (nullable = true)
 |-- Area Population: double (nullable = true)
 |-- Price: double (nullable = true)
 |-- Address: string (nullable = true)



In [70]:
df.columns

['Avg Area Income',
 'Avg Area House Age',
 'Avg Area Number of Rooms',
 'Avg Area Number of Bedrooms',
 'Area Population',
 'Price',
 'Address']

In [71]:
from pyspark.sql.functions import col
from pyspark.sql.types import IntegerType, StringType,DoubleType

In [72]:
df = df.withColumn("Avg Area Income", col("Avg Area Income").cast(DoubleType()))

In [73]:
df = df.withColumn("Avg Area House Age", col("Avg Area House Age").cast(DoubleType()))

In [74]:
df.printSchema()

root
 |-- Avg Area Income: double (nullable = true)
 |-- Avg Area House Age: double (nullable = true)
 |-- Avg Area Number of Rooms: double (nullable = true)
 |-- Avg Area Number of Bedrooms: double (nullable = true)
 |-- Area Population: double (nullable = true)
 |-- Price: double (nullable = true)
 |-- Address: string (nullable = true)



In [75]:
df_cleaned = df.na.drop(how='any')

In [79]:
df_cleaned.show()

+---------------+------------------+------------------------+---------------------------+---------------+-----------+--------------------+
|Avg Area Income|Avg Area House Age|Avg Area Number of Rooms|Avg Area Number of Bedrooms|Area Population|      Price|             Address|
+---------------+------------------+------------------------+---------------------------+---------------+-----------+--------------------+
|    79545.45857|       5.682861322|             7.009188143|                       4.09|     23086.8005|1059033.558|208 Michael Ferry...|
|    79248.64245|       6.002899808|             6.730821019|                       3.09|    40173.07217|1505890.915|188 Johnson Views...|
|    61287.06718|        5.86588984|              8.51272743|                       5.13|     36882.1594|1058987.988|9127 Elizabeth St...|
|    63345.24005|       7.188236095|             5.586728665|                       3.26|    34310.24283|1260616.807|         USS Barnett|
|    59982.19723|       5.0

In [80]:
df.show()

+---------------+------------------+------------------------+---------------------------+---------------+-----------+--------------------+
|Avg Area Income|Avg Area House Age|Avg Area Number of Rooms|Avg Area Number of Bedrooms|Area Population|      Price|             Address|
+---------------+------------------+------------------------+---------------------------+---------------+-----------+--------------------+
|    79545.45857|       5.682861322|             7.009188143|                       4.09|     23086.8005|1059033.558|208 Michael Ferry...|
|           NULL|              NULL|                    NULL|                       NULL|           NULL|       NULL|                NULL|
|    79248.64245|       6.002899808|             6.730821019|                       3.09|    40173.07217|1505890.915|188 Johnson Views...|
|           NULL|              NULL|                    NULL|                       NULL|           NULL|       NULL|                NULL|
|    61287.06718|        5.

In [76]:
from pyspark.ml.feature import VectorAssembler
featureAssembler = VectorAssembler(inputCols=['Avg Area Income',
 'Avg Area House Age',
 'Avg Area Number of Rooms',
 'Avg Area Number of Bedrooms',
 'Area Population'],outputCol='Independent Feature')

In [81]:
output = featureAssembler.transform(df_cleaned)

In [82]:
output.show()

+---------------+------------------+------------------------+---------------------------+---------------+-----------+--------------------+--------------------+
|Avg Area Income|Avg Area House Age|Avg Area Number of Rooms|Avg Area Number of Bedrooms|Area Population|      Price|             Address| Independent Feature|
+---------------+------------------+------------------------+---------------------------+---------------+-----------+--------------------+--------------------+
|    79545.45857|       5.682861322|             7.009188143|                       4.09|     23086.8005|1059033.558|208 Michael Ferry...|[79545.45857,5.68...|
|    79248.64245|       6.002899808|             6.730821019|                       3.09|    40173.07217|1505890.915|188 Johnson Views...|[79248.64245,6.00...|
|    61287.06718|        5.86588984|              8.51272743|                       5.13|     36882.1594|1058987.988|9127 Elizabeth St...|[61287.06718,5.86...|
|    63345.24005|       7.188236095|    

In [84]:
final_data = output.select('Independent Feature','Price')
final_data.show()

+--------------------+-----------+
| Independent Feature|      Price|
+--------------------+-----------+
|[79545.45857,5.68...|1059033.558|
|[79248.64245,6.00...|1505890.915|
|[61287.06718,5.86...|1058987.988|
|[63345.24005,7.18...|1260616.807|
|[59982.19723,5.04...|630943.4893|
|[80175.75416,4.98...|1068138.074|
|[64698.46343,6.02...|1502055.817|
|[78394.33928,6.98...|1573936.564|
|[59927.66081,5.36...|798869.5328|
|[81885.92718,4.42...|1545154.813|
|[80527.47208,8.09...|1707045.722|
|[50593.6955,4.496...|663732.3969|
|[39033.80924,7.67...|1042814.098|
|[73163.66344,6.91...|1291331.518|
|[69391.38018,5.34...| 1402818.21|
|[73091.86675,5.44...| 1306674.66|
|[79706.96306,5.06...|  1556786.6|
|[61929.07702,4.78...|528485.2467|
|[63508.1943,5.947...|1019425.937|
|[62085.2764,5.739...|1030591.429|
+--------------------+-----------+
only showing top 20 rows



## Train Test Split

In [86]:
from pyspark.ml.regression import LinearRegression
train_data,test_data = final_data.randomSplit([0.75,0.25])
reg = LinearRegression(featuresCol='Independent Feature',labelCol='Price')
reg = reg.fit(train_data)

In [87]:
reg.coefficients

DenseVector([21.439, 167403.5459, 120825.4112, 2228.4925, 15.2524])

In [89]:
res = reg.evaluate(test_data)
res.predictions.show()

+--------------------+-----------+------------------+
| Independent Feature|      Price|        prediction|
+--------------------+-----------+------------------+
|[17796.63119,4.94...| 302355.836|102260.57401220826|
|[35608.98624,6.93...|449331.5835| 558207.6155187422|
|[38139.91904,5.57...|723750.0653| 579199.6395482747|
|[38530.12448,4.26...|1267986.688| 909029.9507484953|
|[38571.96367,7.42...|968411.6244| 847972.4623029018|
|[39033.80924,7.67...|1042814.098|  958253.186848544|
|[39411.65279,4.38...|539483.3966| 495927.8757370566|
|[40185.73389,5.94...|529282.0844| 493728.0450131488|
|[40581.77809,4.16...| 509499.589| 576822.8604151993|
|[41533.01296,6.85...|682200.3006|497342.54785240674|
|[42348.16246,6.79...|904785.1632| 941098.1747896555|
|[42822.3111,6.612...|513215.9882|444490.69269990595|
|[43401.44163,7.03...|736817.3769|  647927.668452017|
|[43940.87136,7.24...|547918.3264| 686199.8795967908|
|[43952.33621,5.41...| 324981.993| 483623.3029374862|
|[44088.27418,7.55...|624482

In [90]:
res.meanAbsoluteError

80291.62950444534