In [1]:
from pyspark.sql import SparkSession

In [2]:
!pip install pyspark



In [3]:
sp = SparkSession.builder\
    .appName("rent_predictions")\
    .getOrCreate() 

In [78]:
spp = sp.read.csv("House_Rent_Dataset.csv",header = True,inferSchema=True)

In [79]:
spp.show(5)

+----------+---+-----+----+---------------+-----------+--------------------+-------+-----------------+----------------+--------+----------------+
| Posted On|BHK| Rent|Size|          Floor|  Area Type|       Area Locality|   City|Furnishing Status|Tenant Preferred|Bathroom|Point of Contact|
+----------+---+-----+----+---------------+-----------+--------------------+-------+-----------------+----------------+--------+----------------+
|2022-05-18|  2|10000|1100|Ground out of 2| Super Area|              Bandel|Kolkata|      Unfurnished|Bachelors/Family|       2|   Contact Owner|
|2022-05-13|  2|20000| 800|     1 out of 3| Super Area|Phool Bagan, Kank...|Kolkata|   Semi-Furnished|Bachelors/Family|       1|   Contact Owner|
|2022-05-16|  2|17000|1000|     1 out of 3| Super Area|Salt Lake City Se...|Kolkata|   Semi-Furnished|Bachelors/Family|       1|   Contact Owner|
|2022-07-04|  2|10000| 800|     1 out of 2| Super Area|         Dumdum Park|Kolkata|      Unfurnished|Bachelors/Family|     

In [80]:
sph = spp.drop("Posted On")

In [81]:
sph.show(5)

+---+-----+----+---------------+-----------+--------------------+-------+-----------------+----------------+--------+----------------+
|BHK| Rent|Size|          Floor|  Area Type|       Area Locality|   City|Furnishing Status|Tenant Preferred|Bathroom|Point of Contact|
+---+-----+----+---------------+-----------+--------------------+-------+-----------------+----------------+--------+----------------+
|  2|10000|1100|Ground out of 2| Super Area|              Bandel|Kolkata|      Unfurnished|Bachelors/Family|       2|   Contact Owner|
|  2|20000| 800|     1 out of 3| Super Area|Phool Bagan, Kank...|Kolkata|   Semi-Furnished|Bachelors/Family|       1|   Contact Owner|
|  2|17000|1000|     1 out of 3| Super Area|Salt Lake City Se...|Kolkata|   Semi-Furnished|Bachelors/Family|       1|   Contact Owner|
|  2|10000| 800|     1 out of 2| Super Area|         Dumdum Park|Kolkata|      Unfurnished|Bachelors/Family|       1|   Contact Owner|
|  2| 7500| 850|     1 out of 2|Carpet Area|       Sout

In [82]:
sph.describe().show()

+-------+------------------+-----------------+-----------------+--------------------+----------+--------------------+---------+-----------------+----------------+-----------------+----------------+
|summary|               BHK|             Rent|             Size|               Floor| Area Type|       Area Locality|     City|Furnishing Status|Tenant Preferred|         Bathroom|Point of Contact|
+-------+------------------+-----------------+-----------------+--------------------+----------+--------------------+---------+-----------------+----------------+-----------------+----------------+
|  count|              4746|             4746|             4746|                4746|      4746|                4746|     4746|             4746|            4746|             4746|            4746|
|   mean|2.0838600927096502|34993.45132743363|967.4907290349768|  1.6666666666666667|      NULL|            352525.5|     NULL|             NULL|            NULL|1.965865992414665|            NULL|
| stddev|0

# Linear algothrim

In [83]:
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StringIndexer

In [84]:
from pyspark.sql import functions as F

q1, q3 = sph.approxQuantile("Rent", [0.25, 0.75], 0.05)
iqr = q3 - q1
lower_bound = q1 - 1.5 * iqr
upper_bound = q3 + 1.5 * iqr
sph = sph.withColumn("Rent_capped", 
                            F.when(sph["Rent"] < lower_bound, lower_bound)
                            .when(sph["Rent"] > upper_bound, upper_bound) 
                            .otherwise(sph["Rent"]))  




q1, q3 = sph.approxQuantile("BHK", [0.25, 0.75], 0.05)
iqr = q3 - q1
lower_bound = q1 - 1.5 * iqr
upper_bound = q3 + 1.5 * iqr
sph = sph.withColumn("BHK", 
                            F.when(sph["BHK"] < lower_bound, lower_bound)  
                            .when(sph["BHK"] > upper_bound, upper_bound)  
                            .otherwise(sph["BHK"])) 





q1, q3 = sph.approxQuantile("Size", [0.25, 0.75], 0.05)
iqr = q3 - q1
lower_bound = q1 - 1.5 * iqr
upper_bound = q3 + 1.5 * iqr
sph = sph.withColumn("Size", 
                            F.when(sph["Size"] < lower_bound, lower_bound) 
                            .when(sph["Size"] > upper_bound, upper_bound) 
                            .otherwise(sph["Size"]))  



q1, q3 = sph.approxQuantile("Bathroom", [0.25, 0.75], 0.05)
iqr = q3 - q1
lower_bound = q1 - 1.5 * iqr
upper_bound = q3 + 1.5 * iqr
sph = sph.withColumn("Bathroom", 
                            F.when(sph["Bathroom"] < lower_bound, lower_bound)  
                            .when(sph["Bathroom"] > upper_bound, upper_bound)  
                            .otherwise(sph["Bathroom"]))  

In [85]:
encode = StringIndexer(inputCols=["Floor","Area Type","Area Locality","City","Furnishing Status","Tenant Preferred","Point of Contact"],outputCols=["Floor1","Area Type1","Area Locality1","City1","Furnishing Status1","Tenant Preferred1","Point of Contact1"])

In [86]:
sph = encode.fit(sph).transform(sph)
sph.show(5)

+---+-----+------+---------------+-----------+--------------------+-------+-----------------+----------------+--------+----------------+-----------+------+----------+--------------+-----+------------------+-----------------+-----------------+
|BHK| Rent|  Size|          Floor|  Area Type|       Area Locality|   City|Furnishing Status|Tenant Preferred|Bathroom|Point of Contact|Rent_capped|Floor1|Area Type1|Area Locality1|City1|Furnishing Status1|Tenant Preferred1|Point of Contact1|
+---+-----+------+---------------+-----------+--------------------+-------+-----------------+----------------+--------+----------------+-----------+------+----------+--------------+-----+------------------+-----------------+-----------------+
|2.0|10000|1100.0|Ground out of 2| Super Area|              Bandel|Kolkata|      Unfurnished|Bachelors/Family|     2.0|   Contact Owner|    10000.0|   1.0|       0.0|         472.0|  5.0|               1.0|              0.0|              0.0|
|2.0|20000| 800.0|     1 out

In [87]:
vec = VectorAssembler(inputCols=["Floor1","Area Type1","Area Locality1","City1","Furnishing Status1","Tenant Preferred1","Point of Contact1"],outputCol= "indepent_col")

In [88]:
sph = vec.transform(sph)
sph.show(5)

+---+-----+------+---------------+-----------+--------------------+-------+-----------------+----------------+--------+----------------+-----------+------+----------+--------------+-----+------------------+-----------------+-----------------+--------------------+
|BHK| Rent|  Size|          Floor|  Area Type|       Area Locality|   City|Furnishing Status|Tenant Preferred|Bathroom|Point of Contact|Rent_capped|Floor1|Area Type1|Area Locality1|City1|Furnishing Status1|Tenant Preferred1|Point of Contact1|        indepent_col|
+---+-----+------+---------------+-----------+--------------------+-------+-----------------+----------------+--------+----------------+-----------+------+----------+--------------+-----+------------------+-----------------+-----------------+--------------------+
|2.0|10000|1100.0|Ground out of 2| Super Area|              Bandel|Kolkata|      Unfurnished|Bachelors/Family|     2.0|   Contact Owner|    10000.0|   1.0|       0.0|         472.0|  5.0|               1.0|  

In [89]:
data = sph.select("indepent_col","Rent")
data.show(4)

+--------------------+-----+
|        indepent_col| Rent|
+--------------------+-----+
|[1.0,0.0,472.0,5....|10000|
|(7,[0,2,3],[4.0,6...|20000|
|(7,[0,2,3],[4.0,1...|17000|
|(7,[2,3,4],[516.0...|10000|
+--------------------+-----+
only showing top 4 rows



In [90]:
train,test = data.randomSplit([0.8,0.2])

In [91]:
model = LinearRegression(featuresCol="indepent_col",labelCol="Rent")
model = model.fit(train)

In [92]:
model.coefficients

DenseVector([147.1233, 4575.1206, -0.0719, -4505.1162, 1657.3293, -1241.8334, 39324.1914])

In [93]:
model.intercept

23100.44719787785

In [94]:
pred = model.evaluate(test)
pred.predictions.show()

+--------------------+------+------------------+
|        indepent_col|  Rent|        prediction|
+--------------------+------+------------------+
|(7,[0,1,2],[20.0,...| 45000| 30560.63631438744|
|(7,[0,1,2],[40.0,...| 28000|   33548.129035814|
|(7,[0,1,2],[43.0,...| 61000| 33893.47629969861|
|(7,[0,1,6],[18.0,...|270000| 69647.97889541498|
|(7,[0,1,6],[49.0,...|190000| 74208.80174093193|
|(7,[0,1,6],[80.0,...|170000| 78769.62458644886|
|(7,[0,1,6],[85.0,...|130000| 79505.24117443546|
|(7,[0,2],[2.0,219...| 12000| 23236.88575856857|
|(7,[0,2],[18.0,57...| 25000|25707.236900539257|
|(7,[0,2],[24.0,19...| 25000| 26492.37172084432|
|(7,[0,2],[25.0,15...| 40000| 26777.45123119393|
|(7,[0,2],[29.0,43...| 18000|27335.519334985598|
|(7,[0,2],[33.0,17...| 21000|27943.145216048557|
|(7,[0,2,3],[1.0,7...|  7500|14236.834596787017|
|(7,[0,2,3],[1.0,3...| 14000|14234.604856445349|
|(7,[0,2,3],[1.0,4...| 35000| 9728.697443792802|
|(7,[0,2,3],[1.0,8...|  9500|14231.583917917926|
|(7,[0,2,3],[1.0,9..

In [95]:
print("rmse",pred.rootMeanSquaredError)
print("meanAbsoluteError",pred.meanAbsoluteError)
print("meaanSqaurederror",pred.meanSquaredError)
print("r sqaure",pred.r2)

rmse 50700.29967368507
meanAbsoluteError 24440.754781533327
meaanSqaurederror 2570520387.00147
r sqaure 0.2780756009969415
