In [2]:
pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.2.tar.gz (317.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.3/317.3 MB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.2-py2.py3-none-any.whl size=317812365 sha256=6e2218b09e226d9c81656323a1a7c86890b4d05b4951d275d84a1d0e9d56f1b6
  Stored in directory: /root/.cache/pip/wheels/34/34/bd/03944534c44b677cd5859f248090daa9fb27b3c8f8e5f49574
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.2


In [3]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, avg

In [4]:
# Initialize a Spark session
spark = SparkSession.builder.appName("Real Estate Analysis").getOrCreate()

In [5]:
# Load the dataset
data = spark.read.option("header", "true").csv("/content/Real estate.csv", inferSchema=True)


In [6]:
# Show the schema
data.printSchema()

root
 |-- No: integer (nullable = true)
 |-- X1 transaction date: double (nullable = true)
 |-- X2 house age: double (nullable = true)
 |-- X3 distance to the nearest MRT station: double (nullable = true)
 |-- X4 number of convenience stores: integer (nullable = true)
 |-- X5 latitude: double (nullable = true)
 |-- X6 longitude: double (nullable = true)
 |-- Y house price of unit area: double (nullable = true)



In [7]:
data.show()

+---+-------------------+------------+--------------------------------------+-------------------------------+-----------+------------+--------------------------+
| No|X1 transaction date|X2 house age|X3 distance to the nearest MRT station|X4 number of convenience stores|X5 latitude|X6 longitude|Y house price of unit area|
+---+-------------------+------------+--------------------------------------+-------------------------------+-----------+------------+--------------------------+
|  1|           2012.917|        32.0|                              84.87882|                             10|   24.98298|   121.54024|                      37.9|
|  2|           2012.917|        19.5|                              306.5947|                              9|   24.98034|   121.53951|                      42.2|
|  3|           2013.583|        13.3|                              561.9845|                              5|   24.98746|   121.54391|                      47.3|
|  4|             2013.5|   

In [8]:
# Filter records where house age is greater than 20
filtered_data = data.filter(col("X2 house age") > 20)


In [9]:
filtered_data.show()

+---+-------------------+------------+--------------------------------------+-------------------------------+-----------+------------+--------------------------+
| No|X1 transaction date|X2 house age|X3 distance to the nearest MRT station|X4 number of convenience stores|X5 latitude|X6 longitude|Y house price of unit area|
+---+-------------------+------------+--------------------------------------+-------------------------------+-----------+------------+--------------------------+
|  1|           2012.917|        32.0|                              84.87882|                             10|   24.98298|   121.54024|                      37.9|
|  7|           2012.667|        34.5|                              623.4731|                              7|   24.97933|   121.53642|                      40.3|
|  8|           2013.417|        20.3|                              287.6025|                              6|   24.98042|   121.54228|                      46.7|
|  9|             2013.5|   

In [10]:
# Group by number of convenience stores and calculate the average house price
grouped_data = filtered_data.groupBy("X4 number of convenience stores").agg(avg("Y house price of unit area").alias("avg_house_price"))


In [11]:
# Show the result
grouped_data.show()


+-------------------------------+------------------+
|X4 number of convenience stores|   avg_house_price|
+-------------------------------+------------------+
|                              1|24.516666666666666|
|                              6| 40.25714285714286|
|                              3|           33.1125|
|                              5|              34.2|
|                              9| 48.41250000000001|
|                              4| 37.50909090909091|
|                              8|44.605000000000004|
|                              7| 38.51666666666667|
|                             10| 45.26666666666667|
|                              2| 34.82222222222222|
|                              0|23.060000000000002|
+-------------------------------+------------------+



In [12]:
# Stop the Spark session
spark.stop()