In [None]:
from pyspark import *;
from pyspark.sql import *;
from pyspark.sql.functions import *;
import pandas as pd;

spark = SparkSession.builder.appName("Q10").getOrCreate()

In [None]:
df = spark.read.csv("Real_estate.csv",header=True,inferSchema=True)
df.printSchema()
df.show()

root
 |-- Serial Number: integer (nullable = true)
 |-- List Year: integer (nullable = true)
 |-- Date Recorded: string (nullable = true)
 |-- Town: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- Assessed Value: integer (nullable = true)
 |-- Sale Amount: integer (nullable = true)
 |-- Property Type: string (nullable = true)
 |-- Residential Type: string (nullable = true)

+-------------+---------+-------------+------------+--------------------+--------------+-----------+-------------+----------------+
|Serial Number|List Year|Date Recorded|        Town|             Address|Assessed Value|Sale Amount|Property Type|Residential Type|
+-------------+---------+-------------+------------+--------------------+--------------+-----------+-------------+----------------+
|       220008|     2022|   01/30/2023|     Andover|         618 ROUTE 6|        139020|     232000|  Residential|   Single Family|
|      2020348|     2020|   09/13/2021|     Ansonia|     230 WAKELEE AVE| 

In [None]:
df.select([count(when(isnull(c)|isnan(c),c)).alias(c) for c in df.columns]).show()

+-------------+---------+-------------+----+-------+--------------+-----------+-------------+----------------+
|Serial Number|List Year|Date Recorded|Town|Address|Assessed Value|Sale Amount|Property Type|Residential Type|
+-------------+---------+-------------+----+-------+--------------+-----------+-------------+----------------+
|            0|        0|            0|  10|      0|             0|          0|           72|             168|
+-------------+---------+-------------+----+-------+--------------+-----------+-------------+----------------+



In [None]:
df = df.withColumn("Date Recorded",regexp_replace("Date Recorded","/","-"))
df = df.withColumn("Date Recorded",date_format(to_date(col("Date Recorded"),"MM-dd-yyyy"),"yyyy-MM-dd"))
df.show()

+-------------+---------+-------------+------------+--------------------+--------------+-----------+-------------+----------------+
|Serial Number|List Year|Date Recorded|        Town|             Address|Assessed Value|Sale Amount|Property Type|Residential Type|
+-------------+---------+-------------+------------+--------------------+--------------+-----------+-------------+----------------+
|       220008|     2022|   2023-01-30|     Andover|         618 ROUTE 6|        139020|     232000|  Residential|   Single Family|
|      2020348|     2020|   2021-09-13|     Ansonia|     230 WAKELEE AVE|        150500|     325000|   Commercial|            NULL|
|        20002|     2020|   2020-10-02|     Ashford|     390 TURNPIKE RD|        253000|     430000|  Residential|   Single Family|
|       210317|     2021|   2022-07-05|        Avon|     53 COTSWOLD WAY|        329730|     805000|  Residential|   Single Family|
|       200212|     2020|   2021-03-09|        Avon|    5 CHESTNUT DRIVE|   

In [None]:
df = df.withColumn("Sales_Ratio", round(col("Assessed Value")/col("Sale Amount"),2))
df.show()

+-------------+---------+-------------+------------+--------------------+--------------+-----------+-------------+----------------+-----------+
|Serial Number|List Year|Date Recorded|        Town|             Address|Assessed Value|Sale Amount|Property Type|Residential Type|Sales_Ratio|
+-------------+---------+-------------+------------+--------------------+--------------+-----------+-------------+----------------+-----------+
|       220008|     2022|   2023-01-30|     Andover|         618 ROUTE 6|        139020|     232000|  Residential|   Single Family|        0.6|
|      2020348|     2020|   2021-09-13|     Ansonia|     230 WAKELEE AVE|        150500|     325000|   Commercial|            NULL|       0.46|
|        20002|     2020|   2020-10-02|     Ashford|     390 TURNPIKE RD|        253000|     430000|  Residential|   Single Family|       0.59|
|       210317|     2021|   2022-07-05|        Avon|     53 COTSWOLD WAY|        329730|     805000|  Residential|   Single Family|     

In [None]:
df = df.fillna({"Residential Type":"Unknown"})
df = df.fillna({"Property Type":"Unknown"})
df.show()

+-------------+---------+-------------+------------+--------------------+--------------+-----------+-------------+----------------+-----------+
|Serial Number|List Year|Date Recorded|        Town|             Address|Assessed Value|Sale Amount|Property Type|Residential Type|Sales_Ratio|
+-------------+---------+-------------+------------+--------------------+--------------+-----------+-------------+----------------+-----------+
|       220008|     2022|   2023-01-30|     Andover|         618 ROUTE 6|        139020|     232000|  Residential|   Single Family|        0.6|
|      2020348|     2020|   2021-09-13|     Ansonia|     230 WAKELEE AVE|        150500|     325000|   Commercial|         Unknown|       0.46|
|        20002|     2020|   2020-10-02|     Ashford|     390 TURNPIKE RD|        253000|     430000|  Residential|   Single Family|       0.59|
|       210317|     2021|   2022-07-05|        Avon|     53 COTSWOLD WAY|        329730|     805000|  Residential|   Single Family|     

In [None]:
df.select("List Year").distinct().show()
df = df.filter((col("List Year")>=2001) & (col("List Year")<=2022))
df.select("List Year").distinct().show()

+---------+
|List Year|
+---------+
|     2025|
|     2022|
|     2020|
|     2001|
|     2002|
|     2021|
+---------+

+---------+
|List Year|
+---------+
|     2022|
|     2020|
|     2001|
|     2002|
|     2021|
+---------+



In [None]:
mode_town = df.select(mode("Town")).collect()[0][0]
df = df.fillna({"Town":mode_town})
df.show()

+-------------+---------+-------------+------------+--------------------+--------------+-----------+-------------+----------------+-----------+
|Serial Number|List Year|Date Recorded|        Town|             Address|Assessed Value|Sale Amount|Property Type|Residential Type|Sales_Ratio|
+-------------+---------+-------------+------------+--------------------+--------------+-----------+-------------+----------------+-----------+
|       220008|     2022|   2023-01-30|     Andover|         618 ROUTE 6|        139020|     232000|  Residential|   Single Family|        0.6|
|      2020348|     2020|   2021-09-13|     Ansonia|     230 WAKELEE AVE|        150500|     325000|   Commercial|         Unknown|       0.46|
|        20002|     2020|   2020-10-02|     Ashford|     390 TURNPIKE RD|        253000|     430000|  Residential|   Single Family|       0.59|
|       210317|     2021|   2022-07-05|        Avon|     53 COTSWOLD WAY|        329730|     805000|  Residential|   Single Family|     

In [None]:
df.select([count(when(col(c).isNull(),c)).alias(c) for c in df.columns]).show()

+-------------+---------+-------------+----+-------+--------------+-----------+-------------+----------------+-----------+
|Serial Number|List Year|Date Recorded|Town|Address|Assessed Value|Sale Amount|Property Type|Residential Type|Sales_Ratio|
+-------------+---------+-------------+----+-------+--------------+-----------+-------------+----------------+-----------+
|            0|        0|            0|   0|      0|             0|          0|            0|               0|          7|
+-------------+---------+-------------+----+-------+--------------+-----------+-------------+----------------+-----------+



In [None]:
data = df.toPandas()
data.to_csv("cleaned_10.csv", index=False)

In [None]:
spark.stop()