In [3]:
!pip install pyspark



In [4]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()
df = spark.read.format("csv").option("header", True).load("/content/AIT664-Group001-Connecticut-RealEstateData-Preprocessed-Part01.csv")
print(df.dtypes)
df.printSchema()

[('Serial Number', 'string'), ('List Year', 'string'), ('Date Recorded', 'string'), ('Town', 'string'), ('Address', 'string'), ('Assessed Value', 'string'), ('Sale Amount', 'string'), ('Sales Ratio', 'string'), ('Property Type', 'string'), ('Residential Type', 'string'), ('Non Use Code', 'string'), ('Assessor Remarks', 'string'), ('OPM remarks', 'string'), ('Location', 'string')]
root
 |-- Serial Number: string (nullable = true)
 |-- List Year: string (nullable = true)
 |-- Date Recorded: string (nullable = true)
 |-- Town: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- Assessed Value: string (nullable = true)
 |-- Sale Amount: string (nullable = true)
 |-- Sales Ratio: string (nullable = true)
 |-- Property Type: string (nullable = true)
 |-- Residential Type: string (nullable = true)
 |-- Non Use Code: string (nullable = true)
 |-- Assessor Remarks: string (nullable = true)
 |-- OPM remarks: string (nullable = true)
 |-- Location: string (nullable = true)



In [5]:
# Define a list of desired property types
desired_types = ["Residential", "Condo", "Single Family", "Two Family", "Three Family", "Four Family"]
# Filter the DataFrame based on the list
df = df.filter(df["Property Type"].isin(desired_types))
print(df.count())

243680


In [6]:
print(df.distinct().count())

242714


In [7]:
for col in df.columns:
    null_count = df.filter(df[col].isNull() | (df[col] == "")).count()
    print(f"Column: {col}\t Null Count: {null_count}")
total_count = df.count()
print(f"Total Count: {total_count}")

Column: Serial Number	 Null Count: 0
Column: List Year	 Null Count: 0
Column: Date Recorded	 Null Count: 0
Column: Town	 Null Count: 0
Column: Address	 Null Count: 0
Column: Assessed Value	 Null Count: 0
Column: Sale Amount	 Null Count: 0
Column: Sales Ratio	 Null Count: 0
Column: Property Type	 Null Count: 0
Column: Residential Type	 Null Count: 0
Column: Non Use Code	 Null Count: 180806
Column: Assessor Remarks	 Null Count: 198506
Column: OPM remarks	 Null Count: 239027
Column: Location	 Null Count: 15
Total Count: 243680


In [8]:
# Remove columns Location, OPM remarks, Assessor Remarks from the DataFrame
df = df.drop("Non Use Code","Property Type","OPM remarks", "Assessor Remarks")

In [9]:
df = df.filter(df.Address.isNotNull())
print(df.count())

243680


In [10]:
from pyspark.sql.functions import to_date, date_format
# Convert Year column to integer type
df = df.withColumn("List Year", df["List Year"].cast("integer"))
# Convert Date Recorded column to date format
df = df.withColumn("Date Recorded", to_date(df["Date Recorded"], "M/d/yyyy"))
df = df.withColumn("Date Recorded", date_format("Date Recorded", "MM/dd/yyyy"))
# Convert Assessed Value column to floating point number
df = df.withColumn("Assessed Value", df["Assessed Value"].cast("double"))
# Convert Sale Amount column to floating point number
df = df.withColumn("Sale Amount", df["Sale Amount"].cast("double"))
# Convert Sales Ratio column to floating point number
df = df.withColumn("Sales Ratio", df["Sales Ratio"].cast("double "))
df.show()

+-------------+---------+-------------+----------+--------------------+--------------+-----------+-----------+----------------+--------------------+
|Serial Number|List Year|Date Recorded|      Town|             Address|Assessed Value|Sale Amount|Sales Ratio|Residential Type|            Location|
+-------------+---------+-------------+----------+--------------------+--------------+-----------+-----------+----------------+--------------------+
|        20002|     2020|   10/02/2020|   Ashford|     390 TURNPIKE RD|      253000.0|   430000.0|     0.5883|   Single Family|POINT (-72.20731 ...|
|       210317|     2021|   07/05/2022|      Avon|     53 COTSWOLD WAY|      329730.0|   805000.0|     0.4096|   Single Family|POINT (-72.846365...|
|       200212|     2020|   03/09/2021|      Avon|    5 CHESTNUT DRIVE|      130400.0|   179900.0|     0.7248|           Condo|POINT (-72.875940...|
|       200243|     2020|   04/13/2021|      Avon|111 NORTHINGTON D...|      619290.0|   890000.0|     0.6

In [11]:
from pyspark.sql.functions import round, mean, when
# Filter rows where Assessed Value is 0
zero_assessed_value = df.filter(df["Assessed Value"] == 0)
# Group records by Town and calculate the average Assessed Value
avg_assessed_value = df.groupby(["Town", "Residential Type"]).agg(round(mean(df["Assessed Value"]), 0).alias("Avg Assessed Value"))
display(avg_assessed_value)
# Join the original DataFrame with the average Assessed Value DataFrame
df = df.join(avg_assessed_value, on=["Town", "Residential Type"], how="left")
# Replace 0 values in Assessed Value column with the average Assessed Value
df = df.withColumn("Assessed Value", when(df["Assessed Value"] == 0, df["Avg Assessed Value"]).otherwise(df["Assessed Value"]))
# Drop the Avg Assessed Value column
df = df.drop("Avg Assessed Value")
display(df)

DataFrame[Town: string, Residential Type: string, Avg Assessed Value: double]

DataFrame[Town: string, Residential Type: string, Serial Number: string, List Year: int, Date Recorded: string, Address: string, Assessed Value: double, Sale Amount: double, Sales Ratio: double, Location: string]

In [12]:
df = df.withColumn("Sales Ratio", round(df["Assessed Value"] / df["Sale Amount"],2))
display(df.head(5))

[Row(Town='Ashford', Residential Type='Single Family', Serial Number='20002', List Year=2020, Date Recorded='10/02/2020', Address='390 TURNPIKE RD', Assessed Value=253000.0, Sale Amount=430000.0, Sales Ratio=0.59, Location='POINT (-72.20731 41.9177359)'),
 Row(Town='Avon', Residential Type='Single Family', Serial Number='210317', List Year=2021, Date Recorded='07/05/2022', Address='53 COTSWOLD WAY', Assessed Value=329730.0, Sale Amount=805000.0, Sales Ratio=0.41, Location='POINT (-72.846365959 41.781677018)'),
 Row(Town='Avon', Residential Type='Condo', Serial Number='200212', List Year=2020, Date Recorded='03/09/2021', Address='5 CHESTNUT DRIVE', Assessed Value=130400.0, Sale Amount=179900.0, Sales Ratio=0.72, Location='POINT (-72.8759407 41.7710138)'),
 Row(Town='Avon', Residential Type='Single Family', Serial Number='200243', List Year=2020, Date Recorded='04/13/2021', Address='111 NORTHINGTON DRIVE', Assessed Value=619290.0, Sale Amount=890000.0, Sales Ratio=0.7, Location='POINT (-

In [13]:
df = df.filter(df["Town"] != "***Unknown***")

In [14]:
df.dropDuplicates()

DataFrame[Town: string, Residential Type: string, Serial Number: string, List Year: int, Date Recorded: string, Address: string, Assessed Value: double, Sale Amount: double, Sales Ratio: double, Location: string]

In [16]:
print(df.distinct().count())

242714


In [18]:
import pandas as pd
pandas_df = df.toPandas()
from google.colab import files
pandas_df.to_csv('AIT664-Group001-Connecticut-RealEstateData-Preprocessed-Part02.csv', index=False)
files.download('AIT664-Group001-Connecticut-RealEstateData-Preprocessed-Part02.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>