Q2. USA House Pricing Dataset

1. Create Spark Session & Load Dataset

In [10]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when, avg, round as _round

# Create Spark Session
spark = SparkSession.builder.appName("USA_House_Pricing").getOrCreate()

# Load dataset
df = spark.read.option("header", True).option("inferSchema", True).csv("USA_Housing.csv")

print("Original columns:", df.columns)

Original columns: ['Avg. Area Income', 'Avg. Area House Age', 'Avg. Area Number of Rooms', 'Avg. Area Number of Bedrooms', 'Area Population', 'Price', 'Address']


Creates a Spark session and loads the USA House Pricing dataset into a PySpark DataFrame.

2. Rename columns to remove dots/spaces

In [11]:
df = (df
      .withColumnRenamed("Avg. Area Income", "Avg_Area_Income")
      .withColumnRenamed("Avg. Area House Age", "Avg_Area_House_Age")
      .withColumnRenamed("Avg. Area Number of Rooms", "Avg_Area_Number_of_Rooms")
      .withColumnRenamed("Avg. Area Number of Bedrooms", "Avg_Area_Number_of_Bedrooms")
      .withColumnRenamed("Area Population", "Area_Population")
)

print("Renamed columns:", df.columns)

Renamed columns: ['Avg_Area_Income', 'Avg_Area_House_Age', 'Avg_Area_Number_of_Rooms', 'Avg_Area_Number_of_Bedrooms', 'Area_Population', 'Price', 'Address']


3. Data Cleaning (Handle Corrupted Rows)

Some rows may have non-numeric data due to CSV misalignment.
``We keep only rows where numeric columns are valid numbers.

In [12]:
numeric_cols = [
    "Avg_Area_Income",
    "Avg_Area_House_Age",
    "Avg_Area_Number_of_Rooms",
    "Avg_Area_Number_of_Bedrooms",
    "Area_Population",
    "Price"
]

# Remove rows where numeric data cannot be cast to double
for c in numeric_cols:
    df = df.filter(col(c).cast("double").isNotNull())
    df = df.withColumn(c, col(c).cast("double"))

print("Cleaned numeric columns successfully")

Cleaned numeric columns successfully


4. Basic Transformations

In [13]:
df.select(numeric_cols).describe().show()

df.select(
    _round(avg("Price"),2).alias("Avg_Price"),
    _round(avg("Avg_Area_Income"),2).alias("Avg_Income")
).show()

df.filter(col("Price") > 1000000).select("Address", "Price").show(5, truncate=False)

# 5️⃣ Create SQL Table
df.createOrReplaceTempView("house_data")

spark.sql("""
SELECT ROUND(AVG(Price),2) AS Avg_Price,
       ROUND(AVG(Avg_Area_Income),2) AS Avg_Income,
       COUNT(*) AS Total_Records
FROM house_data
""").show()

+-------+------------------+------------------+------------------------+---------------------------+------------------+------------------+
|summary|   Avg_Area_Income|Avg_Area_House_Age|Avg_Area_Number_of_Rooms|Avg_Area_Number_of_Bedrooms|   Area_Population|             Price|
+-------+------------------+------------------+------------------------+---------------------------+------------------+------------------+
|  count|              5000|              5000|                    5000|                       5000|              5000|              5000|
|   mean| 68583.10898395971|  5.97722203528029|       6.987791850907942|         3.9813299999999967| 36163.51603857463|  1232072.65414236|
| stddev|10657.991213830363|0.9914561798281722|      1.0058332312773866|         1.2341372654846832| 9925.650113501246|353117.62658106035|
|    min|17796.631189543397| 2.644304186036705|      3.2361940234262048|                        2.0|172.61068627290044|15938.657923287848|
|    max|107701.74837763935

5. Create SQL Table

In [14]:
df.createOrReplaceTempView("house_data")

spark.sql("""
SELECT ROUND(AVG(Price),2) AS Avg_Price,
       ROUND(AVG(Avg_Area_Income),2) AS Avg_Income,
       COUNT(*) AS Total_Records
FROM house_data
""").show()

+----------+----------+-------------+
| Avg_Price|Avg_Income|Total_Records|
+----------+----------+-------------+
|1232072.65|  68583.11|         5000|
+----------+----------+-------------+



6. Imputation

In [15]:
from pyspark.ml.feature import Imputer

imputer = Imputer(inputCols=numeric_cols, outputCols=[c + "_imputed" for c in numeric_cols])
df = imputer.fit(df).transform(df)

7. Normalization

In [16]:
from pyspark.ml.feature import VectorAssembler, MinMaxScaler

assembler = VectorAssembler(inputCols=[c + "_imputed" for c in numeric_cols], outputCol="features")
df = assembler.transform(df)

scaler = MinMaxScaler(inputCol="features", outputCol="features_norm")
df = scaler.fit(df).transform(df)

8. Standardization

In [17]:
from pyspark.ml.feature import StandardScaler

scaler_std = StandardScaler(inputCol="features", outputCol="features_std", withMean=True, withStd=True)
df = scaler_std.fit(df).transform(df)

9. Feature Engineering

In [19]:
from pyspark.sql.functions import lit

df = df.withColumn("Income_to_Price_Ratio", col("Avg_Area_Income") / col("Price"))
df = df.withColumn("People_to_Room_Ratio", col("Area_Population") / col("Avg_Area_Number_of_Rooms"))
df = df.withColumn("House_Age_2025", lit(2025) - col("Avg_Area_House_Age"))

df.select("Price", "Income_to_Price_Ratio", "People_to_Room_Ratio", "House_Age_2025").show(5, truncate=False)


+------------------+---------------------+--------------------+------------------+
|Price             |Income_to_Price_Ratio|People_to_Room_Ratio|House_Age_2025    |
+------------------+---------------------+--------------------+------------------+
|1059033.5578701235|0.07511136732465279  |3293.790954438471   |2019.3171386783845|
|1505890.91484695  |0.0526257524190456   |5968.524799526881   |2018.9971001917247|
|1058987.9878760849|0.05787324113238965  |4332.5901952530185  |2019.13411015969  |
|1260616.8066294468|0.0502494014938578   |6141.383426568383   |2017.8117639054813|
|630943.4893385402 |0.0950674636306832   |3361.756070049812   |2019.9594454768937|
+------------------+---------------------+--------------------+------------------+
only showing top 5 rows

