# **Task 2**

# Step 1: Install and Import

In [22]:
!pip install -q pyspark


# Import Libraries

In [31]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import when
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import RegressionEvaluator, MulticlassClassificationEvaluator


# Step 2: Create Spark Session and Load Data

In [32]:
spark = SparkSession.builder.appName("HousingMLModels").getOrCreate()
df = spark.read.csv("housing.csv", header=True, inferSchema=True).dropna()
df.printSchema()


root
 |-- longitude: double (nullable = true)
 |-- latitude: double (nullable = true)
 |-- housing_median_age: double (nullable = true)
 |-- total_rooms: double (nullable = true)
 |-- total_bedrooms: double (nullable = true)
 |-- population: double (nullable = true)
 |-- households: double (nullable = true)
 |-- median_income: double (nullable = true)
 |-- median_house_value: double (nullable = true)
 |-- ocean_proximity: string (nullable = true)



# REGRESSION MODEL (Predict House Value)

In [33]:
feature_cols = ["housing_median_age", "total_rooms", "total_bedrooms", "population", "households", "median_income"]
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
reg_df = assembler.transform(df).select("features", "median_house_value")


# Model Training

In [34]:
train_reg, test_reg = reg_df.randomSplit([0.8, 0.2], seed=42)
lr = LinearRegression(labelCol="median_house_value", featuresCol="features")
lr_model = lr.fit(train_reg)
pred_reg = lr_model.transform(test_reg)


# 📉 Evaluation

In [35]:
reg_eval_rmse = RegressionEvaluator(labelCol="median_house_value", predictionCol="prediction", metricName="rmse")
reg_eval_r2 = RegressionEvaluator(labelCol="median_house_value", predictionCol="prediction", metricName="r2")
print("Regression RMSE:", reg_eval_rmse.evaluate(pred_reg))
print("Regression R²:", reg_eval_r2.evaluate(pred_reg))


Regression RMSE: 76768.14224622416
Regression R²: 0.5468983835741774


CLASSIFICATION MODEL (Predict Price Category)
# 🎯 Step 1: Create Label Column

In [36]:
# Add classification label column
df_class = df.withColumn("price_label", when(df["median_house_value"] < 150000, 0)
                                        .when((df["median_house_value"] >= 150000) & (df["median_house_value"] <= 300000), 1)
                                        .otherwise(2))


# Feature Selection

In [37]:
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
class_df = assembler.transform(df_class).select("features", "price_label")


# Model Training (Logistic Regression)

In [38]:
train_cls, test_cls = class_df.randomSplit([0.8, 0.2], seed=42)
logr = LogisticRegression(labelCol="price_label", featuresCol="features", maxIter=10)
logr_model = logr.fit(train_cls)
pred_cls = logr_model.transform(test_cls)


# 📉 Evaluation

In [39]:
evaluator = MulticlassClassificationEvaluator(labelCol="price_label", predictionCol="prediction", metricName="accuracy")
print("Classification Accuracy:", evaluator.evaluate(pred_cls))


Classification Accuracy: 0.6948984645864289


### Insights:
- Linear Regression shows how income and rooms influence house value.
- Classification simplifies house values into low, mid, high price classes.
- Classification Accuracy gives a quick view of model effectiveness for categorical prediction.
