In [1]:
from pyspark.sql import SparkSession

# recall that we need a spark session to work with spark
# SparkSession.builder creates that session and master("local[*]") runs it localy using whaterver cores are avaiable
# finlay, the session is created, or an existing session is returned
spark = SparkSession.builder.master("local[*]").getOrCreate()
spark

In [2]:
df = spark.read.csv('housing_data.csv', header=True)
df.show()

+-----------+--------+---------+--------------+---+------+
|Size (sqft)|Bedrooms|Bathrooms|Location_Score|Age| Price|
+-----------+--------+---------+--------------+---+------+
|       1360|       1|        3|             5|  2|158336|
|       4272|       1|        2|             6| 41|251463|
|       3592|       3|        2|             4| 53|203428|
|        966|       5|        2|             4| 38|100910|
|       4926|       4|        1|             4| 40|316409|
|       3944|       1|        2|             2| 48|183604|
|       3671|       4|        2|             4| 10|305304|
|       3419|       1|        2|             1| 38|175073|
|        630|       1|        3|             6| 23| 74297|
|       2185|       1|        3|             1| 59| 40815|
|       1269|       5|        1|            10| 45|114709|
|       2891|       2|        1|             6| 18|206371|
|       2933|       4|        3|             1|  3|303013|
|       1684|       5|        3|             6| 25|18157

In [19]:
from pyspark.sql.functions import *

# cheching for null values
df.select([sum(col(c).isNull().cast("int")).alias(c) for c in df.columns]).show()
df = df.withColumn("Price", df.Price.cast("float"))
df = df.withColumn("Bedrooms", df.Bedrooms.cast("int"))
df = df.withColumn("Bathrooms", df.Bathrooms.cast("int"))
df = df.withColumn("Location_Score", df.Location_Score.cast("float"))
df = df.withColumn("Age", df.Age.cast("int"))
df = df.withColumn("Size (sqft)", col("Size (sqft)").cast("int"))
df.show()

+-----------+--------+---------+--------------+---+-----+
|Size (sqft)|Bedrooms|Bathrooms|Location_Score|Age|Price|
+-----------+--------+---------+--------------+---+-----+
|          0|       0|        0|             0|  0|    0|
+-----------+--------+---------+--------------+---+-----+

+-----------+--------+---------+--------------+---+--------+
|Size (sqft)|Bedrooms|Bathrooms|Location_Score|Age|   Price|
+-----------+--------+---------+--------------+---+--------+
|       1360|       1|        3|           5.0|  2|158336.0|
|       4272|       1|        2|           6.0| 41|251463.0|
|       3592|       3|        2|           4.0| 53|203428.0|
|        966|       5|        2|           4.0| 38|100910.0|
|       4926|       4|        1|           4.0| 40|316409.0|
|       3944|       1|        2|           2.0| 48|183604.0|
|       3671|       4|        2|           4.0| 10|305304.0|
|       3419|       1|        2|           1.0| 38|175073.0|
|        630|       1|        3|      

Spark ML stuff (which we never saw in class btw)

In [27]:
from pyspark.sql import SparkSession
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorAssembler

# the idea is similar to how sklearn works - but we MUST use the Spark libraries, since spark is distributed and its libraries are optimized for handeling distributed data
train_data, test_data = df.randomSplit([0.8, 0.2], seed=42)

# Assemble feature (Size) - The VectorAssembler takes all the values from the input columns and combines them into a single vector
assembler = VectorAssembler(inputCols=["Size (sqft)"], outputCol="features")

# using .transform to trasform the data itnto feature vectors
train_transformed = assembler.transform(train_data)
test_transformed = assembler.transform(test_data)

train_transformed.show()

# Train the Simple Linear Regression Model - we use column features as the "feature" column
# and the "Size (sqft)" as the label (what we want to predict)
lr = LinearRegression(featuresCol="features", labelCol="Price")
lr_model = lr.fit(train_transformed)

# Predictions
predictions = lr_model.transform(test_transformed)

+-----------+--------+---------+--------------+---+--------+--------+
|Size (sqft)|Bedrooms|Bathrooms|Location_Score|Age|   Price|features|
+-----------+--------+---------+--------------+---+--------+--------+
|        504|       3|        3|           6.0| 35| 90691.0| [504.0]|
|        516|       4|        3|           4.0| 14|148090.0| [516.0]|
|        564|       2|        3|           7.0| 46| 18921.0| [564.0]|
|        598|       3|        3|           5.0| 59| 37600.0| [598.0]|
|        614|       4|        3|           7.0| 45| 63473.0| [614.0]|
|        619|       2|        2|           1.0| 52|  1392.0| [619.0]|
|        630|       1|        3|           6.0| 23| 74297.0| [630.0]|
|        643|       4|        1|           7.0| 57| 38908.0| [643.0]|
|        646|       3|        1|          10.0|  1|164197.0| [646.0]|
|        654|       3|        2|           3.0| 34| 27073.0| [654.0]|
|        661|       5|        1|           2.0| 37| 50481.0| [661.0]|
|        689|       

In [31]:
# Assemble all feature columns, that we want to make into a feature vector - we explude price since that is what we want to predict
feature_cols = [col for col in df.columns if col != "Price"]
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")

train_transformed = assembler.transform(train_data)
test_transformed = assembler.transform(test_data)

# we can see by this output that the feature vector is a vector with the feature for each column - we dont just the mix the data, we keep it sperated for the model to learn
train_transformed.show()

# since the train_transformed is a DF, then we just tell it where the label colum is located by name - this eliminates the need of having x-train and y-train.
lr_multi = LinearRegression(featuresCol="features", labelCol="Price")
lr_multi_model = lr_multi.fit(train_transformed)

# Predictions - also a datafram with a new column called "prediction"
predictions_multi = lr_multi_model.transform(test_transformed)
predictions_multi.show()

+-----------+--------+---------+--------------+---+--------+--------------------+
|Size (sqft)|Bedrooms|Bathrooms|Location_Score|Age|   Price|            features|
+-----------+--------+---------+--------------+---+--------+--------------------+
|        504|       3|        3|           6.0| 35| 90691.0|[504.0,3.0,3.0,6....|
|        516|       4|        3|           4.0| 14|148090.0|[516.0,4.0,3.0,4....|
|        564|       2|        3|           7.0| 46| 18921.0|[564.0,2.0,3.0,7....|
|        598|       3|        3|           5.0| 59| 37600.0|[598.0,3.0,3.0,5....|
|        614|       4|        3|           7.0| 45| 63473.0|[614.0,4.0,3.0,7....|
|        619|       2|        2|           1.0| 52|  1392.0|[619.0,2.0,2.0,1....|
|        630|       1|        3|           6.0| 23| 74297.0|[630.0,1.0,3.0,6....|
|        643|       4|        1|           7.0| 57| 38908.0|[643.0,4.0,1.0,7....|
|        646|       3|        1|          10.0|  1|164197.0|[646.0,3.0,1.0,10...|
|        654|   

In [32]:
from pyspark.ml.evaluation import RegressionEvaluator

# Define evaluator - get the label col (real values) and the predicted colum
evaluator = RegressionEvaluator(labelCol="Price", predictionCol="prediction", metricName="mae")
mae = evaluator.evaluate(predictions)
mse = evaluator.setMetricName("mse").evaluate(predictions)
rmse = evaluator.setMetricName("rmse").evaluate(predictions)

print(f"Simple Linear Regression - MAE: {mae}, MSE: {mse}, RMSE: {rmse}")

# Evaluate Multiple Linear Regression
mae_multi = evaluator.evaluate(predictions_multi)
mse_multi = evaluator.setMetricName("mse").evaluate(predictions_multi)
rmse_multi = evaluator.setMetricName("rmse").evaluate(predictions_multi)

print(f"Multiple Linear Regression - MAE: {mae_multi}, MSE: {mse_multi}, RMSE: {rmse_multi}")


Simple Linear Regression - MAE: 41942.70127471511, MSE: 2537222031.9864306, RMSE: 50370.84505928435
Multiple Linear Regression - MAE: 18541.454047942676, MSE: 343785518.21196985, RMSE: 18541.454047942676


In [24]:
from pyspark.ml.feature import PCA, VectorAssembler
# Assemble all feature columns
feature_cols = [col for col in df.columns if col != "Size (sqft)"]  # Exclude label
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
data_transformed = assembler.transform(df)

# Apply PCA to reduce to 2 components
pca = PCA(k=2, inputCol="features", outputCol="pca_features")
pca_model = pca.fit(data_transformed)

# Transform data
pca_result = pca_model.transform(data_transformed)

# Show PCA components
pca_result.select("pca_features").show(truncate=False)

+----------------------------------------+
|pca_features                            |
+----------------------------------------+
|[-158335.99935680928,15.00574254616826] |
|[-251462.99587220702,61.57607715985136] |
|[-203427.9950436058,69.63285905888372]  |
|[-100909.99661810188,46.32611672950558] |
|[-316408.99572423456,65.81540480152191] |
|[-183603.99549297744,62.95991287410158] |
|[-305303.9982081172,34.9286134059245]   |
|[-175072.99632745053,52.24503351586803] |
|[-74296.9979378276,29.198913642575764]  |
|[-40814.99506962866,62.33839442910039]  |
|[-114708.99605597553,54.594074070250485]|
|[-206370.99790080823,34.92491613120222] |
|[-303012.9987595311,27.67227220682905]  |
|[-181569.9974257208,39.936898705447106] |
|[-323933.9960051366,63.59684862874288]  |
|[-311090.995702459,66.49682798179852]   |
|[-249458.99666914105,51.341151881648386]|
|[-90366.99663614416,45.42276895982722]  |
|[-112860.99608249361,53.28104362992271] |
|[-203984.99568756353,61.66203662936199] |
+----------