In [1]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.1.tar.gz (317.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.0/317.0 MB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.1-py2.py3-none-any.whl size=317488491 sha256=92a47514ec4b8ef548f01d0aa41f0370734add59b07ed8e6dac6eb12ffcd49a8
  Stored in directory: /root/.cache/pip/wheels/80/1d/60/2c256ed38dddce2fdd93be545214a63e02fbd8d74fb0b7f3a6
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.1


In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()

In [3]:
sc=spark.sparkContext

In [6]:
from pyspark.sql import SparkSession
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.evaluation import RegressionEvaluator

# Initialize Spark Session
spark = SparkSession.builder.appName("PCA Models").getOrCreate()

# Load datasets
train_df = spark.read.csv("train_set_pca.csv", header=True, inferSchema=True)
test_df = spark.read.csv("test_set_pca.csv", header=True, inferSchema=True)

# Display initial few rows to verify data loading
train_df.show()
test_df.show()

# Assuming 'target' is the column you are trying to predict
# You might need to replace 'target' with the actual name of your target column
feature_columns = [col for col in train_df.columns if col != 'price']
assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")

# Transform the data
train_data = assembler.transform(train_df)
test_data = assembler.transform(test_df)

# Select features and target for modeling
train_data = train_data.select("features", "price")
test_data = test_data.select("features", "price")

# Define the linear regression model
linear_model = LinearRegression(featuresCol='features', labelCol='price')

# Train the model
model = linear_model.fit(train_data)

# Predict on the test data
predictions = model.transform(test_data)

# Show predicted vs actual values
predictions.select("prediction", "price").show(5)

# Evaluate the model
evaluator = RegressionEvaluator(labelCol="price", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print(f"Root Mean Squared Error (RMSE) on test data: {rmse}")

# Stop the Spark session
spark.stop()

+-------------------+-------------------+-------------------+-------------------+-------------------+-------------------+------------------+--------------------+--------------------+--------+
|                PC1|                PC2|                PC3|                PC4|                PC5|                PC6|               PC7|                 PC8|                 PC9|   price|
+-------------------+-------------------+-------------------+-------------------+-------------------+-------------------+------------------+--------------------+--------------------+--------+
|  -0.91397416073757|   0.37624919565762| -0.443154732432337| -0.414833995449772| -0.131710604803046|-0.0350293500802002|-0.679320765646838| 1.2847253642867E-15|7.21644966006352E-16|313000.0|
|   0.53567810249467|    0.1299853399346|   1.01797815110796| 0.0149423450565388|  0.925913617301137| -0.741844027836367| 0.075894565785071| 1.4963514639605E-16|-2.22044604925031...|550000.0|
|  -2.64069366662723| -0.594110026357142