In [52]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.sql.functions import col,isnan, when, count
spark = SparkSession.builder.getOrCreate()
sc = spark.sparkContext

#Reading the the data and printing the Schema

In [None]:
rdd = spark.read.csv("test.csv",header=True,inferSchema=True)
rdd.printSchema()

root
 |-- x: integer (nullable = true)
 |-- y: double (nullable = true)



#Checking for null values

In [54]:
rdd.select([count(when(~isnan(c) | col(c).isNull(), c)).alias(c) for c in rdd.columns]).show()

+---+---+
|  x|  y|
+---+---+
|300|300|
+---+---+



#Split that data and apply VectorAssembler to train Linear Regression model

In [None]:
train, test = rdd.randomSplit([0.8,0.2])

vas = VectorAssembler(inputCols=['x'],outputCol='features')
train_vas = vas.transform(train).select('features','y')
test_vas = vas.transform(test).select('features','y')

In [None]:
train_vas.show()

+--------+------------+
|features|           y|
+--------+------------+
|   [0.0]|-3.467883789|
|   [0.0]|-2.344738542|
|   [0.0]|-1.040114209|
|   [1.0]| 5.313686205|
|   [2.0]| -1.63296825|
|   [4.0]| 2.811415826|
|   [4.0]| 5.607664865|
|   [4.0]| 6.079390073|
|   [4.0]| 7.646529763|
|   [4.0]| 8.655714172|
|   [5.0]| 0.676076749|
|   [5.0]| 5.634030902|
|   [5.0]| 8.746747654|
|   [6.0]| 11.89457829|
|   [7.0]| 9.281699753|
|   [7.0]| 10.41468095|
|   [8.0]| 9.391416798|
|   [9.0]| 10.80462727|
|  [11.0]| 8.673336357|
|  [12.0]| 7.468501839|
+--------+------------+
only showing top 20 rows



# Fit the model and predict the target column

In [None]:
lr = LinearRegression(featuresCol='features',labelCol='y',predictionCol='prediction')
model = lr.fit(train_vas)

In [None]:
pred = model.transform(test_vas).show()

+--------+------------+------------------+
|features|           y|        prediction|
+--------+------------+------------------+
|   [1.0]|-2.761182595|0.9011896574318626|
|   [1.0]| 0.275307261|0.9011896574318626|
|   [2.0]|-2.819913974|1.9120423643411601|
|   [6.0]| 2.576625376| 5.955453191978351|
|   [8.0]| 5.405220518| 7.977158605796946|
|  [12.0]| 12.07991648|12.020569433434137|
|  [15.0]| 9.805234876| 15.05312755416203|
|  [19.0]| 17.09537241|19.096538381799217|
|  [19.0]| 21.42637785|19.096538381799217|
|  [21.0]| 16.89085185|21.118243795617815|
|  [23.0]| 18.31396758| 23.13994920943641|
|  [25.0]| 30.41303282|25.161654623255004|
|  [26.0]| 25.75612514|  26.1725073301643|
|  [27.0]| 21.71380347|27.183360037073598|
|  [27.0]| 26.59112396|27.183360037073598|
|  [28.0]| 29.31770045|  28.1942127439829|
|  [31.0]| 31.93063515| 31.22677086471079|
|  [32.0]| 29.38505024| 32.23762357162009|
|  [33.0]| 30.48881287| 33.24847627852939|
|  [36.0]| 38.67780759| 36.28103439925728|
+--------+-

# Evaluate the model performance

In [None]:
from pyspark.ml.evaluation import RegressionEvaluator

pred = model.transform(test_vas)

eval_r2 = RegressionEvaluator(labelCol="y", predictionCol="prediction", metricName="r2")
r2 = eval_r2.evaluate(pred)
print(f"R-squared (R2): {r2}")

eval_mae = RegressionEvaluator(labelCol="y", predictionCol="prediction", metricName="mae")
mae = eval_mae.evaluate(pred)
print(f"Mean Absolute Error (MAE): {mae}")

R-squared (R2): 0.9909212479462157
Mean Absolute Error (MAE): 2.371361050748813
