In [16]:
#!pip install pyspark

In [3]:
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import ChiSqSelector
from pyspark.sql import SparkSession
from pyspark.ml.evaluation import RegressionEvaluator

In [4]:
spark = SparkSession.builder.appName("SalaryPrediction").getOrCreate()

In [5]:
data = spark.read.format("csv").option("header", "true").option("inferSchema", "true").load("Salary_dataset.csv")

In [6]:
data.show(3)

+---+------------------+-------+
|_c0|   YearsExperience| Salary|
+---+------------------+-------+
|  0|1.2000000000000002|39344.0|
|  1|1.4000000000000001|46206.0|
|  2|               1.6|37732.0|
+---+------------------+-------+
only showing top 3 rows



In [7]:
meanYearsExperience = data.select("YearsExperience").agg({"YearsExperience": "mean"}).collect()[0][0]
print(f"Mean YearsExperience: {meanYearsExperience}")

Mean YearsExperience: 5.413333333333332


In [8]:
feature = ['YearsExperience']
label = 'Salary'
Vector  = VectorAssembler(inputCols=feature , outputCol="feature")

In [9]:
df = Vector.transform(data)
(x_train , x_test)  = df.randomSplit([0.7 ,0.3])

In [10]:
linear = LinearRegression(featuresCol="feature", labelCol=label)
# Train the model on the training data
model = linear.fit(x_train)

In [11]:
# Make predictions on the testing data
predictions = model.transform(x_test)

In [12]:
evaluator = RegressionEvaluator(labelCol=label, predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)

The results

In [13]:
print(f"Root Mean Squared Error (RMSE): {rmse}")

Root Mean Squared Error (RMSE): 5421.582162092105


The model coefficients and intercept

In [14]:
print(f"Coefficients:{model.coefficients}")
print(f"Intercept: {model.intercept}")

Coefficients:[9610.355807116148]
Intercept: 23556.231264625265


In [15]:
workExperience = 5  # Years of experience
featuresVector = Vector.transform(spark.createDataFrame([(workExperience, )], ["YearsExperience"])).select("feature").collect()[0].feature
predictedSalary = model.transform(spark.createDataFrame([(featuresVector, )], ["feature"])).select("prediction").collect()[0].prediction
print(f"Predicted salary for {workExperience} years of experience: {predictedSalary}")
# Stop the SparkSession
spark.stop()

Predicted salary for 5 years of experience: 71608.010300206
