In [1]:
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession.builder.appName("Practice").getOrCreate()

In [4]:
df_pyspark = spark.read.csv("Test.csv", header=True, inferSchema=True)

In [5]:
df_pyspark.show()

+--------+---+----------+------+
|   Name |Age|Experience|Salary|
+--------+---+----------+------+
| Abhinav| 22|         4| 25000|
|   Rishi| 22|         3| 30000|
|   Rohan| 24|         2| 12000|
|Raushan | 25|         1| 20000|
|    Ansh| 21|         2| 35000|
|   Aryan| 23|         3| 22000|
+--------+---+----------+------+



In [6]:
df_pyspark.printSchema()

root
 |-- Name : string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Experience: integer (nullable = true)
 |-- Salary: integer (nullable = true)



In [7]:
df_pyspark.columns

['Name ', 'Age', 'Experience', 'Salary']

In [10]:
from pyspark.ml.feature import VectorAssembler

featureAssembler = VectorAssembler(inputCols=["Age","Experience"],outputCol = "Independent Features")

In [11]:
output = featureAssembler.transform(df_pyspark)

In [12]:
output.show()

+--------+---+----------+------+--------------------+
|   Name |Age|Experience|Salary|Independent Features|
+--------+---+----------+------+--------------------+
| Abhinav| 22|         4| 25000|          [22.0,4.0]|
|   Rishi| 22|         3| 30000|          [22.0,3.0]|
|   Rohan| 24|         2| 12000|          [24.0,2.0]|
|Raushan | 25|         1| 20000|          [25.0,1.0]|
|    Ansh| 21|         2| 35000|          [21.0,2.0]|
|   Aryan| 23|         3| 22000|          [23.0,3.0]|
+--------+---+----------+------+--------------------+



In [13]:
output.columns

['Name ', 'Age', 'Experience', 'Salary', 'Independent Features']

In [15]:
finalized_data = output.select("Independent Features", "Salary")

In [16]:
finalized_data.show()

+--------------------+------+
|Independent Features|Salary|
+--------------------+------+
|          [22.0,4.0]| 25000|
|          [22.0,3.0]| 30000|
|          [24.0,2.0]| 12000|
|          [25.0,1.0]| 20000|
|          [21.0,2.0]| 35000|
|          [23.0,3.0]| 22000|
+--------------------+------+



In [18]:
from pyspark.ml.regression import LinearRegression

train_data, test_data = finalized_data.randomSplit([0.75, 0.25])
regressor = LinearRegression(featuresCol = "Independent Features", labelCol="Salary")

regressor = regressor.fit(train_data)

In [20]:
#coefficients
regressor.coefficients

DenseVector([-5778.9474, -3263.1579])

In [21]:
#intercept
regressor.intercept

163894.7368421176

In [27]:
# predictions
pred_results = regressor.evaluate(test_data)

In [28]:
pred_results.predictions.show()

+--------------------+------+------------------+
|Independent Features|Salary|        prediction|
+--------------------+------+------------------+
|          [22.0,4.0]| 25000|23705.263157894165|
+--------------------+------+------------------+



In [29]:
pred_results.meanAbsoluteError, pred_results.meanSquaredError, pred_results.rootMeanSquaredError

(1294.7368421058345, 1676343.4903061886, 1294.7368421058345)