## Linear Regression using SparkML

In [18]:
!pip install pyspark==3.1.2 -q
!pip install findspark -q

In [19]:
import pandas as pd
import findspark
findspark.init()
from pyspark.sql import SparkSession 
from pyspark.ml.feature import VectorAssembler 
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator


#### Create a Spark Session

In [20]:
Spark = SparkSession.builder.appName(' Regression using SparkML').getOrCreate()

##### Load the data in a csv file into a dataframe

In [21]:
!curl -o mpg.csv https://raw.githubusercontent.com/AdelOuledSaid/Machine-learning-with-Apache-Spark-/main/Prediction%20Model%20using%20Linear%20Regression/mpg%20(1).csv



  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed

  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100 13891  100 13891    0     0  32704      0 --:--:-- --:--:-- --:--:-- 32839


In [22]:
mpg_data = Spark.read.csv("mpg.csv", header=True, inferSchema=True)


mpg_data.printSchema()

In [25]:
mpg_data.show(5)

+----+---------+-----------+----------+------+----------+----+--------+
| MPG|Cylinders|Engine Disp|Horsepower|Weight|Accelerate|Year|  Origin|
+----+---------+-----------+----------+------+----------+----+--------+
|15.0|        8|      390.0|       190|  3850|       8.5|  70|American|
|21.0|        6|      199.0|        90|  2648|      15.0|  70|American|
|18.0|        6|      199.0|        97|  2774|      15.5|  70|American|
|16.0|        8|      304.0|       150|  3433|      12.0|  70|American|
|14.0|        8|      455.0|       225|  3086|      10.0|  70|American|
+----+---------+-----------+----------+------+----------+----+--------+
only showing top 5 rows



 ##### Identify the label column and the input columns

In [43]:
assembler= VectorAssembler( inputCols = [ 'Cylinders', 'Engine Disp' , 'Horsepower', 'Weight' , 'Accelerate' ,  'Year'], outputCol = 'features')
mpg_transform_data = assembler.transform(mpg_data)

In [44]:
mpg_transform_data.show(5)

+----+---------+-----------+----------+------+----------+----+--------+--------------------+
| MPG|Cylinders|Engine Disp|Horsepower|Weight|Accelerate|Year|  Origin|            features|
+----+---------+-----------+----------+------+----------+----+--------+--------------------+
|15.0|        8|      390.0|       190|  3850|       8.5|  70|American|[8.0,390.0,190.0,...|
|21.0|        6|      199.0|        90|  2648|      15.0|  70|American|[6.0,199.0,90.0,2...|
|18.0|        6|      199.0|        97|  2774|      15.5|  70|American|[6.0,199.0,97.0,2...|
|16.0|        8|      304.0|       150|  3433|      12.0|  70|American|[8.0,304.0,150.0,...|
|14.0|        8|      455.0|       225|  3086|      10.0|  70|American|[8.0,455.0,225.0,...|
+----+---------+-----------+----------+------+----------+----+--------+--------------------+
only showing top 5 rows



In [45]:
mpg_transform_data.select("features","MPG").show(5)

+--------------------+----+
|            features| MPG|
+--------------------+----+
|[8.0,390.0,190.0,...|15.0|
|[6.0,199.0,90.0,2...|21.0|
|[6.0,199.0,97.0,2...|18.0|
|[8.0,304.0,150.0,...|16.0|
|[8.0,455.0,225.0,...|14.0|
+--------------------+----+
only showing top 5 rows



 * Split  data 

In [52]:
train_data , test_data  = mpg_transform_data.randomSplit([0.7, 0.3], seed=42)

 #### Build and Train a Linear Regression Model

In [56]:
lr = LinearRegression( featuresCol = 'features', labelCol ='MPG')

In [57]:
model = lr.fit(train_data)

 #### Evaluate the model

In [58]:
predict = model.transform(test_data)

 * R Squared

In [61]:
evalteur= RegressionEvaluator( labelCol = "MPG" , predictionCol = "prediction" , metricName="r2")

In [64]:
r2= evalteur.evaluate(predict)
r2

0.8046190375720326

In [65]:
evalteur= RegressionEvaluator( labelCol = "MPG" , predictionCol = "prediction" , metricName="rmse")
rmse= evalteur.evaluate(predict)
rmse

3.453104969079216

In [66]:
evalteur= RegressionEvaluator( labelCol = "MPG" , predictionCol = "prediction" , metricName="mae")
mae= evalteur.evaluate(predict)
mae

2.842391179195012

In [None]:
spark.stop()