In [3]:
import findspark
findspark.init('/home/ubuntu/spark-3.2.0-bin-hadoop3.2')
from pyspark.sql import SparkSession
from pyspark.ml.regression import LinearRegression

In [4]:
DATA_PATH = '../data/Spark_for_Machine_Learning/Linear_Regression/'

In [5]:
spark = SparkSession.builder.appName('lr').getOrCreate()

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
21/12/27 14:56:45 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [7]:
data = spark.read.csv(DATA_PATH + 'Ecommerce_Customers.csv', inferSchema=True, header=True)

[Stage 0:>                                                          (0 + 1) / 1]                                                                                

In [8]:
data.printSchema()

root
 |-- Email: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- Avatar: string (nullable = true)
 |-- Avg Session Length: double (nullable = true)
 |-- Time on App: double (nullable = true)
 |-- Time on Website: double (nullable = true)
 |-- Length of Membership: double (nullable = true)
 |-- Yearly Amount Spent: double (nullable = true)



## Feature building with VectorAssembler

In [9]:
from pyspark.ml.feature import VectorAssembler

In [10]:
data.columns

['Email',
 'Address',
 'Avatar',
 'Avg Session Length',
 'Time on App',
 'Time on Website',
 'Length of Membership',
 'Yearly Amount Spent']

In [12]:
feats = ['Avg Session Length',
         'Time on App',
         'Time on Website',
         'Length of Membership',]

assembler = VectorAssembler(inputCols=feats, outputCol='features')

In [13]:
output = assembler.transform(data)

In [15]:
train_data, test_data = output.select(['features','Yearly Amount Spent']).randomSplit([0.7,0.3])

In [16]:
lr = LinearRegression(featuresCol='features', labelCol='Yearly Amount Spent', regParam=0.3)

In [17]:
lr_model = lr.fit(train_data)

21/12/27 15:07:14 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
21/12/27 15:07:14 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.ForeignLinkerBLAS
21/12/27 15:07:14 WARN InstanceBuilder$NativeLAPACK: Failed to load implementation from:dev.ludovic.netlib.lapack.JNILAPACK


In [20]:
test_results = lr_model.evaluate(test_data)

In [21]:
test_results.rootMeanSquaredError

9.989903900044967

In [22]:
test_results.r2

0.9839337487255126

In [25]:
test_results.residuals.show()

+-------------------+
|          residuals|
+-------------------+
| 10.893085975886379|
| 0.7745048777134684|
|  7.798682311773064|
|  4.439080841447435|
| -5.150604517712338|
|-12.218883267842727|
|-20.354224710384074|
| -3.401592658410209|
| 23.699098048025462|
| 4.1348669779489455|
| -2.914727282233798|
|-7.3452891625805705|
| -4.928336480799317|
|-0.7835574291488001|
| 7.0560341941206275|
|-0.6828863907177265|
|-2.1564072359653323|
|-1.6170897908404527|
|  8.864430822186307|
|-1.9606716818207133|
+-------------------+
only showing top 20 rows



In [28]:
predictions = lr_model.transform(test_data.select('features'))

In [29]:
predictions.show()

+--------------------+------------------+
|            features|        prediction|
+--------------------+------------------+
|[29.5324289670579...| 397.7472650967411|
|[30.5743636841713...| 441.2899088803522|
|[30.9716756438877...|486.83992744511966|
|[31.0472221394875...|388.05831834757396|
|[31.0613251567161...|492.70606257561394|
|[31.0662181616375...| 461.1521764755171|
|[31.1239743499119...|507.30127855014985|
|[31.2681042107507...|426.87212583223413|
|[31.2834474760581...|  568.081991377642|
|[31.3662121671876...|  426.454015578536|
|[31.4252268808548...| 533.6814459369957|
|[31.4474464941278...| 425.9480312578046|
|[31.5147378578019...|494.74082447726073|
|[31.5761319713222...| 542.0101414184771|
|[31.6548096756927...| 468.2073895334279|
|[31.6610498227460...|417.04123997061856|
|[31.7216523605090...|  349.933333867838|
|[31.8186165667690...| 448.0357631609761|
|[31.8512531286083...| 464.1278158446121|
|[31.8530748017465...| 461.2457951441727|
+--------------------+------------