In [58]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

# Spark's MLlib

Check this [link](https://databricks.com/glossary/what-is-machine-learning-library) to understand better what is MLlib

In [2]:
from pyspark.sql import SparkSession
import os

In [3]:
spark = SparkSession.builder.appName('linearReg').getOrCreate()

22/05/26 16:14:12 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [4]:
from pyspark.ml.regression import LinearRegression

## 1. Example - Linear Regression 

In [5]:
df = spark.read.format('libsvm').load(os.path.join('data', 'lr_data_sample.txt'))

22/05/26 16:14:15 WARN LibSVMFileFormat: 'numFeatures' option not specified, determining the number of features by going though the input. If you know the number in advance, please specify it via 'numFeatures' option to avoid the extra scan.


### Split your data

In [6]:
# Data Split with 70% for train and 30% for testing
train, test = df.randomSplit([0.7, 0.3])

In [7]:
train.describe().show()

+-------+-------------------+
|summary|              label|
+-------+-------------------+
|  count|                345|
|   mean| 0.7885039288412324|
| stddev| 10.002031195425062|
|    min|-28.571478869743427|
|    max|  27.78383192005107|
+-------+-------------------+



In [8]:
test.describe().show()

+-------+-------------------+
|summary|              label|
+-------+-------------------+
|  count|                156|
|   mean|-0.9187984328880441|
| stddev| 10.925630659953766|
|    min|-28.046018037776633|
|    max| 26.903524792043335|
+-------+-------------------+



### Data Format

This is the format that **spark** needs to run a machine learning model:

In [9]:
train.show(n=3)

+-------------------+--------------------+
|              label|            features|
+-------------------+--------------------+
|-28.571478869743427|(10,[0,1,2,3,4,5,...|
|-26.736207182601724|(10,[0,1,2,3,4,5,...|
| -23.51088409032297|(10,[0,1,2,3,4,5,...|
+-------------------+--------------------+
only showing top 3 rows



In [10]:
train.printSchema()

root
 |-- label: double (nullable = true)
 |-- features: vector (nullable = true)



### Modelling

In [12]:
lr = LinearRegression(
    featuresCol='features', # NAME OF THE FEATURES COLUMN
    labelCol='label', # NAME OF THE LABEL COLUMN
    predictionCol='prediction' # NAME OF THE PREDICTION COLUMN
)

In [13]:
# Train
model = lr.fit(train)

22/05/26 16:16:46 WARN Instrumentation: [991d6db4] regParam is zero, which might cause numerical instability and overfitting.
22/05/26 16:16:46 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeSystemBLAS
22/05/26 16:16:46 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeRefBLAS
22/05/26 16:16:46 WARN LAPACK: Failed to load implementation from: com.github.fommil.netlib.NativeSystemLAPACK
22/05/26 16:16:46 WARN LAPACK: Failed to load implementation from: com.github.fommil.netlib.NativeRefLAPACK


#### Specs of the model

In [14]:
model.coefficients

DenseVector([0.1533, 1.412, -1.1241, 2.8744, -0.0085, 1.5498, -0.3403, -0.3233, -1.0591, 1.3222])

In [15]:
model.intercept

0.5594423746755619

#### Score on Training

In [16]:
model_summary = model.summary

In [17]:
model_summary.r2

0.04849394438929033

In [18]:
model_summary.rootMeanSquaredError

9.74234840648498

#### Score on Testing

In [19]:
# Test
test_score = model.evaluate(test)

In [20]:
test_score.residuals.show(n=2)

+-------------------+
|          residuals|
+-------------------+
| -26.36863958626874|
|-28.400912371963727|
+-------------------+
only showing top 2 rows



In [21]:
test_score.rootMeanSquaredError

11.151442319122411

In [22]:
test_score.r2

-0.048484355183125105

### Predict on Unlabel Data

In [23]:
unlabeled_data = test.select('features')
unlabeled_data.show(n=2)

+--------------------+
|            features|
+--------------------+
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
+--------------------+
only showing top 2 rows



In [24]:
predictions = model.transform(unlabeled_data)
predictions.show(n=2)

+--------------------+-------------------+
|            features|         prediction|
+--------------------+-------------------+
|(10,[0,1,2,3,4,5,...|-1.6773784515078956|
|(10,[0,1,2,3,4,5,...|  1.595428943480654|
+--------------------+-------------------+
only showing top 2 rows



## 2. Example - Linear Regression

In [25]:
df = spark.read.csv(os.path.join('data', 'Ecommerce_Customers.csv'), inferSchema=True, header=True)

In [26]:
df.printSchema()

root
 |-- Email: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- Avatar: string (nullable = true)
 |-- Avg Session Length: double (nullable = true)
 |-- Time on App: double (nullable = true)
 |-- Time on Website: double (nullable = true)
 |-- Length of Membership: double (nullable = true)
 |-- Yearly Amount Spent: double (nullable = true)



We will try to predict the `Yearly Amount Spent`

In [34]:
df.head(2)[1].asDict()

{'Email': 'hduke@hotmail.com',
 'Address': '4547 Archer CommonDiazchester, CA 06566-8576',
 'Avatar': 'DarkGreen',
 'Avg Session Length': 31.92627202636016,
 'Time on App': 11.109460728682564,
 'Time on Website': 37.268958868297744,
 'Length of Membership': 2.66403418213262,
 'Yearly Amount Spent': 392.2049334443264}

### Transform data to fit a model

In [35]:
from pyspark.ml.linalg import Vectors # A requirement ?
from pyspark.ml.feature import VectorAssembler # This is going to be use to convert the features into a vector

In [36]:
assembler = VectorAssembler(
    inputCols=['Avg Session Length', 'Time on App', 'Time on Website', 'Length of Membership'], # List of columns that can be use right away - Numerical
    outputCol='features' # Name of the final vector
)

In [37]:
# Transform the data
output = assembler.transform(df)

In [38]:
# We have everything than we had before plus the dense vector of the features
output.printSchema()

root
 |-- Email: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- Avatar: string (nullable = true)
 |-- Avg Session Length: double (nullable = true)
 |-- Time on App: double (nullable = true)
 |-- Time on Website: double (nullable = true)
 |-- Length of Membership: double (nullable = true)
 |-- Yearly Amount Spent: double (nullable = true)
 |-- features: vector (nullable = true)



In [42]:
# Dense vector as final column of the spark DataFrame
output.head(1)[0][-1]

DenseVector([34.4973, 12.6557, 39.5777, 4.0826])

In [43]:
# Select only the label and the features column
final_data = output.select('features', 'Yearly Amount Spent')

In [44]:
# Split Data
train_data, test_data = final_data.randomSplit([0.7, 0.3])

In [45]:
train_data.describe().show()

+-------+-------------------+
|summary|Yearly Amount Spent|
+-------+-------------------+
|  count|                342|
|   mean|  498.0343134097524|
| stddev|  77.89586877584429|
|    min| 256.67058229005585|
|    max|  765.5184619388373|
+-------+-------------------+



In [46]:
test_data.describe().show()

+-------+-------------------+
|summary|Yearly Amount Spent|
+-------+-------------------+
|  count|                158|
|   mean|  502.0840755896203|
| stddev|  82.48704495059758|
|    min|  282.4712457199145|
|    max|  744.2218671047146|
+-------+-------------------+



In [47]:
lr = LinearRegression(
    featuresCol='features', # which is the default value
    labelCol='Yearly Amount Spent',
    predictionCol='predictions',
)

lr_model = lr.fit(train_data)

22/05/26 16:28:39 WARN Instrumentation: [30cdd88c] regParam is zero, which might cause numerical instability and overfitting.


In [48]:
# Evaluate in Test Data
test_results = lr_model.evaluate(test_data)

In [54]:
# Residuals - Difference between the actual value and the prediction
test_results.residuals.show()

+-------------------+
|          residuals|
+-------------------+
|   9.30470132544042|
| -4.877881134034908|
|-0.7238967556937155|
| -4.029092545027595|
| -6.742230812539731|
| -22.19918020894022|
|0.09756949945040105|
| -5.675249651060085|
|-2.6034602321134344|
| 17.888804518142024|
|  6.796291415266694|
|-1.2044998730269754|
| -6.252622801098539|
| -3.919734615931702|
|-17.274354519846895|
| 0.9874118486566204|
|  -9.82736239744429|
| 11.858639554215245|
|-13.916339351990928|
| -6.855496241876494|
+-------------------+
only showing top 20 rows



In [55]:
# Actual Predictions
test_results.predictions.show()

+--------------------+-------------------+------------------+
|            features|Yearly Amount Spent|       predictions|
+--------------------+-------------------+------------------+
|[29.5324289670579...|  408.6403510726275|399.33564974718706|
|[30.4925366965402...|  282.4712457199145|287.34912685394943|
|[30.5743636841713...| 442.06441375806565|442.78831051375937|
|[30.8794843441274...|  490.2065999848547| 494.2356925298823|
|[31.0613251567161...|  487.5554580579016|494.29768887044133|
|[31.1239743499119...|  486.9470538397658|  509.146234048706|
|[31.3895854806643...|  410.0696110599829| 409.9720415605325|
|[31.5147378578019...|  489.8124879964614| 495.4877376475215|
|[31.5761319713222...|  541.2265839893283| 543.8300442214418|
|[31.6005122003032...| 479.17285149109694| 461.2840469729549|
|[31.6548096756927...|  475.2634237275485| 468.4671323122818|
|[31.6610498227460...| 416.35835357990084| 417.5628534529278|
|[31.7242025238451...|  503.3878872879605|509.64051008905903|
|[31.812

In [56]:
test_results.rootMeanSquaredError

9.998779727933742

In [57]:
test_results.r2

0.985213004336492