# What is Spark MLlib?

Apache Spark’s Machine Learning Library (MLlib) is designed for simplicity, scalability, and easy integration with other tools. With the scalability, language compatibility, and speed of Spark, data scientists can focus on their data problems and models instead of solving the complexities surrounding distributed data (such as infrastructure, configurations, and so on). Built on top of Spark, MLlib is a scalable machine learning library consisting of common learning algorithms and utilities, including classification, regression, clustering, collaborative filtering, dimensionality reduction, and underlying optimization primitives. Spark MLLib seamlessly integrates with other Spark components such as Spark SQL, Spark Streaming, and DataFrames and is installed in the Databricks runtime. The library is usable in Java, Scala, and Python as part of Spark applications, so that you can include it in complete workflows. MLlib allows for preprocessing, munging, training of models, and making predictions at scale on data. You can even use models trained in MLlib to make predictions in Structured Streaming. Spark provides a sophisticated machine learning API for performing a variety of machine learning tasks, from classification to regression, clustering to deep learning. 

(https://databricks.com/glossary/what-is-machine-learning-library)

# Loading Dataset

In [0]:
data = spark.sql('SELECT * FROM boston')
data.display()

crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,black,lstat,medv
0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,36.2
0.02985,0.0,2.18,0,0.458,6.43,58.7,6.0622,3,222,18.7,394.12,5.21,28.7
0.08829,12.5,7.87,0,0.524,6.012,66.6,5.5605,5,311,15.2,395.6,12.43,22.9
0.14455,12.5,7.87,0,0.524,6.172,96.1,5.9505,5,311,15.2,396.9,19.15,27.1
0.21124,12.5,7.87,0,0.524,5.631,100.0,6.0821,5,311,15.2,386.63,29.93,16.5
0.17004,12.5,7.87,0,0.524,6.004,85.9,6.5921,5,311,15.2,386.71,17.1,18.9


In [0]:
data.printSchema()

# Transform the Data to work with MLlib

In [0]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [0]:
data.columns

In [0]:
assembler = VectorAssembler(inputCols=['crim', 'zn', 'indus', 'chas','nox','rm','age','dis','rad','tax','ptratio','black','lstat'],
                            outputCol='features')

In [0]:
output = assembler.transform(data)

In [0]:
output.display()

crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,black,lstat,medv,features
0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0,"Map(vectorType -> dense, length -> 13, values -> List(0.00632, 18.0, 2.31, 0.0, 0.538, 6.575, 65.2, 4.09, 1.0, 296.0, 15.3, 396.9, 4.98))"
0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6,"Map(vectorType -> dense, length -> 13, values -> List(0.02731, 0.0, 7.07, 0.0, 0.469, 6.421, 78.9, 4.9671, 2.0, 242.0, 17.8, 396.9, 9.14))"
0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7,"Map(vectorType -> dense, length -> 13, values -> List(0.02729, 0.0, 7.07, 0.0, 0.469, 7.185, 61.1, 4.9671, 2.0, 242.0, 17.8, 392.83, 4.03))"
0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4,"Map(vectorType -> dense, length -> 13, values -> List(0.03237, 0.0, 2.18, 0.0, 0.458, 6.998, 45.8, 6.0622, 3.0, 222.0, 18.7, 394.63, 2.94))"
0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,36.2,"Map(vectorType -> dense, length -> 13, values -> List(0.06905, 0.0, 2.18, 0.0, 0.458, 7.147, 54.2, 6.0622, 3.0, 222.0, 18.7, 396.9, 5.33))"
0.02985,0.0,2.18,0,0.458,6.43,58.7,6.0622,3,222,18.7,394.12,5.21,28.7,"Map(vectorType -> dense, length -> 13, values -> List(0.02985, 0.0, 2.18, 0.0, 0.458, 6.43, 58.7, 6.0622, 3.0, 222.0, 18.7, 394.12, 5.21))"
0.08829,12.5,7.87,0,0.524,6.012,66.6,5.5605,5,311,15.2,395.6,12.43,22.9,"Map(vectorType -> dense, length -> 13, values -> List(0.08829, 12.5, 7.87, 0.0, 0.524, 6.012, 66.6, 5.5605, 5.0, 311.0, 15.2, 395.6, 12.43))"
0.14455,12.5,7.87,0,0.524,6.172,96.1,5.9505,5,311,15.2,396.9,19.15,27.1,"Map(vectorType -> dense, length -> 13, values -> List(0.14455, 12.5, 7.87, 0.0, 0.524, 6.172, 96.1, 5.9505, 5.0, 311.0, 15.2, 396.9, 19.15))"
0.21124,12.5,7.87,0,0.524,5.631,100.0,6.0821,5,311,15.2,386.63,29.93,16.5,"Map(vectorType -> dense, length -> 13, values -> List(0.21124, 12.5, 7.87, 0.0, 0.524, 5.631, 100.0, 6.0821, 5.0, 311.0, 15.2, 386.63, 29.93))"
0.17004,12.5,7.87,0,0.524,6.004,85.9,6.5921,5,311,15.2,386.71,17.1,18.9,"Map(vectorType -> dense, length -> 13, values -> List(0.17004, 12.5, 7.87, 0.0, 0.524, 6.004, 85.9, 6.5921, 5.0, 311.0, 15.2, 386.71, 17.1))"


In [0]:
final_data = output.select('features', 'medv')

In [0]:
final_data.display()

features,medv
"Map(vectorType -> dense, length -> 13, values -> List(0.00632, 18.0, 2.31, 0.0, 0.538, 6.575, 65.2, 4.09, 1.0, 296.0, 15.3, 396.9, 4.98))",24.0
"Map(vectorType -> dense, length -> 13, values -> List(0.02731, 0.0, 7.07, 0.0, 0.469, 6.421, 78.9, 4.9671, 2.0, 242.0, 17.8, 396.9, 9.14))",21.6
"Map(vectorType -> dense, length -> 13, values -> List(0.02729, 0.0, 7.07, 0.0, 0.469, 7.185, 61.1, 4.9671, 2.0, 242.0, 17.8, 392.83, 4.03))",34.7
"Map(vectorType -> dense, length -> 13, values -> List(0.03237, 0.0, 2.18, 0.0, 0.458, 6.998, 45.8, 6.0622, 3.0, 222.0, 18.7, 394.63, 2.94))",33.4
"Map(vectorType -> dense, length -> 13, values -> List(0.06905, 0.0, 2.18, 0.0, 0.458, 7.147, 54.2, 6.0622, 3.0, 222.0, 18.7, 396.9, 5.33))",36.2
"Map(vectorType -> dense, length -> 13, values -> List(0.02985, 0.0, 2.18, 0.0, 0.458, 6.43, 58.7, 6.0622, 3.0, 222.0, 18.7, 394.12, 5.21))",28.7
"Map(vectorType -> dense, length -> 13, values -> List(0.08829, 12.5, 7.87, 0.0, 0.524, 6.012, 66.6, 5.5605, 5.0, 311.0, 15.2, 395.6, 12.43))",22.9
"Map(vectorType -> dense, length -> 13, values -> List(0.14455, 12.5, 7.87, 0.0, 0.524, 6.172, 96.1, 5.9505, 5.0, 311.0, 15.2, 396.9, 19.15))",27.1
"Map(vectorType -> dense, length -> 13, values -> List(0.21124, 12.5, 7.87, 0.0, 0.524, 5.631, 100.0, 6.0821, 5.0, 311.0, 15.2, 386.63, 29.93))",16.5
"Map(vectorType -> dense, length -> 13, values -> List(0.17004, 12.5, 7.87, 0.0, 0.524, 6.004, 85.9, 6.5921, 5.0, 311.0, 15.2, 386.71, 17.1))",18.9


# Train Test Split

In [0]:
train, test = final_data.randomSplit([0.7,0.3])

In [0]:
train.describe().display()

summary,medv
count,347.0
mean,22.26628242074929
stddev,9.08488478272475
min,5.0
max,50.0


In [0]:
test.describe().display()

summary,medv
count,159.0
mean,23.11446540880503
stddev,9.44025072396866
min,5.0
max,50.0


# Linear Regression

In [0]:
from pyspark.ml.regression import LinearRegression

In [0]:
lr = LinearRegression(labelCol='medv')

In [0]:
lr_fit = lr.fit(train)

# Model Evaluation

In [0]:
# calling evaluate function on fitted model
test_results = lr_fit.evaluate(test)

In [0]:
# Root Mean Squared Error (RMSE) on test set
test_results.rootMeanSquaredError

In [0]:
# R2 on test set
test_results.r2

In [0]:
# Mean Absolute Error (MAE) on test set
test_results.meanAbsoluteError

In [0]:
# Mean Squared Error (MSE) on test set
test_results.meanSquaredError

In [0]:
# residuals on test set
test_results.residuals.display()

residuals
2.070631321501459
6.328831176356559
-0.1149484810903409
3.975089345993688
-0.6584406061486838
2.404158820948041
-1.5732771118778572
-3.666336941530005
-6.7246584257784505
-6.733719622699944


# Scoring on new data

In [0]:
# lets create a new data by removing target col from test set
new_data = test.select('features')
new_data.display()

features
"Map(vectorType -> dense, length -> 13, values -> List(0.01301, 35.0, 1.52, 0.0, 0.442, 7.241, 49.3, 7.0379, 1.0, 284.0, 15.5, 394.74, 5.49))"
"Map(vectorType -> dense, length -> 13, values -> List(0.01538, 90.0, 3.75, 0.0, 0.394, 7.454, 34.2, 6.3361, 3.0, 244.0, 15.9, 386.34, 3.11))"
"Map(vectorType -> dense, length -> 13, values -> List(0.01965, 80.0, 1.76, 0.0, 0.385, 6.23, 31.5, 9.0892, 1.0, 241.0, 18.2, 341.6, 12.93))"
"Map(vectorType -> dense, length -> 13, values -> List(0.02729, 0.0, 7.07, 0.0, 0.469, 7.185, 61.1, 4.9671, 2.0, 242.0, 17.8, 392.83, 4.03))"
"Map(vectorType -> dense, length -> 13, values -> List(0.02763, 75.0, 2.95, 0.0, 0.428, 6.595, 21.8, 5.4011, 3.0, 252.0, 18.3, 395.63, 4.32))"
"Map(vectorType -> dense, length -> 13, values -> List(0.03049, 55.0, 3.78, 0.0, 0.484, 6.874, 28.1, 6.4654, 5.0, 370.0, 17.6, 387.97, 4.61))"
"Map(vectorType -> dense, length -> 13, values -> List(0.03306, 0.0, 5.19, 0.0, 0.515, 6.059, 37.3, 4.8122, 5.0, 224.0, 20.2, 396.14, 8.51))"
"Map(vectorType -> dense, length -> 13, values -> List(0.03466, 35.0, 6.06, 0.0, 0.4379, 6.031, 23.3, 6.6407, 1.0, 304.0, 16.9, 362.25, 7.83))"
"Map(vectorType -> dense, length -> 13, values -> List(0.03537, 34.0, 6.09, 0.0, 0.433, 6.59, 40.4, 5.4917, 7.0, 329.0, 16.1, 395.75, 9.5))"
"Map(vectorType -> dense, length -> 13, values -> List(0.03584, 80.0, 3.37, 0.0, 0.398, 6.29, 17.8, 6.6115, 4.0, 337.0, 16.1, 396.9, 4.67))"


In [0]:
pred = lr_fit.transform(new_data)
pred.display()

features,prediction
"Map(vectorType -> dense, length -> 13, values -> List(0.01301, 35.0, 1.52, 0.0, 0.442, 7.241, 49.3, 7.0379, 1.0, 284.0, 15.5, 394.74, 5.49))",30.629368678498544
"Map(vectorType -> dense, length -> 13, values -> List(0.01538, 90.0, 3.75, 0.0, 0.394, 7.454, 34.2, 6.3361, 3.0, 244.0, 15.9, 386.34, 3.11))",37.67116882364344
"Map(vectorType -> dense, length -> 13, values -> List(0.01965, 80.0, 1.76, 0.0, 0.385, 6.23, 31.5, 9.0892, 1.0, 241.0, 18.2, 341.6, 12.93))",20.214948481090342
"Map(vectorType -> dense, length -> 13, values -> List(0.02729, 0.0, 7.07, 0.0, 0.469, 7.185, 61.1, 4.9671, 2.0, 242.0, 17.8, 392.83, 4.03))",30.724910654006315
"Map(vectorType -> dense, length -> 13, values -> List(0.02763, 75.0, 2.95, 0.0, 0.428, 6.595, 21.8, 5.4011, 3.0, 252.0, 18.3, 395.63, 4.32))",31.458440606148685
"Map(vectorType -> dense, length -> 13, values -> List(0.03049, 55.0, 3.78, 0.0, 0.484, 6.874, 28.1, 6.4654, 5.0, 370.0, 17.6, 387.97, 4.61))",28.79584117905196
"Map(vectorType -> dense, length -> 13, values -> List(0.03306, 0.0, 5.19, 0.0, 0.515, 6.059, 37.3, 4.8122, 5.0, 224.0, 20.2, 396.14, 8.51))",22.17327711187786
"Map(vectorType -> dense, length -> 13, values -> List(0.03466, 35.0, 6.06, 0.0, 0.4379, 6.031, 23.3, 6.6407, 1.0, 304.0, 16.9, 362.25, 7.83))",23.066336941530004
"Map(vectorType -> dense, length -> 13, values -> List(0.03537, 34.0, 6.09, 0.0, 0.433, 6.59, 40.4, 5.4917, 7.0, 329.0, 16.1, 395.75, 9.5))",28.72465842577845
"Map(vectorType -> dense, length -> 13, values -> List(0.03584, 80.0, 3.37, 0.0, 0.398, 6.29, 17.8, 6.6115, 4.0, 337.0, 16.1, 396.9, 4.67))",30.233719622699944


# Random Forest

In [0]:
from pyspark.ml.regression import RandomForestRegressor

In [0]:
# create random forest instance
rf = RandomForestRegressor(labelCol='medv')

In [0]:
# fit the model
rf_fit = rf.fit(train)

In [0]:
# generate predictions on new data
rf_fit.transform(new_data).display()

features,prediction
"Map(vectorType -> dense, length -> 13, values -> List(0.01301, 35.0, 1.52, 0.0, 0.442, 7.241, 49.3, 7.0379, 1.0, 284.0, 15.5, 394.74, 5.49))",32.487828068881015
"Map(vectorType -> dense, length -> 13, values -> List(0.01538, 90.0, 3.75, 0.0, 0.394, 7.454, 34.2, 6.3361, 3.0, 244.0, 15.9, 386.34, 3.11))",40.205171178086616
"Map(vectorType -> dense, length -> 13, values -> List(0.01965, 80.0, 1.76, 0.0, 0.385, 6.23, 31.5, 9.0892, 1.0, 241.0, 18.2, 341.6, 12.93))",23.85091929379118
"Map(vectorType -> dense, length -> 13, values -> List(0.02729, 0.0, 7.07, 0.0, 0.469, 7.185, 61.1, 4.9671, 2.0, 242.0, 17.8, 392.83, 4.03))",38.18507612539966
"Map(vectorType -> dense, length -> 13, values -> List(0.02763, 75.0, 2.95, 0.0, 0.428, 6.595, 21.8, 5.4011, 3.0, 252.0, 18.3, 395.63, 4.32))",27.99750001393778
"Map(vectorType -> dense, length -> 13, values -> List(0.03049, 55.0, 3.78, 0.0, 0.484, 6.874, 28.1, 6.4654, 5.0, 370.0, 17.6, 387.97, 4.61))",31.338530116664884
"Map(vectorType -> dense, length -> 13, values -> List(0.03306, 0.0, 5.19, 0.0, 0.515, 6.059, 37.3, 4.8122, 5.0, 224.0, 20.2, 396.14, 8.51))",22.06062588386747
"Map(vectorType -> dense, length -> 13, values -> List(0.03466, 35.0, 6.06, 0.0, 0.4379, 6.031, 23.3, 6.6407, 1.0, 304.0, 16.9, 362.25, 7.83))",22.827832140107592
"Map(vectorType -> dense, length -> 13, values -> List(0.03537, 34.0, 6.09, 0.0, 0.433, 6.59, 40.4, 5.4917, 7.0, 329.0, 16.1, 395.75, 9.5))",24.103309818785824
"Map(vectorType -> dense, length -> 13, values -> List(0.03584, 80.0, 3.37, 0.0, 0.398, 6.29, 17.8, 6.6115, 4.0, 337.0, 16.1, 396.9, 4.67))",24.4807572647382


In [0]:
# feature importance
rf_fit.featureImportances

# Cross-Validation / Hyperparameter Tuning, etc.

Read Documentation: https://spark.apache.org/docs/latest/ml-tuning.html