# Spark Machine Learning Basics

In [7]:
import pyspark
from pyspark.sql import SparkSession
import pyspark.ml as ml
import pyspark.sql.functions as spark_f
import pyspark.sql.types as spark_types
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd

In [187]:
spark = SparkSession.builder.appName('Spark Test App').getOrCreate() 
sc = spark.sparkContext

In [188]:
# read some test data
df = spark.read.parquet('hmp.parquet')
df.show()

+---+---+---+--------------------+-----------+
|  x|  y|  z|              source|      class|
+---+---+---+--------------------+-----------+
| 22| 49| 35|Accelerometer-201...|Brush_teeth|
| 22| 49| 35|Accelerometer-201...|Brush_teeth|
| 22| 52| 35|Accelerometer-201...|Brush_teeth|
| 22| 52| 35|Accelerometer-201...|Brush_teeth|
| 21| 52| 34|Accelerometer-201...|Brush_teeth|
| 22| 51| 34|Accelerometer-201...|Brush_teeth|
| 20| 50| 35|Accelerometer-201...|Brush_teeth|
| 22| 52| 34|Accelerometer-201...|Brush_teeth|
| 22| 50| 34|Accelerometer-201...|Brush_teeth|
| 22| 51| 35|Accelerometer-201...|Brush_teeth|
| 21| 51| 33|Accelerometer-201...|Brush_teeth|
| 20| 50| 34|Accelerometer-201...|Brush_teeth|
| 21| 49| 33|Accelerometer-201...|Brush_teeth|
| 21| 49| 33|Accelerometer-201...|Brush_teeth|
| 20| 51| 35|Accelerometer-201...|Brush_teeth|
| 18| 49| 34|Accelerometer-201...|Brush_teeth|
| 19| 48| 34|Accelerometer-201...|Brush_teeth|
| 16| 53| 34|Accelerometer-201...|Brush_teeth|
| 18| 52| 35|

In [189]:
df.count()

446529

### Preprocessing Steps

In [190]:
# Encode class as a number
indexer=ml.feature.StringIndexer(inputCol='class', outputCol='classIndex')
df_indexed=indexer.fit(df).transform(df)
df_indexed.show()

+---+---+---+--------------------+-----------+----------+
|  x|  y|  z|              source|      class|classIndex|
+---+---+---+--------------------+-----------+----------+
| 22| 49| 35|Accelerometer-201...|Brush_teeth|       6.0|
| 22| 49| 35|Accelerometer-201...|Brush_teeth|       6.0|
| 22| 52| 35|Accelerometer-201...|Brush_teeth|       6.0|
| 22| 52| 35|Accelerometer-201...|Brush_teeth|       6.0|
| 21| 52| 34|Accelerometer-201...|Brush_teeth|       6.0|
| 22| 51| 34|Accelerometer-201...|Brush_teeth|       6.0|
| 20| 50| 35|Accelerometer-201...|Brush_teeth|       6.0|
| 22| 52| 34|Accelerometer-201...|Brush_teeth|       6.0|
| 22| 50| 34|Accelerometer-201...|Brush_teeth|       6.0|
| 22| 51| 35|Accelerometer-201...|Brush_teeth|       6.0|
| 21| 51| 33|Accelerometer-201...|Brush_teeth|       6.0|
| 20| 50| 34|Accelerometer-201...|Brush_teeth|       6.0|
| 21| 49| 33|Accelerometer-201...|Brush_teeth|       6.0|
| 21| 49| 33|Accelerometer-201...|Brush_teeth|       6.0|
| 20| 51| 35|A

In [191]:
# One-hot-encoding the class
encoder=ml.feature.OneHotEncoderEstimator(inputCols=['classIndex'], outputCols=['classVector'])
df_enc=encoder.fit(df_indexed).transform(df_indexed)
df_enc.show()

+---+---+---+--------------------+-----------+----------+--------------+
|  x|  y|  z|              source|      class|classIndex|   classVector|
+---+---+---+--------------------+-----------+----------+--------------+
| 22| 49| 35|Accelerometer-201...|Brush_teeth|       6.0|(13,[6],[1.0])|
| 22| 49| 35|Accelerometer-201...|Brush_teeth|       6.0|(13,[6],[1.0])|
| 22| 52| 35|Accelerometer-201...|Brush_teeth|       6.0|(13,[6],[1.0])|
| 22| 52| 35|Accelerometer-201...|Brush_teeth|       6.0|(13,[6],[1.0])|
| 21| 52| 34|Accelerometer-201...|Brush_teeth|       6.0|(13,[6],[1.0])|
| 22| 51| 34|Accelerometer-201...|Brush_teeth|       6.0|(13,[6],[1.0])|
| 20| 50| 35|Accelerometer-201...|Brush_teeth|       6.0|(13,[6],[1.0])|
| 22| 52| 34|Accelerometer-201...|Brush_teeth|       6.0|(13,[6],[1.0])|
| 22| 50| 34|Accelerometer-201...|Brush_teeth|       6.0|(13,[6],[1.0])|
| 22| 51| 35|Accelerometer-201...|Brush_teeth|       6.0|(13,[6],[1.0])|
| 21| 51| 33|Accelerometer-201...|Brush_teeth|     

In [192]:
# Create Vector from columns x,y,z
vec_assembler=ml.feature.VectorAssembler(inputCols=['x', 'y', 'z'], outputCol='features')
df_vec=vec_assembler.transform(df_enc)
df_vec.show()

+---+---+---+--------------------+-----------+----------+--------------+----------------+
|  x|  y|  z|              source|      class|classIndex|   classVector|        features|
+---+---+---+--------------------+-----------+----------+--------------+----------------+
| 22| 49| 35|Accelerometer-201...|Brush_teeth|       6.0|(13,[6],[1.0])|[22.0,49.0,35.0]|
| 22| 49| 35|Accelerometer-201...|Brush_teeth|       6.0|(13,[6],[1.0])|[22.0,49.0,35.0]|
| 22| 52| 35|Accelerometer-201...|Brush_teeth|       6.0|(13,[6],[1.0])|[22.0,52.0,35.0]|
| 22| 52| 35|Accelerometer-201...|Brush_teeth|       6.0|(13,[6],[1.0])|[22.0,52.0,35.0]|
| 21| 52| 34|Accelerometer-201...|Brush_teeth|       6.0|(13,[6],[1.0])|[21.0,52.0,34.0]|
| 22| 51| 34|Accelerometer-201...|Brush_teeth|       6.0|(13,[6],[1.0])|[22.0,51.0,34.0]|
| 20| 50| 35|Accelerometer-201...|Brush_teeth|       6.0|(13,[6],[1.0])|[20.0,50.0,35.0]|
| 22| 52| 34|Accelerometer-201...|Brush_teeth|       6.0|(13,[6],[1.0])|[22.0,52.0,34.0]|
| 22| 50| 

### Using Spark Pipelines

In [193]:
# Instead of conducting each step manually, gather steps into a pipeline
pipeline=ml.pipeline.Pipeline(stages=[indexer, encoder, vec_assembler])

In [194]:
model=pipeline.fit(df)

In [195]:
prediction=model.transform(df)

In [196]:
prediction.show()

+---+---+---+--------------------+-----------+----------+--------------+----------------+
|  x|  y|  z|              source|      class|classIndex|   classVector|        features|
+---+---+---+--------------------+-----------+----------+--------------+----------------+
| 22| 49| 35|Accelerometer-201...|Brush_teeth|       6.0|(13,[6],[1.0])|[22.0,49.0,35.0]|
| 22| 49| 35|Accelerometer-201...|Brush_teeth|       6.0|(13,[6],[1.0])|[22.0,49.0,35.0]|
| 22| 52| 35|Accelerometer-201...|Brush_teeth|       6.0|(13,[6],[1.0])|[22.0,52.0,35.0]|
| 22| 52| 35|Accelerometer-201...|Brush_teeth|       6.0|(13,[6],[1.0])|[22.0,52.0,35.0]|
| 21| 52| 34|Accelerometer-201...|Brush_teeth|       6.0|(13,[6],[1.0])|[21.0,52.0,34.0]|
| 22| 51| 34|Accelerometer-201...|Brush_teeth|       6.0|(13,[6],[1.0])|[22.0,51.0,34.0]|
| 20| 50| 35|Accelerometer-201...|Brush_teeth|       6.0|(13,[6],[1.0])|[20.0,50.0,35.0]|
| 22| 52| 34|Accelerometer-201...|Brush_teeth|       6.0|(13,[6],[1.0])|[22.0,52.0,34.0]|
| 22| 50| 

**-> Same result as with single steps**

In [197]:
df_train=prediction.drop('x', 'y', 'z', 'source', 'class', 'classIndex')
df_train.show()

+--------------+----------------+
|   classVector|        features|
+--------------+----------------+
|(13,[6],[1.0])|[22.0,49.0,35.0]|
|(13,[6],[1.0])|[22.0,49.0,35.0]|
|(13,[6],[1.0])|[22.0,52.0,35.0]|
|(13,[6],[1.0])|[22.0,52.0,35.0]|
|(13,[6],[1.0])|[21.0,52.0,34.0]|
|(13,[6],[1.0])|[22.0,51.0,34.0]|
|(13,[6],[1.0])|[20.0,50.0,35.0]|
|(13,[6],[1.0])|[22.0,52.0,34.0]|
|(13,[6],[1.0])|[22.0,50.0,34.0]|
|(13,[6],[1.0])|[22.0,51.0,35.0]|
|(13,[6],[1.0])|[21.0,51.0,33.0]|
|(13,[6],[1.0])|[20.0,50.0,34.0]|
|(13,[6],[1.0])|[21.0,49.0,33.0]|
|(13,[6],[1.0])|[21.0,49.0,33.0]|
|(13,[6],[1.0])|[20.0,51.0,35.0]|
|(13,[6],[1.0])|[18.0,49.0,34.0]|
|(13,[6],[1.0])|[19.0,48.0,34.0]|
|(13,[6],[1.0])|[16.0,53.0,34.0]|
|(13,[6],[1.0])|[18.0,52.0,35.0]|
|(13,[6],[1.0])|[18.0,51.0,32.0]|
+--------------+----------------+
only showing top 20 rows



### k-Means Clustering

In [198]:
kmeans=ml.clustering.KMeans(k=13, seed=1, featuresCol='features')
pipeline=ml.pipeline.Pipeline(stages=[vec_assembler, kmeans])
model=pipeline.fit(df)
prediction=model.transform(df)

In [39]:
prediction.show()

+---+---+---+--------------------+-----------+----------------+----------+
|  x|  y|  z|              source|      class|        features|prediction|
+---+---+---+--------------------+-----------+----------------+----------+
| 22| 49| 35|Accelerometer-201...|Brush_teeth|[22.0,49.0,35.0]|         7|
| 22| 49| 35|Accelerometer-201...|Brush_teeth|[22.0,49.0,35.0]|         7|
| 22| 52| 35|Accelerometer-201...|Brush_teeth|[22.0,52.0,35.0]|         7|
| 22| 52| 35|Accelerometer-201...|Brush_teeth|[22.0,52.0,35.0]|         7|
| 21| 52| 34|Accelerometer-201...|Brush_teeth|[21.0,52.0,34.0]|         7|
| 22| 51| 34|Accelerometer-201...|Brush_teeth|[22.0,51.0,34.0]|         7|
| 20| 50| 35|Accelerometer-201...|Brush_teeth|[20.0,50.0,35.0]|         7|
| 22| 52| 34|Accelerometer-201...|Brush_teeth|[22.0,52.0,34.0]|         7|
| 22| 50| 34|Accelerometer-201...|Brush_teeth|[22.0,50.0,34.0]|         7|
| 22| 51| 35|Accelerometer-201...|Brush_teeth|[22.0,51.0,35.0]|         7|
| 21| 51| 33|Acceleromete

In [40]:
# Evaluate
evaluator = ml.evaluation.ClusteringEvaluator()
evaluator.evaluate(prediction)

0.4153293521373778

In [46]:
# Evaluate for different values of k
eval_results=[]
for k in range(2,20):
    kmeans=ml.clustering.KMeans(k=k, seed=1, featuresCol='features')
    pipeline=ml.pipeline.Pipeline(stages=[vec_assembler, kmeans])
    model=pipeline.fit(df)
    prediction=model.transform(df)
    evaluator = ml.evaluation.ClusteringEvaluator()
    eval_results.append((k, evaluator.evaluate(prediction)))

In [47]:
eval_results

[(2, 0.6875664014387497),
 (3, 0.6147915951361759),
 (4, 0.6333227654128869),
 (5, 0.5937447997439024),
 (6, 0.592463658820136),
 (7, 0.5484627422401509),
 (8, 0.46686489256383346),
 (9, 0.48034893889849645),
 (10, 0.47370428136987536),
 (11, 0.4819049717562352),
 (12, 0.40964155503229643),
 (13, 0.4153293521373778),
 (14, 0.41244594513295846),
 (15, 0.41771495579360896),
 (16, 0.39594610810727193),
 (17, 0.40512075095291467),
 (18, 0.4058090075137995),
 (19, 0.3794290531790819)]

### Linear Regression with Spark ML

In [50]:
df_ad = spark.read.csv('../Stat_Learning/data/Advertising.csv', header=True, inferSchema=True)
df_ad.show(5)

+---+-----+-----+---------+-----+
|_c0|   TV|Radio|Newspaper|Sales|
+---+-----+-----+---------+-----+
|  1|230.1| 37.8|     69.2| 22.1|
|  2| 44.5| 39.3|     45.1| 10.4|
|  3| 17.2| 45.9|     69.3|  9.3|
|  4|151.5| 41.3|     58.5| 18.5|
|  5|180.8| 10.8|     58.4| 12.9|
+---+-----+-----+---------+-----+
only showing top 5 rows



In [52]:
df_ad=df_ad.drop('_c0')
df_ad.show(5)

+-----+-----+---------+-----+
|   TV|Radio|Newspaper|Sales|
+-----+-----+---------+-----+
|230.1| 37.8|     69.2| 22.1|
| 44.5| 39.3|     45.1| 10.4|
| 17.2| 45.9|     69.3|  9.3|
|151.5| 41.3|     58.5| 18.5|
|180.8| 10.8|     58.4| 12.9|
+-----+-----+---------+-----+
only showing top 5 rows



In [93]:
# Gather features in vector
vec_assembler_ad=ml.feature.VectorAssembler(inputCols=['TV', 'Radio', 'Newspaper'], outputCol='features')

In [65]:
# Linear Regression Model in Spark
lr=ml.regression.LinearRegression(labelCol='Sales')

In [94]:
# Create pipeline
pipeline=ml.pipeline.Pipeline(stages=[vec_assembler_ad, lr])

In [95]:
# Fit Pipeline Model
model=pipeline.fit(df_ad)

In [96]:
# Access individual fitted models
model.stages

[VectorAssembler_293fdd7f8d71, LinearRegression_6f497c8560f9]

In [97]:
# LR coefficients
model.stages[-1].coefficients

DenseVector([0.0458, 0.1885, -0.001])

In [85]:
# LR intercept
model.stages[-1].intercept

2.9388893694594134

In [70]:
# Predictions
predictions=model.transform(df_ad)
predictions.show()

+-----+-----+---------+-----+-----------------+------------------+
|   TV|Radio|Newspaper|Sales|         features|        prediction|
+-----+-----+---------+-----+-----------------+------------------+
|230.1| 37.8|     69.2| 22.1|[230.1,37.8,69.2]| 20.52397440971517|
| 44.5| 39.3|     45.1| 10.4| [44.5,39.3,45.1]|12.337854820894362|
| 17.2| 45.9|     69.3|  9.3| [17.2,45.9,69.3]|12.307670779994238|
|151.5| 41.3|     58.5| 18.5|[151.5,41.3,58.5]| 17.59782951168913|
|180.8| 10.8|     58.4| 12.9|[180.8,10.8,58.4]|13.188671856831299|
|  8.7| 48.9|     75.0|  7.2|  [8.7,48.9,75.0]|12.478347634035858|
| 57.5| 32.8|     23.5| 11.8| [57.5,32.8,23.5]|11.729759951563684|
|120.2| 19.6|     11.6| 13.2|[120.2,19.6,11.6]| 12.12295316550228|
|  8.6|  2.1|      1.0|  4.8|    [8.6,2.1,1.0]| 3.727340862861585|
|199.8|  2.6|     21.2| 10.6| [199.8,2.6,21.2]|12.550848722934685|
| 66.1|  5.8|     24.2|  8.6|  [66.1,5.8,24.2]| 7.032299200558857|
|214.7| 24.0|      4.0| 17.4| [214.7,24.0,4.0]| 17.28512918260

In [86]:
# R2
model.stages[-1].summary.r2

0.897210638178952

### Logistic Regression

In [199]:
# Accelerometer Dataset
df = spark.read.parquet('hmp.parquet')
df.show()

+---+---+---+--------------------+-----------+
|  x|  y|  z|              source|      class|
+---+---+---+--------------------+-----------+
| 22| 49| 35|Accelerometer-201...|Brush_teeth|
| 22| 49| 35|Accelerometer-201...|Brush_teeth|
| 22| 52| 35|Accelerometer-201...|Brush_teeth|
| 22| 52| 35|Accelerometer-201...|Brush_teeth|
| 21| 52| 34|Accelerometer-201...|Brush_teeth|
| 22| 51| 34|Accelerometer-201...|Brush_teeth|
| 20| 50| 35|Accelerometer-201...|Brush_teeth|
| 22| 52| 34|Accelerometer-201...|Brush_teeth|
| 22| 50| 34|Accelerometer-201...|Brush_teeth|
| 22| 51| 35|Accelerometer-201...|Brush_teeth|
| 21| 51| 33|Accelerometer-201...|Brush_teeth|
| 20| 50| 34|Accelerometer-201...|Brush_teeth|
| 21| 49| 33|Accelerometer-201...|Brush_teeth|
| 21| 49| 33|Accelerometer-201...|Brush_teeth|
| 20| 51| 35|Accelerometer-201...|Brush_teeth|
| 18| 49| 34|Accelerometer-201...|Brush_teeth|
| 19| 48| 34|Accelerometer-201...|Brush_teeth|
| 16| 53| 34|Accelerometer-201...|Brush_teeth|
| 18| 52| 35|

In [200]:
# Logistic Regression on Accelerometer Dataset
logr=ml.classification.LogisticRegression(labelCol='classIndex')
pipeline=ml.pipeline.Pipeline(stages=[indexer, vec_assembler, logr])
model=pipeline.fit(df)

In [201]:
prediction=model.transform(df)
prediction.toPandas()

Unnamed: 0,x,y,z,source,class,classIndex,features,rawPrediction,probability,prediction
0,22,49,35,Accelerometer-2011-04-11-13-28-18-brush_teeth-...,Brush_teeth,6.0,"[22.0, 49.0, 35.0]","[1.694142296063741, 0.48981129401865875, -0.81...","[0.18172223142384303, 0.05449714517748826, 0.0...",6.0
1,22,49,35,Accelerometer-2011-04-11-13-28-18-brush_teeth-...,Brush_teeth,6.0,"[22.0, 49.0, 35.0]","[1.694142296063741, 0.48981129401865875, -0.81...","[0.18172223142384303, 0.05449714517748826, 0.0...",6.0
2,22,52,35,Accelerometer-2011-04-11-13-28-18-brush_teeth-...,Brush_teeth,6.0,"[22.0, 52.0, 35.0]","[1.6136877353802426, 0.23804621106274215, -0.8...","[0.13583200747809968, 0.03432168460870159, 0.0...",6.0
3,22,52,35,Accelerometer-2011-04-11-13-28-18-brush_teeth-...,Brush_teeth,6.0,"[22.0, 52.0, 35.0]","[1.6136877353802426, 0.23804621106274215, -0.8...","[0.13583200747809968, 0.03432168460870159, 0.0...",6.0
4,21,52,34,Accelerometer-2011-04-11-13-28-18-brush_teeth-...,Brush_teeth,6.0,"[21.0, 52.0, 34.0]","[1.8235840570079764, 0.2901075033198124, -1.00...","[0.1494040462578127, 0.03223902908738586, 0.00...",6.0
...,...,...,...,...,...,...,...,...,...,...
446524,41,35,51,Accelerometer-2012-06-11-11-39-29-walk-m1.txt,Walk,0.0,"[41.0, 35.0, 51.0]","[-1.6404568327269402, 0.7277860143050034, 2.38...","[0.005422594863362025, 0.05790578568106644, 0....",2.0
446525,40,35,52,Accelerometer-2012-06-11-11-39-29-walk-m1.txt,Walk,0.0,"[40.0, 35.0, 52.0]","[-1.6158780881747177, 0.7450236047555885, 2.35...","[0.005543681668093255, 0.058765828263808176, 0...",2.0
446526,39,37,51,Accelerometer-2012-06-11-11-39-29-walk-m1.txt,Walk,0.0,"[39.0, 37.0, 51.0]","[-1.4596181403359836, 0.6292415083753804, 2.17...","[0.007725074908689002, 0.0623853934324205, 0.2...",2.0
446527,39,37,53,Accelerometer-2012-06-11-11-39-29-walk-m1.txt,Walk,0.0,"[39.0, 37.0, 53.0]","[-1.6449357174114922, 0.5944178065688952, 2.32...","[0.00558038809234513, 0.05238455775458134, 0.2...",2.0


In [202]:
# Evaluate
evaluator=ml.evaluation.MulticlassClassificationEvaluator(predictionCol='prediction', 
                                                          labelCol='classIndex', 
                                                          metricName='f1')
evaluator.evaluate(prediction)

0.28317855513437706

In [203]:
# With train test split
df_train, df_test = df.randomSplit([0.8, 0.2])
model2=pipeline.fit(df_train)
prediction2=model2.transform(df_test)
evaluator.evaluate(prediction2)

0.2844002439404077