# Install Spark

In [1]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://downloads.apache.org/spark/spark-3.0.0/spark-3.0.0-bin-hadoop2.7.tgz
!tar xf spark-3.0.0-bin-hadoop2.7.tgz
!pip install -q findspark

In [2]:
!pip install pyspark==2.4.5



In [3]:
!ls

sample_data  spark-3.0.0-bin-hadoop2.7	spark-3.0.0-bin-hadoop2.7.tgz


In [3]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.0.0-bin-hadoop2.7"

In [4]:
import findspark
findspark.init()
from pyspark.sql import SparkSession

In [5]:
try:
    from pyspark import SparkContext, SparkConf
except ImportError as e:
    printmd('<<<<<!!!!! Please restart your kernel after installing Apache Spark !!!!!>>>>>')

In [6]:
sc = SparkContext.getOrCreate(SparkConf().setMaster("local[*]"))

spark = SparkSession \
    .builder \
    .getOrCreate()

## Exercise

In [7]:
# delete files from previous runs
!rm -f hmp.parquet*

# download the file containing the data in PARQUET format
!wget https://github.com/IBM/coursera/raw/master/hmp.parquet
    
# create a dataframe out of it
df = spark.read.parquet('hmp.parquet')

# register a corresponding query table
df.createOrReplaceTempView('df')

--2020-07-22 05:03:11--  https://github.com/IBM/coursera/raw/master/hmp.parquet
Resolving github.com (github.com)... 140.82.112.4
Connecting to github.com (github.com)|140.82.112.4|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://github.com/IBM/skillsnetwork/raw/master/hmp.parquet [following]
--2020-07-22 05:03:11--  https://github.com/IBM/skillsnetwork/raw/master/hmp.parquet
Reusing existing connection to github.com:443.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/IBM/skillsnetwork/master/hmp.parquet [following]
--2020-07-22 05:03:11--  https://raw.githubusercontent.com/IBM/skillsnetwork/master/hmp.parquet
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 932997 (91

In [8]:
df.show()

+---+---+---+--------------------+-----------+
|  x|  y|  z|              source|      class|
+---+---+---+--------------------+-----------+
| 22| 49| 35|Accelerometer-201...|Brush_teeth|
| 22| 49| 35|Accelerometer-201...|Brush_teeth|
| 22| 52| 35|Accelerometer-201...|Brush_teeth|
| 22| 52| 35|Accelerometer-201...|Brush_teeth|
| 21| 52| 34|Accelerometer-201...|Brush_teeth|
| 22| 51| 34|Accelerometer-201...|Brush_teeth|
| 20| 50| 35|Accelerometer-201...|Brush_teeth|
| 22| 52| 34|Accelerometer-201...|Brush_teeth|
| 22| 50| 34|Accelerometer-201...|Brush_teeth|
| 22| 51| 35|Accelerometer-201...|Brush_teeth|
| 21| 51| 33|Accelerometer-201...|Brush_teeth|
| 20| 50| 34|Accelerometer-201...|Brush_teeth|
| 21| 49| 33|Accelerometer-201...|Brush_teeth|
| 21| 49| 33|Accelerometer-201...|Brush_teeth|
| 20| 51| 35|Accelerometer-201...|Brush_teeth|
| 18| 49| 34|Accelerometer-201...|Brush_teeth|
| 19| 48| 34|Accelerometer-201...|Brush_teeth|
| 16| 53| 34|Accelerometer-201...|Brush_teeth|
| 18| 52| 35|

In [9]:
df_two_class = spark.sql("select * from df where class in ('Use_telephone','Standup_chair')")

In [10]:
df_two_class.show()

+---+---+---+--------------------+-------------+
|  x|  y|  z|              source|        class|
+---+---+---+--------------------+-------------+
| 30| 40| 51|Accelerometer-201...|Standup_chair|
| 30| 41| 51|Accelerometer-201...|Standup_chair|
| 31| 41| 51|Accelerometer-201...|Standup_chair|
| 29| 42| 51|Accelerometer-201...|Standup_chair|
| 30| 43| 52|Accelerometer-201...|Standup_chair|
| 30| 40| 52|Accelerometer-201...|Standup_chair|
| 31| 41| 52|Accelerometer-201...|Standup_chair|
| 32| 39| 52|Accelerometer-201...|Standup_chair|
| 29| 38| 52|Accelerometer-201...|Standup_chair|
| 29| 38| 50|Accelerometer-201...|Standup_chair|
| 28| 40| 50|Accelerometer-201...|Standup_chair|
| 31| 38| 51|Accelerometer-201...|Standup_chair|
| 30| 39| 51|Accelerometer-201...|Standup_chair|
| 30| 39| 50|Accelerometer-201...|Standup_chair|
| 31| 39| 51|Accelerometer-201...|Standup_chair|
| 30| 38| 52|Accelerometer-201...|Standup_chair|
| 29| 39| 53|Accelerometer-201...|Standup_chair|
| 31| 38| 52|Acceler

In [11]:
splits = df_two_class.randomSplit([0.8, 0.2])
df_train = splits[0]
df_test = splits[1]

In [12]:
from pyspark.ml.feature import StringIndexer, OneHotEncoder
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import MinMaxScaler


indexer = StringIndexer(inputCol="class", outputCol="label")

vectorAssembler = VectorAssembler(inputCols=["x","y","z"],
                                  outputCol="features")

normalizer = MinMaxScaler(inputCol="features", outputCol="features_norm")

In [13]:
from pyspark.ml.classification import GBTClassifier

gbt = GBTClassifier(labelCol="label", featuresCol="features_norm", maxIter=8)

In [14]:
from pyspark.ml import Pipeline
pipeline = Pipeline(stages=[indexer, vectorAssembler, normalizer, gbt])

In [15]:
model = pipeline.fit(df_train)

In [16]:
prediction = model.transform(df_train)

In [17]:
prediction.show()

+---+---+---+--------------------+-------------+-----+---------------+--------------------+--------------------+--------------------+----------+
|  x|  y|  z|              source|        class|label|       features|       features_norm|       rawPrediction|         probability|prediction|
+---+---+---+--------------------+-------------+-----+---------------+--------------------+--------------------+--------------------+----------+
|  0| 30| 24|Accelerometer-201...|Standup_chair|  0.0|[0.0,30.0,24.0]|[0.0,0.3653846153...|[1.26132470993969...|[0.92571445365143...|       0.0|
|  0| 31| 17|Accelerometer-201...|Standup_chair|  0.0|[0.0,31.0,17.0]|[0.0,0.3846153846...|[1.26132470993969...|[0.92571445365143...|       0.0|
|  0| 31| 30|Accelerometer-201...|Standup_chair|  0.0|[0.0,31.0,30.0]|[0.0,0.3846153846...|[1.26132470993969...|[0.92571445365143...|       0.0|
|  0| 31| 32|Accelerometer-201...|Standup_chair|  0.0|[0.0,31.0,32.0]|[0.0,0.3846153846...|[1.26132470993969...|[0.92571445365143.

In [18]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
binEval = MulticlassClassificationEvaluator().setMetricName("accuracy") .setPredictionCol("prediction").setLabelCol("label")
    
binEval.evaluate(prediction) 

0.9104927821724276

In [19]:
prediction = model.transform(df_test)

In [20]:
binEval.evaluate(prediction) 

0.9039617318778363