# Install Spark

In [None]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://downloads.apache.org/spark/spark-3.0.0-preview2/spark-3.0.0-preview2-bin-hadoop2.7.tgz
!tar xf spark-3.0.0-preview2-bin-hadoop2.7.tgz
!pip install -q findspark

In [None]:
!pip install pyspark==2.4.5



In [None]:
!ls

hmp.parquet			    spark-3.0.0-preview2-bin-hadoop2.7.tgz
sample_data			    spark-3.0.0-preview2-bin-hadoop2.7.tgz.1
spark-3.0.0-preview2-bin-hadoop2.7  spark-3.0.0-preview2-bin-hadoop2.7.tgz.2


In [None]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.0.0-preview2-bin-hadoop2.7"

In [None]:
import findspark
findspark.init()
from pyspark.sql import SparkSession

# spark = SparkSession.builder.master("local[*]").getOrCreate()

In [None]:
try:
    from pyspark import SparkContext, SparkConf
except ImportError as e:
    printmd('<<<<<!!!!! Please restart your kernel after installing Apache Spark !!!!!>>>>>')

In [None]:
sc = SparkContext.getOrCreate(SparkConf().setMaster("local[*]"))

spark = SparkSession \
    .builder \
    .getOrCreate()

## Exercise

In [None]:
# download the file containing the data in PARQUET format
!wget https://github.com/IBM/coursera/raw/master/hmp.parquet
    
# create a dataframe out of it
df = spark.read.parquet('hmp.parquet')

# register a corresponding query table
df.createOrReplaceTempView('df')

--2020-07-15 04:15:44--  https://github.com/IBM/coursera/raw/master/hmp.parquet
Resolving github.com (github.com)... 140.82.113.4
Connecting to github.com (github.com)|140.82.113.4|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://github.com/IBM/skillsnetwork/raw/master/hmp.parquet [following]
--2020-07-15 04:15:44--  https://github.com/IBM/skillsnetwork/raw/master/hmp.parquet
Reusing existing connection to github.com:443.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/IBM/skillsnetwork/master/hmp.parquet [following]
--2020-07-15 04:15:44--  https://raw.githubusercontent.com/IBM/skillsnetwork/master/hmp.parquet
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 932997 (91

In [None]:
df.show()

+---+---+---+--------------------+-----------+
|  x|  y|  z|              source|      class|
+---+---+---+--------------------+-----------+
| 22| 49| 35|Accelerometer-201...|Brush_teeth|
| 22| 49| 35|Accelerometer-201...|Brush_teeth|
| 22| 52| 35|Accelerometer-201...|Brush_teeth|
| 22| 52| 35|Accelerometer-201...|Brush_teeth|
| 21| 52| 34|Accelerometer-201...|Brush_teeth|
| 22| 51| 34|Accelerometer-201...|Brush_teeth|
| 20| 50| 35|Accelerometer-201...|Brush_teeth|
| 22| 52| 34|Accelerometer-201...|Brush_teeth|
| 22| 50| 34|Accelerometer-201...|Brush_teeth|
| 22| 51| 35|Accelerometer-201...|Brush_teeth|
| 21| 51| 33|Accelerometer-201...|Brush_teeth|
| 20| 50| 34|Accelerometer-201...|Brush_teeth|
| 21| 49| 33|Accelerometer-201...|Brush_teeth|
| 21| 49| 33|Accelerometer-201...|Brush_teeth|
| 20| 51| 35|Accelerometer-201...|Brush_teeth|
| 18| 49| 34|Accelerometer-201...|Brush_teeth|
| 19| 48| 34|Accelerometer-201...|Brush_teeth|
| 16| 53| 34|Accelerometer-201...|Brush_teeth|
| 18| 52| 35|

In [None]:
df_energy = spark.sql("""
select sqrt(sum(x*x)+sum(y*y)+sum(z*z)) as label, class from df group by class
""")      
df_energy.createOrReplaceTempView('df_energy')  

In [None]:
df_energy.show()

+------------------+--------------+
|             label|         class|
+------------------+--------------+
| 11785.39634462923|   Brush_teeth|
|11082.626493751379|  Climb_stairs|
|10616.408809008817|     Comb_hair|
| 7173.493500380411|Descend_stairs|
|15003.269043778426|   Drink_glass|
| 12542.96539897962|      Eat_meat|
| 6071.460120926432|      Eat_soup|
|13225.945637269193|     Getup_bed|
|6783.4063714331605|   Liedown_bed|
|14454.885091207056|    Pour_water|
|10261.338314274606| Sitdown_chair|
| 9737.511232342687| Standup_chair|
| 8959.680239829991| Use_telephone|
|16537.370891408344|          Walk|
+------------------+--------------+



In [None]:
df_join = spark.sql('select * from df inner join df_energy on df.class = df_energy.class')

In [None]:
df_join.show()

+---+---+---+--------------------+-----------+-----------------+-----------+
|  x|  y|  z|              source|      class|            label|      class|
+---+---+---+--------------------+-----------+-----------------+-----------+
| 22| 49| 35|Accelerometer-201...|Brush_teeth|11785.39634462923|Brush_teeth|
| 22| 49| 35|Accelerometer-201...|Brush_teeth|11785.39634462923|Brush_teeth|
| 22| 52| 35|Accelerometer-201...|Brush_teeth|11785.39634462923|Brush_teeth|
| 22| 52| 35|Accelerometer-201...|Brush_teeth|11785.39634462923|Brush_teeth|
| 21| 52| 34|Accelerometer-201...|Brush_teeth|11785.39634462923|Brush_teeth|
| 22| 51| 34|Accelerometer-201...|Brush_teeth|11785.39634462923|Brush_teeth|
| 20| 50| 35|Accelerometer-201...|Brush_teeth|11785.39634462923|Brush_teeth|
| 22| 52| 34|Accelerometer-201...|Brush_teeth|11785.39634462923|Brush_teeth|
| 22| 50| 34|Accelerometer-201...|Brush_teeth|11785.39634462923|Brush_teeth|
| 22| 51| 35|Accelerometer-201...|Brush_teeth|11785.39634462923|Brush_teeth|

In [None]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import Normalizer


vectorAssembler = VectorAssembler(inputCols=["x","y","z"],
                                  outputCol="features")
normalizer = Normalizer(inputCol="features", outputCol="features_norm", p=1.0) # p-norm

In [None]:
from pyspark.ml.regression import LinearRegression

lr = LinearRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)

In [None]:
from pyspark.ml import Pipeline
pipeline = Pipeline(stages=[vectorAssembler, normalizer,lr])

In [None]:
model = pipeline.fit(df_join)

In [None]:
prediction = model.transform(df_join)

In [None]:
prediction.show()

+---+---+---+--------------------+-----------+-----------------+-----------+----------------+--------------------+------------------+
|  x|  y|  z|              source|      class|            label|      class|        features|       features_norm|        prediction|
+---+---+---+--------------------+-----------+-----------------+-----------+----------------+--------------------+------------------+
| 22| 49| 35|Accelerometer-201...|Brush_teeth|11785.39634462923|Brush_teeth|[22.0,49.0,35.0]|[0.20754716981132...|12586.729735016828|
| 22| 49| 35|Accelerometer-201...|Brush_teeth|11785.39634462923|Brush_teeth|[22.0,49.0,35.0]|[0.20754716981132...|12586.729735016828|
| 22| 52| 35|Accelerometer-201...|Brush_teeth|11785.39634462923|Brush_teeth|[22.0,52.0,35.0]|[0.20183486238532...|12542.703337345756|
| 22| 52| 35|Accelerometer-201...|Brush_teeth|11785.39634462923|Brush_teeth|[22.0,52.0,35.0]|[0.20183486238532...|12542.703337345756|
| 21| 52| 34|Accelerometer-201...|Brush_teeth|11785.3963446292

In [None]:
model.stages[2].summary.r2

0.03259100556263628