In [None]:
# -*- coding: utf-8 -*-
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
# implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

# 2022: Docker with Jupyter Notebook and Apache Spark 

### Pull approriate Docker Image from DockerHub

`docker pull jupyter/pyspark-notebook`

### Launch Docker Container

`docker run -it --rm -p 8888:8888 -v "${PWD}:/home/jovyan/work" jupyter/pyspark-notebook`




# Estimating Pi value with Monte Carlo simulation using Spark

Based on https://docs.cloudera.com/machine-learning/cloud/spark/topics/ml-example--montecarlo-estimation.html

In [None]:
# Estimating $\pi$
# This PySpark example shows you how to estimate $\pi$ in parallel
# using Monte Carlo integration.

import sys
from random import random
from operator import add

# import findspark
# findspark.find()
# findspark.init()

print(sys.argv)
partitions = int(sys.argv[1]) if len(sys.argv) > 3 else 2

import pyspark
print("PySpark Version: " + pyspark.__version__)

In [None]:
from pyspark import SparkContext
from pyspark import SparkConf
from pyspark.sql import SparkSession

conf = pyspark.SparkConf().setAppName('my-pyspark')
sc = SparkSession.builder.config(conf=conf).getOrCreate()

print("Spark Version: " + sc.version)

In [None]:
# Change this number to other values, e.g., 10000, 100000 or more to see better PI value.
# Be aware of computational power and runtime !!!
n = 5000 * partitions

def f(_):
    x = random() * 2 - 1
    y = random() * 2 - 1
    return 1 if x ** 2 + y ** 2 < 1 else 0

# To access the associated SparkContext
count = sc.sparkContext.parallelize(range(1, n + 1), partitions).map(f).reduce(add)
print("Pi is roughly %f" % (4.0 * count / n))

In [None]:
sc.stop()

## 2021: Local Setting Apache Spark with jupyter notebook 

#### Install the right version of Java and Scala before installing Spark !!!

Spark runs on both Windows and UNIX-like systems (e.g. Linux, Mac OS), and it should run on any platform that runs a supported version of Java. This should include JVMs on x86_64 and ARM64. It’s easy to run locally on one machine — all you need is to have java installed on your system PATH, or the JAVA_HOME environment variable pointing to a Java installation.

### !!! Important !!! Spark runs on Java 8/11, Scala 2.12, and Python 3.6+ 

### Configure Spark with jupyter notebook
URL https://www.javacodemonk.com/installing-pyspark-with-jupyter-notebook-on-ubuntu-18-04-lts-31cd3781

**Install pyspark**
```
$iau> pip install pyspark findspark
```

**Configure environment using Bash profile**
```
~/.bashrc
export SPARK_HOME=/home/<username>/<path-to-your-venv>/<python-version>/site-packages/pyspark/
export PYSPARK_DRIVER_PYTHON=jupyter
export PYSPARK_DRIVER_PYTHON_OPTS='notebook'
```

**Reload**
```
$ source ~/.bashrc
```

**Start Jupyter notebook from command line with pyspark**
```
pyspark
```