### Spark notebook ###

This notebook will only work in a Jupyter session running on `mathmadslinux2p`.

You can start your own Jupyter session on `mathmadslinux2p` and open this notebook in Chrome on the MADS Windows server by

**Steps**

1. Login to the MADS Windows server using https://mathportal.canterbury.ac.nz/.
2. Download or copy this notebook to your home directory.
3. Open powershell and run `ssh mathmadslinux2p`.
4. Run `start_pyspark_notebook` or `/opt/anaconda3/bin/jupyter-notebook --ip 132.181.129.68 --port $((8000 + $((RANDOM % 999))))`.
5. Copy / paste the url provided in the shell window into Chrome on the MADS Windows server.
6. Open the notebook from the Jupyter root directory (which is your home directory).
7. Run `start_spark()` to start a spark session in the notebook.
8. Run `stop_spark()` before closing the notebook or kill your spark application by hand using the link in the Spark UI.

In [1]:
# Run this cell to import pyspark and to define start_spark() and stop_spark()

import findspark

findspark.init()

import getpass
import pandas
import pyspark
import random
import re

from IPython.display import display, HTML
from pyspark import SparkContext
from pyspark.sql import SparkSession


# Functions used below

def username():
    """Get username with any domain information removed.
    """

    return re.sub('@.*', '', getpass.getuser())


def dict_to_html(d):
    """Convert a Python dictionary into a two column table for display.
    """

    html = []

    html.append(f'<table width="100%" style="width:100%; font-family: monospace;">')
    for k, v in d.items():
        html.append(f'<tr><td style="text-align:left;">{k}</td><td>{v}</td></tr>')
    html.append(f'</table>')

    return ''.join(html)


def show_as_html(df, n=20):
    """Leverage existing pandas jupyter integration to show a spark dataframe as html.
    
    Args:
        n (int): number of rows to show (default: 20)
    """

    display(df.limit(n).toPandas())

    
def display_spark():
    """Display the status of the active Spark session if one is currently running.
    """
    
    if 'spark' in globals() and 'sc' in globals():

        name = sc.getConf().get("spark.app.name")
        
        html = [
            f'<p><b>Spark</b></p>',
            f'<p>The spark session is <b><span style="color:green">active</span></b>, look for <code>{name}</code> under the running applications section in the Spark UI.</p>',
            f'<ul>',
            f'<li><a href="http://mathmadslinux2p.canterbury.ac.nz:8080/" target="_blank">Spark UI</a></li>',
            f'<li><a href="{sc.uiWebUrl}" target="_blank">Spark Application UI</a></li>',
            f'</ul>',
            f'<p><b>Config</b></p>',
            dict_to_html(dict(sc.getConf().getAll())),
            f'<p><b>Notes</b></p>',
            f'<ul>',
            f'<li>The spark session <code>spark</code> and spark context <code>sc</code> global variables have been defined by <code>start_spark()</code>.</li>',
            f'<li>Please run <code>stop_spark()</code> before closing the notebook or restarting the kernel or kill <code>{name}</code> by hand using the link in the Spark UI.</li>',
            f'</ul>',
        ]
        display(HTML(''.join(html)))
        
    else:
        
        html = [
            f'<p><b>Spark</b></p>',
            f'<p>The spark session is <b><span style="color:red">stopped</span></b>, confirm that <code>{username() + " (jupyter)"}</code> is under the completed applications section in the Spark UI.</p>',
            f'<ul>',
            f'<li><a href="http://mathmadslinux2p.canterbury.ac.nz:8080/" target="_blank">Spark UI</a></li>',
            f'</ul>',
        ]
        display(HTML(''.join(html)))


# Functions to start and stop spark

def start_spark(executor_instances=2, executor_cores=1, worker_memory=1, master_memory=1):
    """Start a new Spark session and define globals for SparkSession (spark) and SparkContext (sc).
    
    Args:
        executor_instances (int): number of executors (default: 2)
        executor_cores (int): number of cores per executor (default: 1)
        worker_memory (float): worker memory (default: 1)
        master_memory (float): master memory (default: 1)
    """

    global spark
    global sc

    user = username()
    
    cores = executor_instances * executor_cores
    partitions = cores * 4
    port = 4000 + random.randint(1, 999)

    spark = (
        SparkSession.builder
        .master("spark://masternode2:7077")
        .config("spark.driver.extraJavaOptions", f"-Dderby.system.home=/tmp/{user}/spark/")
        .config("spark.dynamicAllocation.enabled", "false")
        .config("spark.executor.instances", str(executor_instances))
        .config("spark.executor.cores", str(executor_cores))
        .config("spark.cores.max", str(cores))
        .config("spark.executor.memory", f"{worker_memory}g")
        .config("spark.driver.memory", f"{master_memory}g")
        .config("spark.driver.maxResultSize", "0")
        .config("spark.sql.shuffle.partitions", str(partitions))
        .config("spark.ui.port", str(port))
        .appName(user + " (jupyter)")
        .getOrCreate()
    )
    sc = SparkContext.getOrCreate()
    
    display_spark()

    
def stop_spark():
    """Stop the active Spark session and delete globals for SparkSession (spark) and SparkContext (sc).
    """

    global spark
    global sc

    if 'spark' in globals() and 'sc' in globals():

        spark.stop()

        del spark
        del sc

    display_spark()


# Make css changes to improve spark output readability

html = [
    '<style>',
    'pre { white-space: pre !important; }',
    'table.dataframe td { white-space: nowrap !important; }',
    'table.dataframe thead th:first-child, table.dataframe tbody th { display: none; }',
    '</style>',
]
display(HTML(''.join(html)))

In [2]:
# Run this cell to start a spark session in this notebook

start_spark(executor_instances=4, executor_cores=2, worker_memory=4, master_memory=4)

0,1
spark.app.name,kda115 (jupyter)
spark.dynamicAllocation.enabled,false
spark.executor.instances,4
spark.driver.memory,4g
spark.executor.memory,4g
spark.master,spark://masternode2:7077
spark.executor.id,driver
spark.executor.cores,2
spark.driver.host,mathmadslinux2p.canterbury.ac.nz
spark.sql.shuffle.partitions,32


In [3]:
# Import the pyspark API to defined data types
from pyspark.sql import functions as F
from pyspark.sql.types import *
from pyspark.sql import Row

In [4]:
# Reload the daily schema 
schema_daily = StructType([
    StructField("Station_ID", StringType(), True),
    StructField("DATE",  StringType(), True),
    StructField("Element", StringType(), True),
    StructField("VALUE", IntegerType(), True),
    StructField("Measurement_Flag", StringType(), True),
    StructField("Quality_Flag", StringType(), True),
    StructField("Source_Flag", StringType(), True),
    StructField("Observation_Time", StringType(), True)
])

In [5]:
# Read the daily data from HDFS
daily_data = (spark.read.format('csv').
         option('header', False).
         option('inferSchema', False).
         schema(schema_daily).
         load('hdfs:///data/ghcnd/daily')) 

# Show the result
daily_data.printSchema()
show_as_html(daily_data, 10)

root
 |-- Station_ID: string (nullable = true)
 |-- DATE: string (nullable = true)
 |-- Element: string (nullable = true)
 |-- VALUE: integer (nullable = true)
 |-- Measurement_Flag: string (nullable = true)
 |-- Quality_Flag: string (nullable = true)
 |-- Source_Flag: string (nullable = true)
 |-- Observation_Time: string (nullable = true)



Unnamed: 0,Station_ID,DATE,Element,VALUE,Measurement_Flag,Quality_Flag,Source_Flag,Observation_Time
0,AE000041196,20100101,TMAX,259,,,S,
1,AE000041196,20100101,TMIN,120,,,S,
2,AE000041196,20100101,TAVG,181,H,,S,
3,AEM00041194,20100101,TMAX,250,,,S,
4,AEM00041194,20100101,TMIN,168,,,S,
5,AEM00041194,20100101,PRCP,0,,,S,
6,AEM00041194,20100101,TAVG,194,H,,S,
7,AEM00041217,20100101,TMAX,250,,,S,
8,AEM00041217,20100101,TMIN,146,,,S,
9,AEM00041217,20100101,TAVG,199,H,,S,


### Question 01: Plot the observations of TMIN and TMAX for stations in New Zealand

#### A.1 Filter daily to obtain all observations of TMIN and TMAX for all stations in New Zealand, and save the result

In [9]:
# Filter for TMIN and TMAX observations for all stations in New Zealand using the station_ID == 'NZ'
NZ_temp_stations = daily_data.filter(
    ((F.col("Element") == "TMIN") | (F.col("Element") == "TMAX")) & 
    (F.substring(F.col('Station_ID'), 1, 2) == 'NZ') 
)
# show the output 
show_as_html(NZ_temp_stations, 5)

Unnamed: 0,Station_ID,DATE,Element,VALUE,Measurement_Flag,Quality_Flag,Source_Flag,Observation_Time
0,NZ000093292,20100101,TMAX,297,,,S,NaT
1,NZ000093292,20100101,TMIN,74,,,S,NaT
2,NZ000093417,20100101,TMAX,180,,,S,NaT
3,NZ000093417,20100101,TMIN,125,,,S,NaT
4,NZ000093844,20100101,TMAX,232,,,S,NaT


#### A.2 How many observations are there, and how many years are covered by the observations?

In [7]:
# Count the number of observations
num_observations = NZ_temp_stations.count()

# Show the results
print(f"Number of observations: {num_observations}")

Number of observations: 487760


In [10]:
# Extract the year from the DATE column and stored it with the Year column
NZ_temp_stations = (NZ_temp_stations.
                   withColumn('Year', F.substring(F.col('Date'), 1, 4).cast('integer')))

# show the result 
show_as_html(NZ_temp_stations, 5)

Unnamed: 0,Station_ID,DATE,Element,VALUE,Measurement_Flag,Quality_Flag,Source_Flag,Observation_Time,Year
0,NZ000093292,20100101,TMAX,297,,,S,NaT,2010
1,NZ000093292,20100101,TMIN,74,,,S,NaT,2010
2,NZ000093417,20100101,TMAX,180,,,S,NaT,2010
3,NZ000093417,20100101,TMIN,125,,,S,NaT,2010
4,NZ000093844,20100101,TMAX,232,,,S,NaT,2010


In [9]:
# Count the number of unique years
years_covered = NZ_temp_stations.select("Year").distinct().count()


# Show the results
print(f"Number of years covered: {years_covered} years.")

Number of years covered: 85 years.


#### A.3 Save to HDFS and copy to local machine 

In [None]:
# Save the result to the output directory as Parquet
(NZ_temp_stations.write.option('header', True).mode('overwrite').
 csv('hdfs:///user/kda115/ghcnd/result/NZ_temp_stations.csv'))

In [None]:
! hdfs dfs -ls 'hdfs:///user/kda115/ghcnd/result/'

In [None]:
# Create a new folder in home directory.
!mkdir -p /users/home/kda115/NZ_Temperature

In [None]:
!hdfs dfs -copyToLocal /user/kda115/ghcnd/result/NZ_temp_stations.csv/* /users/home/kda115/NZ_Temperature

In [None]:
# Count the number of files or headers in the part files

!hdfs dfs -ls /user/kda115/ghcnd/result/NZ_temp_stations.csv/*.csv | wc -l

In [None]:
#  Count the number of rows in the part files using the wc -l bash command

!hdfs dfs -cat /user/kda115/ghcnd/result/NZ_temp_stations.csv/*.csv | wc -l

# This number included the headers

In [6]:
# Run this cell before closing the notebook or kill your spark application by hand using the link in the Spark UI

stop_spark()