### Spark notebook ###

This notebook will only work in a Jupyter session running on `mathmadslinux2p`.

You can start your own Jupyter session on `mathmadslinux2p` and open this notebook in Chrome on the MADS Windows server by

**Steps**

1. Login to the MADS Windows server using https://mathportal.canterbury.ac.nz/.
2. Download or copy this notebook to your home directory.
3. Open powershell and run `ssh mathmadslinux2p`.
4. Run `start_pyspark_notebook` or `/opt/anaconda3/bin/jupyter-notebook --ip 132.181.129.68 --port $((8000 + $((RANDOM % 999))))`.
5. Copy / paste the url provided in the shell window into Chrome on the MADS Windows server.
6. Open the notebook from the Jupyter root directory (which is your home directory).
7. Run `start_spark()` to start a spark session in the notebook.
8. Run `stop_spark()` before closing the notebook or kill your spark application by hand using the link in the Spark UI.

In [1]:
# Run this cell to import pyspark and to define start_spark() and stop_spark()

import findspark

findspark.init()

import getpass
import pandas
import pyspark
import random
import re

from IPython.display import display, HTML
from pyspark import SparkContext
from pyspark.sql import SparkSession


# Functions used below

def username():
    """Get username with any domain information removed.
    """

    return re.sub('@.*', '', getpass.getuser())


def dict_to_html(d):
    """Convert a Python dictionary into a two column table for display.
    """

    html = []

    html.append(f'<table width="100%" style="width:100%; font-family: monospace;">')
    for k, v in d.items():
        html.append(f'<tr><td style="text-align:left;">{k}</td><td>{v}</td></tr>')
    html.append(f'</table>')

    return ''.join(html)


def show_as_html(df, n=20):
    """Leverage existing pandas jupyter integration to show a spark dataframe as html.
    
    Args:
        n (int): number of rows to show (default: 20)
    """

    display(df.limit(n).toPandas())

    
def display_spark():
    """Display the status of the active Spark session if one is currently running.
    """
    
    if 'spark' in globals() and 'sc' in globals():

        name = sc.getConf().get("spark.app.name")
        
        html = [
            f'<p><b>Spark</b></p>',
            f'<p>The spark session is <b><span style="color:green">active</span></b>, look for <code>{name}</code> under the running applications section in the Spark UI.</p>',
            f'<ul>',
            f'<li><a href="http://mathmadslinux2p.canterbury.ac.nz:8080/" target="_blank">Spark UI</a></li>',
            f'<li><a href="{sc.uiWebUrl}" target="_blank">Spark Application UI</a></li>',
            f'</ul>',
            f'<p><b>Config</b></p>',
            dict_to_html(dict(sc.getConf().getAll())),
            f'<p><b>Notes</b></p>',
            f'<ul>',
            f'<li>The spark session <code>spark</code> and spark context <code>sc</code> global variables have been defined by <code>start_spark()</code>.</li>',
            f'<li>Please run <code>stop_spark()</code> before closing the notebook or restarting the kernel or kill <code>{name}</code> by hand using the link in the Spark UI.</li>',
            f'</ul>',
        ]
        display(HTML(''.join(html)))
        
    else:
        
        html = [
            f'<p><b>Spark</b></p>',
            f'<p>The spark session is <b><span style="color:red">stopped</span></b>, confirm that <code>{username() + " (jupyter)"}</code> is under the completed applications section in the Spark UI.</p>',
            f'<ul>',
            f'<li><a href="http://mathmadslinux2p.canterbury.ac.nz:8080/" target="_blank">Spark UI</a></li>',
            f'</ul>',
        ]
        display(HTML(''.join(html)))


# Functions to start and stop spark

def start_spark(executor_instances=2, executor_cores=1, worker_memory=1, master_memory=1):
    """Start a new Spark session and define globals for SparkSession (spark) and SparkContext (sc).
    
    Args:
        executor_instances (int): number of executors (default: 2)
        executor_cores (int): number of cores per executor (default: 1)
        worker_memory (float): worker memory (default: 1)
        master_memory (float): master memory (default: 1)
    """

    global spark
    global sc

    user = username()
    
    cores = executor_instances * executor_cores
    partitions = cores * 4
    port = 4000 + random.randint(1, 999)

    spark = (
        SparkSession.builder
        .master("spark://masternode2:7077")
        .config("spark.driver.extraJavaOptions", f"-Dderby.system.home=/tmp/{user}/spark/")
        .config("spark.dynamicAllocation.enabled", "false")
        .config("spark.executor.instances", str(executor_instances))
        .config("spark.executor.cores", str(executor_cores))
        .config("spark.cores.max", str(cores))
        .config("spark.executor.memory", f"{worker_memory}g")
        .config("spark.driver.memory", f"{master_memory}g")
        .config("spark.driver.maxResultSize", "0")
        .config("spark.sql.shuffle.partitions", str(partitions))
        .config("spark.ui.port", str(port))
        .appName(user + " (jupyter)")
        .getOrCreate()
    )
    sc = SparkContext.getOrCreate()
    
    display_spark()

    
def stop_spark():
    """Stop the active Spark session and delete globals for SparkSession (spark) and SparkContext (sc).
    """

    global spark
    global sc

    if 'spark' in globals() and 'sc' in globals():

        spark.stop()

        del spark
        del sc

    display_spark()


# Make css changes to improve spark output readability

html = [
    '<style>',
    'pre { white-space: pre !important; }',
    'table.dataframe td { white-space: nowrap !important; }',
    'table.dataframe thead th:first-child, table.dataframe tbody th { display: none; }',
    '</style>',
]
display(HTML(''.join(html)))

In [2]:
# Run this cell to start a spark session in this notebook

start_spark(executor_instances=4, executor_cores=2, worker_memory=4, master_memory=4)

0,1
spark.app.name,kda115 (jupyter)
spark.dynamicAllocation.enabled,false
spark.executor.instances,4
spark.driver.memory,4g
spark.executor.memory,4g
spark.master,spark://masternode2:7077
spark.executor.id,driver
spark.app.startTime,1726369449974
spark.executor.cores,2
spark.driver.host,mathmadslinux2p.canterbury.ac.nz


In [3]:
# Import the pyspark API to defined data types
from pyspark.sql import functions as F
from pyspark.sql.types import *
from pyspark.sql import Row

In [4]:
# Reload the daily schema 
schema_daily = StructType([
    StructField("Station_ID", StringType(), True),
    StructField("DATE",  StringType(), True),
    StructField("Element", StringType(), True),
    StructField("VALUE", IntegerType(), True),
    StructField("Measurement_Flag", StringType(), True),
    StructField("Quality_Flag", StringType(), True),
    StructField("Source_Flag", StringType(), True),
    StructField("Observation_Time", StringType(), True)
])

In [5]:
# Read the 2023 daily data from HDFS
daily_2023 = (spark.read.format('csv').
         option('header', False).
         option('inferSchema', False).
         schema(schema_daily).
         load('hdfs:///data/ghcnd/daily/2023.csv.gz')) 

# Show the result
daily_2023.printSchema()
show_as_html(daily_2023, 10)

root
 |-- Station_ID: string (nullable = true)
 |-- DATE: string (nullable = true)
 |-- Element: string (nullable = true)
 |-- VALUE: integer (nullable = true)
 |-- Measurement_Flag: string (nullable = true)
 |-- Quality_Flag: string (nullable = true)
 |-- Source_Flag: string (nullable = true)
 |-- Observation_Time: string (nullable = true)



Unnamed: 0,Station_ID,DATE,Element,VALUE,Measurement_Flag,Quality_Flag,Source_Flag,Observation_Time
0,AE000041196,20230101,TMAX,252,,,S,
1,AE000041196,20230101,TMIN,149,,,S,
2,AE000041196,20230101,PRCP,0,D,,S,
3,AE000041196,20230101,TAVG,207,H,,S,
4,AEM00041194,20230101,TMAX,255,,,S,
5,AEM00041194,20230101,TMIN,186,,,S,
6,AEM00041194,20230101,PRCP,0,,,S,
7,AEM00041194,20230101,TAVG,223,H,,S,
8,AEM00041217,20230101,TMAX,248,,,S,
9,AEM00041217,20230101,TMIN,184,,,S,


### Question 02: Plot the average rainfall each country 2023

#### 1 Processing the daily data and re - calculate the average rainfall it a more sensible way

In [10]:
# Filter for precipitation (PRCP) observations
prcp_data = daily_2023.filter(F.col("Element") == "PRCP")

# Show the result 
show_as_html(prcp_data, 5)

Unnamed: 0,Station_ID,DATE,Element,VALUE,Measurement_Flag,Quality_Flag,Source_Flag,Observation_Time
0,AE000041196,20230101,PRCP,0,D,,S,NaT
1,AEM00041194,20230101,PRCP,0,,,S,NaT
2,AGE00147708,20230101,PRCP,0,,,S,NaT
3,AGE00147716,20230101,PRCP,0,,,S,NaT
4,AGE00147719,20230101,PRCP,0,,,S,NaT


In [11]:
# Extract the year from the DATE column and the country code from the Station_ID
prcp_data_2023 = (prcp_data.filter(F.col("VALUE") >= 0).
            withColumn('Year', F.substring(F.col('Date'), 1, 4).cast('integer')).
            withColumn("Country_Code", F.substring(F.col("Station_ID"), 1, 2)))


# Show the result 
show_as_html(prcp_data_2023, 5)

Unnamed: 0,Station_ID,DATE,Element,VALUE,Measurement_Flag,Quality_Flag,Source_Flag,Observation_Time,Year,Country_Code
0,AE000041196,20230101,PRCP,0,D,,S,NaT,2023,AE
1,AEM00041194,20230101,PRCP,0,,,S,NaT,2023,AE
2,AGE00147708,20230101,PRCP,0,,,S,NaT,2023,AG
3,AGE00147716,20230101,PRCP,0,,,S,NaT,2023,AG
4,AGE00147719,20230101,PRCP,0,,,S,NaT,2023,AG


In [12]:
#  Calculate total rainfall and number of observations for each station
station_rainfall = (prcp_data_2023.groupBy("Station_ID", "Country_Code") 
    .agg(F.sum("VALUE").alias("Total_Rainfall")))

# Show the result 
show_as_html(station_rainfall, 5)

Unnamed: 0,Station_ID,Country_Code,Total_Rainfall
0,AJ000037735,AJ,60803
1,AJ000037985,AJ,31058
2,AR000087344,AR,2498
3,ASN00003066,AS,9337
4,ASN00003088,AS,6232


In [13]:
# Sum the total rainfall for all stations within each country
total_rainfall_country = (station_rainfall.groupBy("Country_Code") 
    .agg(F.sum("Total_Rainfall").alias("Total_Rainfall_Country")))

# Show the result
show_as_html(total_rainfall_country, 5)

Unnamed: 0,Country_Code,Total_Rainfall_Country
0,TI,30791
1,CA,11188297
2,SW,4132097
3,MX,262042
4,MZ,37731


In [14]:
# Count the number of stations in each country
station_count_country = (station_rainfall.groupBy("Country_Code") 
    .agg(F.countDistinct("Station_ID").alias("Station_Count")))

# Show the result 
show_as_html(station_count_country, 5)

Unnamed: 0,Country_Code,Station_Count
0,TI,19
1,CA,1715
2,SW,545
3,MX,73
4,MZ,8


In [15]:
# Join the total rainfall with the station count for each country
country_rainfall_with_station_count = total_rainfall_country.join(
    station_count_country, on="Country_Code"
)

# show the result
show_as_html(country_rainfall_with_station_count, 5)

Unnamed: 0,Country_Code,Total_Rainfall_Country,Station_Count
0,TI,30791,19
1,BG,235582,10
2,CA,11188297,1715
3,MX,262042,73
4,MZ,37731,8


#### 2. Calculate the average rainfall in 2023 

In [16]:
# Calculate the average rainfall per station for each country
country_avg_rainfall = country_rainfall_with_station_count.withColumn(
    "Average_Rainfall", F.col("Total_Rainfall_Country") / F.col("Station_Count")
)

# Show the result 
show_as_html(country_avg_rainfall.sort(F.col("Average_Rainfall"),ascending = False), 5)

Unnamed: 0,Country_Code,Total_Rainfall_Country,Station_Count,Average_Rainfall
0,AJ,239729,6,39954.833333
1,PS,157198,4,39299.5
2,FM,768767,20,38438.35
3,AQ,144969,4,36242.25
4,FG,35343,1,35343.0


#### 3. Which country has the highest/ smallest average rainfall in a single year across the entire dataset?

In [17]:
# Load the countries dataset from HDFS
countries_raw = spark.read.text("hdfs:///data/ghcnd/ghcnd-countries.txt")

# Transform unstructured 'ghcnd_countries' string into separate string columns
countries = countries_raw.select(
    F.trim(F.substring(countries_raw.value, 1, 2)).alias("Country_Code"),
    F.trim(F.substring(countries_raw.value, 4, 61)).alias("Country_Name")
)

# show the result 
show_as_html(countries, 5)

Unnamed: 0,Country_Code,Country_Name
0,AC,Antigua and Barbuda
1,AE,United Arab Emirates
2,AF,Afghanistan
3,AG,Algeria
4,AJ,Azerbaijan


In [18]:
# Join the prcp_data with countries table to get the country_name for each country.
average_rainfall_2023 = country_avg_rainfall.join(F.broadcast(countries), "Country_Code", how="inner")

# show the result in descending order 
show_as_html(average_rainfall_2023.sort(F.col("Average_Rainfall"),ascending = False), 5)

Unnamed: 0,Country_Code,Total_Rainfall_Country,Station_Count,Average_Rainfall,Country_Name
0,AJ,239729,6,39954.833333,Azerbaijan
1,PS,157198,4,39299.5,Palau
2,FM,768767,20,38438.35,Federated States of Micronesia
3,AQ,144969,4,36242.25,American Samoa [United States]
4,FG,35343,1,35343.0,French Guiana [France]


In [20]:
# Convert Average_Rainfall from tenths of mm to mm
average_rainfall_2023 = average_rainfall_2023.withColumn(
    "Average_Rainfall_mm", F.col("Average_Rainfall") / 10
)

# Show the result in descending order
show_as_html(average_rainfall_2023.sort(F.col("Average_Rainfall_mm"), ascending=False), 5)

Unnamed: 0,Country_Code,Total_Rainfall_Country,Station_Count,Average_Rainfall,Country_Name,Average_Rainfall_mm
0,AJ,239729,6,39954.833333,Azerbaijan,3995.483333
1,PS,157198,4,39299.5,Palau,3929.95
2,FM,768767,20,38438.35,Federated States of Micronesia,3843.835
3,AQ,144969,4,36242.25,American Samoa [United States],3624.225
4,FG,35343,1,35343.0,French Guiana [France],3534.3


In [21]:
# show the result in ascending order 
show_as_html(average_rainfall_2023.sort(F.col("Average_Rainfall_mm"),ascending = True), 5)

Unnamed: 0,Country_Code,Total_Rainfall_Country,Station_Count,Average_Rainfall,Country_Name,Average_Rainfall_mm
0,NU,0,1,0.0,Nicaragua,0.0
1,ZA,555,3,185.0,Zambia,18.5
2,EG,1645,7,235.0,Egypt,23.5
3,MR,575,2,287.5,Mauritania,28.75
4,TX,10520,19,553.684211,Turkmenistan,55.368421


#### 4 Save the result 

In [22]:
# Save the result to the output directory as Parquet
(average_rainfall_2023.write.option('header', True).mode('overwrite')
    .csv('hdfs:///user/kda115/ghcnd/result/average_rainfall_2023.csv'))

In [23]:
# Check the resukt 
! hdfs dfs -ls 'hdfs:///user/kda115/ghcnd/result/'

Found 8 items
drwxr-xr-x   - kda115 kda115          0 2024-09-01 10:40 hdfs:///user/kda115/ghcnd/result/NZ_Stations_Distance.parquet
drwxr-xr-x   - kda115 kda115          0 2024-08-29 21:23 hdfs:///user/kda115/ghcnd/result/NZ_temp_stations.csv
drwxr-xr-x   - kda115 kda115          0 2024-09-11 14:31 hdfs:///user/kda115/ghcnd/result/average_rainfall_2023.csv
drwxr-xr-x   - kda115 kda115          0 2024-08-30 14:39 hdfs:///user/kda115/ghcnd/result/country_prcp_2023.csv
drwxr-xr-x   - kda115 kda115          0 2024-08-30 11:03 hdfs:///user/kda115/ghcnd/result/country_prcp_data.parquet
drwxr-xr-x   - kda115 kda115          0 2024-08-28 15:04 hdfs:///user/kda115/ghcnd/result/enriched_stations.parquet
drwxr-xr-x   - kda115 kda115          0 2024-08-28 16:53 hdfs:///user/kda115/ghcnd/result/new_countries.parquet
drwxr-xr-x   - kda115 kda115          0 2024-08-28 16:53 hdfs:///user/kda115/ghcnd/result/states_new.parquet


In [27]:
# Create a new folder in home directory.
!mkdir -p /users/home/kda115/rainfall_2023

In [24]:
# Use hdfs dfs -copyToLocal to copy the output save from HDFS to local machine 
!hdfs dfs -copyToLocal /user/kda115/ghcnd/result/average_rainfall_2023.csv/* /users/home/kda115/rainfall_2023

In [6]:
# Run this cell before closing the notebook or kill your spark application by hand using the link in the Spark UI

stop_spark()