In [1]:
# Run this cell to import pyspark and to define start_spark() and stop_spark()

import findspark

findspark.init()

import getpass
import pandas
import pyspark
import random
import re

from IPython.display import display, HTML
from pyspark import SparkContext
from pyspark.sql import SparkSession


# Functions used below

def username():
    """Get username with any domain information removed.
    """

    return re.sub('@.*', '', getpass.getuser())


def dict_to_html(d):
    """Convert a Python dictionary into a two column table for display.
    """

    html = []

    html.append(f'<table width="100%" style="width:100%; font-family: monospace;">')
    for k, v in d.items():
        html.append(f'<tr><td style="text-align:left;">{k}</td><td>{v}</td></tr>')
    html.append(f'</table>')

    return ''.join(html)


def show_as_html(df, n=20):
    """Leverage existing pandas jupyter integration to show a spark dataframe as html.
    
    Args:
        n (int): number of rows to show (default: 20)
    """

    display(df.limit(n).toPandas())

    
def display_spark():
    """Display the status of the active Spark session if one is currently running.
    """
    
    if 'spark' in globals() and 'sc' in globals():

        name = sc.getConf().get("spark.app.name")
        
        html = [
            f'<p><b>Spark</b></p>',
            f'<p>The spark session is <b><span style="color:green">active</span></b>, look for <code>{name}</code> under the running applications section in the Spark UI.</p>',
            f'<ul>',
            f'<li><a href="http://mathmadslinux2p.canterbury.ac.nz:8080/" target="_blank">Spark UI</a></li>',
            f'<li><a href="{sc.uiWebUrl}" target="_blank">Spark Application UI</a></li>',
            f'</ul>',
            f'<p><b>Config</b></p>',
            dict_to_html(dict(sc.getConf().getAll())),
            f'<p><b>Notes</b></p>',
            f'<ul>',
            f'<li>The spark session <code>spark</code> and spark context <code>sc</code> global variables have been defined by <code>start_spark()</code>.</li>',
            f'<li>Please run <code>stop_spark()</code> before closing the notebook or restarting the kernel or kill <code>{name}</code> by hand using the link in the Spark UI.</li>',
            f'</ul>',
        ]
        display(HTML(''.join(html)))
        
    else:
        
        html = [
            f'<p><b>Spark</b></p>',
            f'<p>The spark session is <b><span style="color:red">stopped</span></b>, confirm that <code>{username() + " (jupyter)"}</code> is under the completed applications section in the Spark UI.</p>',
            f'<ul>',
            f'<li><a href="http://mathmadslinux2p.canterbury.ac.nz:8080/" target="_blank">Spark UI</a></li>',
            f'</ul>',
        ]
        display(HTML(''.join(html)))


# Functions to start and stop spark

def start_spark(executor_instances=2, executor_cores=1, worker_memory=1, master_memory=1):
    """Start a new Spark session and define globals for SparkSession (spark) and SparkContext (sc).
    
    Args:
        executor_instances (int): number of executors (default: 2)
        executor_cores (int): number of cores per executor (default: 1)
        worker_memory (float): worker memory (default: 1)
        master_memory (float): master memory (default: 1)
    """

    global spark
    global sc

    user = username()
    
    cores = executor_instances * executor_cores
    partitions = cores * 4
    port = 4000 + random.randint(1, 999)

    spark = (
        SparkSession.builder
        .master("spark://masternode2:7077")
        .config("spark.driver.extraJavaOptions", f"-Dderby.system.home=/tmp/{user}/spark/")
        .config("spark.dynamicAllocation.enabled", "false")
        .config("spark.executor.instances", str(executor_instances))
        .config("spark.executor.cores", str(executor_cores))
        .config("spark.cores.max", str(cores))
        .config("spark.executor.memory", f"{worker_memory}g")
        .config("spark.driver.memory", f"{master_memory}g")
        .config("spark.driver.maxResultSize", "0")
        .config("spark.sql.shuffle.partitions", str(partitions))
        .config("spark.ui.port", str(port))
        .appName(user + " (jupyter)")
        .getOrCreate()
    )
    sc = SparkContext.getOrCreate()
    
    display_spark()

    
def stop_spark():
    """Stop the active Spark session and delete globals for SparkSession (spark) and SparkContext (sc).
    """

    global spark
    global sc

    if 'spark' in globals() and 'sc' in globals():

        spark.stop()

        del spark
        del sc

    display_spark()


# Make css changes to improve spark output readability

html = [
    '<style>',
    'pre { white-space: pre !important; }',
    'table.dataframe td { white-space: nowrap !important; }',
    'table.dataframe thead th:first-child, table.dataframe tbody th { display: none; }',
    '</style>',
]
display(HTML(''.join(html)))

In [48]:

from pyspark.sql import SparkSession

!pip3 install plotly #no permission to install packages to env. 



Collecting plotly
  Downloading https://files.pythonhosted.org/packages/a8/07/72953cf70e3bd3a24cbc3e743e6f8539abe6e3e6d83c3c0c83426eaffd39/plotly-5.18.0-py3-none-any.whl (15.6MB)
[K    100% |████████████████████████████████| 15.6MB 84kB/s  eta 0:00:01
[?25hCollecting packaging (from plotly)
  Downloading https://files.pythonhosted.org/packages/05/8e/8de486cbd03baba4deef4142bd643a3e7bbe954a784dc1bb17142572d127/packaging-21.3-py3-none-any.whl (40kB)
[K    100% |████████████████████████████████| 40kB 826kB/s eta 0:00:01
[?25hCollecting tenacity>=6.2.0 (from plotly)
  Downloading https://files.pythonhosted.org/packages/e7/b0/c23bd61e1b32c9b96fbca996c87784e196a812da8d621d8d04851f6c8181/tenacity-8.2.2-py3-none-any.whl
Installing collected packages: packaging, tenacity, plotly
[31mException:
Traceback (most recent call last):
  File "/usr/lib/python3.6/site-packages/pip/basecommand.py", line 215, in main
    status = self.run(options, args)
  File "/usr/lib/python3.6/site-packages/pip/co

In [2]:
#You may increase your resources
#up to 4 executors, 2 cores per executor, 4 GB of executor memory, and 4 GB of master memory.

start_spark(executor_instances=4, executor_cores=2, worker_memory=4, master_memory=4)

0,1
spark.dynamicAllocation.enabled,false
spark.executor.instances,4
spark.ui.port,4692
spark.driver.memory,4g
spark.executor.memory,4g
spark.sql.warehouse.dir,file:/users/home/nki38/spark-warehouse/
spark.master,spark://masternode2:7077
spark.executor.id,driver
spark.executor.cores,2
spark.driver.host,mathmadslinux2p.canterbury.ac.nz


In [3]:
#You may increase your resources
#up to 4 executors, 2 cores per executor, 4 GB of executor memory, and 4 GB of master memory.

start_spark(executor_instances=4, executor_cores=2, worker_memory=4, master_memory=4)

0,1
spark.dynamicAllocation.enabled,false
spark.executor.instances,4
spark.ui.port,4692
spark.driver.memory,4g
spark.executor.memory,4g
spark.sql.warehouse.dir,file:/users/home/nki38/spark-warehouse/
spark.master,spark://masternode2:7077
spark.executor.id,driver
spark.executor.cores,2
spark.driver.host,mathmadslinux2p.canterbury.ac.nz


In [4]:
import pyspark.sql.functions as F
from pyspark.sql.types import *
LIMITER = False
schema = StructType([
    StructField("ID", StringType(), False),
    StructField("DATE", IntegerType(), True),
    StructField("ELEMENT", StringType(), True),
    StructField("VALUE", DoubleType(), True),
    StructField("MEASUREMENT FLAG", StringType(), True),
    StructField("QUALITY FLAG", StringType(), True),
    StructField("SOURCE FLAG", StringType(), True),
    StructField("OBSERVATION TIME", StringType(), True) 
])

if  LIMITER:
    all_daily = spark.read.csv("hdfs:///data/ghcnd/daily/2023.csv.gz", schema)
    all_daily = all_daily.limit(1000)
    print("limited")
else:
    all_daily = spark.read.csv("hdfs:///data/ghcnd/daily", schema)
    
    print ("unlimited")


unlimited


In [5]:
worldwide_precip = all_daily.filter(F.col('ELEMENT') == 'PRCP')

In [None]:
worldwide_precip = worldwide_precip.withColumn("year", F.substring(F.col("DATE").cast("string"), 1, 4))
worldwide_precip = worldwide_precip.withColumn("country", F.substring(F.col("ID").cast("string"), 1, 2))
worldwide_precip.show(1)
print(worldwide_precip.count())

In [7]:
filename = "world_prcp.csv"
output_path = f"hdfs:///user/nki38/outputs/ghcnd/{filename}"
avg_measurement_by_year_country = worldwide_precip.groupBy("year", "country") \
    .agg(F.avg("VALUE").alias("avg_measurement"))
avg_measurement_by_year_country.write.csv(output_path, header=True, mode="overwrite")


In [9]:
avg_measurement_by_year_country.show(1)

+----+-------+------------------+
|year|country|   avg_measurement|
+----+-------+------------------+
|2011|     CS|100.37719298245614|
+----+-------+------------------+
only showing top 1 row



In [21]:
most_rainfall = avg_measurement_by_year_country.agg(F.max("avg_measurement").alias("max_measurement"))
most_rainfall.show()


+---------------+
|max_measurement|
+---------------+
|         4361.0|
+---------------+



In [23]:
country = avg_measurement_by_year_country.filter(F.col('avg_measurement') ==  4361.0)
country.show()

#THE ONLY REASON FOR THIS IS THAT THE OTHER DF IS AVAILABLE AND I FORGOT TO ADD THE CCODE!

+----+-------+---------------+
|year|country|avg_measurement|
+----+-------+---------------+
|2000|     EK|         4361.0|
+----+-------+---------------+



In [12]:
data = avg_measurement_by_year_country.limit(500)


In [47]:
!hdfs dfs -copyToLocal /user/nki38/outputs/ghcnd/

copyToLocal: `ghcnd/NZ_Stations_ANALYSIS.csv/_SUCCESS': File exists
copyToLocal: `ghcnd/NZ_Stations_ANALYSIS.csv/part-00000-222e1ba6-cefe-42fa-94cd-819b864f1840-c000.csv': File exists
copyToLocal: `ghcnd/countries_with_counts.csv/_SUCCESS': File exists
copyToLocal: `ghcnd/countries_with_counts.csv/part-00000-fef3636e-b1d0-49cd-84a5-84f09fa91cae-c000.csv': File exists
copyToLocal: `ghcnd/nz_stations_distance.csv/_SUCCESS': File exists
copyToLocal: `ghcnd/nz_stations_distance.csv/part-00000-c1d4ab45-4643-490f-b8e8-16e0045bd781-c000.csv': File exists
copyToLocal: `ghcnd/rainfall_by_year_country.csv/_SUCCESS': File exists
copyToLocal: `ghcnd/rainfall_by_year_country.csv/part-00000-19bfba1f-87ef-4adb-ba91-aa79469773d0-c000.csv': File exists
copyToLocal: `ghcnd/states_with_counts.csv/_SUCCESS': File exists
copyToLocal: `ghcnd/states_with_counts.csv/part-00000-7a1392fc-c1e2-448b-aa73-2a25bfb76281-c000.csv': File exists
copyToLocal: `ghcnd/stations_augmented.csv/_SUCCESS': File exists
copyToLo

In [46]:
!hdfs dfs -ls /user/nki38/outputs/ghcnd


Found 10 items
drwxr-xr-x   - nki38 nki38          0 2024-04-29 09:20 /user/nki38/outputs/ghcnd/2023_rainfall
drwxr-xr-x   - nki38 nki38          0 2024-04-28 16:08 /user/nki38/outputs/ghcnd/NZ_Stations_ANALYSIS.csv
drwxr-xr-x   - nki38 nki38          0 2024-04-28 11:15 /user/nki38/outputs/ghcnd/countries_with_counts.csv
drwxr-xr-x   - nki38 nki38          0 2024-04-28 11:15 /user/nki38/outputs/ghcnd/nz_stations_distance.csv
drwxr-xr-x   - nki38 nki38          0 2024-04-27 20:20 /user/nki38/outputs/ghcnd/rainfall_by_year_country.csv
drwxr-xr-x   - nki38 nki38          0 2024-04-28 21:04 /user/nki38/outputs/ghcnd/rainfall_by_year_country_new.csv
drwxr-xr-x   - nki38 nki38          0 2024-04-28 11:15 /user/nki38/outputs/ghcnd/states_with_counts.csv
drwxr-xr-x   - nki38 nki38          0 2024-04-26 10:54 /user/nki38/outputs/ghcnd/stations_augmented
drwxr-xr-x   - nki38 nki38          0 2024-04-26 11:09 /user/nki38/outputs/ghcnd/stations_augmented.csv
drwxr-xr-x   - nki38 nki38   

In [33]:
df_2023 = avg_measurement_by_year_country.filter(F.col("year")  == "2023")
print(df_2023.count())
df_2023.cache()
df_2023.show()

182
+----+-------+------------------+
|year|country|   avg_measurement|
+----+-------+------------------+
|2023|     AR| 50.20529482551143|
|2023|     CA| 23.39627178903699|
|2023|     EZ| 22.33065326633166|
|2023|     FM|115.08110944527736|
|2023|     NS|  59.4951768488746|
|2023|     CU| 79.10144927536231|
|2023|     ET| 73.85454545454546|
|2023|     GG|21.219600725952812|
|2023|     GI| 8.418732782369146|
|2023|     MQ|17.701408450704225|
|2023|     KG| 53.66942148760331|
|2023|     IV| 88.92255236239649|
|2023|     CD| 75.08196721311475|
|2023|     PU| 119.5677966101695|
|2023|     ZI| 69.11235955056179|
|2023|     AE|3.9089635854341735|
|2023|     AG| 7.280195274831244|
|2023|     FJ|63.269992663242846|
|2023|     MV| 8.582978723404254|
|2023|     PS|112.04419101924448|
+----+-------+------------------+
only showing top 20 rows



In [45]:
df_2023.show()
df_2023.repartition(1).write.csv( "/user/nki38/outputs/ghcnd/2023_rainfall", header=True, mode="overwrite")

+----+-------+------------------+
|year|country|   avg_measurement|
+----+-------+------------------+
|2023|     AR| 50.20529482551143|
|2023|     CA| 23.39627178903699|
|2023|     EZ| 22.33065326633166|
|2023|     FM|115.08110944527736|
|2023|     NS|  59.4951768488746|
|2023|     CU| 79.10144927536231|
|2023|     ET| 73.85454545454546|
|2023|     GG|21.219600725952812|
|2023|     GI| 8.418732782369146|
|2023|     MQ|17.701408450704225|
|2023|     KG| 53.66942148760331|
|2023|     IV| 88.92255236239649|
|2023|     CD| 75.08196721311475|
|2023|     PU| 119.5677966101695|
|2023|     ZI| 69.11235955056179|
|2023|     AE|3.9089635854341735|
|2023|     AG| 7.280195274831244|
|2023|     FJ|63.269992663242846|
|2023|     MV| 8.582978723404254|
|2023|     PS|112.04419101924448|
+----+-------+------------------+
only showing top 20 rows



In [8]:
#stop_spark()