In [118]:
# Run this cell to import pyspark and to define start_spark() and stop_spark()

import findspark

findspark.init()

import getpass
import pandas as pd
import pyspark
import random
import re

from IPython.display import display, HTML
from pyspark import SparkContext
from pyspark.sql import SparkSession


# Functions used below

def username():
    """Get username with any domain information removed.
    """

    return re.sub('@.*', '', getpass.getuser())


def dict_to_html(d):
    """Convert a Python dictionary into a two column table for display.
    """

    html = []

    html.append(f'<table width="100%" style="width:100%; font-family: monospace;">')
    for k, v in d.items():
        html.append(f'<tr><td style="text-align:left;">{k}</td><td>{v}</td></tr>')
    html.append(f'</table>')

    return ''.join(html)


def show_as_html(df, n=20):
    """Leverage existing pandas jupyter integration to show a spark dataframe as html.
    
    Args:
        n (int): number of rows to show (default: 20)
    """

    display(df.limit(n).toPandas())

    
def display_spark():
    """Display the status of the active Spark session if one is currently running.
    """
    
    if 'spark' in globals() and 'sc' in globals():

        name = sc.getConf().get("spark.app.name")
        
        html = [
            f'<p><b>Spark</b></p>',
            f'<p>The spark session is <b><span style="color:green">active</span></b>, look for <code>{name}</code> under the running applications section in the Spark UI.</p>',
            f'<ul>',
            f'<li><a href="http://mathmadslinux2p.canterbury.ac.nz:8080/" target="_blank">Spark UI</a></li>',
            f'<li><a href="{sc.uiWebUrl}" target="_blank">Spark Application UI</a></li>',
            f'</ul>',
            f'<p><b>Config</b></p>',
            dict_to_html(dict(sc.getConf().getAll())),
            f'<p><b>Notes</b></p>',
            f'<ul>',
            f'<li>The spark session <code>spark</code> and spark context <code>sc</code> global variables have been defined by <code>start_spark()</code>.</li>',
            f'<li>Please run <code>stop_spark()</code> before closing the notebook or restarting the kernel or kill <code>{name}</code> by hand using the link in the Spark UI.</li>',
            f'</ul>',
        ]
        display(HTML(''.join(html)))
        
    else:
        
        html = [
            f'<p><b>Spark</b></p>',
            f'<p>The spark session is <b><span style="color:red">stopped</span></b>, confirm that <code>{username() + " (jupyter)"}</code> is under the completed applications section in the Spark UI.</p>',
            f'<ul>',
            f'<li><a href="http://mathmadslinux2p.canterbury.ac.nz:8080/" target="_blank">Spark UI</a></li>',
            f'</ul>',
        ]
        display(HTML(''.join(html)))


# Functions to start and stop spark

def start_spark(executor_instances=2, executor_cores=1, worker_memory=1, master_memory=1):
    """Start a new Spark session and define globals for SparkSession (spark) and SparkContext (sc).
    
    Args:
        executor_instances (int): number of executors (default: 2)
        executor_cores (int): number of cores per executor (default: 1)
        worker_memory (float): worker memory (default: 1)
        master_memory (float): master memory (default: 1)
    """

    global spark
    global sc

    user = username()
    
    cores = executor_instances * executor_cores
    partitions = cores * 4
    port = 4000 + random.randint(1, 999)

    spark = (
        SparkSession.builder
        .master("spark://masternode2:7077")
        .config("spark.driver.extraJavaOptions", f"-Dderby.system.home=/tmp/{user}/spark/")
        .config("spark.dynamicAllocation.enabled", "false")
        .config("spark.executor.instances", str(executor_instances))
        .config("spark.executor.cores", str(executor_cores))
        .config("spark.cores.max", str(cores))
        .config("spark.executor.memory", f"{worker_memory}g")
        .config("spark.driver.memory", f"{master_memory}g")
        .config("spark.driver.maxResultSize", "0")
        .config("spark.sql.shuffle.partitions", str(partitions))
        .config("spark.ui.port", str(port))
        .appName(user + " (jupyter)")
        .getOrCreate()
    )
    sc = SparkContext.getOrCreate()
    
    display_spark()

    
def stop_spark():
    """Stop the active Spark session and delete globals for SparkSession (spark) and SparkContext (sc).
    """

    global spark
    global sc

    if 'spark' in globals() and 'sc' in globals():

        spark.stop()

        del spark
        del sc

    display_spark()


# Make css changes to improve spark output readability

html = [
    '<style>',
    'pre { white-space: pre !important; }',
    'table.dataframe td { white-space: nowrap !important; }',
    'table.dataframe thead th:first-child, table.dataframe tbody th { display: none; }',
    '</style>',
]
display(HTML(''.join(html)))

In [119]:
from matplotlib import pyplot as plt
import numpy as np
import os
import math
from pyspark.sql import Row, DataFrame, Window, functions as F
from pyspark.sql.types import *

In [144]:
start_spark(executor_instances=4, executor_cores=2, worker_memory=4, master_memory=4)

0,1
spark.dynamicAllocation.enabled,false
spark.driver.port,38345
spark.executor.instances,4
spark.sql.warehouse.dir,file:/users/home/ywa286/Assignment1/notebook/spark-warehouse
spark.driver.memory,4g
spark.executor.memory,4g
spark.master,spark://masternode2:7077
spark.executor.id,driver
spark.executor.cores,2
spark.driver.host,mathmadslinux2p.canterbury.ac.nz


## Q1, Exploration of Stations

In [99]:
# load enriched station data
stations_df = spark.read.parquet("station_enriched.parquet")
show_as_html(stations_df)

Unnamed: 0,ID,STATE_CODE,COUNTRY_CODE,LATITUDE,LONGITUDE,ELEVATION,NAME,GSN_FLAG,HCN_CRN_FLAG,WMO_ID,COUNTRY_NAME,STATE_NAME,CORE_ELEMENTS,CORE_ELEMENTS_COUNT,OTHER_ELEMENTS,OTHER_ELEMENTS_COUNT,FIRSTYEAR,LASTYEAR,TOTAL_ELEMENTS,ACTIVE_YEAR
0,AE000041196,,AE,25.333,55.516998,34.0,SHARJAH INTER. AIRP,GSN,,41196.0,United Arab Emirates,,"[TMAX, TMIN, PRCP]",3,[TAVG],1,1944,2024,4,80
1,AEM00041218,,AE,24.261999,55.609001,264.899994,AL AIN INTL,,,41218.0,United Arab Emirates,,"[TMAX, TMIN, PRCP]",3,[TAVG],1,1994,2024,4,30
2,AGE00147715,,AG,35.419998,8.1197,863.0,TEBESSA,,,,Algeria,,"[TMAX, TMIN, PRCP]",3,[],0,1879,1938,3,59
3,AGE00147794,,AG,36.779999,5.1,225.0,BEJAIA-CAP CARBON,,,,Algeria,,"[TMAX, TMIN]",2,[],0,1926,1938,2,12
4,AGM00060402,,AG,36.712002,5.07,6.1,SOUMMAM,,,60402.0,Algeria,,"[TMAX, TMIN, PRCP, SNWD]",4,[TAVG],1,1973,2024,5,51
5,AGM00060430,,AG,36.299999,2.233,721.0,MILIANA,,,60430.0,Algeria,,"[TMAX, TMIN, PRCP, SNWD]",4,[TAVG],1,1957,2024,5,67
6,AGM00060461,,AG,35.700001,-0.65,22.0,ORAN-PORT,,,60461.0,Algeria,,"[TMAX, TMIN, PRCP]",3,[TAVG],1,1995,2024,4,29
7,AGM00060514,,AG,35.167,2.317,801.0,KSAR CHELLALA,,,60514.0,Algeria,,"[TMAX, TMIN, PRCP, SNWD]",4,[TAVG],1,1995,2024,5,29
8,AGM00060515,,AG,35.333,4.206,459.0,BOU SAADA,,,60515.0,Algeria,,"[TMAX, TMIN, PRCP]",3,[TAVG],1,1984,2024,4,40
9,AGM00060550,,AG,33.667,1.0,1347.0,EL-BAYADH,,,60550.0,Algeria,,"[TMAX, TMIN, PRCP, SNWD]",4,[TAVG],1,1973,2024,5,51


### (a)

In [6]:
# count the number of station and how many of them still activate
station_count = stations_df.count()
print(f"There are {station_count} stations in total.")
station_count_activate_2024 = stations_df.filter(F.col("LASTYEAR") == 2024).count()
print(f"{station_count_activate_2024} of these stations were active so far in 2024.")

There are 127994 stations in total.
36516 of these stations were active so far in 2024.


In [90]:
# Count stations in each network
network_counts = stations_df.select(
    F.count(F.when(F.col("GSN_FLAG") == "GSN", True)).alias("GSN_count"),
    F.count(F.when(F.col("HCN_CRN_FLAG") == "HCN", True)).alias("HCN_count"),
    F.count(F.when(F.col("HCN_CRN_FLAG") == "CRN", True)).alias("CRN_count")
).collect()[0]

print(f"Stations in GSN: {network_counts['GSN_count']}")
print(f"Stations in HCN: {network_counts['HCN_count']}")
print(f"Stations in CRN: {network_counts['CRN_count']}")

# Check for overlaps
overlap_df = stations_df.select(
    "ID",
    F.when(F.col("GSN_FLAG") == "GSN", 1).otherwise(0).alias("GSN"),
    F.when(F.col("HCN_CRN_FLAG") == "HCN", 1).otherwise(0).alias("HCN"),
    F.when(F.col("HCN_CRN_FLAG") == "CRN", 1).otherwise(0).alias("CRN")
)

gsn_hcn_overlap = overlap_df.filter((F.col("GSN") == 1) & (F.col("HCN") == 1)).count()
gsn_crn_overlap = overlap_df.filter((F.col("GSN") == 1) & (F.col("CRN") == 1)).count()

print(f"Stations in both GSN and HCN: {gsn_hcn_overlap}")
print(f"Stations in both GSN and CRN: {gsn_crn_overlap}")

Row(GSN_count=991, HCN_count=1218, CRN_count=234)
Stations in GSN: 991
Stations in HCN: 1218
Stations in CRN: 234


Unnamed: 0,ID,GSN,HCN,CRN
0,AE000041196,1,0,0
1,AEM00041218,0,0,0
2,AGE00147715,0,0,0
3,AGE00147794,0,0,0
4,AGM00060402,0,0,0
5,AGM00060430,0,0,0
6,AGM00060461,0,0,0
7,AGM00060514,0,0,0
8,AGM00060515,0,0,0
9,AGM00060550,0,0,0


Stations in both GSN and HCN: 15
Stations in both GSN and CRN: 0


In [86]:
# identify how many stations located in american territory but not unite states
us_stations_df = stations_df.filter((stations_df['COUNTRY_NAME'].contains('United States')) & (stations_df['COUNTRY_NAME'] != 'United States'))
print(us_stations_df.count())

399


### (c)

In [13]:
# identify how many stations located in southhemisphere
southhemisphere_count = stations_df.filter(F.col("LATITUDE") < 0).count()
print(f"There are {southhemisphere_count} stations located in southhemisphere.")

There are 25357 stations located in southhemisphere.


In [26]:
# identify the number of station for each country
country_stations_num = (
    stations_df
    .groupBy("COUNTRY_NAME")
    .agg(F.count("*").alias("STATION_COUNT"))
    .orderBy(F.desc("STATION_COUNT"))
)
show_as_html(country_stations_num)

Unnamed: 0,COUNTRY_NAME,STATION_COUNT
0,United States,74243
1,Australia,17088
2,Canada,9226
3,Brazil,5989
4,Mexico,5249
5,India,3807
6,Sweden,1721
7,South Africa,1166
8,Russia,1123
9,Germany,1123


In [27]:
# left join the count table to country table and then save it 
# Read the countries file
countries_df = spark.read.text("hdfs:///data/ghcnd/ghcnd-countries.txt")

# Parse the fixed-width formatted data
countries_df = countries_df.select(
    F.trim(F.substring("value", 1, 2)).alias("CODE"),
    F.trim(F.substring("value", 4, 61)).alias("NAME")
)

countries_station_num_df = countries_df.withColumnRenamed("CODE", "COUNTRY_CODE").withColumnRenamed("NAME", "COUNTRY_NAME").join(
    country_stations_num,
    "COUNTRY_NAME",
    "left",
).orderBy(F.desc("STATION_COUNT"))
show_as_html(countries_station_num_df)

countries_station_num_df.write.mode("overwrite").parquet("./countries_enriched.parquet")

Unnamed: 0,CODE,NAME
0,AC,Antigua and Barbuda
1,AE,United Arab Emirates
2,AF,Afghanistan
3,AG,Algeria
4,AJ,Azerbaijan
5,AL,Albania
6,AM,Armenia
7,AO,Angola
8,AQ,American Samoa [United States]
9,AR,Argentina


Unnamed: 0,COUNTRY_NAME,COUNTRY_CODE,STATION_COUNT
0,United States,US,74243
1,Australia,AS,17088
2,Canada,CA,9226
3,Brazil,BR,5989
4,Mexico,MX,5249
5,India,IN,3807
6,Sweden,SW,1721
7,South Africa,SF,1166
8,Germany,GM,1123
9,Russia,RS,1123


In [28]:
# same step on state information
state_stations_num = (
    stations_df
    .groupBy("STATE_NAME")
    .agg(F.count("*").alias("STATION_COUNT"))
    .orderBy(F.desc("STATION_COUNT"))
)
show_as_html(state_stations_num)


Unnamed: 0,STATE_NAME,STATION_COUNT
0,,44055
1,TEXAS,6350
2,COLORADO,4735
3,CALIFORNIA,3138
4,NORTH CAROLINA,2692
5,NEBRASKA,2415
6,MINNESOTA,2386
7,KANSAS,2312
8,NEW MEXICO,2273
9,FLORIDA,2205


In [29]:
# Read the states file
states_df = spark.read.text("hdfs:///data/ghcnd/ghcnd-states.txt")

# Parse the fixed-width formatted data
states_df = states_df.select(
    F.trim(F.substring("value", 1, 2)).alias("CODE"),
    F.trim(F.substring("value", 4, 47)).alias("NAME")
).withColumnRenamed("CODE", "STATE_CODE").withColumnRenamed("NAME", "STATE_NAME")

states_station_num_df = states_df.join(
    state_stations_num,
    "STATE_NAME",
    "left",
).orderBy(F.desc("STATION_COUNT"))
show_as_html(states_station_num_df)

states_station_num_df.write.mode("overwrite").parquet("./states_enriched.parquet")

Unnamed: 0,STATE_CODE,STATE_NAME
0,AB,ALBERTA
1,AK,ALASKA
2,AL,ALABAMA
3,AR,ARKANSAS
4,AS,AMERICAN SAMOA
5,AZ,ARIZONA
6,BC,BRITISH COLUMBIA
7,CA,CALIFORNIA
8,CO,COLORADO
9,CT,CONNECTICUT


Unnamed: 0,STATE_NAME,STATE_CODE,STATION_COUNT
0,TEXAS,TX,6350
1,COLORADO,CO,4735
2,CALIFORNIA,CA,3138
3,NORTH CAROLINA,NC,2692
4,NEBRASKA,NE,2415
5,MINNESOTA,MN,2386
6,KANSAS,KS,2312
7,NEW MEXICO,NM,2273
8,FLORIDA,FL,2205
9,ILLINOIS,IL,2193


## Q2, Distance between Stations

 Since the station is located on the Earth's surface, which can be approximated as a sphere, the distance between any two stations can be calculated using the Haversine Formula \cite{haversine_formula}, based on the longitude and latitude information provided by the enriched station data. The Formula of Haversine like below:
$$ distance = 2R \cdot \arctan\left(\sqrt{a}, \sqrt{1-a}\right) $$
Where, $$ a = \sin^2\left(\frac{\Delta\phi}{2}\right) + \cos(\phi_1) \cdot \cos(\phi_2) \cdot \sin^2\left(\frac{\Delta\lambda}{2}\right) $$ $$\Delta\phi = \phi_2 - \phi_1, \quad \Delta\lambda = \lambda_2 - \lambda_1 $$ Base on the formula above, the Python function can be organized like this.


### (a)

In [100]:
def haversine_distance(lat1, lon1, lat2, lon2):
    R = 6371  # Earth's radius in kilometers

    phi1 = math.radians(lat1)
    phi2 = math.radians(lat2)
    delta_phi = math.radians(lat2 - lat1)
    delta_lambda = math.radians(lon2 - lon1)

    a = math.sin(delta_phi/2)**2 + \
        math.cos(phi1) * math.cos(phi2) * \
        math.sin(delta_lambda/2)**2
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1-a))

    return R * c

# register the UDF
haversine_udf = F.udf(haversine_distance, DoubleType())

# get 10 stations item to test the udf
subset_stations = stations_df.select("ID", "LATITUDE", "LONGITUDE").limit(10)

# generate the station pair
cross_joined = subset_stations.alias("a").crossJoin(subset_stations.alias("b"))

# Calculate distances
distances = cross_joined.select(
    F.col("a.ID").alias("Station1"),
    F.col("b.ID").alias("Station2"),
    F.col("a.LATITUDE").alias("Lat1"),
    F.col("a.LONGITUDE").alias("Lon1"),
    F.col("b.LATITUDE").alias("Lat2"),
    F.col("b.LONGITUDE").alias("Lon2"),
    haversine_udf(F.col("a.LATITUDE"), F.col("a.LONGITUDE"), 
                  F.col("b.LATITUDE"), F.col("b.LONGITUDE")).alias("Distance_km")
).filter(F.col("Station1") < F.col("Station2"))  # Avoid duplicate pairs and self-pairs
show_as_html(distances)


Unnamed: 0,ID,LATITUDE,LONGITUDE,ID.1,LATITUDE.1,LONGITUDE.1
0,AEM00041194,25.254999,55.363998,AEM00041194,25.254999,55.363998
1,AEM00041194,25.254999,55.363998,AGE00147705,36.779999,3.07
2,AEM00041194,25.254999,55.363998,AGM00060369,36.766998,3.1
3,AEM00041194,25.254999,55.363998,AGM00060403,36.466999,7.467
4,AEM00041194,25.254999,55.363998,AGM00060457,35.882999,0.117
5,AEM00041194,25.254999,55.363998,AGM00060476,35.466999,7.083
6,AEM00041194,25.254999,55.363998,AGM00060506,35.599998,0.3
7,AEM00041194,25.254999,55.363998,AGM00060620,27.837999,-0.186
8,AEM00041194,25.254999,55.363998,AGM00060656,27.700001,-8.167
9,AEM00041194,25.254999,55.363998,AJ000037636,41.299999,45.5


### (b)

In [114]:
# same step on NZ stations,
nz_stations_df = stations_df.filter(F.col("COUNTRY_CODE") == "NZ")
print(nz_stations_df.count())
nz_stations_cross_joined = nz_stations_df.alias("src_station").crossJoin(nz_stations_df.alias("dst_station")).select(
    F.col("src_station.ID").alias("Station1"),
    F.col("dst_station.ID").alias("Station2"),
    F.col("src_station.LATITUDE").alias("Lat1"),
    F.col("src_station.LONGITUDE").alias("Lon1"),
    F.col("dst_station.LATITUDE").alias("Lat2"),
    F.col("dst_station.LONGITUDE").alias("Lon2"),
    haversine_udf(F.col("src_station.LATITUDE"), F.col("src_station.LONGITUDE"), 
                  F.col("dst_station.LATITUDE"), F.col("dst_station.LONGITUDE")).alias("Distance_km")
).filter(F.col("Station1") < F.col("Station2"))

sorted_distances = nz_stations_cross_joined.orderBy(F.asc("Distance_km"))
show_as_html(sorted_distances)

15


Unnamed: 0,Station1,Station2,Lat1,Lon1,Lat2,Lon2,Distance_km
0,NZ000093417,NZM00093439,-40.900002,174.983002,-41.333,174.800003,50.52885
1,NZM00093439,NZM00093678,-41.333,174.800003,-42.417,173.699997,151.071707
2,NZ000936150,NZM00093781,-42.716999,170.983002,-43.488998,172.531998,152.258049
3,NZM00093678,NZM00093781,-42.417,173.699997,-43.488998,172.531998,152.458812
4,NZ000093417,NZM00093678,-40.900002,174.983002,-42.417,173.699997,199.529674
5,NZ000936150,NZ000937470,-42.716999,170.983002,-44.516998,169.899994,218.309193
6,NZ000093417,NZ000933090,-40.900002,174.983002,-39.016998,174.182999,220.200217
7,NZ000936150,NZM00093678,-42.716999,170.983002,-42.417,173.699997,224.980882
8,NZ000933090,NZM00093110,-39.016998,174.182999,-37.0,174.800003,230.700766
9,NZ000937470,NZM00093781,-44.516998,169.899994,-43.488998,172.531998,239.530394


## Q3, Summary of Daily

### (a)

In [145]:
# Define schma for Daily
daily_schema = StructType([
    StructField("ID", StringType(), nullable=False),
    StructField("DATE", StringType(), nullable=False),
    StructField("ELEMENT", StringType(), nullable=False),
    StructField("VALUE", FloatType(), nullable=False),
    StructField("MEASUREMENT_FLAG", StringType(), nullable=True),
    StructField("QUALITY_FLAG", StringType(), nullable=True),
    StructField("SOURCE_FLAG", StringType(), nullable=True),
    StructField("OBSERVATION_TIME", StringType(), nullable=True)
])

# Set the path to the daily data
daily_path = "/data/ghcnd/daily/*.csv.gz"

# Read the daily data
daily_df = spark.read.format("csv") \
    .option("header", "false") \
    .option("sep", ",") \
    .schema(daily_schema) \
    .load(daily_path)


# Count the number of rows
row_count = daily_df.select(F.count("*")).collect()[0][0]

print(f"Total number of rows in daily data: {row_count}")

Unnamed: 0,ID,DATE,ELEMENT,VALUE,MEASUREMENT_FLAG,QUALITY_FLAG,SOURCE_FLAG,OBSERVATION_TIME
0,AE000041196,20100101,TMAX,259.0,,,S,
1,AE000041196,20100101,TMIN,120.0,,,S,
2,AE000041196,20100101,TAVG,181.0,H,,S,
3,AEM00041194,20100101,TMAX,250.0,,,S,
4,AEM00041194,20100101,TMIN,168.0,,,S,
5,AEM00041194,20100101,PRCP,0.0,,,S,
6,AEM00041194,20100101,TAVG,194.0,H,,S,
7,AEM00041217,20100101,TMAX,250.0,,,S,
8,AEM00041217,20100101,TMIN,146.0,,,S,
9,AEM00041217,20100101,TAVG,199.0,H,,S,


Total number of rows in daily data: 3119374043


### (b)

In [69]:
element_count_df = (
    daily_df
    .groupBy("ELEMENT")
    .agg(
        F.count("*").alias("OBSERVATIONS_COUNT")
    )
)

In [73]:
# Define your core elements
core_elements = ['PRCP', 'SNOW', 'SNWD', 'TMAX', 'TMIN']  # Replace with actual element names

# Filter for core elements and show counts
core_element_counts = element_count_df.filter(F.col("ELEMENT").isin(core_elements))

show_as_html(core_element_counts)

Unnamed: 0,ELEMENT,OBSERVATIONS_COUNT
0,SNWD,299076145
1,SNOW,356187192
2,TMIN,456739567
3,PRCP,1073530896
4,TMAX,457927581


### (c)

In [146]:
# Filter TMAX and TMIN observations
tmax_data = daily_df.filter(daily_df.ELEMENT == "TMAX").select("ID", "DATE")
tmin_data = daily_df.filter(daily_df.ELEMENT == "TMIN").select("ID", "DATE")

In [147]:
# utilize left anti join to identify which tmax don't have corresponding tmin
tmax_without_tmin = tmax_data.join(tmin_data, on=["ID", "DATE"], how="left_anti")

# count the number of observations
tmax_without_tmin_count = tmax_without_tmin.count()

print(f"Number of TMAX observations without corresponding TMIN: {tmax_without_tmin_count}")

# get the unique contribution stations.
unique_stations = tmax_without_tmin.select("ID").distinct().count()

print(f"Number of unique stations contributing to TMAX observations without corresponding TMIN: {unique_stations}")

Number of TMAX observations without corresponding TMIN: 10567304
Number of unique stations contributing to TMAX observations without corresponding TMIN: 28716


In [1]:
stop_spark()

NameError: name 'stop_spark' is not defined