In [44]:
# Run this cell to import pyspark and to define start_spark() and stop_spark()

import findspark

findspark.init()

import getpass
import pandas as pd
import pyspark
import random
import re

from IPython.display import display, HTML
from pyspark import SparkContext
from pyspark.sql import SparkSession


# Functions used below

def username():
    """Get username with any domain information removed.
    """

    return re.sub('@.*', '', getpass.getuser())


def dict_to_html(d):
    """Convert a Python dictionary into a two column table for display.
    """

    html = []

    html.append(f'<table width="100%" style="width:100%; font-family: monospace;">')
    for k, v in d.items():
        html.append(f'<tr><td style="text-align:left;">{k}</td><td>{v}</td></tr>')
    html.append(f'</table>')

    return ''.join(html)


def show_as_html(df, n=20):
    """Leverage existing pandas jupyter integration to show a spark dataframe as html.
    
    Args:
        n (int): number of rows to show (default: 20)
    """

    display(df.limit(n).toPandas())

    
def display_spark():
    """Display the status of the active Spark session if one is currently running.
    """
    
    if 'spark' in globals() and 'sc' in globals():

        name = sc.getConf().get("spark.app.name")
        
        html = [
            f'<p><b>Spark</b></p>',
            f'<p>The spark session is <b><span style="color:green">active</span></b>, look for <code>{name}</code> under the running applications section in the Spark UI.</p>',
            f'<ul>',
            f'<li><a href="http://mathmadslinux2p.canterbury.ac.nz:8080/" target="_blank">Spark UI</a></li>',
            f'<li><a href="{sc.uiWebUrl}" target="_blank">Spark Application UI</a></li>',
            f'</ul>',
            f'<p><b>Config</b></p>',
            dict_to_html(dict(sc.getConf().getAll())),
            f'<p><b>Notes</b></p>',
            f'<ul>',
            f'<li>The spark session <code>spark</code> and spark context <code>sc</code> global variables have been defined by <code>start_spark()</code>.</li>',
            f'<li>Please run <code>stop_spark()</code> before closing the notebook or restarting the kernel or kill <code>{name}</code> by hand using the link in the Spark UI.</li>',
            f'</ul>',
        ]
        display(HTML(''.join(html)))
        
    else:
        
        html = [
            f'<p><b>Spark</b></p>',
            f'<p>The spark session is <b><span style="color:red">stopped</span></b>, confirm that <code>{username() + " (jupyter)"}</code> is under the completed applications section in the Spark UI.</p>',
            f'<ul>',
            f'<li><a href="http://mathmadslinux2p.canterbury.ac.nz:8080/" target="_blank">Spark UI</a></li>',
            f'</ul>',
        ]
        display(HTML(''.join(html)))


# Functions to start and stop spark

def start_spark(executor_instances=2, executor_cores=1, worker_memory=1, master_memory=1):
    """Start a new Spark session and define globals for SparkSession (spark) and SparkContext (sc).
    
    Args:
        executor_instances (int): number of executors (default: 2)
        executor_cores (int): number of cores per executor (default: 1)
        worker_memory (float): worker memory (default: 1)
        master_memory (float): master memory (default: 1)
    """

    global spark
    global sc

    user = username()
    
    cores = executor_instances * executor_cores
    partitions = cores * 4
    port = 4000 + random.randint(1, 999)

    spark = (
        SparkSession.builder
        .master("spark://masternode2:7077")
        .config("spark.driver.extraJavaOptions", f"-Dderby.system.home=/tmp/{user}/spark/")
        .config("spark.dynamicAllocation.enabled", "false")
        .config("spark.executor.instances", str(executor_instances))
        .config("spark.executor.cores", str(executor_cores))
        .config("spark.cores.max", str(cores))
        .config("spark.executor.memory", f"{worker_memory}g")
        .config("spark.driver.memory", f"{master_memory}g")
        .config("spark.driver.maxResultSize", "0")
        .config("spark.sql.shuffle.partitions", str(partitions))
        .config("spark.ui.port", str(port))
        .appName(user + " (jupyter)")
        .getOrCreate()
    )
    sc = SparkContext.getOrCreate()
    
    display_spark()

    
def stop_spark():
    """Stop the active Spark session and delete globals for SparkSession (spark) and SparkContext (sc).
    """

    global spark
    global sc

    if 'spark' in globals() and 'sc' in globals():

        spark.stop()

        del spark
        del sc

    display_spark()


# Make css changes to improve spark output readability

html = [
    '<style>',
    'pre { white-space: pre !important; }',
    'table.dataframe td { white-space: nowrap !important; }',
    'table.dataframe thead th:first-child, table.dataframe tbody th { display: none; }',
    '</style>',
]
display(HTML(''.join(html)))

In [45]:
from matplotlib import pyplot as plt
import numpy as np
import os
from pyspark.sql import Row, DataFrame, Window, functions as F
from pyspark.sql.types import *

In [61]:
start_spark(executor_instances=4, executor_cores=2, worker_memory=4, master_memory=4)

0,1
spark.dynamicAllocation.enabled,false
spark.executor.instances,4
spark.sql.warehouse.dir,file:/users/home/ywa286/Assignment1/notebook/spark-warehouse
spark.driver.memory,4g
spark.driver.port,45519
spark.executor.memory,4g
spark.master,spark://masternode2:7077
spark.executor.id,driver
spark.executor.cores,2
spark.driver.host,mathmadslinux2p.canterbury.ac.nz


## Q1, Enrich Station Data

In [47]:
# Read the stations file
stations_df = spark.read.text("hdfs:///data/ghcnd/ghcnd-stations.txt")

# Parse the fixed-width formatted data
stations_df = stations_df.select(
    F.trim(F.substring("value", 1, 11)).alias("ID"),
    F.trim(F.substring("value", 13, 8)).cast("float").alias("LATITUDE"),
    F.trim(F.substring("value", 22, 9)).cast("float").alias("LONGITUDE"),
    F.trim(F.substring("value", 32, 6)).cast("float").alias("ELEVATION"),
    F.trim(F.substring("value", 39, 2)).alias("STATE"),
    F.trim(F.substring("value", 42, 30)).alias("NAME"),
    F.trim(F.substring("value", 73, 3)).alias("GSN_FLAG"),
    F.trim(F.substring("value", 77, 3)).alias("HCN_CRN_FLAG"),
    F.trim(F.substring("value", 81, 5)).cast("int").alias("WMO_ID")
)

show_as_html(stations_df)

Unnamed: 0,ID,LATITUDE,LONGITUDE,ELEVATION,STATE,NAME,GSN_FLAG,HCN_CRN_FLAG,WMO_ID
0,ACW00011604,17.116699,-61.783298,10.1,,ST JOHNS COOLIDGE FLD,,,
1,ACW00011647,17.133301,-61.783298,19.200001,,ST JOHNS,,,
2,AE000041196,25.333,55.516998,34.0,,SHARJAH INTER. AIRP,GSN,,41196.0
3,AEM00041194,25.254999,55.363998,10.4,,DUBAI INTL,,,41194.0
4,AEM00041217,24.433001,54.651001,26.799999,,ABU DHABI INTL,,,41217.0
5,AEM00041218,24.261999,55.609001,264.899994,,AL AIN INTL,,,41218.0
6,AF000040930,35.317001,69.016998,3366.0,,NORTH-SALANG,GSN,,40930.0
7,AFM00040938,34.209999,62.228001,977.200012,,HERAT,,,40938.0
8,AFM00040948,34.566002,69.211998,1791.300049,,KABUL INTL,,,40948.0
9,AFM00040990,31.5,65.849998,1010.0,,KANDAHAR AIRPORT,,,40990.0


In [48]:
# Read the countries file
countries_df = spark.read.text("hdfs:///data/ghcnd/ghcnd-countries.txt")

# Parse the fixed-width formatted data
countries_df = countries_df.select(
    F.trim(F.substring("value", 1, 2)).alias("COUNTRY_CODE"),
    F.trim(F.substring("value", 4, 61)).alias("COUNTRY_NAME")
)

show_as_html(countries_df)

Unnamed: 0,COUNTRY_CODE,COUNTRY_NAME
0,AC,Antigua and Barbuda
1,AE,United Arab Emirates
2,AF,Afghanistan
3,AG,Algeria
4,AJ,Azerbaijan
5,AL,Albania
6,AM,Armenia
7,AO,Angola
8,AQ,American Samoa [United States]
9,AR,Argentina


In [49]:
# Read the states file
states_df = spark.read.text("hdfs:///data/ghcnd/ghcnd-states.txt")

# Parse the fixed-width formatted data
states_df = states_df.select(
    F.trim(F.substring("value", 1, 2)).alias("STATE_CODE"),
    F.trim(F.substring("value", 4, 47)).alias("STATE_NAME")
)

show_as_html(states_df)

Unnamed: 0,STATE_CODE,STATE_NAME
0,AB,ALBERTA
1,AK,ALASKA
2,AL,ALABAMA
3,AR,ARKANSAS
4,AS,AMERICAN SAMOA
5,AZ,ARIZONA
6,BC,BRITISH COLUMBIA
7,CA,CALIFORNIA
8,CO,COLORADO
9,CT,CONNECTICUT


In [50]:
# Read the inventory file
inventory_df = spark.read.text("hdfs:///data/ghcnd/ghcnd-inventory.txt")

# Parse the fixed-width formatted data
inventory_df = inventory_df.select(
    F.trim(F.substring("value", 1, 11)).alias("ID"),
    F.trim(F.substring("value", 13, 8)).cast("float").alias("LATITUDE"),
    F.trim(F.substring("value", 22, 9)).cast("float").alias("LONGITUDE"),
    F.trim(F.substring("value", 32, 4)).alias("ELEMENT"),
    F.trim(F.substring("value", 37, 4)).cast("int").alias("FIRSTYEAR"),
    F.trim(F.substring("value", 42, 4)).cast("int").alias("LASTYEAR")
)

show_as_html(inventory_df)

Unnamed: 0,ID,LATITUDE,LONGITUDE,ELEMENT,FIRSTYEAR,LASTYEAR
0,ACW00011604,17.116699,-61.783298,TMAX,1949,1949
1,ACW00011604,17.116699,-61.783298,TMIN,1949,1949
2,ACW00011604,17.116699,-61.783298,PRCP,1949,1949
3,ACW00011604,17.116699,-61.783298,SNOW,1949,1949
4,ACW00011604,17.116699,-61.783298,SNWD,1949,1949
5,ACW00011604,17.116699,-61.783298,PGTM,1949,1949
6,ACW00011604,17.116699,-61.783298,WDFG,1949,1949
7,ACW00011604,17.116699,-61.783298,WSFG,1949,1949
8,ACW00011604,17.116699,-61.783298,WT03,1949,1949
9,ACW00011604,17.116699,-61.783298,WT08,1949,1949


In [57]:
# count the unique element in inventory data
unique_element_count = inventory_df.select('ELEMENT').distinct().count()
print(f"Number of unique elements: {unique_element_count}")
# 
unique_element = inventory_df.select('ELEMENT').distinct()
show_as_html(unique_element)

Number of unique elements: 144


Unnamed: 0,ELEMENT
0,PGTM
1,WT08
2,MDPR
3,ACMH
4,ADPT
5,MDEV
6,WDF2
7,WESD
8,WV01
9,SN01


### (a)

In [None]:
# Extract the first two character of ID column
stations_df = stations_df.withColumn("COUNTRY_CODE", F.substring(stations_df["ID"], 1, 2))
show_as_html(stations_df)

### (b)

In [52]:
# Left join station and country data to enrich station
stations_df = stations_df.join(
    countries_df,
    "COUNTRY_CODE",
    "left"
)
show_as_html(stations_df)

Unnamed: 0,COUNTRY_CODE,ID,LATITUDE,LONGITUDE,ELEVATION,STATE,NAME,GSN_FLAG,HCN_CRN_FLAG,WMO_ID,COUNTRY_NAME
0,AC,ACW00011604,17.116699,-61.783298,10.1,,ST JOHNS COOLIDGE FLD,,,,Antigua and Barbuda
1,AC,ACW00011647,17.133301,-61.783298,19.200001,,ST JOHNS,,,,Antigua and Barbuda
2,AE,AE000041196,25.333,55.516998,34.0,,SHARJAH INTER. AIRP,GSN,,41196.0,United Arab Emirates
3,AE,AEM00041194,25.254999,55.363998,10.4,,DUBAI INTL,,,41194.0,United Arab Emirates
4,AE,AEM00041217,24.433001,54.651001,26.799999,,ABU DHABI INTL,,,41217.0,United Arab Emirates
5,AE,AEM00041218,24.261999,55.609001,264.899994,,AL AIN INTL,,,41218.0,United Arab Emirates
6,AF,AF000040930,35.317001,69.016998,3366.0,,NORTH-SALANG,GSN,,40930.0,Afghanistan
7,AF,AFM00040938,34.209999,62.228001,977.200012,,HERAT,,,40938.0,Afghanistan
8,AF,AFM00040948,34.566002,69.211998,1791.300049,,KABUL INTL,,,40948.0,Afghanistan
9,AF,AFM00040990,31.5,65.849998,1010.0,,KANDAHAR AIRPORT,,,40990.0,Afghanistan


### (c)

In [53]:
# Left join station and states data to enrich station
stations_df = stations_df.withColumnRenamed("STATE", "STATE_CODE").join(
    states_df,
    "STATE_CODE",
    "left"
)
show_as_html(stations_df)

Unnamed: 0,STATE_CODE,COUNTRY_CODE,ID,LATITUDE,LONGITUDE,ELEVATION,NAME,GSN_FLAG,HCN_CRN_FLAG,WMO_ID,COUNTRY_NAME,STATE_NAME
0,,AC,ACW00011604,17.116699,-61.783298,10.1,ST JOHNS COOLIDGE FLD,,,,Antigua and Barbuda,
1,,AC,ACW00011647,17.133301,-61.783298,19.200001,ST JOHNS,,,,Antigua and Barbuda,
2,,AE,AE000041196,25.333,55.516998,34.0,SHARJAH INTER. AIRP,GSN,,41196.0,United Arab Emirates,
3,,AE,AEM00041194,25.254999,55.363998,10.4,DUBAI INTL,,,41194.0,United Arab Emirates,
4,,AE,AEM00041217,24.433001,54.651001,26.799999,ABU DHABI INTL,,,41217.0,United Arab Emirates,
5,,AE,AEM00041218,24.261999,55.609001,264.899994,AL AIN INTL,,,41218.0,United Arab Emirates,
6,,AF,AF000040930,35.317001,69.016998,3366.0,NORTH-SALANG,GSN,,40930.0,Afghanistan,
7,,AF,AFM00040938,34.209999,62.228001,977.200012,HERAT,,,40938.0,Afghanistan,
8,,AF,AFM00040948,34.566002,69.211998,1791.300049,KABUL INTL,,,40948.0,Afghanistan,
9,,AF,AFM00040990,31.5,65.849998,1010.0,KANDAHAR AIRPORT,,,40990.0,Afghanistan,


### (d)

In [54]:
# Define core element list
core_elements = ['PRCP', 'SNOW', 'SNWD', 'TMAX', 'TMIN']

inventory_df_summary = (
    inventory_df
    # add a new column if the element in core elements list
    .withColumn("is_core", F.col("ELEMENT").isin(core_elements))
    .groupBy("ID")
    .agg(
        # find all core element with in one station
        F.collect_set(F.when(F.col("is_core"), F.col("ELEMENT"))).alias("core_elements"),
        F.sum(F.when(F.col("is_core"), 1).otherwise(0)).alias("CORE_ELEMENTS_COUNT"),
        # find other non-core element
        F.collect_set(F.when(~F.col("is_core"), F.col("ELEMENT"))).alias("other_elements"),
        F.sum(F.when(~F.col("is_core"), 1).otherwise(0)).alias("OTHER_ELEMENTS_COUNT"),
        # find the max and min year of station activate
        F.min("FIRSTYEAR").alias("FIRSTYEAR"),
        F.max("LASTYEAR").alias("LASTYEAR")
    )
    # get the total number of element
    .withColumn("TOTAL_ELEMENTS", F.col("CORE_ELEMENTS_COUNT") + F.col("OTHER_ELEMENTS_COUNT"))
    # get the activate year number
    .withColumn("ACTIVE_YEAR", F.col("LASTYEAR") - F.col("FIRSTYEAR"))
    # modify to element column, if have value remain, otherwise set it to empty array
    .withColumn("CORE_ELEMENTS", F.when(F.size("core_elements") > 0, F.col("core_elements")).otherwise(F.array()))
    .withColumn("OTHER_ELEMENTS", F.when(F.size("other_elements") > 0, F.col("other_elements")).otherwise(F.array()))
)

show_as_html(inventory_df_summary)
inventory_df_summary.printSchema()

Unnamed: 0,ID,CORE_ELEMENTS,CORE_ELEMENTS_COUNT,OTHER_ELEMENTS,OTHER_ELEMENTS_COUNT,FIRSTYEAR,LASTYEAR,TOTAL_ELEMENTS,ACTIVE_YEAR
0,AEM00041217,"[TMAX, TMIN, PRCP]",3,[TAVG],1,1983,2024,4,41
1,AGE00147708,"[TMAX, TMIN, PRCP, SNWD]",4,[TAVG],1,1879,2024,5,145
2,AGE00147710,"[TMAX, TMIN, PRCP]",3,[TAVG],1,1909,2009,4,100
3,AGE00147714,"[TMAX, TMIN, PRCP]",3,[],0,1896,1938,3,42
4,AGE00147719,"[TMAX, TMIN, PRCP]",3,[TAVG],1,1888,2024,4,136
5,AGM00060360,"[TMAX, TMIN, PRCP]",3,[TAVG],1,1945,2024,4,79
6,AGM00060445,"[TMAX, TMIN, PRCP, SNWD]",4,[TAVG],1,1957,2024,5,67
7,AGM00060452,"[TMAX, TMIN, PRCP]",3,[TAVG],1,1985,2024,4,39
8,AGM00060511,"[TMAX, TMIN, PRCP, SNWD]",4,[TAVG],1,1983,2024,5,41
9,AGM00060540,"[TMAX, TMIN, PRCP, SNWD]",4,[TAVG],1,1981,2022,5,41


In [25]:
# get the total station with in inventory table
total_stations = inventory_df_summary.count()
# get the number of stations which has all five core element
stations_with_all_core = inventory_df_summary.filter(F.size("CORE_ELEMENTS") == 5).count()
# get the number of stations which only collect PRCP
stations_only_prcp = inventory_df_summary.filter(
    (F.array_contains(F.col("CORE_ELEMENTS"), "PRCP")) & 
    (F.size("CORE_ELEMENTS") == 1) &
    (F.col("OTHER_ELEMENTS_COUNT") == 0)
).count()
# get the number of tatal stations in station table
total_stations_station = stations_df.count()

print(f"Total number of stations: {total_stations}")
print(f"Number of stations collecting all five core elements: {stations_with_all_core}")
print(f"Number of stations collecting only precipitation (PRCP) and no other elements: {stations_only_prcp}")

Total number of stations: 127984
Number of stations collecting all five core elements: 20482
Number of stations collecting only precipitation (PRCP) and no other elements: 16308


### (e)

In [26]:
# left join station with the inventory summary table, to enrich station table
station_enriched = (
    stations_df
    .join(
    inventory_df_summary,
    "ID",
    "LEFT"
    )
)
    
show_as_html(station_enriched)

Unnamed: 0,ID,STATE_CODE,COUNTRY_CODE,LATITUDE,LONGITUDE,ELEVATION,NAME,GSN_FLAG,HCN_CRN_FLAG,WMO_ID,COUNTRY_NAME,STATE_NAME,CORE_ELEMENTS,CORE_ELEMENTS_COUNT,OTHER_ELEMENTS,OTHER_ELEMENTS_COUNT,FIRSTYEAR,LASTYEAR,TOTAL_ELEMENTS,ACTIVE_YEAR
0,AEM00041217,,AE,24.433001,54.651001,26.799999,ABU DHABI INTL,,,41217.0,United Arab Emirates,,"[TMAX, TMIN, PRCP]",3,[TAVG],1,1983,2024,4,41
1,AGE00147708,,AG,36.720001,4.05,222.0,TIZI OUZOU,,,60395.0,Algeria,,"[TMAX, TMIN, PRCP, SNWD]",4,[TAVG],1,1879,2024,5,145
2,AGE00147710,,AG,36.75,5.1,9.0,BEJAIA-BOUGIE (PORT),,,60401.0,Algeria,,"[TMAX, TMIN, PRCP]",3,[TAVG],1,1909,2009,4,100
3,AGE00147714,,AG,35.77,0.8,78.0,ORAN-CAP FALCON,,,,Algeria,,"[TMAX, TMIN, PRCP]",3,[],0,1896,1938,3,42
4,AGE00147719,,AG,33.799702,2.89,767.0,LAGHOUAT,,,60545.0,Algeria,,"[TMAX, TMIN, PRCP]",3,[TAVG],1,1888,2024,4,136
5,AGM00060360,,AG,36.821999,7.809,4.9,ANNABA,,,60360.0,Algeria,,"[TMAX, TMIN, PRCP]",3,[TAVG],1,1945,2024,4,79
6,AGM00060445,,AG,36.178001,5.324,1050.0,SETIF AIN ARNAT,,,60445.0,Algeria,,"[TMAX, TMIN, PRCP, SNWD]",4,[TAVG],1,1957,2024,5,67
7,AGM00060452,,AG,35.817001,-0.267,4.0,ARZEW,,,60452.0,Algeria,,"[TMAX, TMIN, PRCP]",3,[TAVG],1,1985,2024,4,39
8,AGM00060511,,AG,35.341,1.463,989.099976,BOU CHEKIF,,,60511.0,Algeria,,"[TMAX, TMIN, PRCP, SNWD]",4,[TAVG],1,1983,2024,5,41
9,AGM00060540,,AG,34.150002,0.067,1001.0,EL-KHEITER,,,60540.0,Algeria,,"[TMAX, TMIN, PRCP, SNWD]",4,[TAVG],1,1981,2022,5,41


In [30]:
# use parquet to storage the enriched stations table
station_enriched.write.mode("overwrite").parquet("./station_enriched.parquet")

##  Q2, Detecting Missing Stations in Daily Data

In [64]:
# load the enriched station table
stations_df = spark.read.parquet("station_enriched.parquet")
show_as_html(stations_df)

Unnamed: 0,ID,STATE_CODE,COUNTRY_CODE,LATITUDE,LONGITUDE,ELEVATION,NAME,GSN_FLAG,HCN_CRN_FLAG,WMO_ID,COUNTRY_NAME,STATE_NAME,CORE_ELEMENTS,CORE_ELEMENTS_COUNT,OTHER_ELEMENTS,OTHER_ELEMENTS_COUNT,FIRSTYEAR,LASTYEAR,TOTAL_ELEMENTS,ACTIVE_YEAR
0,AE000041196,,AE,25.333,55.516998,34.0,SHARJAH INTER. AIRP,GSN,,41196.0,United Arab Emirates,,"[TMAX, TMIN, PRCP]",3,[TAVG],1,1944,2024,4,80
1,AEM00041218,,AE,24.261999,55.609001,264.899994,AL AIN INTL,,,41218.0,United Arab Emirates,,"[TMAX, TMIN, PRCP]",3,[TAVG],1,1994,2024,4,30
2,AGE00147715,,AG,35.419998,8.1197,863.0,TEBESSA,,,,Algeria,,"[TMAX, TMIN, PRCP]",3,[],0,1879,1938,3,59
3,AGE00147794,,AG,36.779999,5.1,225.0,BEJAIA-CAP CARBON,,,,Algeria,,"[TMAX, TMIN]",2,[],0,1926,1938,2,12
4,AGM00060402,,AG,36.712002,5.07,6.1,SOUMMAM,,,60402.0,Algeria,,"[TMAX, TMIN, PRCP, SNWD]",4,[TAVG],1,1973,2024,5,51
5,AGM00060430,,AG,36.299999,2.233,721.0,MILIANA,,,60430.0,Algeria,,"[TMAX, TMIN, PRCP, SNWD]",4,[TAVG],1,1957,2024,5,67
6,AGM00060461,,AG,35.700001,-0.65,22.0,ORAN-PORT,,,60461.0,Algeria,,"[TMAX, TMIN, PRCP]",3,[TAVG],1,1995,2024,4,29
7,AGM00060514,,AG,35.167,2.317,801.0,KSAR CHELLALA,,,60514.0,Algeria,,"[TMAX, TMIN, PRCP, SNWD]",4,[TAVG],1,1995,2024,5,29
8,AGM00060515,,AG,35.333,4.206,459.0,BOU SAADA,,,60515.0,Algeria,,"[TMAX, TMIN, PRCP]",3,[TAVG],1,1984,2024,4,40
9,AGM00060550,,AG,33.667,1.0,1347.0,EL-BAYADH,,,60550.0,Algeria,,"[TMAX, TMIN, PRCP, SNWD]",4,[TAVG],1,1973,2024,5,51


In [37]:
# Define schma for Daily
daily_schema = StructType([
    StructField("ID", StringType(), nullable=False),
    StructField("DATE", StringType(), nullable=False),
    StructField("ELEMENT", StringType(), nullable=False),
    StructField("VALUE", FloatType(), nullable=False),
    StructField("MEASUREMENT_FLAG", StringType(), nullable=True),
    StructField("QUALITY_FLAG", StringType(), nullable=True),
    StructField("SOURCE_FLAG", StringType(), nullable=True),
    StructField("OBSERVATION_TIME", StringType(), nullable=True)
])

# Read daily csv and parse the date and time.
daily_df = spark.read.format("csv") \
    .option("header", "false") \
    .option("sep", ",") \
    .schema(daily_schema) \
    .load("hdfs:///data/ghcnd/daily/2023.csv.gz") \
    .limit(1000)
daily_df = daily_df.withColumn("DATE", F.to_date(F.col("DATE"), "yyyyMMdd"))
daily_df = daily_df.withColumn("OBSERVATION_TIME", 
    F.to_timestamp(F.concat(F.lit("1970-01-01 "), F.col("OBSERVATION_TIME")), "yyyy-MM-dd HHmm"))
daily_df = daily_df.withColumn("OBSERVATION_TIME", 
    F.date_format(F.col("OBSERVATION_TIME"), "HH:mm"))

Unnamed: 0,ID,DATE,ELEMENT,VALUE,MEASUREMENT_FLAG,QUALITY_FLAG,SOURCE_FLAG,OBSERVATION_TIME
0,AE000041196,2023-01-01,TMAX,252.0,,,S,
1,AE000041196,2023-01-01,TMIN,149.0,,,S,
2,AE000041196,2023-01-01,PRCP,0.0,D,,S,
3,AE000041196,2023-01-01,TAVG,207.0,H,,S,
4,AEM00041194,2023-01-01,TMAX,255.0,,,S,
5,AEM00041194,2023-01-01,TMIN,186.0,,,S,
6,AEM00041194,2023-01-01,PRCP,0.0,,,S,
7,AEM00041194,2023-01-01,TAVG,223.0,H,,S,
8,AEM00041217,2023-01-01,TMAX,248.0,,,S,
9,AEM00041217,2023-01-01,TMIN,184.0,,,S,


root
 |-- ID: string (nullable = true)
 |-- DATE: date (nullable = true)
 |-- ELEMENT: string (nullable = true)
 |-- VALUE: float (nullable = true)
 |-- MEASUREMENT_FLAG: string (nullable = true)
 |-- QUALITY_FLAG: string (nullable = true)
 |-- SOURCE_FLAG: string (nullable = true)
 |-- OBSERVATION_TIME: string (nullable = true)



### (a)

In [38]:
# left join station table to daily
daily_stations_df = daily_df.join(stations_df, "ID", "left")
show_as_html(daily_stations_df)

Unnamed: 0,ID,DATE,ELEMENT,VALUE,MEASUREMENT_FLAG,QUALITY_FLAG,SOURCE_FLAG,OBSERVATION_TIME,STATE_CODE,COUNTRY_CODE,...,COUNTRY_NAME,STATE_NAME,CORE_ELEMENTS,CORE_ELEMENTS_COUNT,OTHER_ELEMENTS,OTHER_ELEMENTS_COUNT,FIRSTYEAR,LASTYEAR,TOTAL_ELEMENTS,ACTIVE_YEAR
0,AE000041196,2023-01-01,TMAX,252.0,,,S,,,AE,...,United Arab Emirates,,"[TMAX, TMIN, PRCP]",3,[TAVG],1,1944,2024,4,80
1,AE000041196,2023-01-01,TMIN,149.0,,,S,,,AE,...,United Arab Emirates,,"[TMAX, TMIN, PRCP]",3,[TAVG],1,1944,2024,4,80
2,AE000041196,2023-01-01,PRCP,0.0,D,,S,,,AE,...,United Arab Emirates,,"[TMAX, TMIN, PRCP]",3,[TAVG],1,1944,2024,4,80
3,AE000041196,2023-01-01,TAVG,207.0,H,,S,,,AE,...,United Arab Emirates,,"[TMAX, TMIN, PRCP]",3,[TAVG],1,1944,2024,4,80
4,AEM00041194,2023-01-01,TMAX,255.0,,,S,,,AE,...,United Arab Emirates,,"[TMAX, TMIN, PRCP]",3,[TAVG],1,1983,2024,4,41
5,AEM00041194,2023-01-01,TMIN,186.0,,,S,,,AE,...,United Arab Emirates,,"[TMAX, TMIN, PRCP]",3,[TAVG],1,1983,2024,4,41
6,AEM00041194,2023-01-01,PRCP,0.0,,,S,,,AE,...,United Arab Emirates,,"[TMAX, TMIN, PRCP]",3,[TAVG],1,1983,2024,4,41
7,AEM00041194,2023-01-01,TAVG,223.0,H,,S,,,AE,...,United Arab Emirates,,"[TMAX, TMIN, PRCP]",3,[TAVG],1,1983,2024,4,41
8,AEM00041217,2023-01-01,TMAX,248.0,,,S,,,AE,...,United Arab Emirates,,"[TMAX, TMIN, PRCP]",3,[TAVG],1,1983,2024,4,41
9,AEM00041217,2023-01-01,TMIN,184.0,,,S,,,AE,...,United Arab Emirates,,"[TMAX, TMIN, PRCP]",3,[TAVG],1,1983,2024,4,41


### (b)

In [40]:
# use left anti join to identifu the station in daily that are not in stations
missing_ids_df = daily_df.join(stations_df, "ID", "left_anti")
show_as_html(missing_ids_df)

Unnamed: 0,ID,DATE,ELEMENT,VALUE,MEASUREMENT_FLAG,QUALITY_FLAG,SOURCE_FLAG,OBSERVATION_TIME


In [62]:
# test on the whole daily data
# Define schma for Daily
daily_schema = StructType([
    StructField("ID", StringType(), nullable=False),
    StructField("DATE", StringType(), nullable=False),
    StructField("ELEMENT", StringType(), nullable=False),
    StructField("VALUE", FloatType(), nullable=False),
    StructField("MEASUREMENT_FLAG", StringType(), nullable=True),
    StructField("QUALITY_FLAG", StringType(), nullable=True),
    StructField("SOURCE_FLAG", StringType(), nullable=True),
    StructField("OBSERVATION_TIME", StringType(), nullable=True)
])
# Set the path to the daily data
daily_path = "/data/ghcnd/daily/*.csv.gz"

# Read the daily data
daily_all_df = spark.read.format("csv") \
    .option("header", "false") \
    .option("sep", ",") \
    .schema(daily_schema) \
    .load(daily_path)
missing_ids_df = daily_all_df.join(stations_df, "ID", "left_anti")
show_as_html(missing_ids_df)

In [67]:
stop_spark()