## Q1, Exploration of Daily

### (a)

In [24]:
# find the setting of the blocksize
!hdfs getconf -confKey "dfs.blocksize"

134217728


In [25]:
# get the file size of 2023 and 2024 daily data
!hdfs dfs -du /data/ghcnd/daily/2023.csv.gz
!hdfs dfs -du /data/ghcnd/daily/2024.csv.gz

168357302  1346858416  /data/ghcnd/daily/2023.csv.gz
88831735  710653880  /data/ghcnd/daily/2024.csv.gz


In [11]:
# identify the block occupation of the two daily data
!hdfs fsck /data/ghcnd/daily/2023.csv.gz -files -blocks
!hdfs fsck /data/ghcnd/daily/2024.csv.gz -files -blocks

Connecting to namenode via http://masternode2:9870/fsck?ugi=ywa286&files=1&blocks=1&path=%2Fdata%2Fghcnd%2Fdaily%2F2023.csv.gz
FSCK started by ywa286 (auth:SIMPLE) from /192.168.40.11 for path /data/ghcnd/daily/2023.csv.gz at Thu Aug 29 21:02:08 NZST 2024

/data/ghcnd/daily/2023.csv.gz 168357302 bytes, replicated: replication=8, 2 block(s):  OK
0. BP-700027894-132.181.129.68-1626517177804:blk_1074220535_479735 len=134217728 Live_repl=8
1. BP-700027894-132.181.129.68-1626517177804:blk_1074220536_479736 len=34139574 Live_repl=8


Status: HEALTHY
 Number of data-nodes:	32
 Number of racks:		1
 Total dirs:			0
 Total symlinks:		0

Replicated Blocks:
 Total size:	168357302 B
 Total files:	1
 Total blocks (validated):	2 (avg. block size 84178651 B)
 Minimally replicated blocks:	2 (100.0 %)
 Over-replicated blocks:	0 (0.0 %)
 Under-replicated blocks:	0 (0.0 %)
 Mis-replicated blocks:		0 (0.0 %)
 Default replication factor:	4
 Average block replication:	8.0
 Missing blocks:		0
 Corrupt blocks:

In [1]:
# Run this cell to import pyspark and to define start_spark() and stop_spark()

import findspark

findspark.init()

import getpass
import pandas as pd
import pyspark
import random
import re

from IPython.display import display, HTML
from pyspark import SparkContext
from pyspark.sql import SparkSession


# Functions used below

def username():
    """Get username with any domain information removed.
    """

    return re.sub('@.*', '', getpass.getuser())


def dict_to_html(d):
    """Convert a Python dictionary into a two column table for display.
    """

    html = []

    html.append(f'<table width="100%" style="width:100%; font-family: monospace;">')
    for k, v in d.items():
        html.append(f'<tr><td style="text-align:left;">{k}</td><td>{v}</td></tr>')
    html.append(f'</table>')

    return ''.join(html)


def show_as_html(df, n=20):
    """Leverage existing pandas jupyter integration to show a spark dataframe as html.
    
    Args:
        n (int): number of rows to show (default: 20)
    """

    display(df.limit(n).toPandas())

    
def display_spark():
    """Display the status of the active Spark session if one is currently running.
    """
    
    if 'spark' in globals() and 'sc' in globals():

        name = sc.getConf().get("spark.app.name")
        
        html = [
            f'<p><b>Spark</b></p>',
            f'<p>The spark session is <b><span style="color:green">active</span></b>, look for <code>{name}</code> under the running applications section in the Spark UI.</p>',
            f'<ul>',
            f'<li><a href="http://mathmadslinux2p.canterbury.ac.nz:8080/" target="_blank">Spark UI</a></li>',
            f'<li><a href="{sc.uiWebUrl}" target="_blank">Spark Application UI</a></li>',
            f'</ul>',
            f'<p><b>Config</b></p>',
            dict_to_html(dict(sc.getConf().getAll())),
            f'<p><b>Notes</b></p>',
            f'<ul>',
            f'<li>The spark session <code>spark</code> and spark context <code>sc</code> global variables have been defined by <code>start_spark()</code>.</li>',
            f'<li>Please run <code>stop_spark()</code> before closing the notebook or restarting the kernel or kill <code>{name}</code> by hand using the link in the Spark UI.</li>',
            f'</ul>',
        ]
        display(HTML(''.join(html)))
        
    else:
        
        html = [
            f'<p><b>Spark</b></p>',
            f'<p>The spark session is <b><span style="color:red">stopped</span></b>, confirm that <code>{username() + " (jupyter)"}</code> is under the completed applications section in the Spark UI.</p>',
            f'<ul>',
            f'<li><a href="http://mathmadslinux2p.canterbury.ac.nz:8080/" target="_blank">Spark UI</a></li>',
            f'</ul>',
        ]
        display(HTML(''.join(html)))


# Functions to start and stop spark

def start_spark(executor_instances=2, executor_cores=1, worker_memory=1, master_memory=1):
    """Start a new Spark session and define globals for SparkSession (spark) and SparkContext (sc).
    
    Args:
        executor_instances (int): number of executors (default: 2)
        executor_cores (int): number of cores per executor (default: 1)
        worker_memory (float): worker memory (default: 1)
        master_memory (float): master memory (default: 1)
    """

    global spark
    global sc

    user = username()
    
    cores = executor_instances * executor_cores
    partitions = cores * 4
    port = 4000 + random.randint(1, 999)

    spark = (
        SparkSession.builder
        .master("spark://masternode2:7077")
        .config("spark.driver.extraJavaOptions", f"-Dderby.system.home=/tmp/{user}/spark/")
        .config("spark.dynamicAllocation.enabled", "false")
        .config("spark.executor.instances", str(executor_instances))
        .config("spark.executor.cores", str(executor_cores))
        .config("spark.cores.max", str(cores))
        .config("spark.executor.memory", f"{worker_memory}g")
        .config("spark.driver.memory", f"{master_memory}g")
        .config("spark.driver.maxResultSize", "0")
        .config("spark.sql.shuffle.partitions", str(partitions))
        .config("spark.ui.port", str(port))
        .appName(user + " (jupyter)")
        .getOrCreate()
    )
    sc = SparkContext.getOrCreate()
    
    display_spark()

    
def stop_spark():
    """Stop the active Spark session and delete globals for SparkSession (spark) and SparkContext (sc).
    """

    global spark
    global sc

    if 'spark' in globals() and 'sc' in globals():

        spark.stop()

        del spark
        del sc

    display_spark()


# Make css changes to improve spark output readability

html = [
    '<style>',
    'pre { white-space: pre !important; }',
    'table.dataframe td { white-space: nowrap !important; }',
    'table.dataframe thead th:first-child, table.dataframe tbody th { display: none; }',
    '</style>',
]
display(HTML(''.join(html)))

In [2]:
from matplotlib import pyplot as plt
import numpy as np
import os
from pyspark.sql import Row, DataFrame, Window, functions as F
from pyspark.sql.types import *

In [3]:
start_spark(executor_instances=2, executor_cores=1, worker_memory=1, master_memory=1)

0,1
spark.dynamicAllocation.enabled,false
spark.app.id,app-20240911175051-0447
spark.app.startTime,1726033849582
spark.driver.port,42915
spark.sql.warehouse.dir,file:/users/home/ywa286/Assignment1/notebook/spark-warehouse
spark.master,spark://masternode2:7077
spark.executor.id,driver
spark.ui.port,4273
spark.driver.memory,1g
spark.driver.host,mathmadslinux2p.canterbury.ac.nz


### (b)

In [4]:
daily_schema = StructType([
    StructField("ID", StringType(), nullable=False),
    StructField("DATE", StringType(), nullable=False),
    StructField("ELEMENT", StringType(), nullable=False),
    StructField("VALUE", FloatType(), nullable=False),
    StructField("MEASUREMENT_FLAG", StringType(), nullable=True),
    StructField("QUALITY_FLAG", StringType(), nullable=True),
    StructField("SOURCE_FLAG", StringType(), nullable=True),
    StructField("OBSERVATION_TIME", StringType(), nullable=True)
])

In [42]:
# loading 2023 daily data and count the row number 
daily_df_2023 = spark.read.format("csv") \
    .option("header", "false") \
    .option("sep", ",") \
    .schema(daily_schema) \
    .load("/data/ghcnd/daily/2023.csv.gz")
count_2023 = daily_df_2023.count()
print(count_2023)

37867272


In [35]:
# loading 2024 daily data and count the row number 
daily_df_2024 = spark.read.format("csv") \
    .option("header", "false") \
    .option("sep", ",") \
    .schema(daily_schema) \
    .load("/data/ghcnd/daily/2024.csv.gz")
count_2024 = daily_df_2024.count()
print(count_2024)

19720790


In [33]:
# get the partition number of two daily data
partitions_2023 = daily_df_2023.rdd.getNumPartitions()
partitions_2024 = daily_df_2024.rdd.getNumPartitions()
 
print(f"Number of partitions for 2023 data: {partitions_2023}")
print(f"Number of partitions for 2024 data: {partitions_2024}")

Number of partitions for 2023 data: 1
Number of partitions for 2024 data: 1


### (c)

In [5]:
# load the daily data from 2014 to 2023
daily_14_23_df = spark.read.format("csv") \
    .option("header", "false") \
    .option("sep", ",") \
    .schema(daily_schema) \
    .load(["/data/ghcnd/daily/201[4-9].csv.gz","/data/ghcnd/daily/202[0-3].csv.gz"])


daily_14_23_df = daily_14_23_df.withColumn("DATE", F.to_date(F.col("DATE"), "yyyyMMdd"))
daily_14_23_df = daily_14_23_df.withColumn("OBSERVATION_TIME", 
    F.to_timestamp(F.concat(F.lit("1970-01-01 "), F.col("OBSERVATION_TIME")), "yyyy-MM-dd HHmm"))
daily_14_23_df = daily_14_23_df.withColumn("OBSERVATION_TIME", 
    F.date_format(F.col("OBSERVATION_TIME"), "HH:mm"))


In [8]:
# identify the partition number 
partitions_all = daily_14_23_df.rdd.getNumPartitions()
count_14_23 = daily_14_23_df.count()
print(partitions_all)
print(count_14_23)

In [10]:
stop_spark()