### Notebook used for gathering strong scalability results. 

In [1]:
def parse_hdf5(iterator):
    import time
    try:
        import h5py
        from io import BytesIO
        print("h5py version:", h5py.__version__)
    except ImportError as e:
        print("!!! [ERROR] Worker side has no h5py installed !!!")
        raise e
    """ Parsing the HDF5 file on the worker side to extract artist_name and artist_hotttnesss """
    partition_start_time = time.time()  # record partition start time
    
    for row in iterator:
        file_path = row["path"]
        binary_data = row["content"] 
        try:
            with h5py.File(BytesIO(binary_data), "r") as h5_file:
                if "metadata" in h5_file and "songs" in h5_file["metadata"]:
                    songs_data = h5_file["metadata"]["songs"][:]

                    # get artist_name and artist_hotttnesss
                    artist_name = songs_data[0]["artist_name"].decode() if isinstance(songs_data[0]["artist_name"], bytes) else str(songs_data[0]["artist_name"])
                    artist_hotttnesss = float(songs_data[0]["artist_hotttnesss"]) if songs_data[0]["artist_hotttnesss"] != "nan" else None

                    if artist_hotttnesss is not None and 0.0 <= artist_hotttnesss <= 1.0:
                        yield Row(artist_name=artist_name, artist_hotttnesss=artist_hotttnesss)
        except Exception as e:
            print(f"Error processing {file_path}: {e}")
            
    partition_end_time = time.time()
    print(f"Partition executed in {partition_end_time - partition_start_time:.2f} seconds")

In [7]:
# Set parameters 
fraction_of_data = 1.0 # Full datasize for strong scalability
number_of_workers = 1 # Current number of workers

In [None]:
 # Import libraries
import pyspark
from pyspark.sql import SparkSession, Row
from pyspark.sql.functions import avg, desc
from pyspark.sql.types import StructType, StructField, StringType, FloatType
import pandas as pd
from operator import add
import time

try:
    import h5py
    from io import BytesIO
    print("h5py version:", h5py.__version__)
except ImportError as e:
    print("!!! [ERROR] Worker side has no h5py installed !!!")
    raise e


spark_session = SparkSession.builder \
.master("spark://192.168.2.130:7077") \
.appName("Group10") \
.config("spark.dynamicAllocation.enabled", True) \
.config("spark.dynamicAllocation.shuffleTracking.enabled", True) \
.config("spark.shuffle.service.enabled", False) \
.config("spark.dynamicAllocation.executorIdleTimeout", "30s") \
.config("spark.executor.cores", 2) \
.config("spark.driver.port",9999)\
.config("spark.blockManager.port",10005)\
.config("spark.cores.max", "12")\
.getOrCreate()

# RDD API
spark_context = spark_session.sparkContext
spark_context.setLogLevel("ERROR")

# record start time
total_start_time = time.time()

# read all file paths on HDFS
hdfs_base_path = "hdfs://192.168.2.130:9000/data/MillionSongSubset/"
df_files = spark_session.read.format("binaryFile") \
    .option("recursiveFileLookup", "true") \
    .load(hdfs_base_path) \
    .select("path", "content")

if 0 < fraction_of_data < 1:
    # Sample a fraction of data
    df_files = df_files.sample(fraction=fraction_of_data)

# limit the number of worker tasks to prevent overloading of resources
MAX_PARTITIONS = 30
df_files = df_files.repartition(MAX_PARTITIONS)

# parallel processing of HDF5 files
rdd_parsed = df_files.rdd.mapPartitions(parse_hdf5)

# convert to DataFrame
schema = StructType([
    StructField("artist_name", StringType(), True),
    StructField("artist_hotttnesss", FloatType(), True)])
df_songs = spark_session.createDataFrame(rdd_parsed, schema=schema)

# calculate avg artist_hotttnesss get Top 5
start_time = time.time() # record aggregation start time

df_songs.groupBy("artist_name") \
    .agg(avg("artist_hotttnesss").alias("avg_hotttnesss")) \
    .orderBy("avg_hotttnesss", ascending=False) \
    .show(5, truncate=False)

end_time = time.time()  # record end time

# Record the total time of Spark task
total_end_time = time.time()

# Calculate execution time for aggregation
aggregation_time = end_time - start_time

# Calculate execution time for total spark job
total_time = total_end_time - total_start_time

# Stop Spark session
spark_session.stop()

h5py version: 3.13.0


                                                                                

+------------------------------+------------------+
|artist_name                   |avg_hotttnesss    |
+------------------------------+------------------+
|Coldplay                      |0.9160532355308533|
|Rihanna                       |0.9082026481628418|
|Taylor Swift                  |0.8974298536777496|
|T.I.                          |0.8728389143943787|
|Usher featuring Jermaine Dupri|0.8546378016471863|
+------------------------------+------------------+
only showing top 5 rows



In [9]:
# Stop Spark session
spark_session.stop()

In [10]:
# Add results to csv file
aggregation_time

30.219465255737305

In [11]:
total_time

103.19308423995972

In [12]:
import os 

# Create dataframe with new results
results = pd.DataFrame([{"aggregation_time": aggregation_time,
                        "total_time": total_time,
                        "num_of_workers": number_of_workers}])
# Define the filename where metrics will be saved
filename = "strong_scalability.csv"
if os.path.exists(filename):
    # Read csv file
    strong_df = pd.read_csv(filename)

    # remove rows with same number of nodes
    strong_df = strong_df[strong_df["num_of_workers"] != number_of_workers]
    
    # Append new results
    strong_df = pd.concat([strong_df, results], ignore_index=True)

else:
    strong_df = results

# Write to /overwrite csv file
strong_df.to_csv(filename, index=False)