* Import necessary modules

In [2]:
import glob
from pyspark.sql import SparkSession
import pandas as pd
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline

* Configure the base path

In [20]:
base_path = 'hdfs://localhost:9000/user/hadoop/input'

* Function for create spark session that connect to hadoop

In [5]:
def create_spark_session():
    """
    Creates and configures a SparkSession with minimal memory settings.
    """
    return SparkSession.builder \
        .appName("Excel Processing") \
        .config("spark.hadoop.fs.defaultFS", "hdfs://localhost:9000") \
        .config("spark.jars.packages", "com.crealytics:spark-excel_2.12:0.13.7") \
        .config("spark.driver.memory", "2g") \
        .config("spark.executor.memory", "2g") \
        .config("spark.sql.shuffle.partitions", "2") \
        .config("spark.driver.maxResultSize", "1g") \
        .config("spark.memory.fraction", "0.6") \
        .config("spark.memory.storageFraction", "0.5") \
        .config("spark.executor.instances", "2") \
        .getOrCreate()

* Create a function to load data from hadoop

In [34]:
def load_excel_files(spark, file_path):
    """
    Load all Excel files from the given base path.
    """
    df = spark.read.format("com.crealytics.spark.excel") \
        .option("header", "true") \
        .option("dataAddress", "'RUA Data'!A1") \
        .option("maxRowsInMemory", 1000) \
        .option("treatEmptyValuesAsNulls", "true") \
        .load(file_path)
    return df

* Test read data as xlsx file

In [35]:
spark = create_spark_session()
hdfs_path = f'{base_path}/jUNE-2021.xlsx'
df = load_excel_files(spark, hdfs_path)
df.show(5)

+-----------------------+--------+--------+--------------------+--------------------+---------+--------------------+------+
|CE SAIN Weather Station|     _c1|     _c2|                 _c3|                 _c4|      _c5|                 _c6|   _c7|
+-----------------------+--------+--------+--------------------+--------------------+---------+--------------------+------+
|   Export timeframe:...|    NULL|    NULL|                NULL|                NULL|     NULL|                NULL|  NULL|
|   Location: Royal U...|    NULL|    NULL|                NULL|                NULL|     NULL|                NULL|  NULL|
|                   NULL|    NULL|    NULL|                NULL|                NULL|     NULL|                NULL|  NULL|
|                  Line#|    Date|    Time|Water Content (m3...|Solar Radiation (...|Rain (mm)|Temperature (Celc...|RH (%)|
|                      1|21/06/01|00:00:00|              0.2942|                   1|        0|               27.68|  89.6|
+-------

* Get all datanode that we work with

In [25]:
file_paths = [f"{base_path}/{filename}" for filename in [
    "APRIL-2021.xlsx", "APRIL-2022.xlsx", "AUGUST-2021.xlsx",
    "DECEMBER-2020.xlsx", "DECEMBER-2021.xlsx", "FEBRUARY-2021.xlsx",
    "FEBRUARY-2022.xlsx", "JANUARY-2021.xlsx", "JANUARY-2022.xlsx",
    "JULY-2021.xlsx", "MARCH-2021.xlsx", "MARCH-2022.xlsx",
    "MAY-2021.xlsx", "NOVEMBER-2020.xlsx", "NOVEMBER-2021.xlsx",
    "OCTOBER-2020.xlsx", "OCTOBER-2021.xlsx", "SEPTEMBER-2020.xlsx",
    "SEPTEMBER-2021.xlsx", "jUNE-2021.xlsx"
]]
print(file_paths)


['hdfs://localhost:9000/user/hadoop/input/APRIL-2021.xlsx', 'hdfs://localhost:9000/user/hadoop/input/APRIL-2022.xlsx', 'hdfs://localhost:9000/user/hadoop/input/AUGUST-2021.xlsx', 'hdfs://localhost:9000/user/hadoop/input/DECEMBER-2020.xlsx', 'hdfs://localhost:9000/user/hadoop/input/DECEMBER-2021.xlsx', 'hdfs://localhost:9000/user/hadoop/input/FEBRUARY-2021.xlsx', 'hdfs://localhost:9000/user/hadoop/input/FEBRUARY-2022.xlsx', 'hdfs://localhost:9000/user/hadoop/input/JANUARY-2021.xlsx', 'hdfs://localhost:9000/user/hadoop/input/JANUARY-2022.xlsx', 'hdfs://localhost:9000/user/hadoop/input/JULY-2021.xlsx', 'hdfs://localhost:9000/user/hadoop/input/MARCH-2021.xlsx', 'hdfs://localhost:9000/user/hadoop/input/MARCH-2022.xlsx', 'hdfs://localhost:9000/user/hadoop/input/MAY-2021.xlsx', 'hdfs://localhost:9000/user/hadoop/input/NOVEMBER-2020.xlsx', 'hdfs://localhost:9000/user/hadoop/input/NOVEMBER-2021.xlsx', 'hdfs://localhost:9000/user/hadoop/input/OCTOBER-2020.xlsx', 'hdfs://localhost:9000/user/hadoo

In [36]:
from pyspark.sql import SparkSession
import pandas as pd


# Initialize list for Pandas DataFrames
excl_list = []

# Read Excel files
for file in file_paths:
    try:
        print(f"Reading file: {file}")
        # Read the file into a Spark DataFrame
        spark_df = load_excel_files(spark=spark, file_path=file)

        # Convert Spark DataFrame to Pandas DataFrame
        excl_list.append(spark_df.toPandas())
    except Exception as e:
        print(f"Error reading {file}: {e}")

# Concatenate all Pandas DataFrames and export to Excel
if excl_list:
    excl_merged = pd.concat(excl_list, ignore_index=True)
    excl_merged.to_csv("MERGED_PHNOM_PENH_DATASET.csv", index=False)
    print("Merged dataset saved to MERGED_PHNOM_PENH_DATASET.xlsx")
else:
    print("No valid files to merge.")


Reading file: hdfs://localhost:9000/user/hadoop/input/APRIL-2021.xlsx
Reading file: hdfs://localhost:9000/user/hadoop/input/APRIL-2022.xlsx
Reading file: hdfs://localhost:9000/user/hadoop/input/AUGUST-2021.xlsx
Reading file: hdfs://localhost:9000/user/hadoop/input/DECEMBER-2020.xlsx
Reading file: hdfs://localhost:9000/user/hadoop/input/DECEMBER-2021.xlsx
Reading file: hdfs://localhost:9000/user/hadoop/input/FEBRUARY-2021.xlsx
Reading file: hdfs://localhost:9000/user/hadoop/input/FEBRUARY-2022.xlsx
Reading file: hdfs://localhost:9000/user/hadoop/input/JANUARY-2021.xlsx
Reading file: hdfs://localhost:9000/user/hadoop/input/JANUARY-2022.xlsx
Reading file: hdfs://localhost:9000/user/hadoop/input/JULY-2021.xlsx
Reading file: hdfs://localhost:9000/user/hadoop/input/MARCH-2021.xlsx
Reading file: hdfs://localhost:9000/user/hadoop/input/MARCH-2022.xlsx
Reading file: hdfs://localhost:9000/user/hadoop/input/MAY-2021.xlsx
Reading file: hdfs://localhost:9000/user/hadoop/input/NOVEMBER-2020.xlsx
Rea