* Import necessary modules

In [113]:
import glob
from pyspark.sql import SparkSession
import pandas as pd
import warnings
import seaborn as sns
import matplotlib.pyplot as plt
warnings.filterwarnings("ignore")
%matplotlib inline

* Configure the base path

In [12]:
base_path = 'hdfs://localhost:9000/user/hadoop/input'

* Function for create spark session that connect to hadoop

In [13]:
def create_spark_session():
    """
    Creates and configures a SparkSession with minimal memory settings.
    """
    return SparkSession.builder \
    .appName("Weather Data Combination") \
    .config("spark.jars.packages", "com.crealytics:spark-excel_2.12:0.13.7") \
    .config("spark.driver.memory", "2g") \
    .config("spark.executor.memory", "2g") \
    .config("spark.sql.shuffle.partitions", "2") \
    .config("spark.driver.maxResultSize", "1g") \
    .getOrCreate()

* Create a function to load data from hadoop

In [110]:
def load_excel_files(spark, file_path):
    """
    Load all Excel files from the given base path.
    """
    df = spark.read.format("com.crealytics.spark.excel") \
                .option("header", "false") \
                .option("dataAddress", "'RUA Data'!A6") \
                .option("maxRowsInMemory", 1000) \
                .option("treatEmptyValuesAsNulls", "true") \
                .load(file_path)
    return df

* Test read data as xlsx file

In [111]:
spark = create_spark_session()
hdfs_path = f'{base_path}/APRIL-2021.xlsx'
df = load_excel_files(spark, hdfs_path)
df.show(5)

+---+--------+--------+------+---+---+-----+----+---+---+----+-----+
|_c0|     _c1|     _c2|   _c3|_c4|_c5|  _c6| _c7|_c8|_c9|_c10| _c11|
+---+--------+--------+------+---+---+-----+----+---+---+----+-----+
|  1|21/04/01|00:00:00|0.2534|  1|  0|28.12|80.6|  0|0.3| 149| 24.5|
|  2|21/04/01|00:05:00|0.2532|  1|  0|28.02|  81|  0|  0| 215|24.49|
|  3|21/04/01|00:10:00|0.2524|  1|  0|28.07|  81|  0|1.3| 170|24.53|
|  4|21/04/01|00:15:00|0.2524|  1|  0| 28.1|80.8|  0|1.7| 166|24.52|
|  5|21/04/01|00:20:00|0.2524|  1|  0|28.07|80.8|0.3|2.7| 181|24.49|
+---+--------+--------+------+---+---+-----+----+---+---+----+-----+
only showing top 5 rows



* Get all datanode that we work with

In [25]:
file_paths = [f"{base_path}/{filename}" for filename in [
    "APRIL-2021.xlsx", "APRIL-2022.xlsx", "AUGUST-2021.xlsx",
    "DECEMBER-2020.xlsx", "DECEMBER-2021.xlsx", "FEBRUARY-2021.xlsx",
    "FEBRUARY-2022.xlsx", "JANUARY-2021.xlsx", "JANUARY-2022.xlsx",
    "JULY-2021.xlsx", "MARCH-2021.xlsx", "MARCH-2022.xlsx",
    "MAY-2021.xlsx", "NOVEMBER-2020.xlsx", "NOVEMBER-2021.xlsx",
    "OCTOBER-2020.xlsx", "OCTOBER-2021.xlsx", "SEPTEMBER-2020.xlsx",
    "SEPTEMBER-2021.xlsx", "jUNE-2021.xlsx"
]]
print(file_paths)


['hdfs://localhost:9000/user/hadoop/input/APRIL-2021.xlsx', 'hdfs://localhost:9000/user/hadoop/input/APRIL-2022.xlsx', 'hdfs://localhost:9000/user/hadoop/input/AUGUST-2021.xlsx', 'hdfs://localhost:9000/user/hadoop/input/DECEMBER-2020.xlsx', 'hdfs://localhost:9000/user/hadoop/input/DECEMBER-2021.xlsx', 'hdfs://localhost:9000/user/hadoop/input/FEBRUARY-2021.xlsx', 'hdfs://localhost:9000/user/hadoop/input/FEBRUARY-2022.xlsx', 'hdfs://localhost:9000/user/hadoop/input/JANUARY-2021.xlsx', 'hdfs://localhost:9000/user/hadoop/input/JANUARY-2022.xlsx', 'hdfs://localhost:9000/user/hadoop/input/JULY-2021.xlsx', 'hdfs://localhost:9000/user/hadoop/input/MARCH-2021.xlsx', 'hdfs://localhost:9000/user/hadoop/input/MARCH-2022.xlsx', 'hdfs://localhost:9000/user/hadoop/input/MAY-2021.xlsx', 'hdfs://localhost:9000/user/hadoop/input/NOVEMBER-2020.xlsx', 'hdfs://localhost:9000/user/hadoop/input/NOVEMBER-2021.xlsx', 'hdfs://localhost:9000/user/hadoop/input/OCTOBER-2020.xlsx', 'hdfs://localhost:9000/user/hadoo

### Convert the excel file to dataframe and append it to an array

In [60]:
from pyspark.sql import SparkSession
import pandas as pd


# Initialize list for Pandas DataFrames
excl_list = []

# Read Excel files
for file in file_paths:
    try:
        print(f"Reading file: {file}")
        # Read the file into a Spark DataFrame
        spark_df = load_excel_files(spark=spark, file_path=file)

        # Convert Spark DataFrame to Pandas DataFrame
        excl_list.append(spark_df.toPandas())
    except Exception as e:
        print(f"Error reading {file}: {e}")


Reading file: hdfs://localhost:9000/user/hadoop/input/APRIL-2021.xlsx
Reading file: hdfs://localhost:9000/user/hadoop/input/APRIL-2022.xlsx
Reading file: hdfs://localhost:9000/user/hadoop/input/AUGUST-2021.xlsx
Reading file: hdfs://localhost:9000/user/hadoop/input/DECEMBER-2020.xlsx
Reading file: hdfs://localhost:9000/user/hadoop/input/DECEMBER-2021.xlsx
Reading file: hdfs://localhost:9000/user/hadoop/input/FEBRUARY-2021.xlsx
Reading file: hdfs://localhost:9000/user/hadoop/input/FEBRUARY-2022.xlsx
Reading file: hdfs://localhost:9000/user/hadoop/input/JANUARY-2021.xlsx
Reading file: hdfs://localhost:9000/user/hadoop/input/JANUARY-2022.xlsx
Reading file: hdfs://localhost:9000/user/hadoop/input/JULY-2021.xlsx
Reading file: hdfs://localhost:9000/user/hadoop/input/MARCH-2021.xlsx
Reading file: hdfs://localhost:9000/user/hadoop/input/MARCH-2022.xlsx
Reading file: hdfs://localhost:9000/user/hadoop/input/MAY-2021.xlsx
Reading file: hdfs://localhost:9000/user/hadoop/input/NOVEMBER-2020.xlsx
Rea

### Test Show the append array

In [98]:
combined_df = pd.concat(excl_list, ignore_index=True)
# Display the first five rows
combined_df.head()

Unnamed: 0,_c0,_c1,_c2,_c3,_c4,_c5,_c6,_c7,_c8,_c9,_c10,_c11,_c12
0,1,21/04/01,00:00:00,0.2534,1,0,28.12,80.6,0.0,0.3,149,24.5,
1,2,21/04/01,00:05:00,0.2532,1,0,28.02,81.0,0.0,0.0,215,24.49,
2,3,21/04/01,00:10:00,0.2524,1,0,28.07,81.0,0.0,1.3,170,24.53,
3,4,21/04/01,00:15:00,0.2524,1,0,28.1,80.8,0.0,1.7,166,24.52,
4,5,21/04/01,00:20:00,0.2524,1,0,28.07,80.8,0.3,2.7,181,24.49,


#### Let check the col . contain null value , it seem like have a col. contain all null value let drop it
#### And drop col. (_c0) that we no need

In [99]:
print(combined_df.isnull().sum())
print('len_c12' , len(combined_df['_c12']))
# axis == 0 for row and 1 for col.
combined_df.drop(['_c12' ,'_c0'] , axis=1 , inplace=True)
combined_df.head()

_c0          0
_c1          0
_c2          0
_c3          0
_c4          0
_c5          0
_c6          0
_c7          0
_c8          0
_c9          0
_c10         0
_c11         0
_c12    160438
dtype: int64
len_c12 160438


Unnamed: 0,_c1,_c2,_c3,_c4,_c5,_c6,_c7,_c8,_c9,_c10,_c11
0,21/04/01,00:00:00,0.2534,1,0,28.12,80.6,0.0,0.3,149,24.5
1,21/04/01,00:05:00,0.2532,1,0,28.02,81.0,0.0,0.0,215,24.49
2,21/04/01,00:10:00,0.2524,1,0,28.07,81.0,0.0,1.3,170,24.53
3,21/04/01,00:15:00,0.2524,1,0,28.1,80.8,0.0,1.7,166,24.52
4,21/04/01,00:20:00,0.2524,1,0,28.07,80.8,0.3,2.7,181,24.49


### Renaming certain columns

In [96]:
df = combined_df.rename(columns={
    '_c1': 'date',
    '_c2': 'time',
    '_c3': 'water_content',
    '_c4': 'solar_radiation',
    '_c5': 'rain',
    '_c6': 'temperature',
    '_c7': 'rh',
    '_c8': 'wind_speed',
    '_c9': 'gust_speed',
    '_c10': 'wind_direction',
    '_c11': 'dew_point',
})
df.head(5)


Unnamed: 0,date,time,water_content,solar_radiation,rain,temperature,rh,wind_speed,gust_speed,wind_direction,dew_point
0,21/04/01,00:00:00,0.2534,1,0,28.12,80.6,0.0,0.3,149,24.5
1,21/04/01,00:05:00,0.2532,1,0,28.02,81.0,0.0,0.0,215,24.49
2,21/04/01,00:10:00,0.2524,1,0,28.07,81.0,0.0,1.3,170,24.53
3,21/04/01,00:15:00,0.2524,1,0,28.1,80.8,0.0,1.7,166,24.52
4,21/04/01,00:20:00,0.2524,1,0,28.07,80.8,0.3,2.7,181,24.49


In [None]:
Q1 = df['water_content'].quantile(0.25)
Q3 = df['water_content'].quantile(0.75)
IQR = Q3 - Q1

# Find outliers based on IQR
outliers_iqr = df[(df['water_content'] < (Q1 - 1.5 * IQR)) | (df['water_content'] > (Q3 + 1.5 * IQR))]
print("Outliers based on IQR:\n", outliers_iqr)


Outliers based on Z-score:
 Empty DataFrame
Columns: [date, time, water_content, z_score]
Index: []
