## Data Cleaning

##### Import necessary modules

In [26]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import to_date, date_format

##### Configure the hadoop dir path

In [32]:
BASE_PATH = 'hdfs://localhost:9000/user/hadoop'
INPUT_PATH = f'{BASE_PATH}/inputs'
MERGED_PATH = f'{BASE_PATH}/merged'
CLEANED_PATH = f'{BASE_PATH}/cleaned'

print('input path {} , merge path {}'.format(INPUT_PATH, MERGED_PATH))

input path hdfs://localhost:9000/user/hadoop/inputs , merge path hdfs://localhost:9000/user/hadoop/merged


##### Read the data that we have merged

In [24]:
# Initialize Spark session
spark = SparkSession.builder \
    .appName("Read CSV from HDFS") \
    .getOrCreate()

# Path to the CSV file on HDFS
file_path = f'{MERGED_PATH}/combined_raw_data.csv'

# Read the CSV file into a DataFrame
df = spark.read.csv(file_path, header=True, inferSchema=True)

df.printSchema()


root
 |-- Line#: integer (nullable = true)
 |-- Date: string (nullable = true)
 |-- Time: timestamp (nullable = true)
 |-- Water Content (m3/m3): double (nullable = true)
 |-- Solar Radiation (W/m2): double (nullable = true)
 |-- Rain (mm): double (nullable = true)
 |-- Temperature (Celcius): double (nullable = true)
 |-- RH (%): double (nullable = true)
 |-- Wind Speed (m/s): double (nullable = true)
 |-- Gust Speed (m/s): double (nullable = true)
 |-- Wind Direction (Degree): double (nullable = true)
 |-- Dew Point (Celcius): double (nullable = true)



##### Drop the dulpicate value from dataframe

In [None]:
# Drop duplicates from the DataFrame
df_no_duplicates = df.dropDuplicates()

# Find the duplicate rows by subtracting deduplicated data from the original
duplicates = df.subtract(df_no_duplicates)

# Show the duplicated rows
duplicates.show(5)




+-----+----+----+---------------------+----------------------+---------+---------------------+------+----------------+----------------+-----------------------+-------------------+
|Line#|Date|Time|Water Content (m3/m3)|Solar Radiation (W/m2)|Rain (mm)|Temperature (Celcius)|RH (%)|Wind Speed (m/s)|Gust Speed (m/s)|Wind Direction (Degree)|Dew Point (Celcius)|
+-----+----+----+---------------------+----------------------+---------+---------------------+------+----------------+----------------+-----------------------+-------------------+
+-----+----+----+---------------------+----------------------+---------+---------------------+------+----------------+----------------+-----------------------+-------------------+





##### Convert Date Datatype from String to Date in (dd-MM-yyyy)

In [27]:
df = df.withColumn("Date", to_date("Date", "yy/MM/dd"))
df = df.withColumn("Date", date_format("Date", "yyyy-MM-dd"))
df = df.withColumn("Date", to_date("Date", "yyyy-MM-dd"))

##### Verify the col. have been modify

In [28]:
df.printSchema()

root
 |-- Line#: integer (nullable = true)
 |-- Date: date (nullable = true)
 |-- Time: timestamp (nullable = true)
 |-- Water Content (m3/m3): double (nullable = true)
 |-- Solar Radiation (W/m2): double (nullable = true)
 |-- Rain (mm): double (nullable = true)
 |-- Temperature (Celcius): double (nullable = true)
 |-- RH (%): double (nullable = true)
 |-- Wind Speed (m/s): double (nullable = true)
 |-- Gust Speed (m/s): double (nullable = true)
 |-- Wind Direction (Degree): double (nullable = true)
 |-- Dew Point (Celcius): double (nullable = true)



##### Drop Line# Column 

In [29]:
# Drop the "Line#" column
df = df.drop("Line#")

In [30]:
df.printSchema()

root
 |-- Date: date (nullable = true)
 |-- Time: timestamp (nullable = true)
 |-- Water Content (m3/m3): double (nullable = true)
 |-- Solar Radiation (W/m2): double (nullable = true)
 |-- Rain (mm): double (nullable = true)
 |-- Temperature (Celcius): double (nullable = true)
 |-- RH (%): double (nullable = true)
 |-- Wind Speed (m/s): double (nullable = true)
 |-- Gust Speed (m/s): double (nullable = true)
 |-- Wind Direction (Degree): double (nullable = true)
 |-- Dew Point (Celcius): double (nullable = true)



In [34]:
df.write \
    .mode("overwrite") \
    .option("header", "true") \
    .option("compression", "none") \
    .csv(f'{CLEANED_PATH}/cleaned_data.csv')

                                                                                

##### Verify the file write successfully

In [37]:
# Initialize Spark session
spark = SparkSession.builder \
    .appName("Read CSV from HDFS") \
    .getOrCreate()

# Path to the CSV file on HDFS
file_path = f'{CLEANED_PATH}/cleaned_data.csv'

# Read the CSV file into a DataFrame
df = spark.read.csv(file_path, header=True, inferSchema=True)

df.show(5)
df.printSchema()


+----------+-------------------+---------------------+----------------------+---------+---------------------+------+----------------+----------------+-----------------------+-------------------+
|      Date|               Time|Water Content (m3/m3)|Solar Radiation (W/m2)|Rain (mm)|Temperature (Celcius)|RH (%)|Wind Speed (m/s)|Gust Speed (m/s)|Wind Direction (Degree)|Dew Point (Celcius)|
+----------+-------------------+---------------------+----------------------+---------+---------------------+------+----------------+----------------+-----------------------+-------------------+
|2021-04-01|2024-11-30 00:05:00|               0.2532|                   1.0|      0.0|                28.02|  81.0|             0.0|             0.0|                  215.0|              24.49|
|2021-04-01|2024-11-30 00:10:00|               0.2524|                   1.0|      0.0|                28.07|  81.0|             0.0|             1.3|                  170.0|              24.53|
|2021-04-01|2024-11-30 00