## Data Cleaning

##### Import necessary modules

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import to_date, date_format

##### Configure the hadoop dir path

In [2]:
BASE_PATH = 'hdfs://localhost:9000/user/hadoop'
INPUT_PATH = f'{BASE_PATH}/inputs'
MERGED_PATH = f'{BASE_PATH}/merged'
CLEANED_PATH = f'{BASE_PATH}/cleaned'

print('input path {} , merge path {}'.format(INPUT_PATH, MERGED_PATH))

input path hdfs://localhost:9000/user/hadoop/inputs , merge path hdfs://localhost:9000/user/hadoop/merged


##### Read the data that we have merged

In [8]:
# Initialize Spark session
spark = SparkSession.builder \
    .appName("Read CSV from HDFS") \
    .getOrCreate()

# Path to the CSV file on HDFS
file_path = f'{MERGED_PATH}/combined_raw_data.csv'

# Read the CSV file into a DataFrame
df = spark.read.csv(file_path, header=True, inferSchema=False)

df.printSchema()
df.show(5)


root
 |-- _c0: string (nullable = true)
 |-- _c1: string (nullable = true)
 |-- _c2: string (nullable = true)
 |-- _c3: string (nullable = true)
 |-- _c4: string (nullable = true)
 |-- _c5: string (nullable = true)
 |-- _c6: string (nullable = true)
 |-- _c7: string (nullable = true)
 |-- _c8: string (nullable = true)
 |-- _c9: string (nullable = true)
 |-- _c10: string (nullable = true)
 |-- _c11: string (nullable = true)

+----+--------+--------+------+---+---+-----+----+---+---+----+-----+
| _c0|     _c1|     _c2|   _c3|_c4|_c5|  _c6| _c7|_c8|_c9|_c10| _c11|
+----+--------+--------+------+---+---+-----+----+---+---+----+-----+
|3601|21/05/13|12:00:00| 0.278|233|  0|34.92|60.8|0.3|1.7| 125|26.24|
|3602|21/05/13|12:05:00|0.2776|751|  0|34.26|60.9|0.3|1.3| 124|25.65|
|3603|21/05/13|12:10:00|0.2776|963|  0|35.18|59.6|0.3|1.3| 114|26.15|
|3604|21/05/13|12:15:00|0.2776|956|  0| 35.8|57.5|0.3|1.3| 107|26.12|
|3605|21/05/13|12:20:00|0.2776|994|  0|35.98|58.8|0.3|1.3| 105|26.68|
+----+------

##### Let change the col. name for readable

In [16]:
from pyspark.sql import SparkSession

# Assuming 'df' is your DataFrame
# Rename columns
column_mapping = {
    "_c1": "date",
    "_c2": "time",
    "_c3": "water_content(m3/m3)",
    "_c4": "solar_radiation(w/m2)",
    "_c5": "rain(mm)",
    "_c6": "temperature(celcius)",
    "_c7": "rh(%)",
    "_c8": "wind_speed(m/s)",
    "_c9": "gust_speed(m/s)",
    "_c10": "wind_direction(degree)",
    "_c11": "dew_point(celcius)"
}

# Start with the original DataFrame
rename_cf = df

# Rename columns iteratively
for old_name, new_name in column_mapping.items():
    rename_cf = rename_cf.withColumnRenamed(old_name, new_name)

# Show the DataFrame schema after renaming
rename_cf.printSchema()
df.printSchema()


root
 |-- _c0: string (nullable = true)
 |-- date: string (nullable = true)
 |-- time: string (nullable = true)
 |-- water_content(m3/m3): string (nullable = true)
 |-- solar_radiation(w/m2): string (nullable = true)
 |-- rain(mm): string (nullable = true)
 |-- temperature(celcius): string (nullable = true)
 |-- rh(%): string (nullable = true)
 |-- wind_speed(m/s): string (nullable = true)
 |-- gust_speed(m/s): string (nullable = true)
 |-- wind_direction(degree): string (nullable = true)
 |-- dew_point(celcius): string (nullable = true)

root
 |-- _c0: string (nullable = true)
 |-- _c1: string (nullable = true)
 |-- _c2: string (nullable = true)
 |-- _c3: string (nullable = true)
 |-- _c4: string (nullable = true)
 |-- _c5: string (nullable = true)
 |-- _c6: string (nullable = true)
 |-- _c7: string (nullable = true)
 |-- _c8: string (nullable = true)
 |-- _c9: string (nullable = true)
 |-- _c10: string (nullable = true)
 |-- _c11: string (nullable = true)



##### Change the format date from yy/MM/dd to dd-MM-yyyy

In [29]:
from pyspark.sql import functions as F

# Reformatting the date and time columns
df_transformed = rename_cf.withColumn("date", F.date_format(F.to_date("date", "yy/MM/dd"), "dd-MM-yyyy"))

df_transformed.show(5, truncate=False)


+----+----------+--------+--------------------+---------------------+--------+--------------------+-----+---------------+---------------+----------------------+------------------+
|_c0 |date      |time    |water_content(m3/m3)|solar_radiation(w/m2)|rain(mm)|temperature(celcius)|rh(%)|wind_speed(m/s)|gust_speed(m/s)|wind_direction(degree)|dew_point(celcius)|
+----+----------+--------+--------------------+---------------------+--------+--------------------+-----+---------------+---------------+----------------------+------------------+
|3601|13-05-2021|12:00:00|0.278               |233                  |0       |34.92               |60.8 |0.3            |1.7            |125                   |26.24             |
|3602|13-05-2021|12:05:00|0.2776              |751                  |0       |34.26               |60.9 |0.3            |1.3            |124                   |25.65             |
|3603|13-05-2021|12:10:00|0.2776              |963                  |0       |35.18               |5

##### Drop the dulpicate value from dataframe

In [20]:
# Drop duplicates from the DataFrame
df_no_duplicates = df_transformed.dropDuplicates()

# Find the duplicate rows by subtracting deduplicated data from the original
duplicates = df_transformed.subtract(df_no_duplicates)

# Show the duplicated rows
duplicates.show(5)


                                                                                

+---+----+----+--------------------+---------------------+--------+--------------------+-----+---------------+---------------+----------------------+------------------+
|_c0|date|time|water_content(m3/m3)|solar_radiation(w/m2)|rain(mm)|temperature(celcius)|rh(%)|wind_speed(m/s)|gust_speed(m/s)|wind_direction(degree)|dew_point(celcius)|
+---+----+----+--------------------+---------------------+--------+--------------------+-----+---------------+---------------+----------------------+------------------+
+---+----+----+--------------------+---------------------+--------+--------------------+-----+---------------+---------------+----------------------+------------------+



##### Drop _c0 Column 

In [22]:
# Drop the "_cO" column
drop_c0_colunm = df_no_duplicates.drop('_c0')

In [24]:
drop_c0_colunm.printSchema()

root
 |-- date: string (nullable = true)
 |-- time: string (nullable = true)
 |-- water_content(m3/m3): string (nullable = true)
 |-- solar_radiation(w/m2): string (nullable = true)
 |-- rain(mm): string (nullable = true)
 |-- temperature(celcius): string (nullable = true)
 |-- rh(%): string (nullable = true)
 |-- wind_speed(m/s): string (nullable = true)
 |-- gust_speed(m/s): string (nullable = true)
 |-- wind_direction(degree): string (nullable = true)
 |-- dew_point(celcius): string (nullable = true)



##### Write it into hadoop

In [25]:
drop_c0_colunm.write \
    .mode("overwrite") \
    .option("header", "true") \
    .option("compression", "none") \
    .csv(f'{CLEANED_PATH}/cleaned_data.csv')

                                                                                

##### Verify the file write successfully

In [28]:
# Initialize Spark session
spark = SparkSession.builder \
    .appName("Read CSV from HDFS") \
    .getOrCreate()

# Path to the CSV file on HDFS
file_path = f'{CLEANED_PATH}/cleaned_data.csv'

# Read the CSV file into a DataFrame
df = spark.read.csv(file_path, header=True)

df.show(5)
df.printSchema()


+----------+--------+--------------------+---------------------+--------+--------------------+-----+---------------+---------------+----------------------+------------------+
|      date|    time|water_content(m3/m3)|solar_radiation(w/m2)|rain(mm)|temperature(celcius)|rh(%)|wind_speed(m/s)|gust_speed(m/s)|wind_direction(degree)|dew_point(celcius)|
+----------+--------+--------------------+---------------------+--------+--------------------+-----+---------------+---------------+----------------------+------------------+
|15-05-2021|01:10:00|              0.2753|                    1|       0|               26.28| 89.5|              0|              0|               -888.88|             24.45|
|13-05-2021|04:55:00|               0.278|                    1|       0|               26.43| 94.8|              0|              0|               -888.88|             25.56|
|17-05-2021|04:45:00|              0.2739|                    1|       0|               27.16| 93.4|              0|         