## Data Partition

##### Import necessary modules

In [1]:
from pyspark.sql import SparkSession
from utils.Constants import Constants
from pyspark.sql.functions import month
from os.path import abspath
from pyspark.sql.functions import month, year, concat_ws

##### Configure the hadoop dir path

In [6]:
BASE_PATH = 'hdfs://localhost:9000/user/hadoop'
CLEANED_PATH = f'{BASE_PATH}/cleaned'
PARTITION_PATH = f'{BASE_PATH}/partitions'

print('cleand path {}'.format(CLEANED_PATH))

cleand path hdfs://localhost:9000/user/hadoop/cleaned


##### Read the data that we have cleaned

In [7]:
# Initialize Spark session
spark = SparkSession.builder \
    .appName("Read CSV from HDFS") \
    .getOrCreate()

# Path to the CSV file on HDFS
file_path = f'{CLEANED_PATH}/cleaned_data.csv'
schema = Constants.WEATHER_DATA_SCHEMA

# Read the CSV file into a DataFrame
df = spark.read.csv(file_path, header=True , schema=schema)

df.printSchema()
df.show(5)


root
 |-- date: date (nullable = true)
 |-- time: string (nullable = true)
 |-- water_content(m3/m3): float (nullable = true)
 |-- solar_radiation(w/m2): float (nullable = true)
 |-- rain(mm): float (nullable = true)
 |-- temperature(celcius): float (nullable = true)
 |-- rh(%): float (nullable = true)
 |-- wind_speed(m/s): float (nullable = true)
 |-- gust_speed(m/s): float (nullable = true)
 |-- wind_direction(degree): float (nullable = true)
 |-- dew_point(celcius): float (nullable = true)

+----------+--------+--------------------+---------------------+--------+--------------------+-----+---------------+---------------+----------------------+------------------+
|      date|    time|water_content(m3/m3)|solar_radiation(w/m2)|rain(mm)|temperature(celcius)|rh(%)|wind_speed(m/s)|gust_speed(m/s)|wind_direction(degree)|dew_point(celcius)|
+----------+--------+--------------------+---------------------+--------+--------------------+-----+---------------+---------------+-------------------

24/12/07 14:24:35 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


##### Let partition by date , becasue the col. date is not having high cardinality.

In [8]:
warehouse_location = abspath('spark-warehouse')
schema = Constants.WEATHER_DATA_SCHEMA

spark = SparkSession \
    .builder \
    .appName("Python Spark SQL Hive integration example") \
    .config("spark.sql.warehouse.dir", warehouse_location) \
    .config("spark.sql.catalogImplementation", "hive") \
    .enableHiveSupport() \
    .getOrCreate()

# Read the CSV file with the defined schema
file_path = f'{CLEANED_PATH}/cleaned_data.csv'
schema = Constants.WEATHER_DATA_SCHEMA

# Read the CSV file into a DataFrame
data = spark.read.csv(file_path, header=True , schema=schema)
spark.sql("CREATE DATABASE IF NOT EXISTS mydb")

data = data \
    .withColumn("year_month", concat_ws("-", year(data["date"]), month(data["date"]))) \
    .orderBy("date", "time")

data.write \
    .format("parquet") \
    .mode("overwrite") \
    .partitionBy("year_month") \
    .saveAsTable("mydb.weather_data_partitioned_by_month")

print(f"Data has been partitioned by month and saved to Hive table 'mydb.weather_data_partitioned_by_month'.")


24/12/07 14:24:35 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.

Data has been partitioned by month and saved to Hive table 'mydb.weather_data_partitioned_by_month'.


                                                                                

##### Describe the table schema

In [12]:
warehouse_location = abspath('spark-warehouse')

spark = SparkSession \
    .builder \
    .appName("Python Spark SQL Hive integration example") \
    .config("spark.sql.warehouse.dir", warehouse_location) \
    .config("spark.sql.catalogImplementation", "hive") \
    .enableHiveSupport() \
    .getOrCreate()

spark.sql("DESCRIBE mydb.weather_data_partitioned_by_month").show()

+--------------------+---------+-------+
|            col_name|data_type|comment|
+--------------------+---------+-------+
|                date|     date|   NULL|
|                time|   string|   NULL|
|water_content(m3/m3)|    float|   NULL|
|solar_radiation(w...|    float|   NULL|
|            rain(mm)|    float|   NULL|
|temperature(celcius)|    float|   NULL|
|               rh(%)|    float|   NULL|
|     wind_speed(m/s)|    float|   NULL|
|     gust_speed(m/s)|    float|   NULL|
|wind_direction(de...|    float|   NULL|
|  dew_point(celcius)|    float|   NULL|
|          year_month|   string|   NULL|
|# Partition Infor...|         |       |
|          # col_name|data_type|comment|
|          year_month|   string|   NULL|
+--------------------+---------+-------+



24/12/07 14:24:43 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


##### Test Load data

In [15]:
result = spark.sql("""
SELECT * 
FROM mydb.weather_data_partitioned_by_month
""")
result.show(5)


+----------+--------+--------------------+---------------------+--------+--------------------+-----+---------------+---------------+----------------------+------------------+----------+
|      date|    time|water_content(m3/m3)|solar_radiation(w/m2)|rain(mm)|temperature(celcius)|rh(%)|wind_speed(m/s)|gust_speed(m/s)|wind_direction(degree)|dew_point(celcius)|year_month|
+----------+--------+--------------------+---------------------+--------+--------------------+-----+---------------+---------------+----------------------+------------------+----------+
|2021-01-01|15:35:00|              0.3393|                221.0|     0.0|               26.57| 60.9|            1.7|            4.0|                 331.0|             18.43|    2021-1|
|2021-01-01|01:25:00|              0.3371|                  1.0|     0.0|               23.45| 63.4|            1.3|            3.7|                  21.0|             16.12|    2021-1|
|2021-01-01|02:50:00|              0.3371|                  1.0|     0

In [17]:
spark.sql("SHOW TABLES IN mydb").show()

+---------+--------------------+-----------+
|namespace|           tableName|isTemporary|
+---------+--------------------+-----------+
|     mydb|weather_data_part...|      false|
+---------+--------------------+-----------+

