## Data Partition

##### Import necessary modules

In [8]:
from pyspark.sql import SparkSession
from utils.Constants import Constants
from pyspark.sql import SparkSession
from pyspark.sql.functions import year, month, concat_ws, col
from pyspark.sql.types import StringType,DateType

##### Configure the hadoop dir path and hive location

In [9]:
BASE_PATH = 'hdfs://localhost:8020/user'
CLEANED_PATH = f'{BASE_PATH}/hadoop/cleaned'
HIVE_DIR = 'hive/warehouse'
HIVE_LOCATION = f'{BASE_PATH}/{HIVE_DIR}'

print('cleaned path and hive location {} , {}'.format(CLEANED_PATH , HIVE_LOCATION))

cleaned path and hive location hdfs://localhost:8020/user/hadoop/cleaned , hdfs://localhost:8020/user/hive/warehouse


##### Read the data that we have cleaned

In [10]:
# Initialize Spark session
spark = SparkSession.builder \
    .appName("Read CSV from HDFS") \
    .getOrCreate()

# Path to the CSV file on HDFS
file_path = f'{CLEANED_PATH}/cleaned_data.csv'
schema = Constants.WEATHER_DATA_SCHEMA

# Read the CSV file into a DataFrame
df = spark.read.csv(file_path, header=True , schema=Constants.WEATHER_DATA_SCHEMA)

df.printSchema()
df.show(5)

root
 |-- date: string (nullable = true)
 |-- time: string (nullable = true)
 |-- water_content: float (nullable = true)
 |-- solar_radiation: float (nullable = true)
 |-- rain: float (nullable = true)
 |-- temperature: float (nullable = true)
 |-- rh: float (nullable = true)
 |-- wind_speed: float (nullable = true)
 |-- gust_speed: float (nullable = true)
 |-- wind_direction: float (nullable = true)
 |-- dew_point: float (nullable = true)

+----------+--------+-------------+---------------+----+-----------+----+----------+----------+--------------+---------+
|      date|    time|water_content|solar_radiation|rain|temperature|  rh|wind_speed|gust_speed|wind_direction|dew_point|
+----------+--------+-------------+---------------+----+-----------+----+----------+----------+--------------+---------+
|2020-10-23|01:15:00|       0.3069|            1.0| 0.0|      25.77|92.4|       0.0|       0.3|         277.0|    24.47|
|2020-10-23|01:20:00|       0.3066|            1.0| 0.0|      25.74|92.

25/02/01 16:53:35 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


##### Let partition by date , becasue the col. date is not having high cardinality.

In [11]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import year, month, concat_ws, col
from pyspark.sql.types import StringType, DateType

# Initialize Spark session with Hive support
spark = SparkSession.builder \
    .appName("Python Spark SQL Hive integration example") \
    .config("spark.sql.warehouse.dir", HIVE_LOCATION) \
    .config("hive.metastore.uris", "thrift://hive-metastore:9083") \
    .config("spark.executor.memory", "4g") \
    .config("spark.driver.memory", "2g") \
    .config("spark.sql.shuffle.partitions", "50") \
    .config("spark.memory.fraction", "0.7") \
    .enableHiveSupport() \
    .getOrCreate()


# Create database if it doesn't exist
spark.sql("CREATE DATABASE IF NOT EXISTS weatherdb")
spark.sql("USE weatherdb")

# Read the cleaned data
file_path = f"{CLEANED_PATH}/cleaned_data.csv"
data = spark.read.csv(file_path, header=True, schema=Constants.WEATHER_DATA_SCHEMA)

# Cast columns to appropriate types
data = data.withColumn("date", col("date").cast(DateType()))
data = data.withColumn("time", col("time").cast(StringType()))

# Add a year_month column for partitioning
data = data.withColumn("year_month", concat_ws("-", year(data["date"]), month(data["date"])))

# Save the data as a partitioned Parquet file
data.write \
    .format("parquet") \
    .mode("overwrite") \
    .partitionBy("year_month") \
    .save(f"{HIVE_LOCATION}/weather_data_partitioned_by_month")

# Define the schema for the Hive table
schema = ", ".join([f"{field.name} {field.dataType.simpleString()}" for field in data.schema.fields if field.name != "year_month"])

# Register the Parquet files as a Hive table with explicit schema
spark.sql(f"""
    CREATE TABLE IF NOT EXISTS weatherdb.weather_data_partitioned_by_month (
        {schema}
    )
    USING parquet
    PARTITIONED BY (year_month STRING)
    LOCATION 'hdfs://localhost:8020/user/hive/warehouse/weather_data_partitioned_by_month'
""")

# Refresh the table metadata to recognize the partitions
spark.sql("MSCK REPAIR TABLE weatherdb.weather_data_partitioned_by_month")

print("Data has been partitioned by month and saved to Hive table 'weatherdb.weather_data_partitioned_by_month'.")

25/02/01 16:53:35 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.
25/02/01 16:53:36 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 95.00% for 8 writers
25/02/01 16:53:36 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 84.44% for 9 writers
25/02/01 16:53:36 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 76.00% for 10 writers
25/02/01 16:53:36 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 69.09% for 11 writers
25/02/01 16:53:36 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 63.33% for 12 writers
25/02/01 16:53:36 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap me

Data has been partitioned by month and saved to Hive table 'weatherdb.weather_data_partitioned_by_month'.


                                                                                

##### Describe the table schema

In [12]:

spark = SparkSession \
    .builder \
    .appName("Python Spark SQL Hive integration example") \
    .config("spark.sql.warehouse.dir", HIVE_LOCATION) \
    .config("hive.metastore.uris", "thrift://hive-metastore:9083") \
    .enableHiveSupport() \
    .getOrCreate()

spark.sql("DESCRIBE weatherdb.weather_data_partitioned_by_month").show()

25/02/01 16:53:38 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


+--------------------+---------+-------+
|            col_name|data_type|comment|
+--------------------+---------+-------+
|                date|     date|   NULL|
|                time|   string|   NULL|
|       water_content|    float|   NULL|
|     solar_radiation|    float|   NULL|
|                rain|    float|   NULL|
|         temperature|    float|   NULL|
|                  rh|    float|   NULL|
|          wind_speed|    float|   NULL|
|          gust_speed|    float|   NULL|
|      wind_direction|    float|   NULL|
|           dew_point|    float|   NULL|
|          year_month|   string|   NULL|
|# Partition Infor...|         |       |
|          # col_name|data_type|comment|
|          year_month|   string|   NULL|
+--------------------+---------+-------+



##### Test Load data

In [13]:
spark = SparkSession \
    .builder \
    .appName("Python Spark SQL Hive integration example") \
    .config("spark.sql.warehouse.dir", HIVE_DIR) \
    .enableHiveSupport() \
    .getOrCreate()

result = spark.sql("""
SELECT * 
FROM weatherdb.weather_data_partitioned_by_month
""")
result.show(5)


25/02/01 16:53:38 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


+----------+--------+-------------+---------------+----+-----------+----+----------+----------+--------------+---------+----------+
|      date|    time|water_content|solar_radiation|rain|temperature|  rh|wind_speed|gust_speed|wind_direction|dew_point|year_month|
+----------+--------+-------------+---------------+----+-----------+----+----------+----------+--------------+---------+----------+
|2020-11-01|00:00:00|       0.3062|            1.0| 0.0|      26.84|94.8|       0.3|       1.0|         152.0|    25.97|   2020-11|
|2020-11-01|00:05:00|       0.3066|            1.0| 0.0|      26.77|94.7|       0.0|       1.0|         152.0|    25.88|   2020-11|
|2020-11-01|00:10:00|       0.3066|            1.0| 0.0|      26.72|94.7|       0.3|       1.3|         154.0|    25.83|   2020-11|
|2020-11-01|00:15:00|       0.3066|            1.0| 0.0|      26.65|94.6|       0.0|       0.0|         152.0|    25.74|   2020-11|
|2020-11-01|00:20:00|       0.3066|            1.0| 0.0|      26.55|94.7|   

In [14]:
spark.sql("SHOW TABLES IN weatherdb").show()

+---------+--------------------+-----------+
|namespace|           tableName|isTemporary|
+---------+--------------------+-----------+
|weatherdb|weather_data_part...|      false|
+---------+--------------------+-----------+

